0
# Core Tokenization
1
2
The core tokenization functionality provides the main `Tokenizer` and `TokenizerImpl` structs that orchestrate the tokenization pipeline, along with the `Encoding` struct that represents tokenization results with full offset tracking and metadata.
3
4
## Capabilities
5
6
### Tokenizer Construction
7
8
Main tokenizer creation with various initialization methods including from models, files, and remote sources.
9
10
```rust { .api }
11
impl Tokenizer {
12
/// Create a new tokenizer with the specified model
13
pub fn new(model: impl Into<ModelWrapper>) -> Self;
14
15
/// Load a tokenizer from a JSON file
16
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
17
18
/// Load a tokenizer from JSON bytes
19
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
20
21
/// Load a pre-trained tokenizer from Hugging Face Hub (requires 'http' feature)
22
#[cfg(feature = "http")]
23
pub fn from_pretrained<S: AsRef<str>>(
24
identifier: S,
25
params: Option<FromPretrainedParameters>
26
) -> Result<Self>;
27
28
/// Unwrap the inner TokenizerImpl
29
pub fn into_inner(self) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>;
30
}
31
```
32
33
### TokenizerImpl Configuration
34
35
Generic tokenizer implementation with configurable pipeline components.
36
37
```rust { .api }
38
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
39
where
40
M: Model,
41
N: Normalizer,
42
PT: PreTokenizer,
43
PP: PostProcessor,
44
D: Decoder,
45
{
46
/// Create a new tokenizer implementation with a model
47
pub fn new(model: M) -> Self;
48
49
/// Load from file
50
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
51
52
/// Load from bytes
53
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
54
55
/// Set the normalizer
56
pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self;
57
58
/// Get the normalizer
59
pub fn get_normalizer(&self) -> Option<&N>;
60
61
/// Set the pre-tokenizer
62
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Option<impl Into<PT>>) -> &mut Self;
63
64
/// Get the pre-tokenizer
65
pub fn get_pre_tokenizer(&self) -> Option<&PT>;
66
67
/// Set the post-processor
68
pub fn with_post_processor(&mut self, post_processor: Option<impl Into<PP>>) -> &mut Self;
69
70
/// Get the post-processor
71
pub fn get_post_processor(&self) -> Option<&PP>;
72
73
/// Set the decoder
74
pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self;
75
76
/// Get the decoder
77
pub fn get_decoder(&self) -> Option<&D>;
78
79
/// Set the model
80
pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self;
81
82
/// Get the model
83
pub fn get_model(&self) -> &M;
84
}
85
```
86
87
### Vocabulary Management
88
89
Access to vocabulary information and token conversion methods.
90
91
```rust { .api }
92
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
93
/// Get the vocabulary as a HashMap
94
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;
95
96
/// Get the vocabulary size
97
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;
98
99
/// Convert token to ID
100
pub fn token_to_id(&self, token: &str) -> Option<u32>;
101
102
/// Convert ID to token
103
pub fn id_to_token(&self, id: u32) -> Option<String>;
104
105
/// Get added tokens decoder
106
pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>;
107
108
/// Set whether to encode special tokens
109
pub fn set_encode_special_tokens(&mut self, value: bool);
110
111
/// Get whether special tokens are encoded
112
pub fn get_encode_special_tokens(&self) -> bool;
113
}
114
```
115
116
### Text Encoding
117
118
Methods for converting text to token sequences with various encoding options.
119
120
```rust { .api }
121
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
122
/// Encode a single input
123
pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
124
where
125
E: Into<EncodeInput<'s>>;
126
127
/// Fast encoding variant
128
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
129
where
130
E: Into<EncodeInput<'s>>;
131
132
/// Encode with character-level offsets
133
pub fn encode_char_offsets<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
134
where
135
E: Into<EncodeInput<'s>>;
136
137
/// Encode multiple inputs in batch
138
pub fn encode_batch<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
139
where
140
E: Into<EncodeInput<'s>>;
141
142
/// Fast batch encoding
143
pub fn encode_batch_fast<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
144
where
145
E: Into<EncodeInput<'s>>;
146
147
/// Batch encoding with character-level offsets
148
pub fn encode_batch_char_offsets<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
149
where
150
E: Into<EncodeInput<'s>>;
151
}
152
```
153
154
### Token Decoding
155
156
Methods for converting token IDs back to text.
157
158
```rust { .api }
159
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
160
/// Decode token IDs to text
161
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;
162
163
/// Create a streaming decoder
164
pub fn decode_stream(&self, skip_special_tokens: bool) -> DecodeStream<'_, M, N, PT, PP, D>;
165
166
/// Decode multiple sequences in batch
167
pub fn decode_batch(&self, sentences: &[&[u32]], skip_special_tokens: bool) -> Result<Vec<String>>;
168
}
169
```
170
171
### Token Management
172
173
Methods for adding special tokens and managing added vocabulary.
174
175
```rust { .api }
176
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
177
/// Add special tokens to the tokenizer
178
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;
179
180
/// Add regular tokens to the tokenizer
181
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;
182
183
/// Set the added vocabulary
184
pub fn with_added_vocabulary(&mut self, added_vocabulary: AddedVocabulary) -> &mut Self;
185
186
/// Get the added vocabulary
187
pub fn get_added_vocabulary(&self) -> &AddedVocabulary;
188
}
189
```
190
191
### Configuration Management
192
193
Methods for managing padding, truncation, and other processing parameters.
194
195
```rust { .api }
196
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
197
/// Set truncation parameters
198
pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> Result<&mut Self>;
199
200
/// Get truncation parameters
201
pub fn get_truncation(&self) -> Option<&TruncationParams>;
202
203
/// Get mutable truncation parameters
204
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>;
205
206
/// Set padding parameters
207
pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self;
208
209
/// Get padding parameters
210
pub fn get_padding(&self) -> Option<&PaddingParams>;
211
212
/// Get mutable padding parameters
213
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>;
214
}
215
```
216
217
### Serialization
218
219
Methods for saving and loading tokenizers.
220
221
```rust { .api }
222
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
223
/// Convert tokenizer to JSON string
224
pub fn to_string(&self, pretty: bool) -> Result<String>;
225
226
/// Save tokenizer to file
227
pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()>;
228
}
229
```
230
231
### Encoding Structure
232
233
The `Encoding` struct represents the result of tokenization with complete metadata.
234
235
```rust { .api }
236
pub struct Encoding {
237
// Private fields containing all tokenization results
238
}
239
240
impl Encoding {
241
/// Create a new encoding
242
pub fn new(
243
ids: Vec<u32>,
244
type_ids: Vec<u32>,
245
tokens: Vec<String>,
246
words: Vec<Option<u32>>,
247
offsets: Vec<Offsets>,
248
special_tokens_mask: Vec<u32>,
249
attention_mask: Vec<u32>,
250
overflowing: Vec<Encoding>,
251
) -> Self;
252
253
/// Create encoding with specified capacity
254
pub fn with_capacity(len: usize) -> Self;
255
256
/// Create encoding from tokens
257
pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self;
258
259
/// Get token IDs
260
pub fn get_ids(&self) -> &[u32];
261
262
/// Get type IDs
263
pub fn get_type_ids(&self) -> &[u32];
264
265
/// Get token strings
266
pub fn get_tokens(&self) -> &[String];
267
268
/// Get word indices
269
pub fn get_words(&self) -> &[Option<u32>];
270
271
/// Get character offsets
272
pub fn get_offsets(&self) -> &[Offsets];
273
274
/// Get special tokens mask
275
pub fn get_special_tokens_mask(&self) -> &[u32];
276
277
/// Get attention mask
278
pub fn get_attention_mask(&self) -> &[u32];
279
280
/// Get overflowing encodings
281
pub fn get_overflowing(&self) -> &[Encoding];
282
283
/// Get mutable overflowing encodings
284
pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding>;
285
286
/// Set sequence ID for the encoding
287
pub fn set_sequence_id(&mut self, sequence_id: usize);
288
289
/// Get the length of the encoding
290
pub fn len(&self) -> usize;
291
292
/// Check if encoding is empty
293
pub fn is_empty(&self) -> bool;
294
}
295
```
296
297
### Streaming Decoder
298
299
The `DecodeStream` provides incremental decoding capabilities.
300
301
```rust { .api }
302
pub struct DecodeStream<'tok, M, N, PT, PP, D> {
303
// Private state for streaming decode
304
}
305
306
impl<'tok, M, N, PT, PP, D> DecodeStream<'tok, M, N, PT, PP, D> {
307
/// Decode the next token ID and return any resulting text
308
pub fn step(&mut self, id: u32) -> Result<Option<String>>;
309
}
310
```
311
312
## Usage Examples
313
314
### Basic Tokenization
315
316
```rust
317
use tokenizers::tokenizer::{Result, Tokenizer};
318
use tokenizers::models::bpe::BPE;
319
320
fn basic_tokenization() -> Result<()> {
321
// Create BPE model
322
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
323
let tokenizer = Tokenizer::new(bpe);
324
325
// Encode text
326
let encoding = tokenizer.encode("Hello world!", false)?;
327
println!("Tokens: {:?}", encoding.get_tokens());
328
println!("IDs: {:?}", encoding.get_ids());
329
println!("Offsets: {:?}", encoding.get_offsets());
330
331
// Decode back
332
let decoded = tokenizer.decode(encoding.get_ids(), false)?;
333
println!("Decoded: {}", decoded);
334
335
Ok(())
336
}
337
```
338
339
### Batch Processing
340
341
```rust
342
use tokenizers::tokenizer::{Result, Tokenizer};
343
use tokenizers::models::bpe::BPE;
344
345
fn batch_processing() -> Result<()> {
346
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
347
let tokenizer = Tokenizer::new(bpe);
348
349
let texts = vec![
350
"First sentence.",
351
"Second sentence is longer.",
352
"Third sentence is the longest of all."
353
];
354
355
// Batch encode
356
let encodings = tokenizer.encode_batch(texts, false)?;
357
358
for (i, encoding) in encodings.iter().enumerate() {
359
println!("Text {}: {:?}", i, encoding.get_tokens());
360
}
361
362
// Batch decode
363
let ids_batch: Vec<&[u32]> = encodings.iter()
364
.map(|enc| enc.get_ids())
365
.collect();
366
367
let decoded_texts = tokenizer.decode_batch(&ids_batch, false)?;
368
for (i, text) in decoded_texts.iter().enumerate() {
369
println!("Decoded {}: {}", i, text);
370
}
371
372
Ok(())
373
}
374
```
375
376
### Streaming Decode
377
378
```rust
379
use tokenizers::tokenizer::{Result, Tokenizer};
380
use tokenizers::models::bpe::BPE;
381
382
fn streaming_decode() -> Result<()> {
383
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
384
let tokenizer = Tokenizer::new(bpe);
385
386
let mut stream = tokenizer.decode_stream(false);
387
388
let token_ids = vec![123, 456, 789];
389
390
for id in token_ids {
391
if let Some(text) = stream.step(id)? {
392
print!("{}", text);
393
}
394
}
395
println!();
396
397
Ok(())
398
}
399
```