Tessl Tile for cargo/tokenizers@0.22.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-tokenization.md decoders.md index.md models.md normalizers.md post-processors.md pre-tokenizers.md training.md utilities.md

core-tokenization.mddocs/

0
# Core Tokenization
1

2
The core tokenization functionality provides the main `Tokenizer` and `TokenizerImpl` structs that orchestrate the tokenization pipeline, along with the `Encoding` struct that represents tokenization results with full offset tracking and metadata.
3

4
## Capabilities
5

6
### Tokenizer Construction
7

8
Main tokenizer creation with various initialization methods including from models, files, and remote sources.
9

10
```rust { .api }
11
impl Tokenizer {
12
    /// Create a new tokenizer with the specified model
13
    pub fn new(model: impl Into<ModelWrapper>) -> Self;
14
    
15
    /// Load a tokenizer from a JSON file
16
    pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
17
    
18
    /// Load a tokenizer from JSON bytes
19
    pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
20
    
21
    /// Load a pre-trained tokenizer from Hugging Face Hub (requires 'http' feature)
22
    #[cfg(feature = "http")]
23
    pub fn from_pretrained<S: AsRef<str>>(
24
        identifier: S, 
25
        params: Option<FromPretrainedParameters>
26
    ) -> Result<Self>;
27
    
28
    /// Unwrap the inner TokenizerImpl
29
    pub fn into_inner(self) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>;
30
}
31
```
32

33
### TokenizerImpl Configuration
34

35
Generic tokenizer implementation with configurable pipeline components.
36

37
```rust { .api }
38
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
39
where
40
    M: Model,
41
    N: Normalizer,
42
    PT: PreTokenizer,
43
    PP: PostProcessor,
44
    D: Decoder,
45
{
46
    /// Create a new tokenizer implementation with a model
47
    pub fn new(model: M) -> Self;
48
    
49
    /// Load from file
50
    pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
51
    
52
    /// Load from bytes
53
    pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
54
    
55
    /// Set the normalizer
56
    pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self;
57
    
58
    /// Get the normalizer
59
    pub fn get_normalizer(&self) -> Option<&N>;
60
    
61
    /// Set the pre-tokenizer
62
    pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Option<impl Into<PT>>) -> &mut Self;
63
    
64
    /// Get the pre-tokenizer
65
    pub fn get_pre_tokenizer(&self) -> Option<&PT>;
66
    
67
    /// Set the post-processor
68
    pub fn with_post_processor(&mut self, post_processor: Option<impl Into<PP>>) -> &mut Self;
69
    
70
    /// Get the post-processor
71
    pub fn get_post_processor(&self) -> Option<&PP>;
72
    
73
    /// Set the decoder
74
    pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self;
75
    
76
    /// Get the decoder
77
    pub fn get_decoder(&self) -> Option<&D>;
78
    
79
    /// Set the model
80
    pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self;
81
    
82
    /// Get the model
83
    pub fn get_model(&self) -> &M;
84
}
85
```
86

87
### Vocabulary Management
88

89
Access to vocabulary information and token conversion methods.
90

91
```rust { .api }
92
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
93
    /// Get the vocabulary as a HashMap
94
    pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;
95
    
96
    /// Get the vocabulary size
97
    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;
98
    
99
    /// Convert token to ID
100
    pub fn token_to_id(&self, token: &str) -> Option<u32>;
101
    
102
    /// Convert ID to token
103
    pub fn id_to_token(&self, id: u32) -> Option<String>;
104
    
105
    /// Get added tokens decoder
106
    pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>;
107
    
108
    /// Set whether to encode special tokens
109
    pub fn set_encode_special_tokens(&mut self, value: bool);
110
    
111
    /// Get whether special tokens are encoded
112
    pub fn get_encode_special_tokens(&self) -> bool;
113
}
114
```
115

116
### Text Encoding
117

118
Methods for converting text to token sequences with various encoding options.
119

120
```rust { .api }
121
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
122
    /// Encode a single input
123
    pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
124
    where
125
        E: Into<EncodeInput<'s>>;
126
    
127
    /// Fast encoding variant
128
    pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
129
    where
130
        E: Into<EncodeInput<'s>>;
131
    
132
    /// Encode with character-level offsets
133
    pub fn encode_char_offsets<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
134
    where
135
        E: Into<EncodeInput<'s>>;
136
    
137
    /// Encode multiple inputs in batch
138
    pub fn encode_batch<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
139
    where
140
        E: Into<EncodeInput<'s>>;
141
    
142
    /// Fast batch encoding
143
    pub fn encode_batch_fast<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
144
    where
145
        E: Into<EncodeInput<'s>>;
146
    
147
    /// Batch encoding with character-level offsets
148
    pub fn encode_batch_char_offsets<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>
149
    where
150
        E: Into<EncodeInput<'s>>;
151
}
152
```
153

154
### Token Decoding
155

156
Methods for converting token IDs back to text.
157

158
```rust { .api }
159
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
160
    /// Decode token IDs to text
161
    pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;
162
    
163
    /// Create a streaming decoder
164
    pub fn decode_stream(&self, skip_special_tokens: bool) -> DecodeStream<'_, M, N, PT, PP, D>;
165
    
166
    /// Decode multiple sequences in batch
167
    pub fn decode_batch(&self, sentences: &[&[u32]], skip_special_tokens: bool) -> Result<Vec<String>>;
168
}
169
```
170

171
### Token Management
172

173
Methods for adding special tokens and managing added vocabulary.
174

175
```rust { .api }
176
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
177
    /// Add special tokens to the tokenizer
178
    pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;
179
    
180
    /// Add regular tokens to the tokenizer
181
    pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;
182
    
183
    /// Set the added vocabulary
184
    pub fn with_added_vocabulary(&mut self, added_vocabulary: AddedVocabulary) -> &mut Self;
185
    
186
    /// Get the added vocabulary
187
    pub fn get_added_vocabulary(&self) -> &AddedVocabulary;
188
}
189
```
190

191
### Configuration Management
192

193
Methods for managing padding, truncation, and other processing parameters.
194

195
```rust { .api }
196
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
197
    /// Set truncation parameters
198
    pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> Result<&mut Self>;
199
    
200
    /// Get truncation parameters
201
    pub fn get_truncation(&self) -> Option<&TruncationParams>;
202
    
203
    /// Get mutable truncation parameters
204
    pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>;
205
    
206
    /// Set padding parameters
207
    pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self;
208
    
209
    /// Get padding parameters
210
    pub fn get_padding(&self) -> Option<&PaddingParams>;
211
    
212
    /// Get mutable padding parameters
213
    pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>;
214
}
215
```
216

217
### Serialization
218

219
Methods for saving and loading tokenizers.
220

221
```rust { .api }
222
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {
223
    /// Convert tokenizer to JSON string
224
    pub fn to_string(&self, pretty: bool) -> Result<String>;
225
    
226
    /// Save tokenizer to file
227
    pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()>;
228
}
229
```
230

231
### Encoding Structure
232

233
The `Encoding` struct represents the result of tokenization with complete metadata.
234

235
```rust { .api }
236
pub struct Encoding {
237
    // Private fields containing all tokenization results
238
}
239

240
impl Encoding {
241
    /// Create a new encoding
242
    pub fn new(
243
        ids: Vec<u32>,
244
        type_ids: Vec<u32>,
245
        tokens: Vec<String>,
246
        words: Vec<Option<u32>>,
247
        offsets: Vec<Offsets>,
248
        special_tokens_mask: Vec<u32>,
249
        attention_mask: Vec<u32>,
250
        overflowing: Vec<Encoding>,
251
    ) -> Self;
252
    
253
    /// Create encoding with specified capacity
254
    pub fn with_capacity(len: usize) -> Self;
255
    
256
    /// Create encoding from tokens
257
    pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self;
258
    
259
    /// Get token IDs
260
    pub fn get_ids(&self) -> &[u32];
261
    
262
    /// Get type IDs
263
    pub fn get_type_ids(&self) -> &[u32];
264
    
265
    /// Get token strings
266
    pub fn get_tokens(&self) -> &[String];
267
    
268
    /// Get word indices
269
    pub fn get_words(&self) -> &[Option<u32>];
270
    
271
    /// Get character offsets
272
    pub fn get_offsets(&self) -> &[Offsets];
273
    
274
    /// Get special tokens mask
275
    pub fn get_special_tokens_mask(&self) -> &[u32];
276
    
277
    /// Get attention mask
278
    pub fn get_attention_mask(&self) -> &[u32];
279
    
280
    /// Get overflowing encodings
281
    pub fn get_overflowing(&self) -> &[Encoding];
282
    
283
    /// Get mutable overflowing encodings
284
    pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding>;
285
    
286
    /// Set sequence ID for the encoding
287
    pub fn set_sequence_id(&mut self, sequence_id: usize);
288
    
289
    /// Get the length of the encoding
290
    pub fn len(&self) -> usize;
291
    
292
    /// Check if encoding is empty
293
    pub fn is_empty(&self) -> bool;
294
}
295
```
296

297
### Streaming Decoder
298

299
The `DecodeStream` provides incremental decoding capabilities.
300

301
```rust { .api }
302
pub struct DecodeStream<'tok, M, N, PT, PP, D> {
303
    // Private state for streaming decode
304
}
305

306
impl<'tok, M, N, PT, PP, D> DecodeStream<'tok, M, N, PT, PP, D> {
307
    /// Decode the next token ID and return any resulting text
308
    pub fn step(&mut self, id: u32) -> Result<Option<String>>;
309
}
310
```
311

312
## Usage Examples
313

314
### Basic Tokenization
315

316
```rust
317
use tokenizers::tokenizer::{Result, Tokenizer};
318
use tokenizers::models::bpe::BPE;
319

320
fn basic_tokenization() -> Result<()> {
321
    // Create BPE model
322
    let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
323
    let tokenizer = Tokenizer::new(bpe);
324
    
325
    // Encode text
326
    let encoding = tokenizer.encode("Hello world!", false)?;
327
    println!("Tokens: {:?}", encoding.get_tokens());
328
    println!("IDs: {:?}", encoding.get_ids());
329
    println!("Offsets: {:?}", encoding.get_offsets());
330
    
331
    // Decode back
332
    let decoded = tokenizer.decode(encoding.get_ids(), false)?;
333
    println!("Decoded: {}", decoded);
334
    
335
    Ok(())
336
}
337
```
338

339
### Batch Processing
340

341
```rust
342
use tokenizers::tokenizer::{Result, Tokenizer};
343
use tokenizers::models::bpe::BPE;
344

345
fn batch_processing() -> Result<()> {
346
    let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
347
    let tokenizer = Tokenizer::new(bpe);
348
    
349
    let texts = vec![
350
        "First sentence.",
351
        "Second sentence is longer.",
352
        "Third sentence is the longest of all."
353
    ];
354
    
355
    // Batch encode
356
    let encodings = tokenizer.encode_batch(texts, false)?;
357
    
358
    for (i, encoding) in encodings.iter().enumerate() {
359
        println!("Text {}: {:?}", i, encoding.get_tokens());
360
    }
361
    
362
    // Batch decode
363
    let ids_batch: Vec<&[u32]> = encodings.iter()
364
        .map(|enc| enc.get_ids())
365
        .collect();
366
    
367
    let decoded_texts = tokenizer.decode_batch(&ids_batch, false)?;
368
    for (i, text) in decoded_texts.iter().enumerate() {
369
        println!("Decoded {}: {}", i, text);
370
    }
371
    
372
    Ok(())
373
}
374
```
375

376
### Streaming Decode
377

378
```rust
379
use tokenizers::tokenizer::{Result, Tokenizer};
380
use tokenizers::models::bpe::BPE;
381

382
fn streaming_decode() -> Result<()> {
383
    let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
384
    let tokenizer = Tokenizer::new(bpe);
385
    
386
    let mut stream = tokenizer.decode_stream(false);
387
    
388
    let token_ids = vec![123, 456, 789];
389
    
390
    for id in token_ids {
391
        if let Some(text) = stream.step(id)? {
392
            print!("{}", text);
393
        }
394
    }
395
    println!();
396
    
397
    Ok(())
398
}
399
```

Version

Tile

Files

core-tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-tokenization.mddocs/