Tessl Tile for cargo/tokenizers@0.22.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-tokenization.md decoders.md index.md models.md normalizers.md post-processors.md pre-tokenizers.md training.md utilities.md

models.mddocs/

0
# Tokenization Models
1

2
Tokenization models implement the core algorithms for converting text into tokens. The library supports four main algorithms: BPE (Byte-Pair Encoding), WordPiece, WordLevel, and Unigram, each with their own builders and trainers.
3

4
## Capabilities
5

6
### Model Trait
7

8
All tokenization models implement the `Model` trait which defines the core interface for tokenization.
9

10
```rust { .api }
11
pub trait Model {
12
    type Trainer: Trainer + Sync;
13
    
14
    /// Tokenize a sequence into tokens
15
    fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
16
    
17
    /// Convert token string to ID
18
    fn token_to_id(&self, token: &str) -> Option<u32>;
19
    
20
    /// Convert ID to token string
21
    fn id_to_token(&self, id: u32) -> Option<String>;
22
    
23
    /// Get the full vocabulary mapping
24
    fn get_vocab(&self) -> HashMap<String, u32>;
25
    
26
    /// Get vocabulary size
27
    fn get_vocab_size(&self) -> usize;
28
    
29
    /// Save the model to disk
30
    fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;
31
    
32
    /// Get a trainer for this model type
33
    fn get_trainer(&self) -> Self::Trainer;
34
}
35
```
36

37
### Model Wrapper
38

39
The `ModelWrapper` enum provides a unified interface for all model types.
40

41
```rust { .api }
42
pub enum ModelWrapper {
43
    BPE(BPE),
44
    WordPiece(WordPiece),
45
    WordLevel(WordLevel),
46
    Unigram(Unigram),
47
}
48
```
49

50
### BPE Model
51

52
Byte-Pair Encoding model with configurable dropout, unknown token handling, and subword formatting.
53

54
```rust { .api }
55
pub struct BPE {
56
    pub dropout: Option<f32>,
57
    pub unk_token: Option<String>,
58
    pub continuing_subword_prefix: Option<String>,
59
    pub end_of_word_suffix: Option<String>,
60
    pub fuse_unk: bool,
61
    pub byte_fallback: bool,
62
    pub ignore_merges: bool,
63
}
64

65
impl BPE {
66
    /// Create a new BPE builder
67
    pub fn builder() -> BpeBuilder;
68
    
69
    /// Create BPE from vocabulary and merges
70
    pub fn new(vocab: Vocab, merges: Merges) -> Self;
71
    
72
    /// Create BPE builder from files
73
    pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder;
74
    
75
    /// Read vocabulary and merges from files
76
    pub fn read_file(vocab: &str, merges: &str) -> Result<(Vocab, Merges)>;
77
    
78
    /// Clear the internal cache
79
    pub fn clear_cache(&mut self);
80
    
81
    /// Resize the internal cache
82
    pub fn resize_cache(&mut self, capacity: usize);
83
}
84
```
85

86
### BPE Builder
87

88
Builder pattern for configuring BPE models.
89

90
```rust { .api }
91
pub struct BpeBuilder {
92
    // Private configuration fields
93
}
94

95
impl BpeBuilder {
96
    /// Create a new BPE builder
97
    pub fn new() -> Self;
98
    
99
    /// Set vocabulary and merges files
100
    pub fn files(mut self, vocab: String, merges: String) -> Self;
101
    
102
    /// Set vocabulary and merges data directly
103
    pub fn vocab_and_merges<V: Into<AHashMap<String, u32>>>(
104
        mut self, 
105
        vocab: V, 
106
        merges: Merges
107
    ) -> Self;
108
    
109
    /// Set cache capacity
110
    pub fn cache_capacity(mut self, capacity: usize) -> Self;
111
    
112
    /// Set dropout rate for regularization
113
    pub fn dropout(mut self, dropout: f32) -> Self;
114
    
115
    /// Set unknown token
116
    pub fn unk_token(mut self, unk_token: String) -> Self;
117
    
118
    /// Set continuing subword prefix
119
    pub fn continuing_subword_prefix(mut self, prefix: String) -> Self;
120
    
121
    /// Set end of word suffix
122
    pub fn end_of_word_suffix(mut self, suffix: String) -> Self;
123
    
124
    /// Set whether to fuse unknown tokens
125
    pub fn fuse_unk(mut self, fuse_unk: bool) -> Self;
126
    
127
    /// Enable byte fallback for unknown characters
128
    pub fn byte_fallback(mut self, byte_fallback: bool) -> Self;
129
    
130
    /// Set whether to ignore merges during encoding
131
    pub fn ignore_merges(mut self, ignore_merges: bool) -> Self;
132
    
133
    /// Build the BPE model
134
    pub fn build(mut self) -> Result<BPE>;
135
}
136
```
137

138
### WordPiece Model
139

140
WordPiece tokenization model commonly used in BERT and similar models.
141

142
```rust { .api }
143
pub struct WordPiece {
144
    // Private fields
145
}
146

147
pub struct WordPieceBuilder {
148
    // Private configuration fields
149
}
150

151
impl WordPieceBuilder {
152
    /// Create a new WordPiece builder
153
    pub fn new() -> Self;
154
    
155
    /// Set vocabulary file
156
    pub fn files(mut self, vocab: String) -> Self;
157
    
158
    /// Set vocabulary data directly
159
    pub fn vocab<V: Into<HashMap<String, u32>>>(mut self, vocab: V) -> Self;
160
    
161
    /// Set unknown token
162
    pub fn unk_token(mut self, unk_token: String) -> Self;
163
    
164
    /// Set continuing subword prefix (typically "##")
165
    pub fn continuing_subword_prefix(mut self, prefix: String) -> Self;
166
    
167
    /// Set maximum word characters
168
    pub fn max_input_chars_per_word(mut self, max_input_chars_per_word: usize) -> Self;
169
    
170
    /// Build the WordPiece model
171
    pub fn build(self) -> Result<WordPiece>;
172
}
173

174
impl WordPiece {
175
    /// Create a new WordPiece builder
176
    pub fn builder() -> WordPieceBuilder;
177
    
178
    /// Create WordPiece from vocabulary file
179
    pub fn from_file(vocab: &str) -> WordPieceBuilder;
180
}
181
```
182

183
### WordLevel Model
184

185
Word-level tokenization that treats each word as a single token.
186

187
```rust { .api }
188
pub struct WordLevel {
189
    // Private fields
190
}
191

192
pub struct WordLevelBuilder {
193
    // Private configuration fields
194
}
195

196
impl WordLevelBuilder {
197
    /// Create a new WordLevel builder
198
    pub fn new() -> Self;
199
    
200
    /// Set vocabulary file
201
    pub fn files(mut self, vocab: String) -> Self;
202
    
203
    /// Set vocabulary data directly
204
    pub fn vocab<V: Into<HashMap<String, u32>>>(mut self, vocab: V) -> Self;
205
    
206
    /// Set unknown token
207
    pub fn unk_token(mut self, unk_token: String) -> Self;
208
    
209
    /// Build the WordLevel model
210
    pub fn build(self) -> Result<WordLevel>;
211
}
212

213
impl WordLevel {
214
    /// Create a new WordLevel builder
215
    pub fn builder() -> WordLevelBuilder;
216
    
217
    /// Create WordLevel from vocabulary file
218
    pub fn from_file(vocab: &str) -> WordLevelBuilder;
219
}
220
```
221

222
### Unigram Model
223

224
Unigram language model-based tokenization with subword regularization. Uses vocabulary with scores to probabilistically select tokenization.
225

226
```rust { .api }
227
pub struct Unigram {
228
    // Private fields containing vocabulary and model parameters
229
}
230

231
impl Unigram {
232
    /// Create a Unigram model from vocabulary with scores
233
    pub fn from(vocab: Vec<(String, f64)>, unk_id: Option<usize>, byte_fallback: bool) -> Result<Self>;
234
    
235
    /// Load a Unigram model from file
236
    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self>;
237
    
238
    /// Encode a sentence into subword tokens
239
    pub fn encode(&self, sentence: &str) -> Result<Vec<String>>;
240
    
241
    /// Check if byte fallback is enabled
242
    pub fn byte_fallback(&self) -> bool;
243
    
244
    /// Clear internal caches
245
    pub fn clear_cache(&mut self);
246
    
247
    /// Iterate over vocabulary entries
248
    pub fn iter(&self) -> UnigramIterator<'_>;
249
}
250

251
/// Iterator over Unigram vocabulary entries
252
pub struct UnigramIterator<'a> {
253
    // Private iterator state
254
}
255

256
impl<'a> Iterator for UnigramIterator<'a> {
257
    type Item = (&'a str, f64);
258
    fn next(&mut self) -> Option<Self::Item>;
259
}
260

261
/// Errors specific to Unigram model operations
262
#[derive(Debug)]
263
pub enum UnigramError {
264
    /// Vocabulary is empty
265
    EmptyVocabulary,
266
    /// Invalid vocabulary format
267
    InvalidVocabulary(String),
268
    /// Unknown token not found in vocabulary
269
    UnknownToken,
270
    /// File I/O error
271
    IoError(std::io::Error),
272
}
273
```
274

275
### Type Definitions
276

277
Common types used across model implementations.
278

279
```rust { .api }
280
// BPE specific types
281
pub type Vocab = AHashMap<String, u32>;
282
pub type MergeMap = AHashMap<Pair, (u32, u32)>;
283
pub type Merges = Vec<(String, String)>;
284

285
pub struct Pair(pub String, pub String);
286

287
// General token type
288
pub struct Token {
289
    pub id: u32,
290
    pub value: String,
291
    pub offsets: (usize, usize),
292
}
293

294
impl Token {
295
    pub fn new(id: u32, value: String, offsets: (usize, usize)) -> Self;
296
}
297
```
298

299
### Error Types
300

301
Model-specific error types for error handling.
302

303
```rust { .api }
304
// BPE errors
305
pub enum BpeError {
306
    Io(std::io::Error),
307
    JsonError(serde_json::Error),
308
    BadVocabulary,
309
    BadMerges(usize),
310
    MergeTokenOutOfVocabulary(String),
311
    UnkTokenOutOfVocabulary(String),
312
    InvalidDropout,
313
}
314

315
// WordPiece errors
316
pub enum WordPieceError {
317
    MissingUnkToken,
318
}
319

320
// WordLevel errors
321
pub enum WordLevelError {
322
    MissingUnkToken,
323
}
324

325
// Unigram errors
326
pub enum UnigramError {
327
    // Various unigram-specific errors
328
}
329
```
330

331
## Usage Examples
332

333
### BPE Model Usage
334

335
```rust
336
use tokenizers::models::bpe::BPE;
337
use tokenizers::tokenizer::Tokenizer;
338

339
fn create_bpe_model() -> tokenizers::Result<()> {
340
    // Create BPE from files
341
    let bpe = BPE::from_file("./vocab.json", "./merges.txt")
342
        .dropout(0.1)
343
        .unk_token("[UNK]".into())
344
        .continuing_subword_prefix("##".into())
345
        .build()?;
346
    
347
    let tokenizer = Tokenizer::new(bpe);
348
    
349
    let encoding = tokenizer.encode("Hello world!", false)?;
350
    println!("BPE Tokens: {:?}", encoding.get_tokens());
351
    
352
    Ok(())
353
}
354
```
355

356
### WordPiece Model Usage
357

358
```rust
359
use tokenizers::models::wordpiece::WordPiece;
360
use tokenizers::tokenizer::Tokenizer;
361

362
fn create_wordpiece_model() -> tokenizers::Result<()> {
363
    // Create WordPiece from vocabulary file
364
    let wordpiece = WordPiece::from_file("./wordpiece_vocab.txt")
365
        .unk_token("[UNK]".into())
366
        .continuing_subword_prefix("##".into())
367
        .build()?;
368
    
369
    let tokenizer = Tokenizer::new(wordpiece);
370
    
371
    let encoding = tokenizer.encode("Hello world!", false)?;
372
    println!("WordPiece Tokens: {:?}", encoding.get_tokens());
373
    
374
    Ok(())
375
}
376
```
377

378
### Custom Vocabulary
379

380
```rust
381
use tokenizers::models::wordlevel::WordLevel;
382
use tokenizers::tokenizer::Tokenizer;
383
use std::collections::HashMap;
384

385
fn create_wordlevel_with_custom_vocab() -> tokenizers::Result<()> {
386
    // Create custom vocabulary
387
    let mut vocab = HashMap::new();
388
    vocab.insert("hello".to_string(), 0);
389
    vocab.insert("world".to_string(), 1);
390
    vocab.insert("[UNK]".to_string(), 2);
391
    
392
    let wordlevel = WordLevel::builder()
393
        .vocab(vocab)
394
        .unk_token("[UNK]".into())
395
        .build()?;
396
    
397
    let tokenizer = Tokenizer::new(wordlevel);
398
    
399
    let encoding = tokenizer.encode("hello world", false)?;
400
    println!("WordLevel Tokens: {:?}", encoding.get_tokens());
401
    println!("WordLevel IDs: {:?}", encoding.get_ids());
402
    
403
    Ok(())
404
}
405
```
406

407
### Unigram Model
408

409
```rust
410
use tokenizers::models::unigram::Unigram;
411
use tokenizers::Tokenizer;
412

413
fn create_unigram_model() -> tokenizers::Result<()> {
414
    // Create vocabulary with scores (higher scores = more likely to be selected)
415
    let vocab = vec![
416
        ("hello".to_string(), -1.0),
417
        ("world".to_string(), -1.5),
418
        ("he".to_string(), -2.0),  
419
        ("llo".to_string(), -2.5),
420
        ("[UNK]".to_string(), -100.0),
421
    ];
422
    
423
    // Create Unigram model with vocabulary, unknown token ID, and byte fallback
424
    let unigram = Unigram::from(vocab, Some(4), false)?; // [UNK] is at index 4
425
    
426
    let tokenizer = Tokenizer::new(unigram);
427
    
428
    let encoding = tokenizer.encode("hello world", false)?;
429
    println!("Unigram Tokens: {:?}", encoding.get_tokens());
430
    
431
    Ok(())
432
}
433
```
434

435
### Model Information Access
436

437
```rust
438
use tokenizers::models::bpe::BPE;
439
use tokenizers::Model;
440

441
fn model_introspection() -> tokenizers::Result<()> {
442
    let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
443
    
444
    // Access vocabulary information
445
    let vocab = bpe.get_vocab();
446
    println!("Vocabulary size: {}", vocab.len());
447
    
448
    // Token conversion
449
    if let Some(id) = bpe.token_to_id("hello") {
450
        println!("Token 'hello' has ID: {}", id);
451
        if let Some(token) = bpe.id_to_token(id) {
452
            println!("ID {} maps back to: {}", id, token);
453
        }
454
    }
455
    
456
    // Tokenize directly
457
    let tokens = bpe.tokenize("Hello world!")?;
458
    for token in tokens {
459
        println!("Token: {} (ID: {}, offsets: {:?})", 
460
                 token.value, token.id, token.offsets);
461
    }
462
    
463
    Ok(())
464
}
465
```

Version

Tile

Files

models.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

models.mddocs/