Tessl Tile for cargo/tokenizers@0.22.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-tokenization.md decoders.md index.md models.md normalizers.md post-processors.md pre-tokenizers.md training.md utilities.md

index.mddocs/

0
# Tokenizers
1

2
A high-performance Rust tokenization library that provides implementations of modern tokenization algorithms including BPE, WordPiece, WordLevel, and Unigram models. The library is designed around a composable pipeline architecture with five core components, offering exceptional performance through Rust implementation while maintaining flexibility and ease of use for natural language processing applications.
3

4
## Package Information
5

6
- **Package Name**: tokenizers
7
- **Package Type**: cargo
8
- **Language**: Rust
9
- **Installation**: Add to Cargo.toml: `tokenizers = "0.22.0"`
10

11
## Core Imports
12

13
```rust
14
use tokenizers::{Tokenizer, Encoding, Result};
15
use tokenizers::models::bpe::BPE;
16
```
17

18
For specific components:
19

20
```rust
21
use tokenizers::{
22
    AddedToken,
23
    Model,
24
    NormalizerWrapper, 
25
    PreTokenizerWrapper,
26
    PostProcessorWrapper,
27
    DecoderWrapper
28
};
29
```
30

31
## Basic Usage
32

33
```rust
34
use tokenizers::{Tokenizer, Result, EncodeInput};
35
use tokenizers::models::bpe::BPE;
36

37
fn main() -> Result<()> {
38
    // Create a BPE model from vocabulary and merges files
39
    let bpe = BPE::from_file("./vocab.json", "./merges.txt")
40
        .dropout(0.1)
41
        .unk_token("[UNK]".into())
42
        .build()?;
43

44
    // Create a tokenizer with the BPE model
45
    let mut tokenizer = Tokenizer::new(bpe);
46

47
    // Encode text
48
    let encoding = tokenizer.encode("Hello, world!", false)?;
49
    println!("Tokens: {:?}", encoding.get_tokens());
50
    println!("IDs: {:?}", encoding.get_ids());
51

52
    // Decode back to text
53
    let decoded = tokenizer.decode(encoding.get_ids(), false)?;
54
    println!("Decoded: {}", decoded);
55

56
    Ok(())
57
}
58
```
59

60
## Architecture
61

62
The tokenizers library uses a composable pipeline architecture with five core components:
63

64
- **Normalizer**: Text normalization (unicode normalization, case conversion, etc.)
65
- **PreTokenizer**: Pre-segmentation that splits text while tracking offsets
66
- **Model**: Core tokenization algorithm (BPE, WordPiece, WordLevel, Unigram)
67
- **PostProcessor**: Post-processing to add special tokens for language models
68
- **Decoder**: Converts raw tokens back to readable text
69

70
This modular design allows mixing and matching components to create custom tokenization pipelines while providing sensible defaults for common use cases. The library prioritizes performance through Rust implementation and supports CPU parallelism.
71

72
## Capabilities
73

74
### Core Tokenization
75

76
Primary tokenizer implementation with encoding, decoding, and configuration management. Provides the main `Tokenizer` struct and pipeline orchestration.
77

78
```rust { .api }
79
pub struct Tokenizer(TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>);
80

81
impl Tokenizer {
82
    pub fn new(model: impl Into<ModelWrapper>) -> Self;
83
    pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
84
    pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
85
    pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>;
86
    pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;
87
    
88
    // Batch processing methods
89
    pub fn encode_batch<'s, E>(&self, input: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>;
90
    pub fn decode_batch(&self, sequences: Vec<&[u32]>, skip_special_tokens: bool) -> Result<Vec<String>>;
91
    
92
    // Vocabulary access
93
    pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;
94
    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;
95
    pub fn token_to_id(&self, token: &str) -> Option<u32>;
96
    pub fn id_to_token(&self, id: u32) -> Option<String>;
97
    
98
    // Training methods
99
    pub fn train_from_files(&mut self, trainer: &mut dyn Trainer, files: Vec<String>) -> Result<()>;
100
    pub fn train<I, S>(&mut self, trainer: &mut dyn Trainer, sequences: I) -> Result<()>
101
    where
102
        I: Iterator<Item = S> + Send,
103
        S: AsRef<str> + Send;
104
    
105
    // Token management
106
    pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;
107
    pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;
108
    
109
    // Component access
110
    pub fn get_model(&self) -> &ModelWrapper;
111
    pub fn get_normalizer(&self) -> Option<&NormalizerWrapper>;
112
    pub fn get_pre_tokenizer(&self) -> Option<&PreTokenizerWrapper>;
113
    pub fn get_post_processor(&self) -> Option<&PostProcessorWrapper>;
114
    pub fn get_decoder(&self) -> Option<&DecoderWrapper>;
115
}
116

117
pub struct Encoding {
118
    // Contains ids, tokens, offsets, type_ids, attention_mask, special_tokens_mask, etc.
119
}
120

121
impl Encoding {
122
    // Basic data access
123
    pub fn get_ids(&self) -> &[u32];
124
    pub fn get_tokens(&self) -> &[String];
125
    pub fn get_offsets(&self) -> &[Offsets];
126
    pub fn get_word_ids(&self) -> &[Option<u32>];
127
    pub fn get_sequence_ids(&self) -> &[Option<u32>];
128
    pub fn get_type_ids(&self) -> &[u32];
129
    pub fn get_attention_mask(&self) -> &[u32];
130
    pub fn get_special_tokens_mask(&self) -> &[u32];
131
    pub fn get_overflowing(&self) -> &[Encoding];
132
    
133
    // Character and word mapping
134
    pub fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)>;
135
    pub fn word_to_chars(&self, word_index: u32) -> Option<Offsets>;
136
    pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets>;
137
    pub fn token_to_word(&self, token_index: usize) -> Option<u32>;
138
    pub fn char_to_token(&self, char_index: usize) -> Option<usize>;
139
    pub fn char_to_word(&self, char_index: usize) -> Option<u32>;
140
    
141
    // Length information
142
    pub fn len(&self) -> usize;
143
    pub fn is_empty(&self) -> bool;
144
    
145
    // Modification methods
146
    pub fn truncate(&mut self, max_length: usize, stride: usize, direction: TruncationDirection);
147
    pub fn pad(&mut self, length: usize, pad_id: u32, pad_type_id: u32, pad_token: &str, direction: PaddingDirection);
148
    
149
    // Combination
150
    pub fn merge_with(&mut self, pair: Encoding, growing_offsets: bool) -> Result<()>;
151
}
152
```
153

154
[Core Tokenization](./core-tokenization.md)
155

156
### Tokenization Models
157

158
Implementation of different tokenization algorithms including BPE (Byte-Pair Encoding), WordPiece, WordLevel, and Unigram models with their respective trainers and builders.
159

160
```rust { .api }
161
pub trait Model {
162
    type Trainer: Trainer + Sync;
163
    fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
164
    fn token_to_id(&self, token: &str) -> Option<u32>;
165
    fn id_to_token(&self, id: u32) -> Option<String>;
166
    fn get_vocab(&self) -> HashMap<String, u32>;
167
    fn get_vocab_size(&self) -> usize;
168
}
169

170
pub struct BPE {
171
    pub dropout: Option<f32>;
172
    pub unk_token: Option<String>;
173
    // Other configuration fields
174
}
175

176
impl BPE {
177
    pub fn builder() -> BpeBuilder;
178
    pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder;
179
}
180
```
181

182
[Tokenization Models](./models.md)
183

184
### Text Normalization
185

186
Text preprocessing capabilities including unicode normalization, case conversion, accent stripping, and custom text transformations applied before tokenization.
187

188
```rust { .api }
189
pub trait Normalizer {
190
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
191
}
192

193
pub enum NormalizerWrapper {
194
    BertNormalizer(BertNormalizer),
195
    StripNormalizer(Strip),
196
    StripAccents(StripAccents),
197
    NFC(NFC), NFD(NFD), NFKC(NFKC), NFKD(NFKD),
198
    Sequence(Sequence),
199
    Lowercase(Lowercase),
200
    // Other normalizers
201
}
202
```
203

204
[Text Normalization](./normalizers.md)
205

206
### Pre-tokenization
207

208
Text splitting strategies that divide input into initial segments while preserving offset mapping, including whitespace-based, punctuation-aware, and custom pattern splitting.
209

210
```rust { .api }
211
pub trait PreTokenizer {
212
    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>;
213
}
214

215
pub enum PreTokenizerWrapper {
216
    BertPreTokenizer(BertPreTokenizer),
217
    ByteLevel(ByteLevel),
218
    Whitespace(Whitespace),
219
    Sequence(Sequence),
220
    // Other pre-tokenizers
221
}
222
```
223

224
[Pre-tokenization](./pre-tokenizers.md)
225

226
### Post-processing
227

228
Special token addition and encoding modifications for different model architectures including BERT, RoBERTa, and custom template-based processing.
229

230
```rust { .api }
231
pub trait PostProcessor {
232
    fn added_tokens(&self, is_pair: bool) -> usize;
233
    fn process(
234
        &self,
235
        encoding: Encoding,
236
        pair_encoding: Option<Encoding>,
237
        add_special_tokens: bool,
238
    ) -> Result<Encoding>;
239
}
240

241
pub enum PostProcessorWrapper {
242
    Roberta(RobertaProcessing),
243
    Bert(BertProcessing),
244
    Template(TemplateProcessing),
245
    // Other processors
246
}
247
```
248

249
[Post-processing](./post-processors.md)
250

251
### Decoding
252

253
Token-to-text conversion with support for different decoding strategies including BPE decoding, WordPiece decoding, byte-level decoding, and custom decoders.
254

255
```rust { .api }
256
pub trait Decoder {
257
    fn decode(&self, tokens: Vec<String>) -> Result<String>;
258
    fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>;
259
}
260

261
pub enum DecoderWrapper {
262
    BPE(BPEDecoder),
263
    ByteLevel(ByteLevel),
264
    WordPiece(WordPiece),
265
    // Other decoders
266
}
267
```
268

269
[Decoding](./decoders.md)
270

271
### Model Training
272

273
Training capabilities for all supported tokenization models with configurable parameters, progress tracking, and special token handling.
274

275
```rust { .api }
276
pub trait Trainer {
277
    type Model: Model + Sized;
278
    fn should_show_progress(&self) -> bool;
279
    fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
280
    fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>;
281
}
282

283
pub struct BpeTrainer;
284
pub struct WordPieceTrainer;
285
pub struct WordLevelTrainer;  
286
pub struct UnigramTrainer;
287
```
288

289
[Model Training](./training.md)
290

291
### Utilities
292

293
Support utilities including padding, truncation, parallelism configuration, and HTTP downloading capabilities.
294

295
```rust { .api }
296
pub struct PaddingParams {
297
    pub strategy: PaddingStrategy,
298
    pub direction: PaddingDirection,
299
    pub pad_id: u32,
300
    // Other padding fields
301
}
302

303
pub struct TruncationParams {
304
    pub direction: TruncationDirection,
305
    pub max_length: usize,
306
    pub strategy: TruncationStrategy,
307
    // Other truncation fields
308
}
309

310
pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Result<()>;
311
pub fn truncate_encodings(encoding: Encoding, pair_encoding: Option<Encoding>, params: &TruncationParams) -> Result<(Encoding, Option<Encoding>)>;
312
```
313

314
[Utilities](./utilities.md)
315

316
## Types
317

318
```rust { .api }
319
use std::collections::HashMap;
320

321
pub type Error = Box<dyn std::error::Error + Send + Sync>;
322
pub type Result<T> = std::result::Result<T, Error>;
323
pub type Offsets = (usize, usize);
324

325
pub struct Token {
326
    pub id: u32,
327
    pub value: String,
328
    pub offsets: (usize, usize),
329
}
330

331
pub struct AddedToken {
332
    pub content: String,
333
    pub single_word: bool,
334
    pub lstrip: bool,
335
    pub rstrip: bool,
336
    pub normalized: bool,
337
    pub special: bool,
338
}
339

340
pub struct AddedVocabulary {
341
    // Manages special tokens and added vocabulary
342
}
343

344
pub enum InputSequence<'s> {
345
    Raw(Cow<'s, str>),
346
    PreTokenized(Cow<'s, [&'s str]>),
347
    PreTokenizedOwned(Cow<'s, [String]>),
348
    PreTokenizedCow(Cow<'s, [Cow<'s, str>]>),
349
}
350

351
pub enum EncodeInput<'s> {
352
    Single(InputSequence<'s>),
353
    Dual(InputSequence<'s>, InputSequence<'s>),
354
}
355

356
pub enum TruncationDirection {
357
    Left,
358
    Right,
359
}
360

361
pub enum PaddingDirection {
362
    Left,
363
    Right,
364
}
365

366
pub trait Trainer {
367
    type Model: Model + Sized;
368
    fn should_show_progress(&self) -> bool;
369
    fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
370
    fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>
371
    where
372
        I: Iterator<Item = S> + Send,
373
        S: AsRef<str> + Send,
374
        F: Fn(&str) -> Result<Vec<String>> + Sync;
375
}
376
```

Version

Tile

Files

index.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

index.mddocs/