or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

index.mddocs/

0

# Tokenizers

1

2

A high-performance Rust tokenization library that provides implementations of modern tokenization algorithms including BPE, WordPiece, WordLevel, and Unigram models. The library is designed around a composable pipeline architecture with five core components, offering exceptional performance through Rust implementation while maintaining flexibility and ease of use for natural language processing applications.

3

4

## Package Information

5

6

- **Package Name**: tokenizers

7

- **Package Type**: cargo

8

- **Language**: Rust

9

- **Installation**: Add to Cargo.toml: `tokenizers = "0.22.0"`

10

11

## Core Imports

12

13

```rust

14

use tokenizers::{Tokenizer, Encoding, Result};

15

use tokenizers::models::bpe::BPE;

16

```

17

18

For specific components:

19

20

```rust

21

use tokenizers::{

22

AddedToken,

23

Model,

24

NormalizerWrapper,

25

PreTokenizerWrapper,

26

PostProcessorWrapper,

27

DecoderWrapper

28

};

29

```

30

31

## Basic Usage

32

33

```rust

34

use tokenizers::{Tokenizer, Result, EncodeInput};

35

use tokenizers::models::bpe::BPE;

36

37

fn main() -> Result<()> {

38

// Create a BPE model from vocabulary and merges files

39

let bpe = BPE::from_file("./vocab.json", "./merges.txt")

40

.dropout(0.1)

41

.unk_token("[UNK]".into())

42

.build()?;

43

44

// Create a tokenizer with the BPE model

45

let mut tokenizer = Tokenizer::new(bpe);

46

47

// Encode text

48

let encoding = tokenizer.encode("Hello, world!", false)?;

49

println!("Tokens: {:?}", encoding.get_tokens());

50

println!("IDs: {:?}", encoding.get_ids());

51

52

// Decode back to text

53

let decoded = tokenizer.decode(encoding.get_ids(), false)?;

54

println!("Decoded: {}", decoded);

55

56

Ok(())

57

}

58

```

59

60

## Architecture

61

62

The tokenizers library uses a composable pipeline architecture with five core components:

63

64

- **Normalizer**: Text normalization (unicode normalization, case conversion, etc.)

65

- **PreTokenizer**: Pre-segmentation that splits text while tracking offsets

66

- **Model**: Core tokenization algorithm (BPE, WordPiece, WordLevel, Unigram)

67

- **PostProcessor**: Post-processing to add special tokens for language models

68

- **Decoder**: Converts raw tokens back to readable text

69

70

This modular design allows mixing and matching components to create custom tokenization pipelines while providing sensible defaults for common use cases. The library prioritizes performance through Rust implementation and supports CPU parallelism.

71

72

## Capabilities

73

74

### Core Tokenization

75

76

Primary tokenizer implementation with encoding, decoding, and configuration management. Provides the main `Tokenizer` struct and pipeline orchestration.

77

78

```rust { .api }

79

pub struct Tokenizer(TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>);

80

81

impl Tokenizer {

82

pub fn new(model: impl Into<ModelWrapper>) -> Self;

83

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;

84

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;

85

pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>;

86

pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;

87

88

// Batch processing methods

89

pub fn encode_batch<'s, E>(&self, input: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>;

90

pub fn decode_batch(&self, sequences: Vec<&[u32]>, skip_special_tokens: bool) -> Result<Vec<String>>;

91

92

// Vocabulary access

93

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;

94

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;

95

pub fn token_to_id(&self, token: &str) -> Option<u32>;

96

pub fn id_to_token(&self, id: u32) -> Option<String>;

97

98

// Training methods

99

pub fn train_from_files(&mut self, trainer: &mut dyn Trainer, files: Vec<String>) -> Result<()>;

100

pub fn train<I, S>(&mut self, trainer: &mut dyn Trainer, sequences: I) -> Result<()>

101

where

102

I: Iterator<Item = S> + Send,

103

S: AsRef<str> + Send;

104

105

// Token management

106

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;

107

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;

108

109

// Component access

110

pub fn get_model(&self) -> &ModelWrapper;

111

pub fn get_normalizer(&self) -> Option<&NormalizerWrapper>;

112

pub fn get_pre_tokenizer(&self) -> Option<&PreTokenizerWrapper>;

113

pub fn get_post_processor(&self) -> Option<&PostProcessorWrapper>;

114

pub fn get_decoder(&self) -> Option<&DecoderWrapper>;

115

}

116

117

pub struct Encoding {

118

// Contains ids, tokens, offsets, type_ids, attention_mask, special_tokens_mask, etc.

119

}

120

121

impl Encoding {

122

// Basic data access

123

pub fn get_ids(&self) -> &[u32];

124

pub fn get_tokens(&self) -> &[String];

125

pub fn get_offsets(&self) -> &[Offsets];

126

pub fn get_word_ids(&self) -> &[Option<u32>];

127

pub fn get_sequence_ids(&self) -> &[Option<u32>];

128

pub fn get_type_ids(&self) -> &[u32];

129

pub fn get_attention_mask(&self) -> &[u32];

130

pub fn get_special_tokens_mask(&self) -> &[u32];

131

pub fn get_overflowing(&self) -> &[Encoding];

132

133

// Character and word mapping

134

pub fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)>;

135

pub fn word_to_chars(&self, word_index: u32) -> Option<Offsets>;

136

pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets>;

137

pub fn token_to_word(&self, token_index: usize) -> Option<u32>;

138

pub fn char_to_token(&self, char_index: usize) -> Option<usize>;

139

pub fn char_to_word(&self, char_index: usize) -> Option<u32>;

140

141

// Length information

142

pub fn len(&self) -> usize;

143

pub fn is_empty(&self) -> bool;

144

145

// Modification methods

146

pub fn truncate(&mut self, max_length: usize, stride: usize, direction: TruncationDirection);

147

pub fn pad(&mut self, length: usize, pad_id: u32, pad_type_id: u32, pad_token: &str, direction: PaddingDirection);

148

149

// Combination

150

pub fn merge_with(&mut self, pair: Encoding, growing_offsets: bool) -> Result<()>;

151

}

152

```

153

154

[Core Tokenization](./core-tokenization.md)

155

156

### Tokenization Models

157

158

Implementation of different tokenization algorithms including BPE (Byte-Pair Encoding), WordPiece, WordLevel, and Unigram models with their respective trainers and builders.

159

160

```rust { .api }

161

pub trait Model {

162

type Trainer: Trainer + Sync;

163

fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;

164

fn token_to_id(&self, token: &str) -> Option<u32>;

165

fn id_to_token(&self, id: u32) -> Option<String>;

166

fn get_vocab(&self) -> HashMap<String, u32>;

167

fn get_vocab_size(&self) -> usize;

168

}

169

170

pub struct BPE {

171

pub dropout: Option<f32>;

172

pub unk_token: Option<String>;

173

// Other configuration fields

174

}

175

176

impl BPE {

177

pub fn builder() -> BpeBuilder;

178

pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder;

179

}

180

```

181

182

[Tokenization Models](./models.md)

183

184

### Text Normalization

185

186

Text preprocessing capabilities including unicode normalization, case conversion, accent stripping, and custom text transformations applied before tokenization.

187

188

```rust { .api }

189

pub trait Normalizer {

190

fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;

191

}

192

193

pub enum NormalizerWrapper {

194

BertNormalizer(BertNormalizer),

195

StripNormalizer(Strip),

196

StripAccents(StripAccents),

197

NFC(NFC), NFD(NFD), NFKC(NFKC), NFKD(NFKD),

198

Sequence(Sequence),

199

Lowercase(Lowercase),

200

// Other normalizers

201

}

202

```

203

204

[Text Normalization](./normalizers.md)

205

206

### Pre-tokenization

207

208

Text splitting strategies that divide input into initial segments while preserving offset mapping, including whitespace-based, punctuation-aware, and custom pattern splitting.

209

210

```rust { .api }

211

pub trait PreTokenizer {

212

fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>;

213

}

214

215

pub enum PreTokenizerWrapper {

216

BertPreTokenizer(BertPreTokenizer),

217

ByteLevel(ByteLevel),

218

Whitespace(Whitespace),

219

Sequence(Sequence),

220

// Other pre-tokenizers

221

}

222

```

223

224

[Pre-tokenization](./pre-tokenizers.md)

225

226

### Post-processing

227

228

Special token addition and encoding modifications for different model architectures including BERT, RoBERTa, and custom template-based processing.

229

230

```rust { .api }

231

pub trait PostProcessor {

232

fn added_tokens(&self, is_pair: bool) -> usize;

233

fn process(

234

&self,

235

encoding: Encoding,

236

pair_encoding: Option<Encoding>,

237

add_special_tokens: bool,

238

) -> Result<Encoding>;

239

}

240

241

pub enum PostProcessorWrapper {

242

Roberta(RobertaProcessing),

243

Bert(BertProcessing),

244

Template(TemplateProcessing),

245

// Other processors

246

}

247

```

248

249

[Post-processing](./post-processors.md)

250

251

### Decoding

252

253

Token-to-text conversion with support for different decoding strategies including BPE decoding, WordPiece decoding, byte-level decoding, and custom decoders.

254

255

```rust { .api }

256

pub trait Decoder {

257

fn decode(&self, tokens: Vec<String>) -> Result<String>;

258

fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>;

259

}

260

261

pub enum DecoderWrapper {

262

BPE(BPEDecoder),

263

ByteLevel(ByteLevel),

264

WordPiece(WordPiece),

265

// Other decoders

266

}

267

```

268

269

[Decoding](./decoders.md)

270

271

### Model Training

272

273

Training capabilities for all supported tokenization models with configurable parameters, progress tracking, and special token handling.

274

275

```rust { .api }

276

pub trait Trainer {

277

type Model: Model + Sized;

278

fn should_show_progress(&self) -> bool;

279

fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;

280

fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>;

281

}

282

283

pub struct BpeTrainer;

284

pub struct WordPieceTrainer;

285

pub struct WordLevelTrainer;

286

pub struct UnigramTrainer;

287

```

288

289

[Model Training](./training.md)

290

291

### Utilities

292

293

Support utilities including padding, truncation, parallelism configuration, and HTTP downloading capabilities.

294

295

```rust { .api }

296

pub struct PaddingParams {

297

pub strategy: PaddingStrategy,

298

pub direction: PaddingDirection,

299

pub pad_id: u32,

300

// Other padding fields

301

}

302

303

pub struct TruncationParams {

304

pub direction: TruncationDirection,

305

pub max_length: usize,

306

pub strategy: TruncationStrategy,

307

// Other truncation fields

308

}

309

310

pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Result<()>;

311

pub fn truncate_encodings(encoding: Encoding, pair_encoding: Option<Encoding>, params: &TruncationParams) -> Result<(Encoding, Option<Encoding>)>;

312

```

313

314

[Utilities](./utilities.md)

315

316

## Types

317

318

```rust { .api }

319

use std::collections::HashMap;

320

321

pub type Error = Box<dyn std::error::Error + Send + Sync>;

322

pub type Result<T> = std::result::Result<T, Error>;

323

pub type Offsets = (usize, usize);

324

325

pub struct Token {

326

pub id: u32,

327

pub value: String,

328

pub offsets: (usize, usize),

329

}

330

331

pub struct AddedToken {

332

pub content: String,

333

pub single_word: bool,

334

pub lstrip: bool,

335

pub rstrip: bool,

336

pub normalized: bool,

337

pub special: bool,

338

}

339

340

pub struct AddedVocabulary {

341

// Manages special tokens and added vocabulary

342

}

343

344

pub enum InputSequence<'s> {

345

Raw(Cow<'s, str>),

346

PreTokenized(Cow<'s, [&'s str]>),

347

PreTokenizedOwned(Cow<'s, [String]>),

348

PreTokenizedCow(Cow<'s, [Cow<'s, str>]>),

349

}

350

351

pub enum EncodeInput<'s> {

352

Single(InputSequence<'s>),

353

Dual(InputSequence<'s>, InputSequence<'s>),

354

}

355

356

pub enum TruncationDirection {

357

Left,

358

Right,

359

}

360

361

pub enum PaddingDirection {

362

Left,

363

Right,

364

}

365

366

pub trait Trainer {

367

type Model: Model + Sized;

368

fn should_show_progress(&self) -> bool;

369

fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;

370

fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>

371

where

372

I: Iterator<Item = S> + Send,

373

S: AsRef<str> + Send,

374

F: Fn(&str) -> Result<Vec<String>> + Sync;

375

}

376

```