or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

models.mddocs/

0

# Tokenization Models

1

2

Tokenization models implement the core algorithms for converting text into tokens. The library supports four main algorithms: BPE (Byte-Pair Encoding), WordPiece, WordLevel, and Unigram, each with their own builders and trainers.

3

4

## Capabilities

5

6

### Model Trait

7

8

All tokenization models implement the `Model` trait which defines the core interface for tokenization.

9

10

```rust { .api }

11

pub trait Model {

12

type Trainer: Trainer + Sync;

13

14

/// Tokenize a sequence into tokens

15

fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;

16

17

/// Convert token string to ID

18

fn token_to_id(&self, token: &str) -> Option<u32>;

19

20

/// Convert ID to token string

21

fn id_to_token(&self, id: u32) -> Option<String>;

22

23

/// Get the full vocabulary mapping

24

fn get_vocab(&self) -> HashMap<String, u32>;

25

26

/// Get vocabulary size

27

fn get_vocab_size(&self) -> usize;

28

29

/// Save the model to disk

30

fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;

31

32

/// Get a trainer for this model type

33

fn get_trainer(&self) -> Self::Trainer;

34

}

35

```

36

37

### Model Wrapper

38

39

The `ModelWrapper` enum provides a unified interface for all model types.

40

41

```rust { .api }

42

pub enum ModelWrapper {

43

BPE(BPE),

44

WordPiece(WordPiece),

45

WordLevel(WordLevel),

46

Unigram(Unigram),

47

}

48

```

49

50

### BPE Model

51

52

Byte-Pair Encoding model with configurable dropout, unknown token handling, and subword formatting.

53

54

```rust { .api }

55

pub struct BPE {

56

pub dropout: Option<f32>,

57

pub unk_token: Option<String>,

58

pub continuing_subword_prefix: Option<String>,

59

pub end_of_word_suffix: Option<String>,

60

pub fuse_unk: bool,

61

pub byte_fallback: bool,

62

pub ignore_merges: bool,

63

}

64

65

impl BPE {

66

/// Create a new BPE builder

67

pub fn builder() -> BpeBuilder;

68

69

/// Create BPE from vocabulary and merges

70

pub fn new(vocab: Vocab, merges: Merges) -> Self;

71

72

/// Create BPE builder from files

73

pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder;

74

75

/// Read vocabulary and merges from files

76

pub fn read_file(vocab: &str, merges: &str) -> Result<(Vocab, Merges)>;

77

78

/// Clear the internal cache

79

pub fn clear_cache(&mut self);

80

81

/// Resize the internal cache

82

pub fn resize_cache(&mut self, capacity: usize);

83

}

84

```

85

86

### BPE Builder

87

88

Builder pattern for configuring BPE models.

89

90

```rust { .api }

91

pub struct BpeBuilder {

92

// Private configuration fields

93

}

94

95

impl BpeBuilder {

96

/// Create a new BPE builder

97

pub fn new() -> Self;

98

99

/// Set vocabulary and merges files

100

pub fn files(mut self, vocab: String, merges: String) -> Self;

101

102

/// Set vocabulary and merges data directly

103

pub fn vocab_and_merges<V: Into<AHashMap<String, u32>>>(

104

mut self,

105

vocab: V,

106

merges: Merges

107

) -> Self;

108

109

/// Set cache capacity

110

pub fn cache_capacity(mut self, capacity: usize) -> Self;

111

112

/// Set dropout rate for regularization

113

pub fn dropout(mut self, dropout: f32) -> Self;

114

115

/// Set unknown token

116

pub fn unk_token(mut self, unk_token: String) -> Self;

117

118

/// Set continuing subword prefix

119

pub fn continuing_subword_prefix(mut self, prefix: String) -> Self;

120

121

/// Set end of word suffix

122

pub fn end_of_word_suffix(mut self, suffix: String) -> Self;

123

124

/// Set whether to fuse unknown tokens

125

pub fn fuse_unk(mut self, fuse_unk: bool) -> Self;

126

127

/// Enable byte fallback for unknown characters

128

pub fn byte_fallback(mut self, byte_fallback: bool) -> Self;

129

130

/// Set whether to ignore merges during encoding

131

pub fn ignore_merges(mut self, ignore_merges: bool) -> Self;

132

133

/// Build the BPE model

134

pub fn build(mut self) -> Result<BPE>;

135

}

136

```

137

138

### WordPiece Model

139

140

WordPiece tokenization model commonly used in BERT and similar models.

141

142

```rust { .api }

143

pub struct WordPiece {

144

// Private fields

145

}

146

147

pub struct WordPieceBuilder {

148

// Private configuration fields

149

}

150

151

impl WordPieceBuilder {

152

/// Create a new WordPiece builder

153

pub fn new() -> Self;

154

155

/// Set vocabulary file

156

pub fn files(mut self, vocab: String) -> Self;

157

158

/// Set vocabulary data directly

159

pub fn vocab<V: Into<HashMap<String, u32>>>(mut self, vocab: V) -> Self;

160

161

/// Set unknown token

162

pub fn unk_token(mut self, unk_token: String) -> Self;

163

164

/// Set continuing subword prefix (typically "##")

165

pub fn continuing_subword_prefix(mut self, prefix: String) -> Self;

166

167

/// Set maximum word characters

168

pub fn max_input_chars_per_word(mut self, max_input_chars_per_word: usize) -> Self;

169

170

/// Build the WordPiece model

171

pub fn build(self) -> Result<WordPiece>;

172

}

173

174

impl WordPiece {

175

/// Create a new WordPiece builder

176

pub fn builder() -> WordPieceBuilder;

177

178

/// Create WordPiece from vocabulary file

179

pub fn from_file(vocab: &str) -> WordPieceBuilder;

180

}

181

```

182

183

### WordLevel Model

184

185

Word-level tokenization that treats each word as a single token.

186

187

```rust { .api }

188

pub struct WordLevel {

189

// Private fields

190

}

191

192

pub struct WordLevelBuilder {

193

// Private configuration fields

194

}

195

196

impl WordLevelBuilder {

197

/// Create a new WordLevel builder

198

pub fn new() -> Self;

199

200

/// Set vocabulary file

201

pub fn files(mut self, vocab: String) -> Self;

202

203

/// Set vocabulary data directly

204

pub fn vocab<V: Into<HashMap<String, u32>>>(mut self, vocab: V) -> Self;

205

206

/// Set unknown token

207

pub fn unk_token(mut self, unk_token: String) -> Self;

208

209

/// Build the WordLevel model

210

pub fn build(self) -> Result<WordLevel>;

211

}

212

213

impl WordLevel {

214

/// Create a new WordLevel builder

215

pub fn builder() -> WordLevelBuilder;

216

217

/// Create WordLevel from vocabulary file

218

pub fn from_file(vocab: &str) -> WordLevelBuilder;

219

}

220

```

221

222

### Unigram Model

223

224

Unigram language model-based tokenization with subword regularization. Uses vocabulary with scores to probabilistically select tokenization.

225

226

```rust { .api }

227

pub struct Unigram {

228

// Private fields containing vocabulary and model parameters

229

}

230

231

impl Unigram {

232

/// Create a Unigram model from vocabulary with scores

233

pub fn from(vocab: Vec<(String, f64)>, unk_id: Option<usize>, byte_fallback: bool) -> Result<Self>;

234

235

/// Load a Unigram model from file

236

pub fn load<P: AsRef<Path>>(path: P) -> Result<Self>;

237

238

/// Encode a sentence into subword tokens

239

pub fn encode(&self, sentence: &str) -> Result<Vec<String>>;

240

241

/// Check if byte fallback is enabled

242

pub fn byte_fallback(&self) -> bool;

243

244

/// Clear internal caches

245

pub fn clear_cache(&mut self);

246

247

/// Iterate over vocabulary entries

248

pub fn iter(&self) -> UnigramIterator<'_>;

249

}

250

251

/// Iterator over Unigram vocabulary entries

252

pub struct UnigramIterator<'a> {

253

// Private iterator state

254

}

255

256

impl<'a> Iterator for UnigramIterator<'a> {

257

type Item = (&'a str, f64);

258

fn next(&mut self) -> Option<Self::Item>;

259

}

260

261

/// Errors specific to Unigram model operations

262

#[derive(Debug)]

263

pub enum UnigramError {

264

/// Vocabulary is empty

265

EmptyVocabulary,

266

/// Invalid vocabulary format

267

InvalidVocabulary(String),

268

/// Unknown token not found in vocabulary

269

UnknownToken,

270

/// File I/O error

271

IoError(std::io::Error),

272

}

273

```

274

275

### Type Definitions

276

277

Common types used across model implementations.

278

279

```rust { .api }

280

// BPE specific types

281

pub type Vocab = AHashMap<String, u32>;

282

pub type MergeMap = AHashMap<Pair, (u32, u32)>;

283

pub type Merges = Vec<(String, String)>;

284

285

pub struct Pair(pub String, pub String);

286

287

// General token type

288

pub struct Token {

289

pub id: u32,

290

pub value: String,

291

pub offsets: (usize, usize),

292

}

293

294

impl Token {

295

pub fn new(id: u32, value: String, offsets: (usize, usize)) -> Self;

296

}

297

```

298

299

### Error Types

300

301

Model-specific error types for error handling.

302

303

```rust { .api }

304

// BPE errors

305

pub enum BpeError {

306

Io(std::io::Error),

307

JsonError(serde_json::Error),

308

BadVocabulary,

309

BadMerges(usize),

310

MergeTokenOutOfVocabulary(String),

311

UnkTokenOutOfVocabulary(String),

312

InvalidDropout,

313

}

314

315

// WordPiece errors

316

pub enum WordPieceError {

317

MissingUnkToken,

318

}

319

320

// WordLevel errors

321

pub enum WordLevelError {

322

MissingUnkToken,

323

}

324

325

// Unigram errors

326

pub enum UnigramError {

327

// Various unigram-specific errors

328

}

329

```

330

331

## Usage Examples

332

333

### BPE Model Usage

334

335

```rust

336

use tokenizers::models::bpe::BPE;

337

use tokenizers::tokenizer::Tokenizer;

338

339

fn create_bpe_model() -> tokenizers::Result<()> {

340

// Create BPE from files

341

let bpe = BPE::from_file("./vocab.json", "./merges.txt")

342

.dropout(0.1)

343

.unk_token("[UNK]".into())

344

.continuing_subword_prefix("##".into())

345

.build()?;

346

347

let tokenizer = Tokenizer::new(bpe);

348

349

let encoding = tokenizer.encode("Hello world!", false)?;

350

println!("BPE Tokens: {:?}", encoding.get_tokens());

351

352

Ok(())

353

}

354

```

355

356

### WordPiece Model Usage

357

358

```rust

359

use tokenizers::models::wordpiece::WordPiece;

360

use tokenizers::tokenizer::Tokenizer;

361

362

fn create_wordpiece_model() -> tokenizers::Result<()> {

363

// Create WordPiece from vocabulary file

364

let wordpiece = WordPiece::from_file("./wordpiece_vocab.txt")

365

.unk_token("[UNK]".into())

366

.continuing_subword_prefix("##".into())

367

.build()?;

368

369

let tokenizer = Tokenizer::new(wordpiece);

370

371

let encoding = tokenizer.encode("Hello world!", false)?;

372

println!("WordPiece Tokens: {:?}", encoding.get_tokens());

373

374

Ok(())

375

}

376

```

377

378

### Custom Vocabulary

379

380

```rust

381

use tokenizers::models::wordlevel::WordLevel;

382

use tokenizers::tokenizer::Tokenizer;

383

use std::collections::HashMap;

384

385

fn create_wordlevel_with_custom_vocab() -> tokenizers::Result<()> {

386

// Create custom vocabulary

387

let mut vocab = HashMap::new();

388

vocab.insert("hello".to_string(), 0);

389

vocab.insert("world".to_string(), 1);

390

vocab.insert("[UNK]".to_string(), 2);

391

392

let wordlevel = WordLevel::builder()

393

.vocab(vocab)

394

.unk_token("[UNK]".into())

395

.build()?;

396

397

let tokenizer = Tokenizer::new(wordlevel);

398

399

let encoding = tokenizer.encode("hello world", false)?;

400

println!("WordLevel Tokens: {:?}", encoding.get_tokens());

401

println!("WordLevel IDs: {:?}", encoding.get_ids());

402

403

Ok(())

404

}

405

```

406

407

### Unigram Model

408

409

```rust

410

use tokenizers::models::unigram::Unigram;

411

use tokenizers::Tokenizer;

412

413

fn create_unigram_model() -> tokenizers::Result<()> {

414

// Create vocabulary with scores (higher scores = more likely to be selected)

415

let vocab = vec![

416

("hello".to_string(), -1.0),

417

("world".to_string(), -1.5),

418

("he".to_string(), -2.0),

419

("llo".to_string(), -2.5),

420

("[UNK]".to_string(), -100.0),

421

];

422

423

// Create Unigram model with vocabulary, unknown token ID, and byte fallback

424

let unigram = Unigram::from(vocab, Some(4), false)?; // [UNK] is at index 4

425

426

let tokenizer = Tokenizer::new(unigram);

427

428

let encoding = tokenizer.encode("hello world", false)?;

429

println!("Unigram Tokens: {:?}", encoding.get_tokens());

430

431

Ok(())

432

}

433

```

434

435

### Model Information Access

436

437

```rust

438

use tokenizers::models::bpe::BPE;

439

use tokenizers::Model;

440

441

fn model_introspection() -> tokenizers::Result<()> {

442

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

443

444

// Access vocabulary information

445

let vocab = bpe.get_vocab();

446

println!("Vocabulary size: {}", vocab.len());

447

448

// Token conversion

449

if let Some(id) = bpe.token_to_id("hello") {

450

println!("Token 'hello' has ID: {}", id);

451

if let Some(token) = bpe.id_to_token(id) {

452

println!("ID {} maps back to: {}", id, token);

453

}

454

}

455

456

// Tokenize directly

457

let tokens = bpe.tokenize("Hello world!")?;

458

for token in tokens {

459

println!("Token: {} (ID: {}, offsets: {:?})",

460

token.value, token.id, token.offsets);

461

}

462

463

Ok(())

464

}

465

```