or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

core-tokenization.mddocs/

0

# Core Tokenization

1

2

The core tokenization functionality provides the main `Tokenizer` and `TokenizerImpl` structs that orchestrate the tokenization pipeline, along with the `Encoding` struct that represents tokenization results with full offset tracking and metadata.

3

4

## Capabilities

5

6

### Tokenizer Construction

7

8

Main tokenizer creation with various initialization methods including from models, files, and remote sources.

9

10

```rust { .api }

11

impl Tokenizer {

12

/// Create a new tokenizer with the specified model

13

pub fn new(model: impl Into<ModelWrapper>) -> Self;

14

15

/// Load a tokenizer from a JSON file

16

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;

17

18

/// Load a tokenizer from JSON bytes

19

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;

20

21

/// Load a pre-trained tokenizer from Hugging Face Hub (requires 'http' feature)

22

#[cfg(feature = "http")]

23

pub fn from_pretrained<S: AsRef<str>>(

24

identifier: S,

25

params: Option<FromPretrainedParameters>

26

) -> Result<Self>;

27

28

/// Unwrap the inner TokenizerImpl

29

pub fn into_inner(self) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>;

30

}

31

```

32

33

### TokenizerImpl Configuration

34

35

Generic tokenizer implementation with configurable pipeline components.

36

37

```rust { .api }

38

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>

39

where

40

M: Model,

41

N: Normalizer,

42

PT: PreTokenizer,

43

PP: PostProcessor,

44

D: Decoder,

45

{

46

/// Create a new tokenizer implementation with a model

47

pub fn new(model: M) -> Self;

48

49

/// Load from file

50

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;

51

52

/// Load from bytes

53

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;

54

55

/// Set the normalizer

56

pub fn with_normalizer(&mut self, normalizer: Option<impl Into<N>>) -> &mut Self;

57

58

/// Get the normalizer

59

pub fn get_normalizer(&self) -> Option<&N>;

60

61

/// Set the pre-tokenizer

62

pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Option<impl Into<PT>>) -> &mut Self;

63

64

/// Get the pre-tokenizer

65

pub fn get_pre_tokenizer(&self) -> Option<&PT>;

66

67

/// Set the post-processor

68

pub fn with_post_processor(&mut self, post_processor: Option<impl Into<PP>>) -> &mut Self;

69

70

/// Get the post-processor

71

pub fn get_post_processor(&self) -> Option<&PP>;

72

73

/// Set the decoder

74

pub fn with_decoder(&mut self, decoder: Option<impl Into<D>>) -> &mut Self;

75

76

/// Get the decoder

77

pub fn get_decoder(&self) -> Option<&D>;

78

79

/// Set the model

80

pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self;

81

82

/// Get the model

83

pub fn get_model(&self) -> &M;

84

}

85

```

86

87

### Vocabulary Management

88

89

Access to vocabulary information and token conversion methods.

90

91

```rust { .api }

92

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

93

/// Get the vocabulary as a HashMap

94

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;

95

96

/// Get the vocabulary size

97

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;

98

99

/// Convert token to ID

100

pub fn token_to_id(&self, token: &str) -> Option<u32>;

101

102

/// Convert ID to token

103

pub fn id_to_token(&self, id: u32) -> Option<String>;

104

105

/// Get added tokens decoder

106

pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken>;

107

108

/// Set whether to encode special tokens

109

pub fn set_encode_special_tokens(&mut self, value: bool);

110

111

/// Get whether special tokens are encoded

112

pub fn get_encode_special_tokens(&self) -> bool;

113

}

114

```

115

116

### Text Encoding

117

118

Methods for converting text to token sequences with various encoding options.

119

120

```rust { .api }

121

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

122

/// Encode a single input

123

pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>

124

where

125

E: Into<EncodeInput<'s>>;

126

127

/// Fast encoding variant

128

pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>

129

where

130

E: Into<EncodeInput<'s>>;

131

132

/// Encode with character-level offsets

133

pub fn encode_char_offsets<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>

134

where

135

E: Into<EncodeInput<'s>>;

136

137

/// Encode multiple inputs in batch

138

pub fn encode_batch<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>

139

where

140

E: Into<EncodeInput<'s>>;

141

142

/// Fast batch encoding

143

pub fn encode_batch_fast<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>

144

where

145

E: Into<EncodeInput<'s>>;

146

147

/// Batch encoding with character-level offsets

148

pub fn encode_batch_char_offsets<'s, E>(&self, inputs: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>

149

where

150

E: Into<EncodeInput<'s>>;

151

}

152

```

153

154

### Token Decoding

155

156

Methods for converting token IDs back to text.

157

158

```rust { .api }

159

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

160

/// Decode token IDs to text

161

pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;

162

163

/// Create a streaming decoder

164

pub fn decode_stream(&self, skip_special_tokens: bool) -> DecodeStream<'_, M, N, PT, PP, D>;

165

166

/// Decode multiple sequences in batch

167

pub fn decode_batch(&self, sentences: &[&[u32]], skip_special_tokens: bool) -> Result<Vec<String>>;

168

}

169

```

170

171

### Token Management

172

173

Methods for adding special tokens and managing added vocabulary.

174

175

```rust { .api }

176

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

177

/// Add special tokens to the tokenizer

178

pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;

179

180

/// Add regular tokens to the tokenizer

181

pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;

182

183

/// Set the added vocabulary

184

pub fn with_added_vocabulary(&mut self, added_vocabulary: AddedVocabulary) -> &mut Self;

185

186

/// Get the added vocabulary

187

pub fn get_added_vocabulary(&self) -> &AddedVocabulary;

188

}

189

```

190

191

### Configuration Management

192

193

Methods for managing padding, truncation, and other processing parameters.

194

195

```rust { .api }

196

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

197

/// Set truncation parameters

198

pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> Result<&mut Self>;

199

200

/// Get truncation parameters

201

pub fn get_truncation(&self) -> Option<&TruncationParams>;

202

203

/// Get mutable truncation parameters

204

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>;

205

206

/// Set padding parameters

207

pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self;

208

209

/// Get padding parameters

210

pub fn get_padding(&self) -> Option<&PaddingParams>;

211

212

/// Get mutable padding parameters

213

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>;

214

}

215

```

216

217

### Serialization

218

219

Methods for saving and loading tokenizers.

220

221

```rust { .api }

222

impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D> {

223

/// Convert tokenizer to JSON string

224

pub fn to_string(&self, pretty: bool) -> Result<String>;

225

226

/// Save tokenizer to file

227

pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()>;

228

}

229

```

230

231

### Encoding Structure

232

233

The `Encoding` struct represents the result of tokenization with complete metadata.

234

235

```rust { .api }

236

pub struct Encoding {

237

// Private fields containing all tokenization results

238

}

239

240

impl Encoding {

241

/// Create a new encoding

242

pub fn new(

243

ids: Vec<u32>,

244

type_ids: Vec<u32>,

245

tokens: Vec<String>,

246

words: Vec<Option<u32>>,

247

offsets: Vec<Offsets>,

248

special_tokens_mask: Vec<u32>,

249

attention_mask: Vec<u32>,

250

overflowing: Vec<Encoding>,

251

) -> Self;

252

253

/// Create encoding with specified capacity

254

pub fn with_capacity(len: usize) -> Self;

255

256

/// Create encoding from tokens

257

pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self;

258

259

/// Get token IDs

260

pub fn get_ids(&self) -> &[u32];

261

262

/// Get type IDs

263

pub fn get_type_ids(&self) -> &[u32];

264

265

/// Get token strings

266

pub fn get_tokens(&self) -> &[String];

267

268

/// Get word indices

269

pub fn get_words(&self) -> &[Option<u32>];

270

271

/// Get character offsets

272

pub fn get_offsets(&self) -> &[Offsets];

273

274

/// Get special tokens mask

275

pub fn get_special_tokens_mask(&self) -> &[u32];

276

277

/// Get attention mask

278

pub fn get_attention_mask(&self) -> &[u32];

279

280

/// Get overflowing encodings

281

pub fn get_overflowing(&self) -> &[Encoding];

282

283

/// Get mutable overflowing encodings

284

pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding>;

285

286

/// Set sequence ID for the encoding

287

pub fn set_sequence_id(&mut self, sequence_id: usize);

288

289

/// Get the length of the encoding

290

pub fn len(&self) -> usize;

291

292

/// Check if encoding is empty

293

pub fn is_empty(&self) -> bool;

294

}

295

```

296

297

### Streaming Decoder

298

299

The `DecodeStream` provides incremental decoding capabilities.

300

301

```rust { .api }

302

pub struct DecodeStream<'tok, M, N, PT, PP, D> {

303

// Private state for streaming decode

304

}

305

306

impl<'tok, M, N, PT, PP, D> DecodeStream<'tok, M, N, PT, PP, D> {

307

/// Decode the next token ID and return any resulting text

308

pub fn step(&mut self, id: u32) -> Result<Option<String>>;

309

}

310

```

311

312

## Usage Examples

313

314

### Basic Tokenization

315

316

```rust

317

use tokenizers::tokenizer::{Result, Tokenizer};

318

use tokenizers::models::bpe::BPE;

319

320

fn basic_tokenization() -> Result<()> {

321

// Create BPE model

322

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

323

let tokenizer = Tokenizer::new(bpe);

324

325

// Encode text

326

let encoding = tokenizer.encode("Hello world!", false)?;

327

println!("Tokens: {:?}", encoding.get_tokens());

328

println!("IDs: {:?}", encoding.get_ids());

329

println!("Offsets: {:?}", encoding.get_offsets());

330

331

// Decode back

332

let decoded = tokenizer.decode(encoding.get_ids(), false)?;

333

println!("Decoded: {}", decoded);

334

335

Ok(())

336

}

337

```

338

339

### Batch Processing

340

341

```rust

342

use tokenizers::tokenizer::{Result, Tokenizer};

343

use tokenizers::models::bpe::BPE;

344

345

fn batch_processing() -> Result<()> {

346

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

347

let tokenizer = Tokenizer::new(bpe);

348

349

let texts = vec![

350

"First sentence.",

351

"Second sentence is longer.",

352

"Third sentence is the longest of all."

353

];

354

355

// Batch encode

356

let encodings = tokenizer.encode_batch(texts, false)?;

357

358

for (i, encoding) in encodings.iter().enumerate() {

359

println!("Text {}: {:?}", i, encoding.get_tokens());

360

}

361

362

// Batch decode

363

let ids_batch: Vec<&[u32]> = encodings.iter()

364

.map(|enc| enc.get_ids())

365

.collect();

366

367

let decoded_texts = tokenizer.decode_batch(&ids_batch, false)?;

368

for (i, text) in decoded_texts.iter().enumerate() {

369

println!("Decoded {}: {}", i, text);

370

}

371

372

Ok(())

373

}

374

```

375

376

### Streaming Decode

377

378

```rust

379

use tokenizers::tokenizer::{Result, Tokenizer};

380

use tokenizers::models::bpe::BPE;

381

382

fn streaming_decode() -> Result<()> {

383

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

384

let tokenizer = Tokenizer::new(bpe);

385

386

let mut stream = tokenizer.decode_stream(false);

387

388

let token_ids = vec![123, 456, 789];

389

390

for id in token_ids {

391

if let Some(text) = stream.step(id)? {

392

print!("{}", text);

393

}

394

}

395

println!();

396

397

Ok(())

398

}

399

```