or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

training.mddocs/

0

# Model Training

1

2

Training capabilities for tokenization models. Each model type has its own specific trainer implementation located within the model's module, not in a separate trainers module.

3

4

## Capabilities

5

6

### Trainer Trait

7

8

All trainers implement the `Trainer` trait which defines the interface for training tokenization models.

9

10

```rust { .api }

11

pub trait Trainer {

12

type Model: Model + Sized;

13

14

fn should_show_progress(&self) -> bool;

15

fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;

16

fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>

17

where

18

I: Iterator<Item = S> + Send,

19

S: AsRef<str> + Send,

20

F: Fn(&str) -> Result<Vec<String>> + Sync;

21

}

22

```

23

24

### BPE Trainer

25

26

BPE model trainer for learning Byte-Pair Encoding vocabularies.

27

28

```rust { .api }

29

use tokenizers::models::bpe::BpeTrainer;

30

use std::collections::BTreeSet;

31

32

pub struct BpeTrainer {

33

// Configuration for BPE training

34

}

35

36

impl BpeTrainer {

37

pub fn new() -> Self;

38

pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;

39

pub fn min_frequency(&mut self, freq: u32) -> &mut Self;

40

pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;

41

pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;

42

pub fn limit_alphabet(&mut self, limit: Option<usize>) -> &mut Self;

43

pub fn initial_alphabet(&mut self, alphabet: BTreeSet<char>) -> &mut Self;

44

pub fn continuing_subword_prefix(&mut self, prefix: Option<String>) -> &mut Self;

45

pub fn end_of_word_suffix(&mut self, suffix: Option<String>) -> &mut Self;

46

}

47

```

48

49

### WordPiece Trainer

50

51

WordPiece model trainer for learning WordPiece vocabularies.

52

53

```rust { .api }

54

use tokenizers::models::wordpiece::WordPieceTrainer;

55

56

pub struct WordPieceTrainer {

57

// Configuration for WordPiece training

58

}

59

60

impl WordPieceTrainer {

61

pub fn new() -> Self;

62

pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;

63

pub fn min_frequency(&mut self, freq: u32) -> &mut Self;

64

pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;

65

pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;

66

pub fn limit_alphabet(&mut self, limit: Option<usize>) -> &mut Self;

67

pub fn initial_alphabet(&mut self, alphabet: BTreeSet<char>) -> &mut Self;

68

pub fn continuing_subword_prefix(&mut self, prefix: String) -> &mut Self;

69

pub fn end_of_word_suffix(&mut self, suffix: String) -> &mut Self;

70

}

71

```

72

73

### WordLevel Trainer

74

75

WordLevel model trainer for learning word-level vocabularies.

76

77

```rust { .api }

78

use tokenizers::models::wordlevel::WordLevelTrainer;

79

80

pub struct WordLevelTrainer {

81

// Configuration for WordLevel training

82

}

83

84

impl WordLevelTrainer {

85

pub fn new() -> Self;

86

pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;

87

pub fn min_frequency(&mut self, freq: u32) -> &mut Self;

88

pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;

89

pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;

90

}

91

```

92

93

### Unigram Trainer

94

95

Unigram model trainer for learning Unigram language model vocabularies.

96

97

```rust { .api }

98

use tokenizers::models::unigram::UnigramTrainer;

99

100

pub struct UnigramTrainer {

101

// Configuration for Unigram training

102

}

103

104

impl UnigramTrainer {

105

pub fn new() -> Self;

106

pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;

107

pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;

108

pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;

109

pub fn unk_token(&mut self, unk_token: Option<String>) -> &mut Self;

110

pub fn max_piece_length(&mut self, max_piece_length: usize) -> &mut Self;

111

pub fn n_sub_iterations(&mut self, n_sub_iterations: usize) -> &mut Self;

112

pub fn shrinking_factor(&mut self, shrinking_factor: f64) -> &mut Self;

113

}

114

```

115

116

### Training Methods

117

118

Training is performed using the tokenizer's training methods, which work with the appropriate trainer for the model type.

119

120

```rust { .api }

121

impl Tokenizer {

122

/// Train from file paths

123

pub fn train_from_files(&mut self, trainer: &mut dyn Trainer, files: Vec<String>) -> Result<()>;

124

125

/// Train from iterator of sequences

126

pub fn train<I, S>(&mut self, trainer: &mut dyn Trainer, sequences: I) -> Result<()>

127

where

128

I: Iterator<Item = S> + Send,

129

S: AsRef<str> + Send;

130

}

131

```

132

133

## Usage Examples

134

135

### BPE Training

136

137

```rust

138

use tokenizers::models::bpe::{BPE, BpeTrainer};

139

use tokenizers::{Tokenizer, AddedToken};

140

141

fn train_bpe_model() -> tokenizers::Result<()> {

142

// Create a BPE model and trainer

143

let mut model = BPE::default();

144

let mut trainer = BpeTrainer::new();

145

146

// Configure the trainer

147

trainer

148

.vocab_size(30000)

149

.min_frequency(2)

150

.show_progress(true)

151

.special_tokens(vec![

152

AddedToken::from("<s>", true),

153

AddedToken::from("</s>", true),

154

AddedToken::from("<unk>", true),

155

AddedToken::from("<pad>", true),

156

]);

157

158

// Create tokenizer and train

159

let mut tokenizer = Tokenizer::new(model);

160

161

// Train from files

162

let files = vec!["./training_data.txt".to_string()];

163

tokenizer.train_from_files(&mut trainer, files)?;

164

165

// Save the trained model

166

tokenizer.save("./tokenizer.json", false)?;

167

168

Ok(())

169

}

170

```

171

172

### WordPiece Training

173

174

```rust

175

use tokenizers::models::wordpiece::{WordPiece, WordPieceTrainer};

176

use tokenizers::{Tokenizer, AddedToken};

177

178

fn train_wordpiece_model() -> tokenizers::Result<()> {

179

// Create a WordPiece model and trainer

180

let mut model = WordPiece::default();

181

let mut trainer = WordPieceTrainer::new();

182

183

// Configure the trainer

184

trainer

185

.vocab_size(30000)

186

.min_frequency(2)

187

.show_progress(true)

188

.continuing_subword_prefix("##".to_string())

189

.special_tokens(vec![

190

AddedToken::from("[CLS]", true),

191

AddedToken::from("[SEP]", true),

192

AddedToken::from("[UNK]", true),

193

AddedToken::from("[PAD]", true),

194

AddedToken::from("[MASK]", true),

195

]);

196

197

// Create tokenizer and train

198

let mut tokenizer = Tokenizer::new(model);

199

200

// Train from iterator of text lines

201

let training_data = vec![

202

"Hello world!",

203

"This is training data.",

204

"WordPiece tokenization is powerful.",

205

];

206

207

tokenizer.train(

208

&mut trainer,

209

training_data.into_iter()

210

)?;

211

212

Ok(())

213

}

214

```

215

216

### Training Progress and Configuration

217

218

```rust

219

use tokenizers::models::bpe::{BPE, BpeTrainer};

220

use tokenizers::{Tokenizer, AddedToken};

221

use std::collections::BTreeSet;

222

223

fn advanced_bpe_training() -> tokenizers::Result<()> {

224

let mut model = BPE::default();

225

let mut trainer = BpeTrainer::new();

226

227

// Advanced configuration

228

let alphabet: BTreeSet<char> = "abcdefghijklmnopqrstuvwxyz".chars().collect();

229

230

trainer

231

.vocab_size(50000)

232

.min_frequency(5)

233

.show_progress(true)

234

.limit_alphabet(Some(1000))

235

.initial_alphabet(alphabet)

236

.continuing_subword_prefix(Some("##".to_string()))

237

.end_of_word_suffix(Some("</w>".to_string()))

238

.special_tokens(vec![

239

AddedToken::from("<unk>", true),

240

AddedToken::from("<s>", true),

241

AddedToken::from("</s>", true),

242

]);

243

244

let mut tokenizer = Tokenizer::new(model);

245

246

// Large-scale training from multiple files

247

let training_files = vec![

248

"./corpus1.txt".to_string(),

249

"./corpus2.txt".to_string(),

250

"./corpus3.txt".to_string(),

251

];

252

253

tokenizer.train_from_files(&mut trainer, training_files)?;

254

255

// Test the trained tokenizer

256

let encoding = tokenizer.encode("Hello world!", false)?;

257

println!("Tokens: {:?}", encoding.get_tokens());

258

259

Ok(())

260

}

261

```