0
# Model Training
1
2
Training capabilities for tokenization models. Each model type has its own specific trainer implementation located within the model's module, not in a separate trainers module.
3
4
## Capabilities
5
6
### Trainer Trait
7
8
All trainers implement the `Trainer` trait which defines the interface for training tokenization models.
9
10
```rust { .api }
11
pub trait Trainer {
12
type Model: Model + Sized;
13
14
fn should_show_progress(&self) -> bool;
15
fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
16
fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>
17
where
18
I: Iterator<Item = S> + Send,
19
S: AsRef<str> + Send,
20
F: Fn(&str) -> Result<Vec<String>> + Sync;
21
}
22
```
23
24
### BPE Trainer
25
26
BPE model trainer for learning Byte-Pair Encoding vocabularies.
27
28
```rust { .api }
29
use tokenizers::models::bpe::BpeTrainer;
30
use std::collections::BTreeSet;
31
32
pub struct BpeTrainer {
33
// Configuration for BPE training
34
}
35
36
impl BpeTrainer {
37
pub fn new() -> Self;
38
pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;
39
pub fn min_frequency(&mut self, freq: u32) -> &mut Self;
40
pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;
41
pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;
42
pub fn limit_alphabet(&mut self, limit: Option<usize>) -> &mut Self;
43
pub fn initial_alphabet(&mut self, alphabet: BTreeSet<char>) -> &mut Self;
44
pub fn continuing_subword_prefix(&mut self, prefix: Option<String>) -> &mut Self;
45
pub fn end_of_word_suffix(&mut self, suffix: Option<String>) -> &mut Self;
46
}
47
```
48
49
### WordPiece Trainer
50
51
WordPiece model trainer for learning WordPiece vocabularies.
52
53
```rust { .api }
54
use tokenizers::models::wordpiece::WordPieceTrainer;
55
56
pub struct WordPieceTrainer {
57
// Configuration for WordPiece training
58
}
59
60
impl WordPieceTrainer {
61
pub fn new() -> Self;
62
pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;
63
pub fn min_frequency(&mut self, freq: u32) -> &mut Self;
64
pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;
65
pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;
66
pub fn limit_alphabet(&mut self, limit: Option<usize>) -> &mut Self;
67
pub fn initial_alphabet(&mut self, alphabet: BTreeSet<char>) -> &mut Self;
68
pub fn continuing_subword_prefix(&mut self, prefix: String) -> &mut Self;
69
pub fn end_of_word_suffix(&mut self, suffix: String) -> &mut Self;
70
}
71
```
72
73
### WordLevel Trainer
74
75
WordLevel model trainer for learning word-level vocabularies.
76
77
```rust { .api }
78
use tokenizers::models::wordlevel::WordLevelTrainer;
79
80
pub struct WordLevelTrainer {
81
// Configuration for WordLevel training
82
}
83
84
impl WordLevelTrainer {
85
pub fn new() -> Self;
86
pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;
87
pub fn min_frequency(&mut self, freq: u32) -> &mut Self;
88
pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;
89
pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;
90
}
91
```
92
93
### Unigram Trainer
94
95
Unigram model trainer for learning Unigram language model vocabularies.
96
97
```rust { .api }
98
use tokenizers::models::unigram::UnigramTrainer;
99
100
pub struct UnigramTrainer {
101
// Configuration for Unigram training
102
}
103
104
impl UnigramTrainer {
105
pub fn new() -> Self;
106
pub fn vocab_size(&mut self, vocab_size: usize) -> &mut Self;
107
pub fn show_progress(&mut self, show_progress: bool) -> &mut Self;
108
pub fn special_tokens(&mut self, tokens: Vec<AddedToken>) -> &mut Self;
109
pub fn unk_token(&mut self, unk_token: Option<String>) -> &mut Self;
110
pub fn max_piece_length(&mut self, max_piece_length: usize) -> &mut Self;
111
pub fn n_sub_iterations(&mut self, n_sub_iterations: usize) -> &mut Self;
112
pub fn shrinking_factor(&mut self, shrinking_factor: f64) -> &mut Self;
113
}
114
```
115
116
### Training Methods
117
118
Training is performed using the tokenizer's training methods, which work with the appropriate trainer for the model type.
119
120
```rust { .api }
121
impl Tokenizer {
122
/// Train from file paths
123
pub fn train_from_files(&mut self, trainer: &mut dyn Trainer, files: Vec<String>) -> Result<()>;
124
125
/// Train from iterator of sequences
126
pub fn train<I, S>(&mut self, trainer: &mut dyn Trainer, sequences: I) -> Result<()>
127
where
128
I: Iterator<Item = S> + Send,
129
S: AsRef<str> + Send;
130
}
131
```
132
133
## Usage Examples
134
135
### BPE Training
136
137
```rust
138
use tokenizers::models::bpe::{BPE, BpeTrainer};
139
use tokenizers::{Tokenizer, AddedToken};
140
141
fn train_bpe_model() -> tokenizers::Result<()> {
142
// Create a BPE model and trainer
143
let mut model = BPE::default();
144
let mut trainer = BpeTrainer::new();
145
146
// Configure the trainer
147
trainer
148
.vocab_size(30000)
149
.min_frequency(2)
150
.show_progress(true)
151
.special_tokens(vec![
152
AddedToken::from("<s>", true),
153
AddedToken::from("</s>", true),
154
AddedToken::from("<unk>", true),
155
AddedToken::from("<pad>", true),
156
]);
157
158
// Create tokenizer and train
159
let mut tokenizer = Tokenizer::new(model);
160
161
// Train from files
162
let files = vec!["./training_data.txt".to_string()];
163
tokenizer.train_from_files(&mut trainer, files)?;
164
165
// Save the trained model
166
tokenizer.save("./tokenizer.json", false)?;
167
168
Ok(())
169
}
170
```
171
172
### WordPiece Training
173
174
```rust
175
use tokenizers::models::wordpiece::{WordPiece, WordPieceTrainer};
176
use tokenizers::{Tokenizer, AddedToken};
177
178
fn train_wordpiece_model() -> tokenizers::Result<()> {
179
// Create a WordPiece model and trainer
180
let mut model = WordPiece::default();
181
let mut trainer = WordPieceTrainer::new();
182
183
// Configure the trainer
184
trainer
185
.vocab_size(30000)
186
.min_frequency(2)
187
.show_progress(true)
188
.continuing_subword_prefix("##".to_string())
189
.special_tokens(vec![
190
AddedToken::from("[CLS]", true),
191
AddedToken::from("[SEP]", true),
192
AddedToken::from("[UNK]", true),
193
AddedToken::from("[PAD]", true),
194
AddedToken::from("[MASK]", true),
195
]);
196
197
// Create tokenizer and train
198
let mut tokenizer = Tokenizer::new(model);
199
200
// Train from iterator of text lines
201
let training_data = vec![
202
"Hello world!",
203
"This is training data.",
204
"WordPiece tokenization is powerful.",
205
];
206
207
tokenizer.train(
208
&mut trainer,
209
training_data.into_iter()
210
)?;
211
212
Ok(())
213
}
214
```
215
216
### Training Progress and Configuration
217
218
```rust
219
use tokenizers::models::bpe::{BPE, BpeTrainer};
220
use tokenizers::{Tokenizer, AddedToken};
221
use std::collections::BTreeSet;
222
223
fn advanced_bpe_training() -> tokenizers::Result<()> {
224
let mut model = BPE::default();
225
let mut trainer = BpeTrainer::new();
226
227
// Advanced configuration
228
let alphabet: BTreeSet<char> = "abcdefghijklmnopqrstuvwxyz".chars().collect();
229
230
trainer
231
.vocab_size(50000)
232
.min_frequency(5)
233
.show_progress(true)
234
.limit_alphabet(Some(1000))
235
.initial_alphabet(alphabet)
236
.continuing_subword_prefix(Some("##".to_string()))
237
.end_of_word_suffix(Some("</w>".to_string()))
238
.special_tokens(vec![
239
AddedToken::from("<unk>", true),
240
AddedToken::from("<s>", true),
241
AddedToken::from("</s>", true),
242
]);
243
244
let mut tokenizer = Tokenizer::new(model);
245
246
// Large-scale training from multiple files
247
let training_files = vec![
248
"./corpus1.txt".to_string(),
249
"./corpus2.txt".to_string(),
250
"./corpus3.txt".to_string(),
251
];
252
253
tokenizer.train_from_files(&mut trainer, training_files)?;
254
255
// Test the trained tokenizer
256
let encoding = tokenizer.encode("Hello world!", false)?;
257
println!("Tokens: {:?}", encoding.get_tokens());
258
259
Ok(())
260
}
261
```