0
# Utilities
1
2
Support utilities including padding, truncation, parallelism configuration, and additional helper functions for tokenization workflows.
3
4
## Capabilities
5
6
### Padding
7
8
Configuration and functions for padding token sequences to uniform lengths.
9
10
```rust { .api }
11
pub struct PaddingParams {
12
pub strategy: PaddingStrategy,
13
pub direction: PaddingDirection,
14
pub pad_to_multiple_of: Option<usize>,
15
pub pad_id: u32,
16
pub pad_type_id: u32,
17
pub pad_token: String,
18
}
19
20
pub enum PaddingStrategy {
21
BatchLongest,
22
Fixed(usize),
23
}
24
25
pub enum PaddingDirection {
26
Left,
27
Right,
28
}
29
30
/// Pad encodings to uniform length
31
pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Result<()>;
32
```
33
34
### Truncation
35
36
Configuration and functions for truncating long sequences.
37
38
```rust { .api }
39
pub struct TruncationParams {
40
pub direction: TruncationDirection,
41
pub max_length: usize,
42
pub strategy: TruncationStrategy,
43
pub stride: usize,
44
}
45
46
pub enum TruncationDirection {
47
Left,
48
Right,
49
}
50
51
pub enum TruncationStrategy {
52
LongestFirst,
53
OnlyFirst,
54
OnlySecond,
55
}
56
57
pub enum TruncationError {
58
SecondSequenceNotProvided,
59
SequenceTooShort,
60
}
61
62
/// Truncate encodings to specified maximum length
63
pub fn truncate_encodings(
64
encoding: Encoding,
65
pair_encoding: Option<Encoding>,
66
params: &TruncationParams,
67
) -> Result<(Encoding, Option<Encoding>)>;
68
```
69
70
### Parallelism
71
72
Parallelism configuration and utilities.
73
74
```rust { .api }
75
/// Environment variable for controlling parallelism
76
pub const ENV_VARIABLE: &str = "TOKENIZERS_PARALLELISM";
77
```
78
79
### Iterator Utilities
80
81
```rust { .api }
82
/// Iterator that preserves line endings
83
pub use crate::utils::iter::LinesWithEnding;
84
```
85
86
### HTTP Support
87
88
HTTP downloading capabilities when the `http` feature is enabled.
89
90
```rust { .api }
91
#[cfg(feature = "http")]
92
pub struct FromPretrainedParameters {
93
// Configuration for downloading from Hugging Face Hub
94
}
95
```
96
97
## Usage Examples
98
99
### Padding Configuration
100
101
```rust
102
use tokenizers::utils::padding::{PaddingParams, PaddingStrategy, PaddingDirection, pad_encodings};
103
use tokenizers::tokenizer::Tokenizer;
104
use tokenizers::models::bpe::BPE;
105
106
fn configure_padding() -> tokenizers::Result<()> {
107
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
108
let mut tokenizer = Tokenizer::new(bpe);
109
110
// Configure padding parameters
111
let padding_params = PaddingParams {
112
strategy: PaddingStrategy::BatchLongest,
113
direction: PaddingDirection::Right,
114
pad_to_multiple_of: Some(8),
115
pad_id: 0,
116
pad_type_id: 0,
117
pad_token: "[PAD]".to_string(),
118
};
119
120
tokenizer.with_padding(Some(padding_params));
121
122
// Encode batch - will be automatically padded
123
let texts = vec!["Short text", "This is a much longer text that will need padding"];
124
let encodings = tokenizer.encode_batch(texts, false)?;
125
126
for (i, encoding) in encodings.iter().enumerate() {
127
println!("Sequence {}: {:?}", i, encoding.get_tokens());
128
println!("Length: {}", encoding.len());
129
}
130
131
Ok(())
132
}
133
```
134
135
### Truncation Configuration
136
137
```rust
138
use tokenizers::utils::truncation::{TruncationParams, TruncationStrategy, TruncationDirection};
139
use tokenizers::tokenizer::Tokenizer;
140
use tokenizers::models::bpe::BPE;
141
142
fn configure_truncation() -> tokenizers::Result<()> {
143
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
144
let mut tokenizer = Tokenizer::new(bpe);
145
146
// Configure truncation parameters
147
let truncation_params = TruncationParams {
148
direction: TruncationDirection::Right,
149
max_length: 512,
150
strategy: TruncationStrategy::LongestFirst,
151
stride: 0,
152
};
153
154
tokenizer.with_truncation(Some(truncation_params))?;
155
156
// Long text will be truncated
157
let long_text = "Very long text that exceeds the maximum length...".repeat(100);
158
let encoding = tokenizer.encode(long_text, false)?;
159
160
println!("Truncated length: {}", encoding.len()); // Will be <= 512
161
println!("Tokens: {:?}", &encoding.get_tokens()[..10]); // First 10 tokens
162
163
Ok(())
164
}
165
```
166
167
### Parallelism Control
168
169
```rust
170
use std::env;
171
172
fn configure_parallelism() {
173
// Set the number of threads for tokenization
174
env::set_var("TOKENIZERS_PARALLELISM", "4");
175
176
// Now tokenization operations will use up to 4 threads
177
// This affects batch operations and training
178
}
179
```
180
181
### Loading from Hugging Face Hub
182
183
```rust
184
#[cfg(feature = "http")]
185
use tokenizers::{Tokenizer, FromPretrainedParameters};
186
187
#[cfg(feature = "http")]
188
fn load_from_hub() -> tokenizers::Result<()> {
189
// Load a pre-trained tokenizer from Hugging Face Hub
190
let tokenizer = Tokenizer::from_pretrained("bert-base-uncased", None)?;
191
192
let encoding = tokenizer.encode("Hello from Hugging Face!", false)?;
193
println!("Tokens: {:?}", encoding.get_tokens());
194
195
// With custom parameters
196
let params = FromPretrainedParameters {
197
// Configuration options
198
};
199
200
let tokenizer = Tokenizer::from_pretrained("gpt2", Some(params))?;
201
202
Ok(())
203
}
204
```
205
206
### Combined Configuration
207
208
```rust
209
use tokenizers::tokenizer::Tokenizer;
210
use tokenizers::models::bpe::BPE;
211
use tokenizers::utils::padding::{PaddingParams, PaddingStrategy, PaddingDirection};
212
use tokenizers::utils::truncation::{TruncationParams, TruncationStrategy, TruncationDirection};
213
214
fn complete_configuration() -> tokenizers::Result<()> {
215
let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;
216
let mut tokenizer = Tokenizer::new(bpe);
217
218
// Configure both padding and truncation
219
let padding = PaddingParams {
220
strategy: PaddingStrategy::Fixed(512),
221
direction: PaddingDirection::Right,
222
pad_to_multiple_of: None,
223
pad_id: 0,
224
pad_type_id: 0,
225
pad_token: "[PAD]".to_string(),
226
};
227
228
let truncation = TruncationParams {
229
direction: TruncationDirection::Right,
230
max_length: 512,
231
strategy: TruncationStrategy::LongestFirst,
232
stride: 0,
233
};
234
235
tokenizer
236
.with_padding(Some(padding))
237
.with_truncation(Some(truncation))?;
238
239
// Now all encodings will be exactly 512 tokens
240
let texts = vec![
241
"Short text",
242
"Medium length text with more words",
243
"Very long text that will be truncated...".repeat(100),
244
];
245
246
let encodings = tokenizer.encode_batch(texts, false)?;
247
248
for encoding in encodings {
249
assert_eq!(encoding.len(), 512);
250
println!("All sequences now have length: {}", encoding.len());
251
}
252
253
Ok(())
254
}
255
```