or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

Support utilities including padding, truncation, parallelism configuration, and additional helper functions for tokenization workflows.

3

4

## Capabilities

5

6

### Padding

7

8

Configuration and functions for padding token sequences to uniform lengths.

9

10

```rust { .api }

11

pub struct PaddingParams {

12

pub strategy: PaddingStrategy,

13

pub direction: PaddingDirection,

14

pub pad_to_multiple_of: Option<usize>,

15

pub pad_id: u32,

16

pub pad_type_id: u32,

17

pub pad_token: String,

18

}

19

20

pub enum PaddingStrategy {

21

BatchLongest,

22

Fixed(usize),

23

}

24

25

pub enum PaddingDirection {

26

Left,

27

Right,

28

}

29

30

/// Pad encodings to uniform length

31

pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Result<()>;

32

```

33

34

### Truncation

35

36

Configuration and functions for truncating long sequences.

37

38

```rust { .api }

39

pub struct TruncationParams {

40

pub direction: TruncationDirection,

41

pub max_length: usize,

42

pub strategy: TruncationStrategy,

43

pub stride: usize,

44

}

45

46

pub enum TruncationDirection {

47

Left,

48

Right,

49

}

50

51

pub enum TruncationStrategy {

52

LongestFirst,

53

OnlyFirst,

54

OnlySecond,

55

}

56

57

pub enum TruncationError {

58

SecondSequenceNotProvided,

59

SequenceTooShort,

60

}

61

62

/// Truncate encodings to specified maximum length

63

pub fn truncate_encodings(

64

encoding: Encoding,

65

pair_encoding: Option<Encoding>,

66

params: &TruncationParams,

67

) -> Result<(Encoding, Option<Encoding>)>;

68

```

69

70

### Parallelism

71

72

Parallelism configuration and utilities.

73

74

```rust { .api }

75

/// Environment variable for controlling parallelism

76

pub const ENV_VARIABLE: &str = "TOKENIZERS_PARALLELISM";

77

```

78

79

### Iterator Utilities

80

81

```rust { .api }

82

/// Iterator that preserves line endings

83

pub use crate::utils::iter::LinesWithEnding;

84

```

85

86

### HTTP Support

87

88

HTTP downloading capabilities when the `http` feature is enabled.

89

90

```rust { .api }

91

#[cfg(feature = "http")]

92

pub struct FromPretrainedParameters {

93

// Configuration for downloading from Hugging Face Hub

94

}

95

```

96

97

## Usage Examples

98

99

### Padding Configuration

100

101

```rust

102

use tokenizers::utils::padding::{PaddingParams, PaddingStrategy, PaddingDirection, pad_encodings};

103

use tokenizers::tokenizer::Tokenizer;

104

use tokenizers::models::bpe::BPE;

105

106

fn configure_padding() -> tokenizers::Result<()> {

107

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

108

let mut tokenizer = Tokenizer::new(bpe);

109

110

// Configure padding parameters

111

let padding_params = PaddingParams {

112

strategy: PaddingStrategy::BatchLongest,

113

direction: PaddingDirection::Right,

114

pad_to_multiple_of: Some(8),

115

pad_id: 0,

116

pad_type_id: 0,

117

pad_token: "[PAD]".to_string(),

118

};

119

120

tokenizer.with_padding(Some(padding_params));

121

122

// Encode batch - will be automatically padded

123

let texts = vec!["Short text", "This is a much longer text that will need padding"];

124

let encodings = tokenizer.encode_batch(texts, false)?;

125

126

for (i, encoding) in encodings.iter().enumerate() {

127

println!("Sequence {}: {:?}", i, encoding.get_tokens());

128

println!("Length: {}", encoding.len());

129

}

130

131

Ok(())

132

}

133

```

134

135

### Truncation Configuration

136

137

```rust

138

use tokenizers::utils::truncation::{TruncationParams, TruncationStrategy, TruncationDirection};

139

use tokenizers::tokenizer::Tokenizer;

140

use tokenizers::models::bpe::BPE;

141

142

fn configure_truncation() -> tokenizers::Result<()> {

143

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

144

let mut tokenizer = Tokenizer::new(bpe);

145

146

// Configure truncation parameters

147

let truncation_params = TruncationParams {

148

direction: TruncationDirection::Right,

149

max_length: 512,

150

strategy: TruncationStrategy::LongestFirst,

151

stride: 0,

152

};

153

154

tokenizer.with_truncation(Some(truncation_params))?;

155

156

// Long text will be truncated

157

let long_text = "Very long text that exceeds the maximum length...".repeat(100);

158

let encoding = tokenizer.encode(long_text, false)?;

159

160

println!("Truncated length: {}", encoding.len()); // Will be <= 512

161

println!("Tokens: {:?}", &encoding.get_tokens()[..10]); // First 10 tokens

162

163

Ok(())

164

}

165

```

166

167

### Parallelism Control

168

169

```rust

170

use std::env;

171

172

fn configure_parallelism() {

173

// Set the number of threads for tokenization

174

env::set_var("TOKENIZERS_PARALLELISM", "4");

175

176

// Now tokenization operations will use up to 4 threads

177

// This affects batch operations and training

178

}

179

```

180

181

### Loading from Hugging Face Hub

182

183

```rust

184

#[cfg(feature = "http")]

185

use tokenizers::{Tokenizer, FromPretrainedParameters};

186

187

#[cfg(feature = "http")]

188

fn load_from_hub() -> tokenizers::Result<()> {

189

// Load a pre-trained tokenizer from Hugging Face Hub

190

let tokenizer = Tokenizer::from_pretrained("bert-base-uncased", None)?;

191

192

let encoding = tokenizer.encode("Hello from Hugging Face!", false)?;

193

println!("Tokens: {:?}", encoding.get_tokens());

194

195

// With custom parameters

196

let params = FromPretrainedParameters {

197

// Configuration options

198

};

199

200

let tokenizer = Tokenizer::from_pretrained("gpt2", Some(params))?;

201

202

Ok(())

203

}

204

```

205

206

### Combined Configuration

207

208

```rust

209

use tokenizers::tokenizer::Tokenizer;

210

use tokenizers::models::bpe::BPE;

211

use tokenizers::utils::padding::{PaddingParams, PaddingStrategy, PaddingDirection};

212

use tokenizers::utils::truncation::{TruncationParams, TruncationStrategy, TruncationDirection};

213

214

fn complete_configuration() -> tokenizers::Result<()> {

215

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

216

let mut tokenizer = Tokenizer::new(bpe);

217

218

// Configure both padding and truncation

219

let padding = PaddingParams {

220

strategy: PaddingStrategy::Fixed(512),

221

direction: PaddingDirection::Right,

222

pad_to_multiple_of: None,

223

pad_id: 0,

224

pad_type_id: 0,

225

pad_token: "[PAD]".to_string(),

226

};

227

228

let truncation = TruncationParams {

229

direction: TruncationDirection::Right,

230

max_length: 512,

231

strategy: TruncationStrategy::LongestFirst,

232

stride: 0,

233

};

234

235

tokenizer

236

.with_padding(Some(padding))

237

.with_truncation(Some(truncation))?;

238

239

// Now all encodings will be exactly 512 tokens

240

let texts = vec![

241

"Short text",

242

"Medium length text with more words",

243

"Very long text that will be truncated...".repeat(100),

244

];

245

246

let encodings = tokenizer.encode_batch(texts, false)?;

247

248

for encoding in encodings {

249

assert_eq!(encoding.len(), 512);

250

println!("All sequences now have length: {}", encoding.len());

251

}

252

253

Ok(())

254

}

255

```