or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

normalizers.mddocs/

0

# Text Normalization

1

2

Text normalization components that preprocess input text before tokenization, including unicode normalization, case conversion, accent removal, and custom text transformations.

3

4

## Capabilities

5

6

### Normalizer Trait

7

8

All normalizers implement the `Normalizer` trait.

9

10

```rust { .api }

11

pub trait Normalizer {

12

fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;

13

}

14

```

15

16

### Normalizer Wrapper

17

18

The `NormalizerWrapper` enum provides a unified interface for all normalizer types.

19

20

```rust { .api }

21

pub enum NormalizerWrapper {

22

BertNormalizer(BertNormalizer),

23

StripNormalizer(Strip),

24

StripAccents(StripAccents),

25

NFC(NFC),

26

NFD(NFD),

27

NFKC(NFKC),

28

NFKD(NFKD),

29

Sequence(Sequence),

30

Lowercase(Lowercase),

31

Nmt(Nmt),

32

Precompiled(Precompiled),

33

Replace(Replace),

34

Prepend(Prepend),

35

ByteLevel(ByteLevel),

36

}

37

```

38

39

### Common Normalizers

40

41

```rust { .api }

42

// BERT-style normalization with configurable options

43

pub struct BertNormalizer {

44

pub clean_text: bool,

45

pub handle_chinese_chars: bool,

46

pub strip_accents: Option<bool>,

47

pub lowercase: bool,

48

}

49

50

impl BertNormalizer {

51

pub fn new(

52

clean_text: bool,

53

handle_chinese_chars: bool,

54

strip_accents: Option<bool>,

55

lowercase: bool,

56

) -> Self;

57

}

58

59

impl Default for BertNormalizer {

60

fn default() -> Self;

61

}

62

63

// Strip whitespace

64

pub struct Strip;

65

impl Strip {

66

pub fn new(left: bool, right: bool) -> Self;

67

}

68

69

// Remove accents

70

pub struct StripAccents;

71

72

// Unicode normalization forms

73

pub struct NFC;

74

pub struct NFD;

75

pub struct NFKC;

76

pub struct NFKD;

77

78

// Convert to lowercase

79

pub struct Lowercase;

80

81

// Sequence of normalizers

82

pub struct Sequence;

83

impl Sequence {

84

pub fn new(normalizers: Vec<NormalizerWrapper>) -> Self;

85

}

86

87

// Pattern replacement

88

pub struct Replace;

89

90

// Prepend text

91

pub struct Prepend;

92

93

// Byte-level normalization

94

pub struct ByteLevel;

95

```

96

97

### NormalizedString

98

99

String type that tracks normalization changes and maintains offset mapping.

100

101

```rust { .api }

102

pub struct NormalizedString {

103

// Private fields for normalized content and offset tracking

104

}

105

106

pub enum OffsetReferential {

107

Original,

108

Normalized,

109

}

110

111

pub enum SplitDelimiterBehavior {

112

Removed,

113

Isolated,

114

MergedWithPrevious,

115

MergedWithNext,

116

Contiguous,

117

}

118

```

119

120

## Usage Examples

121

122

```rust

123

use tokenizers::normalizers::{NormalizerWrapper, Sequence, Strip, Lowercase, NFC};

124

use tokenizers::tokenizer::Tokenizer;

125

use tokenizers::models::bpe::BPE;

126

127

fn create_normalizer_sequence() -> tokenizers::Result<()> {

128

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

129

let mut tokenizer = Tokenizer::new(bpe);

130

131

// Create a sequence of normalizers

132

let normalizers = vec![

133

Strip::new(true, true).into(), // Strip leading/trailing whitespace

134

NFC.into(), // Unicode NFC normalization

135

Lowercase.into(), // Convert to lowercase

136

];

137

138

let sequence = Sequence::new(normalizers);

139

tokenizer.with_normalizer(Some(sequence));

140

141

let encoding = tokenizer.encode(" Hello WORLD! ", false)?;

142

println!("Normalized tokens: {:?}", encoding.get_tokens());

143

144

Ok(())

145

}

146

```