or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

pre-tokenizers.mddocs/

0

# Pre-tokenization

1

2

Pre-tokenization components that split input text into initial segments while preserving offset mapping, providing various splitting strategies before the main tokenization algorithm is applied.

3

4

## Capabilities

5

6

### PreTokenizer Trait

7

8

All pre-tokenizers implement the `PreTokenizer` trait.

9

10

```rust { .api }

11

pub trait PreTokenizer {

12

fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>;

13

}

14

```

15

16

### PreTokenizer Wrapper

17

18

```rust { .api }

19

pub enum PreTokenizerWrapper {

20

BertPreTokenizer(BertPreTokenizer),

21

ByteLevel(ByteLevel),

22

Delimiter(CharDelimiterSplit),

23

Metaspace(Metaspace),

24

Whitespace(Whitespace),

25

Sequence(Sequence),

26

Split(Split),

27

Punctuation(Punctuation),

28

WhitespaceSplit(WhitespaceSplit),

29

Digits(Digits),

30

UnicodeScripts(UnicodeScripts),

31

FixedLength(FixedLength),

32

}

33

```

34

35

### Common Pre-Tokenizers

36

37

```rust { .api }

38

// BERT-style pre-tokenization

39

pub struct BertPreTokenizer;

40

41

// Byte-level pre-tokenization

42

pub struct ByteLevel;

43

impl ByteLevel {

44

pub fn new(add_prefix_space: bool, trim_offsets: bool, use_regex: bool) -> Self;

45

pub fn alphabet() -> Vec<String>;

46

}

47

48

// Basic whitespace splitting

49

pub struct Whitespace;

50

pub struct WhitespaceSplit;

51

52

// Metaspace replacement

53

pub struct Metaspace;

54

impl Metaspace {

55

pub fn new(replacement: char, prepend_scheme: PrependScheme, split: bool) -> Self;

56

}

57

58

pub enum PrependScheme {

59

Always,

60

Never,

61

First,

62

}

63

64

// Sequence of pre-tokenizers

65

pub struct Sequence;

66

impl Sequence {

67

pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self;

68

}

69

70

// Character delimiter splitting

71

pub struct CharDelimiterSplit;

72

73

// Pattern-based splitting

74

pub struct Split;

75

76

// Punctuation handling

77

pub struct Punctuation;

78

79

// Digit-aware splitting

80

pub struct Digits;

81

82

// Unicode script-based splitting

83

pub struct UnicodeScripts;

84

85

// Fixed-length tokenization

86

pub struct FixedLength;

87

```

88

89

### PreTokenizedString

90

91

String type that maintains pre-tokenization splits and offset information.

92

93

```rust { .api }

94

pub struct PreTokenizedString {

95

// Private fields for splits and offset tracking

96

}

97

```

98

99

## Usage Examples

100

101

```rust

102

use tokenizers::pre_tokenizers::{PreTokenizerWrapper, Sequence, Whitespace, Punctuation};

103

use tokenizers::tokenizer::Tokenizer;

104

use tokenizers::models::bpe::BPE;

105

106

fn create_pretokenizer_sequence() -> tokenizers::Result<()> {

107

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

108

let mut tokenizer = Tokenizer::new(bpe);

109

110

// Combine whitespace and punctuation splitting

111

let pretokenizers = vec![

112

Whitespace.into(),

113

Punctuation::new(tokenizers::pre_tokenizers::punctuation::Behavior::Isolated).into(),

114

];

115

116

let sequence = Sequence::new(pretokenizers);

117

tokenizer.with_pre_tokenizer(Some(sequence));

118

119

let encoding = tokenizer.encode("Hello, world!", false)?;

120

println!("Pre-tokenized: {:?}", encoding.get_tokens());

121

122

Ok(())

123

}

124

```