or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

post-processors.mddocs/

0

# Post-processing

1

2

Post-processing components that add special tokens and modify encodings for different model architectures like BERT, RoBERTa, and custom templates.

3

4

## Capabilities

5

6

### PostProcessor Trait

7

8

All post-processors implement the `PostProcessor` trait.

9

10

```rust { .api }

11

pub trait PostProcessor {

12

fn added_tokens(&self, is_pair: bool) -> usize;

13

fn process(

14

&self,

15

encoding: Encoding,

16

pair_encoding: Option<Encoding>,

17

add_special_tokens: bool,

18

) -> Result<Encoding>;

19

fn process_encodings(

20

&self,

21

encodings: Vec<Encoding>,

22

add_special_tokens: bool,

23

) -> Result<Vec<Encoding>>;

24

}

25

```

26

27

### PostProcessor Wrapper

28

29

```rust { .api }

30

pub enum PostProcessorWrapper {

31

Roberta(RobertaProcessing),

32

Bert(BertProcessing),

33

ByteLevel(ByteLevel),

34

Template(TemplateProcessing),

35

Sequence(Sequence),

36

}

37

```

38

39

### Common Post-Processors

40

41

```rust { .api }

42

// BERT-style processing with [CLS] and [SEP] tokens

43

pub struct BertProcessing;

44

impl BertProcessing {

45

pub fn new(sep: (String, u32), cls: (String, u32)) -> Self;

46

}

47

48

// RoBERTa-style processing

49

pub struct RobertaProcessing;

50

impl RobertaProcessing {

51

pub fn new(

52

sep: (String, u32),

53

cls: (String, u32),

54

trim_offsets: bool,

55

add_prefix_space: bool

56

) -> Self;

57

}

58

59

// Template-based processing

60

pub struct TemplateProcessing;

61

62

// Sequence of post-processors

63

pub struct Sequence;

64

65

// Byte-level post-processing

66

pub struct ByteLevel;

67

```

68

69

## Usage Examples

70

71

```rust

72

use tokenizers::processors::{PostProcessorWrapper, BertProcessing};

73

use tokenizers::tokenizer::Tokenizer;

74

use tokenizers::models::bpe::BPE;

75

76

fn create_bert_processor() -> tokenizers::Result<()> {

77

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

78

let mut tokenizer = Tokenizer::new(bpe);

79

80

// Add BERT-style [CLS] and [SEP] tokens

81

let processor = BertProcessing::new(

82

("[SEP]".to_string(), 102),

83

("[CLS]".to_string(), 101),

84

);

85

86

tokenizer.with_post_processor(Some(processor));

87

88

let encoding = tokenizer.encode("Hello world!", true)?;

89

println!("BERT processed tokens: {:?}", encoding.get_tokens());

90

// Output: ["[CLS]", "Hello", "world", "!", "[SEP]"]

91

92

Ok(())

93

}

94

```