or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-tokenization.mddecoders.mdindex.mdmodels.mdnormalizers.mdpost-processors.mdpre-tokenizers.mdtraining.mdutilities.md

decoders.mddocs/

0

# Decoding

1

2

Decoding components that convert token sequences back to readable text, supporting various decoding strategies including BPE, WordPiece, byte-level, and custom decoders.

3

4

## Capabilities

5

6

### Decoder Trait

7

8

All decoders implement the `Decoder` trait.

9

10

```rust { .api }

11

pub trait Decoder {

12

fn decode(&self, tokens: Vec<String>) -> Result<String>;

13

fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>;

14

}

15

```

16

17

### Decoder Wrapper

18

19

```rust { .api }

20

pub enum DecoderWrapper {

21

BPE(BPEDecoder),

22

ByteLevel(ByteLevel),

23

WordPiece(WordPiece),

24

Metaspace(Metaspace),

25

CTC(CTC),

26

Sequence(Sequence),

27

Replace(Replace),

28

Fuse(Fuse),

29

Strip(Strip),

30

ByteFallback(ByteFallback),

31

}

32

```

33

34

### Common Decoders

35

36

```rust { .api }

37

// BPE decoding

38

pub struct BPEDecoder {

39

pub suffix: String,

40

}

41

42

impl BPEDecoder {

43

pub fn new(suffix: String) -> Self;

44

}

45

46

// WordPiece decoding (removes ## prefixes)

47

pub struct WordPiece;

48

49

// Byte-level decoding

50

pub struct ByteLevel;

51

52

// Metaspace decoding (replaces metaspace character)

53

pub struct Metaspace;

54

55

// CTC decoding for speech recognition

56

pub struct CTC;

57

58

// Sequence of decoders

59

pub struct Sequence;

60

61

// Pattern replacement during decoding

62

pub struct Replace;

63

64

// Fuse consecutive identical tokens

65

pub struct Fuse;

66

67

// Strip characters from decoded output

68

pub struct Strip;

69

70

// Byte fallback decoding

71

pub struct ByteFallback;

72

```

73

74

## Usage Examples

75

76

```rust

77

use tokenizers::decoders::{DecoderWrapper, BPEDecoder};

78

use tokenizers::tokenizer::Tokenizer;

79

use tokenizers::models::bpe::BPE;

80

81

fn create_bpe_decoder() -> tokenizers::Result<()> {

82

let bpe = BPE::from_file("./vocab.json", "./merges.txt").build()?;

83

let mut tokenizer = Tokenizer::new(bpe);

84

85

// Set BPE decoder (usually matches the model type)

86

tokenizer.with_decoder(Some(BPEDecoder::new("</w>".to_string())));

87

88

let encoding = tokenizer.encode("Hello world!", false)?;

89

let decoded = tokenizer.decode(encoding.get_ids(), false)?;

90

println!("Decoded: {}", decoded);

91

92

Ok(())

93

}

94

```