High-performance tokenization library with BPE, WordPiece, WordLevel, and Unigram models for natural language processing.
pkg:cargo/tokenizers@0.22.x
npx @tessl/cli install tessl/cargo-tokenizers@0.22.0A high-performance Rust tokenization library that provides implementations of modern tokenization algorithms including BPE, WordPiece, WordLevel, and Unigram models. The library is designed around a composable pipeline architecture with five core components, offering exceptional performance through Rust implementation while maintaining flexibility and ease of use for natural language processing applications.
tokenizers = "0.22.0"use tokenizers::{Tokenizer, Encoding, Result};
use tokenizers::models::bpe::BPE;For specific components:
use tokenizers::{
AddedToken,
Model,
NormalizerWrapper,
PreTokenizerWrapper,
PostProcessorWrapper,
DecoderWrapper
};use tokenizers::{Tokenizer, Result, EncodeInput};
use tokenizers::models::bpe::BPE;
fn main() -> Result<()> {
// Create a BPE model from vocabulary and merges files
let bpe = BPE::from_file("./vocab.json", "./merges.txt")
.dropout(0.1)
.unk_token("[UNK]".into())
.build()?;
// Create a tokenizer with the BPE model
let mut tokenizer = Tokenizer::new(bpe);
// Encode text
let encoding = tokenizer.encode("Hello, world!", false)?;
println!("Tokens: {:?}", encoding.get_tokens());
println!("IDs: {:?}", encoding.get_ids());
// Decode back to text
let decoded = tokenizer.decode(encoding.get_ids(), false)?;
println!("Decoded: {}", decoded);
Ok(())
}The tokenizers library uses a composable pipeline architecture with five core components:
This modular design allows mixing and matching components to create custom tokenization pipelines while providing sensible defaults for common use cases. The library prioritizes performance through Rust implementation and supports CPU parallelism.
Primary tokenizer implementation with encoding, decoding, and configuration management. Provides the main Tokenizer struct and pipeline orchestration.
pub struct Tokenizer(TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>);
impl Tokenizer {
pub fn new(model: impl Into<ModelWrapper>) -> Self;
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>;
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>;
pub fn encode<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>;
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String>;
// Batch processing methods
pub fn encode_batch<'s, E>(&self, input: Vec<E>, add_special_tokens: bool) -> Result<Vec<Encoding>>;
pub fn decode_batch(&self, sequences: Vec<&[u32]>, skip_special_tokens: bool) -> Result<Vec<String>>;
// Vocabulary access
pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>;
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize;
pub fn token_to_id(&self, token: &str) -> Option<u32>;
pub fn id_to_token(&self, id: u32) -> Option<String>;
// Training methods
pub fn train_from_files(&mut self, trainer: &mut dyn Trainer, files: Vec<String>) -> Result<()>;
pub fn train<I, S>(&mut self, trainer: &mut dyn Trainer, sequences: I) -> Result<()>
where
I: Iterator<Item = S> + Send,
S: AsRef<str> + Send;
// Token management
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize;
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize;
// Component access
pub fn get_model(&self) -> &ModelWrapper;
pub fn get_normalizer(&self) -> Option<&NormalizerWrapper>;
pub fn get_pre_tokenizer(&self) -> Option<&PreTokenizerWrapper>;
pub fn get_post_processor(&self) -> Option<&PostProcessorWrapper>;
pub fn get_decoder(&self) -> Option<&DecoderWrapper>;
}
pub struct Encoding {
// Contains ids, tokens, offsets, type_ids, attention_mask, special_tokens_mask, etc.
}
impl Encoding {
// Basic data access
pub fn get_ids(&self) -> &[u32];
pub fn get_tokens(&self) -> &[String];
pub fn get_offsets(&self) -> &[Offsets];
pub fn get_word_ids(&self) -> &[Option<u32>];
pub fn get_sequence_ids(&self) -> &[Option<u32>];
pub fn get_type_ids(&self) -> &[u32];
pub fn get_attention_mask(&self) -> &[u32];
pub fn get_special_tokens_mask(&self) -> &[u32];
pub fn get_overflowing(&self) -> &[Encoding];
// Character and word mapping
pub fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)>;
pub fn word_to_chars(&self, word_index: u32) -> Option<Offsets>;
pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets>;
pub fn token_to_word(&self, token_index: usize) -> Option<u32>;
pub fn char_to_token(&self, char_index: usize) -> Option<usize>;
pub fn char_to_word(&self, char_index: usize) -> Option<u32>;
// Length information
pub fn len(&self) -> usize;
pub fn is_empty(&self) -> bool;
// Modification methods
pub fn truncate(&mut self, max_length: usize, stride: usize, direction: TruncationDirection);
pub fn pad(&mut self, length: usize, pad_id: u32, pad_type_id: u32, pad_token: &str, direction: PaddingDirection);
// Combination
pub fn merge_with(&mut self, pair: Encoding, growing_offsets: bool) -> Result<()>;
}Implementation of different tokenization algorithms including BPE (Byte-Pair Encoding), WordPiece, WordLevel, and Unigram models with their respective trainers and builders.
pub trait Model {
type Trainer: Trainer + Sync;
fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>;
fn get_vocab(&self) -> HashMap<String, u32>;
fn get_vocab_size(&self) -> usize;
}
pub struct BPE {
pub dropout: Option<f32>;
pub unk_token: Option<String>;
// Other configuration fields
}
impl BPE {
pub fn builder() -> BpeBuilder;
pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder;
}Text preprocessing capabilities including unicode normalization, case conversion, accent stripping, and custom text transformations applied before tokenization.
pub trait Normalizer {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
}
pub enum NormalizerWrapper {
BertNormalizer(BertNormalizer),
StripNormalizer(Strip),
StripAccents(StripAccents),
NFC(NFC), NFD(NFD), NFKC(NFKC), NFKD(NFKD),
Sequence(Sequence),
Lowercase(Lowercase),
// Other normalizers
}Text splitting strategies that divide input into initial segments while preserving offset mapping, including whitespace-based, punctuation-aware, and custom pattern splitting.
pub trait PreTokenizer {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>;
}
pub enum PreTokenizerWrapper {
BertPreTokenizer(BertPreTokenizer),
ByteLevel(ByteLevel),
Whitespace(Whitespace),
Sequence(Sequence),
// Other pre-tokenizers
}Special token addition and encoding modifications for different model architectures including BERT, RoBERTa, and custom template-based processing.
pub trait PostProcessor {
fn added_tokens(&self, is_pair: bool) -> usize;
fn process(
&self,
encoding: Encoding,
pair_encoding: Option<Encoding>,
add_special_tokens: bool,
) -> Result<Encoding>;
}
pub enum PostProcessorWrapper {
Roberta(RobertaProcessing),
Bert(BertProcessing),
Template(TemplateProcessing),
// Other processors
}Token-to-text conversion with support for different decoding strategies including BPE decoding, WordPiece decoding, byte-level decoding, and custom decoders.
pub trait Decoder {
fn decode(&self, tokens: Vec<String>) -> Result<String>;
fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>;
}
pub enum DecoderWrapper {
BPE(BPEDecoder),
ByteLevel(ByteLevel),
WordPiece(WordPiece),
// Other decoders
}Training capabilities for all supported tokenization models with configurable parameters, progress tracking, and special token handling.
pub trait Trainer {
type Model: Model + Sized;
fn should_show_progress(&self) -> bool;
fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>;
}
pub struct BpeTrainer;
pub struct WordPieceTrainer;
pub struct WordLevelTrainer;
pub struct UnigramTrainer;Support utilities including padding, truncation, parallelism configuration, and HTTP downloading capabilities.
pub struct PaddingParams {
pub strategy: PaddingStrategy,
pub direction: PaddingDirection,
pub pad_id: u32,
// Other padding fields
}
pub struct TruncationParams {
pub direction: TruncationDirection,
pub max_length: usize,
pub strategy: TruncationStrategy,
// Other truncation fields
}
pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Result<()>;
pub fn truncate_encodings(encoding: Encoding, pair_encoding: Option<Encoding>, params: &TruncationParams) -> Result<(Encoding, Option<Encoding>)>;use std::collections::HashMap;
pub type Error = Box<dyn std::error::Error + Send + Sync>;
pub type Result<T> = std::result::Result<T, Error>;
pub type Offsets = (usize, usize);
pub struct Token {
pub id: u32,
pub value: String,
pub offsets: (usize, usize),
}
pub struct AddedToken {
pub content: String,
pub single_word: bool,
pub lstrip: bool,
pub rstrip: bool,
pub normalized: bool,
pub special: bool,
}
pub struct AddedVocabulary {
// Manages special tokens and added vocabulary
}
pub enum InputSequence<'s> {
Raw(Cow<'s, str>),
PreTokenized(Cow<'s, [&'s str]>),
PreTokenizedOwned(Cow<'s, [String]>),
PreTokenizedCow(Cow<'s, [Cow<'s, str>]>),
}
pub enum EncodeInput<'s> {
Single(InputSequence<'s>),
Dual(InputSequence<'s>, InputSequence<'s>),
}
pub enum TruncationDirection {
Left,
Right,
}
pub enum PaddingDirection {
Left,
Right,
}
pub trait Trainer {
type Model: Model + Sized;
fn should_show_progress(&self) -> bool;
fn train(&self, model: &mut Self::Model) -> Result<Vec<AddedToken>>;
fn feed<I, S, F>(&mut self, iterator: I, process: F) -> Result<()>
where
I: Iterator<Item = S> + Send,
S: AsRef<str> + Send,
F: Fn(&str) -> Result<Vec<String>> + Sync;
}