CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

tokenization.mddocs/

Tokenization

Native llama.cpp tokenization and HuggingFace tokenizer integration supporting different vocabulary types, encoding/decoding operations, and model-specific preprocessing.

Capabilities

Native Tokenization

Use the model's built-in tokenizer for consistent text processing.

class LlamaTokenizer:
    def __init__(self, llama: "Llama"):
        """
        Initialize tokenizer with Llama model instance.
        
        Args:
            llama: Llama model instance
        """

    def tokenize(
        self, 
        text: str, 
        add_bos: bool = True, 
        special: bool = False
    ) -> List[int]:
        """
        Convert text to token IDs.
        
        Args:
            text: Input text to tokenize
            add_bos: Add beginning-of-sequence token
            special: Allow special tokens in output
            
        Returns:
            List of token IDs
        """

    def detokenize(
        self, 
        tokens: List[int], 
        decode: bool = True
    ) -> str:
        """
        Convert token IDs to text.
        
        Args:
            tokens: List of token IDs to convert
            decode: Decode bytes to string
            
        Returns:
            Decoded text string
        """

    def encode(
        self, 
        text: str, 
        add_bos: bool = True, 
        special: bool = False
    ) -> List[int]:
        """
        Encode text to tokens (alias for tokenize).
        
        Args:
            text: Text to encode
            add_bos: Add beginning-of-sequence token
            special: Allow special tokens
            
        Returns:
            List of token IDs
        """

    def decode(
        self, 
        tokens: List[int], 
        **kwargs
    ) -> str:
        """
        Decode tokens to text (alias for detokenize).
        
        Args:
            tokens: Token IDs to decode
            **kwargs: Additional decoding parameters
            
        Returns:
            Decoded text
        """

    @classmethod
    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
        """
        Create tokenizer from GGML tokenizer file.
        
        Args:
            path: Path to GGML tokenizer file
            
        Returns:
            LlamaTokenizer instance
        """

HuggingFace Tokenizer Integration

Use HuggingFace tokenizers for compatibility with Transformers ecosystem.

class LlamaHFTokenizer:
    def __init__(self, hf_tokenizer):
        """
        Initialize with HuggingFace tokenizer.
        
        Args:
            hf_tokenizer: HuggingFace tokenizer instance
        """

    def tokenize(
        self, 
        text: str, 
        add_bos: bool = True, 
        special: bool = False
    ) -> List[int]:
        """
        Tokenize text using HuggingFace tokenizer.
        
        Args:
            text: Input text
            add_bos: Add beginning-of-sequence token
            special: Allow special tokens
            
        Returns:
            List of token IDs
        """

    def detokenize(
        self, 
        tokens: List[int], 
        decode: bool = True
    ) -> str:
        """
        Detokenize using HuggingFace tokenizer.
        
        Args:
            tokens: Token IDs to decode
            decode: Decode to string
            
        Returns:
            Decoded text
        """

    @classmethod
    def from_pretrained(
        cls, 
        pretrained_model_name_or_path: str, 
        **kwargs
    ) -> "LlamaHFTokenizer":
        """
        Load tokenizer from HuggingFace model.
        
        Args:
            pretrained_model_name_or_path: Model name or path
            **kwargs: Additional tokenizer arguments
            
        Returns:
            LlamaHFTokenizer instance
        """

Base Tokenizer Interface

Abstract base class for tokenizer implementations.

class BaseLlamaTokenizer:
    """Abstract base class for tokenizer implementations."""
    
    def tokenize(
        self, 
        text: str, 
        add_bos: bool = True, 
        special: bool = False
    ) -> List[int]:
        """Convert text to tokens."""
        
    def detokenize(
        self, 
        tokens: List[int], 
        decode: bool = True
    ) -> str:
        """Convert tokens to text."""
        
    def encode(
        self, 
        text: str, 
        add_bos: bool = True, 
        special: bool = False
    ) -> List[int]:
        """Encode text (alias for tokenize)."""
        
    def decode(self, tokens: List[int], **kwargs) -> str:
        """Decode tokens (alias for detokenize)."""

Vocabulary Type Constants

# Vocabulary types supported by llama.cpp
LLAMA_VOCAB_TYPE_NONE: int  # No vocabulary
LLAMA_VOCAB_TYPE_SPM: int   # SentencePiece model
LLAMA_VOCAB_TYPE_BPE: int   # Byte pair encoding  
LLAMA_VOCAB_TYPE_WPM: int   # WordPiece model
LLAMA_VOCAB_TYPE_UGM: int   # Unigram model
LLAMA_VOCAB_TYPE_RWKV: int  # RWKV tokenizer

Preprocessing Type Constants

# Text preprocessing types for different models
LLAMA_VOCAB_PRE_TYPE_DEFAULT: int        # Default preprocessing
LLAMA_VOCAB_PRE_TYPE_LLAMA3: int         # Llama 3 preprocessing
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: int   # DeepSeek preprocessing
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: int # DeepSeek Coder preprocessing
LLAMA_VOCAB_PRE_TYPE_FALCON: int         # Falcon preprocessing
LLAMA_VOCAB_PRE_TYPE_MPT: int            # MPT preprocessing
LLAMA_VOCAB_PRE_TYPE_STARCODER: int      # StarCoder preprocessing
LLAMA_VOCAB_PRE_TYPE_GPT2: int           # GPT-2 preprocessing
LLAMA_VOCAB_PRE_TYPE_REFACT: int         # Refact preprocessing
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: int      # Command-R preprocessing
LLAMA_VOCAB_PRE_TYPE_QWEN2: int          # Qwen2 preprocessing
LLAMA_VOCAB_PRE_TYPE_OLMO: int           # OLMo preprocessing
LLAMA_VOCAB_PRE_TYPE_DBRX: int           # DBRX preprocessing
LLAMA_VOCAB_PRE_TYPE_SMAUG: int          # Smaug preprocessing
LLAMA_VOCAB_PRE_TYPE_PORO: int           # Poro preprocessing
LLAMA_VOCAB_PRE_TYPE_CHATGLM3: int       # ChatGLM3 preprocessing
LLAMA_VOCAB_PRE_TYPE_CHATGLM4: int       # ChatGLM4 preprocessing
LLAMA_VOCAB_PRE_TYPE_VIKING: int         # Viking preprocessing
LLAMA_VOCAB_PRE_TYPE_JAIS: int           # Jais preprocessing
LLAMA_VOCAB_PRE_TYPE_TEKKEN: int         # Tekken preprocessing
LLAMA_VOCAB_PRE_TYPE_SMOLLM: int         # SmolLM preprocessing
LLAMA_VOCAB_PRE_TYPE_CODESHELL: int      # CodeShell preprocessing
LLAMA_VOCAB_PRE_TYPE_BLOOM: int          # BLOOM preprocessing
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: int   # GPT-3 Finnish preprocessing
LLAMA_VOCAB_PRE_TYPE_EXAONE: int         # EXAONE preprocessing

Usage Examples

Basic Tokenization

from llama_cpp import Llama

# Initialize model with tokenizer access
llm = Llama(model_path="./models/llama-2-7b.gguf")

# Tokenize text
text = "Hello, world! How are you today?"
tokens = llm.tokenize(text, add_bos=True)
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")

# Detokenize back to text
decoded = llm.detokenize(tokens)
print(f"Decoded: {decoded}")

Native Tokenizer Usage

from llama_cpp.llama_tokenizer import LlamaTokenizer

# Create standalone tokenizer
tokenizer = LlamaTokenizer.from_ggml_file("./tokenizer.ggml")

# Tokenize without BOS token
tokens = tokenizer.tokenize("Python is awesome", add_bos=False)
print(f"Without BOS: {tokens}")

# Tokenize with BOS token  
tokens_bos = tokenizer.tokenize("Python is awesome", add_bos=True)
print(f"With BOS: {tokens_bos}")

# Handle special tokens
tokens_special = tokenizer.tokenize(
    "<|im_start|>user\nHello<|im_end|>", 
    special=True
)
print(f"Special tokens: {tokens_special}")

HuggingFace Integration

from llama_cpp.llama_tokenizer import LlamaHFTokenizer

# Load HuggingFace tokenizer
hf_tokenizer = LlamaHFTokenizer.from_pretrained(
    "microsoft/DialoGPT-medium",
    use_fast=True
)

# Use with consistent interface
text = "Tell me a joke about programming"
tokens = hf_tokenizer.tokenize(text)
decoded = hf_tokenizer.detokenize(tokens)

print(f"Original: {text}")
print(f"Tokens: {tokens}")
print(f"Decoded: {decoded}")

Token Analysis

# Analyze tokenization behavior
texts = [
    "Hello world",
    "Hello, world!",
    "Hello world.",
    "HelloWorld",
    "HELLO WORLD",
]

for text in texts:
    tokens = llm.tokenize(text, add_bos=False)
    print(f"'{text}' -> {len(tokens)} tokens: {tokens}")

Batch Processing

# Process multiple texts efficiently
texts = [
    "First example text",
    "Second example with more words",
    "Third text for processing",
]

# Tokenize all texts
all_tokens = []
for text in texts:
    tokens = llm.tokenize(text, add_bos=True)
    all_tokens.append(tokens)
    print(f"'{text}' -> {len(tokens)} tokens")

# Find maximum length for padding
max_length = max(len(tokens) for tokens in all_tokens)
print(f"Maximum token length: {max_length}")

Special Token Handling

# Check special token IDs
print(f"BOS token: {llm.token_bos}")
print(f"EOS token: {llm.token_eos}")
print(f"Newline token: {llm.token_nl}")

# Create text with explicit special tokens
text_with_special = f"<|begin_of_text|>Hello<|end_of_text|>"
tokens = llm.tokenize(text_with_special, special=True)
print(f"With special tokens: {tokens}")

# Compare with normal tokenization
tokens_normal = llm.tokenize(text_with_special, special=False)
print(f"Normal tokenization: {tokens_normal}")

Vocabulary Analysis

# Get vocabulary information
print(f"Vocabulary size: {llm.n_vocab}")
print(f"Context size: {llm.n_ctx}")

# Sample some token IDs and their text representations
import random

sample_ids = random.sample(range(min(1000, llm.n_vocab)), 10)
for token_id in sample_ids:
    try:
        text = llm.detokenize([token_id])
        print(f"Token {token_id}: '{text}'")
    except:
        print(f"Token {token_id}: <unable to decode>")

Custom Tokenizer Integration

from llama_cpp.llama_tokenizer import BaseLlamaTokenizer

class CustomTokenizer(BaseLlamaTokenizer):
    def __init__(self, base_tokenizer):
        self.base_tokenizer = base_tokenizer
    
    def tokenize(self, text, add_bos=True, special=False):
        # Add custom preprocessing
        processed_text = text.lower().strip()
        return self.base_tokenizer.tokenize(processed_text, add_bos, special)
    
    def detokenize(self, tokens, decode=True):
        return self.base_tokenizer.detokenize(tokens, decode)

# Use custom tokenizer
custom_tokenizer = CustomTokenizer(llm)
tokens = custom_tokenizer.tokenize("HELLO WORLD!")
print(f"Custom tokenized: {tokens}")

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json