Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
Native llama.cpp tokenization and HuggingFace tokenizer integration supporting different vocabulary types, encoding/decoding operations, and model-specific preprocessing.
Use the model's built-in tokenizer for consistent text processing.
class LlamaTokenizer:
def __init__(self, llama: "Llama"):
"""
Initialize tokenizer with Llama model instance.
Args:
llama: Llama model instance
"""
def tokenize(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""
Convert text to token IDs.
Args:
text: Input text to tokenize
add_bos: Add beginning-of-sequence token
special: Allow special tokens in output
Returns:
List of token IDs
"""
def detokenize(
self,
tokens: List[int],
decode: bool = True
) -> str:
"""
Convert token IDs to text.
Args:
tokens: List of token IDs to convert
decode: Decode bytes to string
Returns:
Decoded text string
"""
def encode(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""
Encode text to tokens (alias for tokenize).
Args:
text: Text to encode
add_bos: Add beginning-of-sequence token
special: Allow special tokens
Returns:
List of token IDs
"""
def decode(
self,
tokens: List[int],
**kwargs
) -> str:
"""
Decode tokens to text (alias for detokenize).
Args:
tokens: Token IDs to decode
**kwargs: Additional decoding parameters
Returns:
Decoded text
"""
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
"""
Create tokenizer from GGML tokenizer file.
Args:
path: Path to GGML tokenizer file
Returns:
LlamaTokenizer instance
"""Use HuggingFace tokenizers for compatibility with Transformers ecosystem.
class LlamaHFTokenizer:
def __init__(self, hf_tokenizer):
"""
Initialize with HuggingFace tokenizer.
Args:
hf_tokenizer: HuggingFace tokenizer instance
"""
def tokenize(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""
Tokenize text using HuggingFace tokenizer.
Args:
text: Input text
add_bos: Add beginning-of-sequence token
special: Allow special tokens
Returns:
List of token IDs
"""
def detokenize(
self,
tokens: List[int],
decode: bool = True
) -> str:
"""
Detokenize using HuggingFace tokenizer.
Args:
tokens: Token IDs to decode
decode: Decode to string
Returns:
Decoded text
"""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
**kwargs
) -> "LlamaHFTokenizer":
"""
Load tokenizer from HuggingFace model.
Args:
pretrained_model_name_or_path: Model name or path
**kwargs: Additional tokenizer arguments
Returns:
LlamaHFTokenizer instance
"""Abstract base class for tokenizer implementations.
class BaseLlamaTokenizer:
"""Abstract base class for tokenizer implementations."""
def tokenize(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""Convert text to tokens."""
def detokenize(
self,
tokens: List[int],
decode: bool = True
) -> str:
"""Convert tokens to text."""
def encode(
self,
text: str,
add_bos: bool = True,
special: bool = False
) -> List[int]:
"""Encode text (alias for tokenize)."""
def decode(self, tokens: List[int], **kwargs) -> str:
"""Decode tokens (alias for detokenize)."""# Vocabulary types supported by llama.cpp
LLAMA_VOCAB_TYPE_NONE: int # No vocabulary
LLAMA_VOCAB_TYPE_SPM: int # SentencePiece model
LLAMA_VOCAB_TYPE_BPE: int # Byte pair encoding
LLAMA_VOCAB_TYPE_WPM: int # WordPiece model
LLAMA_VOCAB_TYPE_UGM: int # Unigram model
LLAMA_VOCAB_TYPE_RWKV: int # RWKV tokenizer# Text preprocessing types for different models
LLAMA_VOCAB_PRE_TYPE_DEFAULT: int # Default preprocessing
LLAMA_VOCAB_PRE_TYPE_LLAMA3: int # Llama 3 preprocessing
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: int # DeepSeek preprocessing
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: int # DeepSeek Coder preprocessing
LLAMA_VOCAB_PRE_TYPE_FALCON: int # Falcon preprocessing
LLAMA_VOCAB_PRE_TYPE_MPT: int # MPT preprocessing
LLAMA_VOCAB_PRE_TYPE_STARCODER: int # StarCoder preprocessing
LLAMA_VOCAB_PRE_TYPE_GPT2: int # GPT-2 preprocessing
LLAMA_VOCAB_PRE_TYPE_REFACT: int # Refact preprocessing
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: int # Command-R preprocessing
LLAMA_VOCAB_PRE_TYPE_QWEN2: int # Qwen2 preprocessing
LLAMA_VOCAB_PRE_TYPE_OLMO: int # OLMo preprocessing
LLAMA_VOCAB_PRE_TYPE_DBRX: int # DBRX preprocessing
LLAMA_VOCAB_PRE_TYPE_SMAUG: int # Smaug preprocessing
LLAMA_VOCAB_PRE_TYPE_PORO: int # Poro preprocessing
LLAMA_VOCAB_PRE_TYPE_CHATGLM3: int # ChatGLM3 preprocessing
LLAMA_VOCAB_PRE_TYPE_CHATGLM4: int # ChatGLM4 preprocessing
LLAMA_VOCAB_PRE_TYPE_VIKING: int # Viking preprocessing
LLAMA_VOCAB_PRE_TYPE_JAIS: int # Jais preprocessing
LLAMA_VOCAB_PRE_TYPE_TEKKEN: int # Tekken preprocessing
LLAMA_VOCAB_PRE_TYPE_SMOLLM: int # SmolLM preprocessing
LLAMA_VOCAB_PRE_TYPE_CODESHELL: int # CodeShell preprocessing
LLAMA_VOCAB_PRE_TYPE_BLOOM: int # BLOOM preprocessing
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: int # GPT-3 Finnish preprocessing
LLAMA_VOCAB_PRE_TYPE_EXAONE: int # EXAONE preprocessingfrom llama_cpp import Llama
# Initialize model with tokenizer access
llm = Llama(model_path="./models/llama-2-7b.gguf")
# Tokenize text
text = "Hello, world! How are you today?"
tokens = llm.tokenize(text, add_bos=True)
print(f"Tokens: {tokens}")
print(f"Token count: {len(tokens)}")
# Detokenize back to text
decoded = llm.detokenize(tokens)
print(f"Decoded: {decoded}")from llama_cpp.llama_tokenizer import LlamaTokenizer
# Create standalone tokenizer
tokenizer = LlamaTokenizer.from_ggml_file("./tokenizer.ggml")
# Tokenize without BOS token
tokens = tokenizer.tokenize("Python is awesome", add_bos=False)
print(f"Without BOS: {tokens}")
# Tokenize with BOS token
tokens_bos = tokenizer.tokenize("Python is awesome", add_bos=True)
print(f"With BOS: {tokens_bos}")
# Handle special tokens
tokens_special = tokenizer.tokenize(
"<|im_start|>user\nHello<|im_end|>",
special=True
)
print(f"Special tokens: {tokens_special}")from llama_cpp.llama_tokenizer import LlamaHFTokenizer
# Load HuggingFace tokenizer
hf_tokenizer = LlamaHFTokenizer.from_pretrained(
"microsoft/DialoGPT-medium",
use_fast=True
)
# Use with consistent interface
text = "Tell me a joke about programming"
tokens = hf_tokenizer.tokenize(text)
decoded = hf_tokenizer.detokenize(tokens)
print(f"Original: {text}")
print(f"Tokens: {tokens}")
print(f"Decoded: {decoded}")# Analyze tokenization behavior
texts = [
"Hello world",
"Hello, world!",
"Hello world.",
"HelloWorld",
"HELLO WORLD",
]
for text in texts:
tokens = llm.tokenize(text, add_bos=False)
print(f"'{text}' -> {len(tokens)} tokens: {tokens}")# Process multiple texts efficiently
texts = [
"First example text",
"Second example with more words",
"Third text for processing",
]
# Tokenize all texts
all_tokens = []
for text in texts:
tokens = llm.tokenize(text, add_bos=True)
all_tokens.append(tokens)
print(f"'{text}' -> {len(tokens)} tokens")
# Find maximum length for padding
max_length = max(len(tokens) for tokens in all_tokens)
print(f"Maximum token length: {max_length}")# Check special token IDs
print(f"BOS token: {llm.token_bos}")
print(f"EOS token: {llm.token_eos}")
print(f"Newline token: {llm.token_nl}")
# Create text with explicit special tokens
text_with_special = f"<|begin_of_text|>Hello<|end_of_text|>"
tokens = llm.tokenize(text_with_special, special=True)
print(f"With special tokens: {tokens}")
# Compare with normal tokenization
tokens_normal = llm.tokenize(text_with_special, special=False)
print(f"Normal tokenization: {tokens_normal}")# Get vocabulary information
print(f"Vocabulary size: {llm.n_vocab}")
print(f"Context size: {llm.n_ctx}")
# Sample some token IDs and their text representations
import random
sample_ids = random.sample(range(min(1000, llm.n_vocab)), 10)
for token_id in sample_ids:
try:
text = llm.detokenize([token_id])
print(f"Token {token_id}: '{text}'")
except:
print(f"Token {token_id}: <unable to decode>")from llama_cpp.llama_tokenizer import BaseLlamaTokenizer
class CustomTokenizer(BaseLlamaTokenizer):
def __init__(self, base_tokenizer):
self.base_tokenizer = base_tokenizer
def tokenize(self, text, add_bos=True, special=False):
# Add custom preprocessing
processed_text = text.lower().strip()
return self.base_tokenizer.tokenize(processed_text, add_bos, special)
def detokenize(self, tokens, decode=True):
return self.base_tokenizer.detokenize(tokens, decode)
# Use custom tokenizer
custom_tokenizer = CustomTokenizer(llm)
tokens = custom_tokenizer.tokenize("HELLO WORLD!")
print(f"Custom tokenized: {tokens}")Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python