Fast inference engine for Transformer models
—
Core inference functionality for running Transformer models with high performance. CTranslate2 provides three main inference classes: Translator for sequence-to-sequence models, Generator for language models, and Encoder for encoder-only models. All classes support batching, streaming, and asynchronous processing with advanced optimization techniques.
The Translator class handles sequence-to-sequence models like T5, BART, and traditional Transformer models for tasks such as machine translation, summarization, and text-to-text generation.
class Translator:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, flash_attention: bool = False,
tensor_parallel: bool = False, files: dict = None):
"""
Initialize a Translator for sequence-to-sequence models.
Args:
model_path (str): Path to the CTranslate2 model directory
device (str): Device to run on ("cpu", "cuda", "auto")
device_index (int): Device index for multi-GPU setups
compute_type (str): Computation precision ("default", "float32", "float16", "int8")
inter_threads (int): Number of inter-op threads
intra_threads (int): Number of intra-op threads (0 for auto)
max_queued_batches (int): Maximum number of batches in queue
flash_attention (bool): Enable Flash Attention optimization
tensor_parallel (bool): Enable tensor parallelism
files (dict): Additional model files mapping
"""
def translate_batch(self, source: list, target_prefix: list = None,
beam_size: int = 1, patience: float = 1.0,
length_penalty: float = 1.0, coverage_penalty: float = 0.0,
repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0,
prefix_bias_beta: float = 0.0, max_length: int = 512,
min_length: int = 0, use_vmap: bool = False,
return_end_token: bool = False, max_input_length: int = 1024,
max_decoding_length: int = 256, min_decoding_length: int = 1,
sampling_topk: int = 1, sampling_topp: float = 1.0,
sampling_temperature: float = 1.0, return_scores: bool = False,
return_attention: bool = False, return_alternatives: bool = False,
min_alternative_expansion_prob: float = 0.0,
num_hypotheses: int = 1, **kwargs) -> list:
"""
Translate a batch of source sequences.
Args:
source (list): List of source sequences (each sequence is a list of tokens)
target_prefix (list, optional): List of target prefixes to condition generation
beam_size (int): Beam search size for decoding
patience (float): Beam search patience factor
length_penalty (float): Length penalty for beam search
coverage_penalty (float): Coverage penalty to avoid repetition
repetition_penalty (float): Repetition penalty for generated tokens
no_repeat_ngram_size (int): Size of n-grams that cannot be repeated
max_length (int): Maximum length of generated sequences
min_length (int): Minimum length of generated sequences
sampling_topk (int): Top-k sampling parameter
sampling_topp (float): Top-p (nucleus) sampling parameter
sampling_temperature (float): Temperature for sampling
return_scores (bool): Whether to return scores
return_attention (bool): Whether to return attention weights
num_hypotheses (int): Number of hypotheses to return per input
Returns:
list: List of TranslationResult objects
"""
def score_batch(self, source: list, target: list, max_input_length: int = 1024,
use_vmap: bool = False, **kwargs) -> list:
"""
Score a batch of source-target sequence pairs.
Args:
source (list): List of source sequences
target (list): List of target sequences to score
max_input_length (int): Maximum input sequence length
use_vmap (bool): Whether to use vocabulary mapping
Returns:
list: List of ScoringResult objects with scores
"""
def translate_iterable(self, source, target_prefix=None, batch_size: int = 32,
batch_type: str = "examples", **kwargs):
"""
Translate an iterable of source sequences with efficient batching.
Args:
source: Iterable of source sequences
target_prefix: Iterable of target prefixes (optional)
batch_size (int): Maximum batch size
batch_type (str): Batching strategy ("examples" or "tokens")
**kwargs: Additional arguments passed to translate_batch
Yields:
TranslationResult: Results for each input sequence
"""
def score_iterable(self, source, target, batch_size: int = 32,
batch_type: str = "examples", **kwargs):
"""
Score an iterable of source-target pairs with efficient batching.
Args:
source: Iterable of source sequences
target: Iterable of target sequences
batch_size (int): Maximum batch size
batch_type (str): Batching strategy ("examples" or "tokens")
**kwargs: Additional arguments passed to score_batch
Yields:
ScoringResult: Scoring results for each input pair
"""
def generate_tokens(self, source: list, target_prefix: list = None, **kwargs):
"""
Generate tokens step-by-step for a single input.
Args:
source (list): Source sequence as list of tokens
target_prefix (list, optional): Target prefix tokens
**kwargs: Additional generation parameters
Yields:
GenerationStepResult: Each generated token with metadata
"""
@property
def model_is_loaded(self) -> bool:
"""Whether the model is loaded in memory."""
@property
def device(self) -> str:
"""Device name where the model is running."""
@property
def device_index(self) -> list:
"""List of device indices being used."""
@property
def num_translators(self) -> int:
"""Number of translator instances."""
@property
def num_queued_batches(self) -> int:
"""Current number of queued batches."""
@property
def compute_type(self) -> str:
"""Compute type being used for inference."""The Generator class handles decoder-only language models like GPT-2, Llama, and Mistral for text generation, completion, and scoring tasks.
class Generator:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, flash_attention: bool = False,
tensor_parallel: bool = False, files: dict = None):
"""
Initialize a Generator for language models.
Args:
model_path (str): Path to the CTranslate2 model directory
device (str): Device to run on ("cpu", "cuda", "auto")
device_index (int): Device index for multi-GPU setups
compute_type (str): Computation precision ("default", "float32", "float16", "int8")
inter_threads (int): Number of inter-op threads
intra_threads (int): Number of intra-op threads (0 for auto)
max_queued_batches (int): Maximum number of batches in queue
flash_attention (bool): Enable Flash Attention optimization
tensor_parallel (bool): Enable tensor parallelism
files (dict): Additional model files mapping
"""
def generate_batch(self, start_tokens: list, max_length: int = 512,
min_length: int = 0, sampling_topk: int = 1,
sampling_topp: float = 1.0, sampling_temperature: float = 1.0,
repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0,
disable_unk: bool = False, suppress_sequences: list = None,
end_token: str = None, return_end_token: bool = False,
max_input_length: int = 1024, static_prompt: bool = False,
cache_static_prompt: bool = True, include_prompt_in_result: bool = True,
return_scores: bool = False, **kwargs) -> list:
"""
Generate sequences from a batch of start tokens.
Args:
start_tokens (list): List of start token sequences
max_length (int): Maximum length of generated sequences
min_length (int): Minimum length of generated sequences
sampling_topk (int): Top-k sampling parameter
sampling_topp (float): Top-p (nucleus) sampling parameter
sampling_temperature (float): Temperature for sampling
repetition_penalty (float): Repetition penalty for generated tokens
no_repeat_ngram_size (int): Size of n-grams that cannot be repeated
disable_unk (bool): Whether to disable unknown token generation
suppress_sequences (list): List of token sequences to suppress
end_token (str): Token that ends generation
return_end_token (bool): Whether to include end token in result
max_input_length (int): Maximum input sequence length
static_prompt (bool): Whether prompt is static across calls
cache_static_prompt (bool): Whether to cache static prompt
include_prompt_in_result (bool): Whether to include prompt in output
return_scores (bool): Whether to return generation scores
Returns:
list: List of GenerationResult objects
"""
def score_batch(self, tokens: list, max_length: int = 1024, **kwargs) -> list:
"""
Score a batch of token sequences.
Args:
tokens (list): List of token sequences to score
max_length (int): Maximum sequence length to consider
Returns:
list: List of ScoringResult objects with scores
"""
def generate_iterable(self, start_tokens, batch_size: int = 32,
batch_type: str = "examples", **kwargs):
"""
Generate from an iterable of start token sequences with efficient batching.
Args:
start_tokens: Iterable of start token sequences
batch_size (int): Maximum batch size
batch_type (str): Batching strategy ("examples" or "tokens")
**kwargs: Additional arguments passed to generate_batch
Yields:
GenerationResult: Results for each input sequence
"""
def score_iterable(self, tokens, batch_size: int = 32,
batch_type: str = "examples", **kwargs):
"""
Score an iterable of token sequences with efficient batching.
Args:
tokens: Iterable of token sequences
batch_size (int): Maximum batch size
batch_type (str): Batching strategy ("examples" or "tokens")
**kwargs: Additional arguments passed to score_batch
Yields:
ScoringResult: Scoring results for each input sequence
"""
def generate_tokens(self, prompt: list, **kwargs):
"""
Generate tokens step-by-step for a single prompt.
Args:
prompt (list): Prompt tokens as list
**kwargs: Additional generation parameters
Yields:
GenerationStepResult: Each generated token with metadata
"""
def async_generate_tokens(self, prompt: list, **kwargs):
"""
Generate tokens asynchronously step-by-step for a single prompt.
Args:
prompt (list): Prompt tokens as list
**kwargs: Additional generation parameters
Returns:
AsyncGenerationResult: Async result object for streaming
"""The Encoder class handles encoder-only models like BERT and RoBERTa for feature extraction and representation learning tasks.
class Encoder:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, files: dict = None):
"""
Initialize an Encoder for encoder-only models.
Args:
model_path (str): Path to the CTranslate2 model directory
device (str): Device to run on ("cpu", "cuda", "auto")
device_index (int): Device index for multi-GPU setups
compute_type (str): Computation precision ("default", "float32", "float16", "int8")
inter_threads (int): Number of inter-op threads
intra_threads (int): Number of intra-op threads (0 for auto)
max_queued_batches (int): Maximum number of batches in queue
files (dict): Additional model files mapping
"""
def forward_batch(self, inputs: list, normalize: bool = False,
max_input_length: int = 1024, **kwargs) -> list:
"""
Forward pass on a batch of input sequences.
Args:
inputs (list): List of input token sequences
normalize (bool): Whether to normalize output embeddings
max_input_length (int): Maximum input sequence length
Returns:
list: List of EncoderForwardOutput objects
"""import ctranslate2
# Load a translation model
translator = ctranslate2.Translator("path/to/model", device="cpu")
# Translate single sentence
source = [["Hello", "world", "!"]]
results = translator.translate_batch(source)
print(results[0].hypotheses[0]) # ['Bonjour', 'le', 'monde', '!']
# Translate with beam search
results = translator.translate_batch(source, beam_size=4, num_hypotheses=2)
for i, hypothesis in enumerate(results[0].hypotheses):
score = results[0].scores[i]
print(f"Hypothesis {i+1} (score: {score:.4f}): {' '.join(hypothesis)}")import ctranslate2
# Load a language model
generator = ctranslate2.Generator("path/to/model", device="cpu")
# Generate text
prompt = [["The", "quick", "brown", "fox"]]
results = generator.generate_batch(prompt, max_length=50, sampling_temperature=0.8)
print(" ".join(results[0].sequences[0]))
# Step-by-step generation
for step in generator.generate_tokens(["The", "quick", "brown"]):
print(f"Token: {step.token}, Probability: {step.log_prob:.4f}")
if step.is_last:
breakimport ctranslate2
translator = ctranslate2.Translator("path/to/model")
# Process large dataset efficiently
source_sentences = [["sentence", "1"], ["sentence", "2"], ...] # Large list
# Stream processing with batching
for result in translator.translate_iterable(source_sentences, batch_size=32):
translated = " ".join(result.hypotheses[0])
print(translated)class TranslationResult:
"""Result from translation operations."""
hypotheses: list[list[str]] # List of hypothesis token sequences
scores: list[float] # Scores for each hypothesis
attention: list # Attention weights (if requested)
class GenerationResult:
"""Result from generation operations."""
sequences: list[list[str]] # Generated token sequences
scores: list[float] # Generation scores
sequences_ids: list[list[int]] # Token IDs for generated sequences
class ScoringResult:
"""Result from scoring operations."""
scores: list[float] # Log probabilities for each sequence
tokens_count: list[int] # Token counts for each sequence
class GenerationStepResult:
"""Result from step-by-step generation."""
token: str # Generated token
token_id: int # Token ID
is_last: bool # Whether this is the last token
log_prob: float # Log probability of the token
class EncoderForwardOutput:
"""Output from encoder forward pass."""
last_hidden_state: StorageView # Final hidden states
pooler_output: StorageView # Pooled output (if available)
class AsyncTranslationResult:
"""Async result wrapper for translation."""
def result(self) -> TranslationResult: ...
def is_done(self) -> bool: ...
class AsyncGenerationResult:
"""Async result wrapper for generation."""
def result(self) -> GenerationResult: ...
def is_done(self) -> bool: ...
class AsyncScoringResult:
"""Async result wrapper for scoring."""
def result(self) -> ScoringResult: ...
def is_done(self) -> bool: ...Install with Tessl CLI
npx tessl i tessl/pypi-ctranslate2