CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-ctranslate2

Fast inference engine for Transformer models

Pending
Overview
Eval results
Files

inference.mddocs/

Model Inference

Core inference functionality for running Transformer models with high performance. CTranslate2 provides three main inference classes: Translator for sequence-to-sequence models, Generator for language models, and Encoder for encoder-only models. All classes support batching, streaming, and asynchronous processing with advanced optimization techniques.

Capabilities

Translation (Seq2Seq Models)

The Translator class handles sequence-to-sequence models like T5, BART, and traditional Transformer models for tasks such as machine translation, summarization, and text-to-text generation.

class Translator:
    def __init__(self, model_path: str, device: str = "auto", 
                 device_index: int = 0, compute_type: str = "default", 
                 inter_threads: int = 1, intra_threads: int = 0, 
                 max_queued_batches: int = 0, flash_attention: bool = False, 
                 tensor_parallel: bool = False, files: dict = None):
        """
        Initialize a Translator for sequence-to-sequence models.
        
        Args:
            model_path (str): Path to the CTranslate2 model directory
            device (str): Device to run on ("cpu", "cuda", "auto")
            device_index (int): Device index for multi-GPU setups
            compute_type (str): Computation precision ("default", "float32", "float16", "int8")
            inter_threads (int): Number of inter-op threads
            intra_threads (int): Number of intra-op threads (0 for auto)
            max_queued_batches (int): Maximum number of batches in queue
            flash_attention (bool): Enable Flash Attention optimization
            tensor_parallel (bool): Enable tensor parallelism
            files (dict): Additional model files mapping
        """
    
    def translate_batch(self, source: list, target_prefix: list = None, 
                       beam_size: int = 1, patience: float = 1.0, 
                       length_penalty: float = 1.0, coverage_penalty: float = 0.0, 
                       repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0, 
                       prefix_bias_beta: float = 0.0, max_length: int = 512, 
                       min_length: int = 0, use_vmap: bool = False, 
                       return_end_token: bool = False, max_input_length: int = 1024, 
                       max_decoding_length: int = 256, min_decoding_length: int = 1, 
                       sampling_topk: int = 1, sampling_topp: float = 1.0, 
                       sampling_temperature: float = 1.0, return_scores: bool = False, 
                       return_attention: bool = False, return_alternatives: bool = False, 
                       min_alternative_expansion_prob: float = 0.0, 
                       num_hypotheses: int = 1, **kwargs) -> list:
        """
        Translate a batch of source sequences.
        
        Args:
            source (list): List of source sequences (each sequence is a list of tokens)
            target_prefix (list, optional): List of target prefixes to condition generation
            beam_size (int): Beam search size for decoding
            patience (float): Beam search patience factor
            length_penalty (float): Length penalty for beam search
            coverage_penalty (float): Coverage penalty to avoid repetition
            repetition_penalty (float): Repetition penalty for generated tokens
            no_repeat_ngram_size (int): Size of n-grams that cannot be repeated
            max_length (int): Maximum length of generated sequences
            min_length (int): Minimum length of generated sequences
            sampling_topk (int): Top-k sampling parameter
            sampling_topp (float): Top-p (nucleus) sampling parameter
            sampling_temperature (float): Temperature for sampling
            return_scores (bool): Whether to return scores
            return_attention (bool): Whether to return attention weights
            num_hypotheses (int): Number of hypotheses to return per input
            
        Returns:
            list: List of TranslationResult objects
        """
    
    def score_batch(self, source: list, target: list, max_input_length: int = 1024, 
                   use_vmap: bool = False, **kwargs) -> list:
        """
        Score a batch of source-target sequence pairs.
        
        Args:
            source (list): List of source sequences
            target (list): List of target sequences to score
            max_input_length (int): Maximum input sequence length
            use_vmap (bool): Whether to use vocabulary mapping
            
        Returns:
            list: List of ScoringResult objects with scores
        """
    
    def translate_iterable(self, source, target_prefix=None, batch_size: int = 32, 
                          batch_type: str = "examples", **kwargs):
        """
        Translate an iterable of source sequences with efficient batching.
        
        Args:
            source: Iterable of source sequences
            target_prefix: Iterable of target prefixes (optional)
            batch_size (int): Maximum batch size
            batch_type (str): Batching strategy ("examples" or "tokens")
            **kwargs: Additional arguments passed to translate_batch
            
        Yields:
            TranslationResult: Results for each input sequence
        """
    
    def score_iterable(self, source, target, batch_size: int = 32, 
                      batch_type: str = "examples", **kwargs):
        """
        Score an iterable of source-target pairs with efficient batching.
        
        Args:
            source: Iterable of source sequences
            target: Iterable of target sequences
            batch_size (int): Maximum batch size
            batch_type (str): Batching strategy ("examples" or "tokens")
            **kwargs: Additional arguments passed to score_batch
            
        Yields:
            ScoringResult: Scoring results for each input pair
        """
    
    def generate_tokens(self, source: list, target_prefix: list = None, **kwargs):
        """
        Generate tokens step-by-step for a single input.
        
        Args:
            source (list): Source sequence as list of tokens
            target_prefix (list, optional): Target prefix tokens
            **kwargs: Additional generation parameters
            
        Yields:
            GenerationStepResult: Each generated token with metadata
        """
    
    @property
    def model_is_loaded(self) -> bool:
        """Whether the model is loaded in memory."""
    
    @property
    def device(self) -> str:
        """Device name where the model is running."""
    
    @property
    def device_index(self) -> list:
        """List of device indices being used."""
    
    @property
    def num_translators(self) -> int:
        """Number of translator instances."""
    
    @property
    def num_queued_batches(self) -> int:
        """Current number of queued batches."""
    
    @property
    def compute_type(self) -> str:
        """Compute type being used for inference."""

Text Generation (Language Models)

The Generator class handles decoder-only language models like GPT-2, Llama, and Mistral for text generation, completion, and scoring tasks.

class Generator:
    def __init__(self, model_path: str, device: str = "auto", 
                 device_index: int = 0, compute_type: str = "default", 
                 inter_threads: int = 1, intra_threads: int = 0, 
                 max_queued_batches: int = 0, flash_attention: bool = False, 
                 tensor_parallel: bool = False, files: dict = None):
        """
        Initialize a Generator for language models.
        
        Args:
            model_path (str): Path to the CTranslate2 model directory
            device (str): Device to run on ("cpu", "cuda", "auto")
            device_index (int): Device index for multi-GPU setups
            compute_type (str): Computation precision ("default", "float32", "float16", "int8")
            inter_threads (int): Number of inter-op threads
            intra_threads (int): Number of intra-op threads (0 for auto)
            max_queued_batches (int): Maximum number of batches in queue
            flash_attention (bool): Enable Flash Attention optimization
            tensor_parallel (bool): Enable tensor parallelism
            files (dict): Additional model files mapping
        """
    
    def generate_batch(self, start_tokens: list, max_length: int = 512, 
                      min_length: int = 0, sampling_topk: int = 1, 
                      sampling_topp: float = 1.0, sampling_temperature: float = 1.0, 
                      repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0, 
                      disable_unk: bool = False, suppress_sequences: list = None, 
                      end_token: str = None, return_end_token: bool = False, 
                      max_input_length: int = 1024, static_prompt: bool = False, 
                      cache_static_prompt: bool = True, include_prompt_in_result: bool = True, 
                      return_scores: bool = False, **kwargs) -> list:
        """
        Generate sequences from a batch of start tokens.
        
        Args:
            start_tokens (list): List of start token sequences
            max_length (int): Maximum length of generated sequences
            min_length (int): Minimum length of generated sequences
            sampling_topk (int): Top-k sampling parameter
            sampling_topp (float): Top-p (nucleus) sampling parameter
            sampling_temperature (float): Temperature for sampling
            repetition_penalty (float): Repetition penalty for generated tokens
            no_repeat_ngram_size (int): Size of n-grams that cannot be repeated
            disable_unk (bool): Whether to disable unknown token generation
            suppress_sequences (list): List of token sequences to suppress
            end_token (str): Token that ends generation
            return_end_token (bool): Whether to include end token in result
            max_input_length (int): Maximum input sequence length
            static_prompt (bool): Whether prompt is static across calls
            cache_static_prompt (bool): Whether to cache static prompt
            include_prompt_in_result (bool): Whether to include prompt in output
            return_scores (bool): Whether to return generation scores
            
        Returns:
            list: List of GenerationResult objects
        """
    
    def score_batch(self, tokens: list, max_length: int = 1024, **kwargs) -> list:
        """
        Score a batch of token sequences.
        
        Args:
            tokens (list): List of token sequences to score
            max_length (int): Maximum sequence length to consider
            
        Returns:
            list: List of ScoringResult objects with scores
        """
    
    def generate_iterable(self, start_tokens, batch_size: int = 32, 
                         batch_type: str = "examples", **kwargs):
        """
        Generate from an iterable of start token sequences with efficient batching.
        
        Args:
            start_tokens: Iterable of start token sequences
            batch_size (int): Maximum batch size
            batch_type (str): Batching strategy ("examples" or "tokens")
            **kwargs: Additional arguments passed to generate_batch
            
        Yields:
            GenerationResult: Results for each input sequence
        """
    
    def score_iterable(self, tokens, batch_size: int = 32, 
                      batch_type: str = "examples", **kwargs):
        """
        Score an iterable of token sequences with efficient batching.
        
        Args:
            tokens: Iterable of token sequences
            batch_size (int): Maximum batch size
            batch_type (str): Batching strategy ("examples" or "tokens")
            **kwargs: Additional arguments passed to score_batch
            
        Yields:
            ScoringResult: Scoring results for each input sequence
        """
    
    def generate_tokens(self, prompt: list, **kwargs):
        """
        Generate tokens step-by-step for a single prompt.
        
        Args:
            prompt (list): Prompt tokens as list
            **kwargs: Additional generation parameters
            
        Yields:
            GenerationStepResult: Each generated token with metadata
        """
    
    def async_generate_tokens(self, prompt: list, **kwargs):
        """
        Generate tokens asynchronously step-by-step for a single prompt.
        
        Args:
            prompt (list): Prompt tokens as list
            **kwargs: Additional generation parameters
            
        Returns:
            AsyncGenerationResult: Async result object for streaming
        """

Encoding (Encoder-Only Models)

The Encoder class handles encoder-only models like BERT and RoBERTa for feature extraction and representation learning tasks.

class Encoder:
    def __init__(self, model_path: str, device: str = "auto", 
                 device_index: int = 0, compute_type: str = "default", 
                 inter_threads: int = 1, intra_threads: int = 0, 
                 max_queued_batches: int = 0, files: dict = None):
        """
        Initialize an Encoder for encoder-only models.
        
        Args:
            model_path (str): Path to the CTranslate2 model directory
            device (str): Device to run on ("cpu", "cuda", "auto")
            device_index (int): Device index for multi-GPU setups
            compute_type (str): Computation precision ("default", "float32", "float16", "int8")
            inter_threads (int): Number of inter-op threads
            intra_threads (int): Number of intra-op threads (0 for auto)
            max_queued_batches (int): Maximum number of batches in queue
            files (dict): Additional model files mapping
        """
    
    def forward_batch(self, inputs: list, normalize: bool = False, 
                     max_input_length: int = 1024, **kwargs) -> list:
        """
        Forward pass on a batch of input sequences.
        
        Args:
            inputs (list): List of input token sequences
            normalize (bool): Whether to normalize output embeddings
            max_input_length (int): Maximum input sequence length
            
        Returns:
            list: List of EncoderForwardOutput objects
        """

Usage Examples

Basic Translation

import ctranslate2

# Load a translation model
translator = ctranslate2.Translator("path/to/model", device="cpu")

# Translate single sentence
source = [["Hello", "world", "!"]]
results = translator.translate_batch(source)
print(results[0].hypotheses[0])  # ['Bonjour', 'le', 'monde', '!']

# Translate with beam search
results = translator.translate_batch(source, beam_size=4, num_hypotheses=2)
for i, hypothesis in enumerate(results[0].hypotheses):
    score = results[0].scores[i]
    print(f"Hypothesis {i+1} (score: {score:.4f}): {' '.join(hypothesis)}")

Text Generation

import ctranslate2

# Load a language model
generator = ctranslate2.Generator("path/to/model", device="cpu")

# Generate text
prompt = [["The", "quick", "brown", "fox"]]
results = generator.generate_batch(prompt, max_length=50, sampling_temperature=0.8)
print(" ".join(results[0].sequences[0]))

# Step-by-step generation
for step in generator.generate_tokens(["The", "quick", "brown"]):
    print(f"Token: {step.token}, Probability: {step.log_prob:.4f}")
    if step.is_last:
        break

Streaming Processing

import ctranslate2

translator = ctranslate2.Translator("path/to/model")

# Process large dataset efficiently
source_sentences = [["sentence", "1"], ["sentence", "2"], ...]  # Large list

# Stream processing with batching
for result in translator.translate_iterable(source_sentences, batch_size=32):
    translated = " ".join(result.hypotheses[0])
    print(translated)

Types

class TranslationResult:
    """Result from translation operations."""
    hypotheses: list[list[str]]  # List of hypothesis token sequences
    scores: list[float]          # Scores for each hypothesis
    attention: list              # Attention weights (if requested)

class GenerationResult:
    """Result from generation operations."""
    sequences: list[list[str]]   # Generated token sequences
    scores: list[float]          # Generation scores
    sequences_ids: list[list[int]]  # Token IDs for generated sequences

class ScoringResult:
    """Result from scoring operations."""
    scores: list[float]          # Log probabilities for each sequence
    tokens_count: list[int]      # Token counts for each sequence

class GenerationStepResult:
    """Result from step-by-step generation."""
    token: str                   # Generated token
    token_id: int               # Token ID
    is_last: bool               # Whether this is the last token
    log_prob: float             # Log probability of the token

class EncoderForwardOutput:
    """Output from encoder forward pass."""
    last_hidden_state: StorageView  # Final hidden states
    pooler_output: StorageView      # Pooled output (if available)

class AsyncTranslationResult:
    """Async result wrapper for translation."""
    def result(self) -> TranslationResult: ...
    def is_done(self) -> bool: ...

class AsyncGenerationResult:
    """Async result wrapper for generation."""
    def result(self) -> GenerationResult: ...
    def is_done(self) -> bool: ...

class AsyncScoringResult:
    """Async result wrapper for scoring."""
    def result(self) -> ScoringResult: ...
    def is_done(self) -> bool: ...

Install with Tessl CLI

npx tessl i tessl/pypi-ctranslate2

docs

converters.md

index.md

inference.md

specialized.md

specifications.md

utilities.md

tile.json