Fast inference engine for Transformer models
npx @tessl/cli install tessl/pypi-ctranslate2@4.6.0A high-performance C++ and Python library specifically designed for efficient inference with Transformer models across various architectures including encoder-decoder models (Transformer, BART, T5, Whisper), decoder-only models (GPT-2, Llama, Mistral), and encoder-only models (BERT, RoBERTa). The library implements a custom runtime that applies advanced performance optimization techniques such as weights quantization, layer fusion, batch reordering, and memory management to significantly accelerate inference and reduce memory usage on both CPU and GPU platforms.
pip install ctranslate2import ctranslate2Common usage patterns:
from ctranslate2 import Translator, Generator, Encoder
from ctranslate2 import TransformersConverter, contains_modelimport ctranslate2
# Translation example (seq2seq models)
translator = ctranslate2.Translator("path/to/ct2_model", device="cpu")
results = translator.translate_batch([["Hello", "world"]])
print(results[0].hypotheses[0]) # Translated text
# Generation example (language models)
generator = ctranslate2.Generator("path/to/ct2_model", device="cpu")
results = generator.generate_batch([["The quick brown"]])
print(results[0].sequences[0]) # Generated continuation
# Model conversion example
converter = ctranslate2.converters.TransformersConverter("microsoft/DialoGPT-medium")
converter.convert("ct2_model_output")CTranslate2 follows a modular architecture:
Translator, Generator, Encoder for different model typesWhisper for speech recognitionStorageView for efficient tensor operations, device managementCore inference functionality for running Transformer models with high performance. Supports translation, generation, and encoding tasks with batching, streaming, and asynchronous processing.
class Translator:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, flash_attention: bool = False,
tensor_parallel: bool = False, files: dict = None): ...
def translate_batch(self, source: list, target_prefix: list = None, **kwargs) -> list: ...
def score_batch(self, source: list, target: list, **kwargs) -> list: ...
class Generator:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, flash_attention: bool = False,
tensor_parallel: bool = False, files: dict = None): ...
def generate_batch(self, start_tokens: list, **kwargs) -> list: ...
def score_batch(self, tokens: list, **kwargs) -> list: ...
class Encoder:
def __init__(self, model_path: str, device: str = "auto",
device_index: int = 0, compute_type: str = "default",
inter_threads: int = 1, intra_threads: int = 0,
max_queued_batches: int = 0, files: dict = None): ...
def forward_batch(self, inputs: list, **kwargs) -> list: ...Convert models from popular frameworks (Transformers, Fairseq, OpenNMT, etc.) to CTranslate2 format for optimized inference. Supports quantization, file copying, and various framework-specific options.
class TransformersConverter:
def __init__(self, model_name_or_path: str, activation_scales: str = None,
copy_files: list = None, load_as_float16: bool = False,
revision: str = None, low_cpu_mem_usage: bool = False,
trust_remote_code: bool = False): ...
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False): ...
# Additional converters
class FairseqConverter: ...
class OpenNMTPyConverter: ...
class OpenNMTTFConverter: ...
class MarianConverter: ...
class OpusMTConverter: ...
class OpenAIGPT2Converter: ...Programmatically define and build Transformer model architectures from scratch. Supports various model types including sequence-to-sequence, decoder-only, and encoder-only models with extensive configuration options.
class TransformerSpec:
def __init__(self, encoder: TransformerEncoderSpec, decoder: TransformerDecoderSpec): ...
@classmethod
def from_config(cls, num_layers: int, num_heads: int, **kwargs): ...
def save(self, output_dir: str): ...
def validate(self): ...
def optimize(self, quantization: str = None): ...
class TransformerDecoderModelSpec:
def __init__(self, decoder: TransformerDecoderSpec): ...
@classmethod
def from_config(cls, num_layers: int, num_heads: int, **kwargs): ...
class TransformerEncoderModelSpec:
def __init__(self, encoder: TransformerEncoderSpec, pooling_layer: bool = False): ...Domain-specific model classes for speech recognition and audio processing tasks. Includes Whisper for speech-to-text and Wav2Vec2 for speech representation learning.
class Whisper:
def __init__(self, model_path: str, device: str = "auto", **kwargs): ...
def transcribe(self, features: list, **kwargs) -> list: ...
def detect_language(self, features: list, **kwargs) -> list: ...
class Wav2Vec2:
def __init__(self, model_path: str, device: str = "auto", **kwargs): ...
def encode(self, features: list, **kwargs) -> list: ...
class Wav2Vec2Bert:
def __init__(self, model_path: str, device: str = "auto", **kwargs): ...
def encode(self, features: list, **kwargs) -> list: ...Helper functions for model management, device configuration, logging, and tensor operations. Includes utilities for checking model compatibility and managing computational resources.
def contains_model(path: str) -> bool: ...
def get_cuda_device_count() -> int: ...
def get_supported_compute_types(device: str, device_index: int = 0) -> list: ...
def set_random_seed(seed: int): ...
def get_log_level() -> str: ...
def set_log_level(level: str): ...
class StorageView:
def __init__(self, array=None, dtype=None): ...
def numpy(self): ...
def copy(self): ...
def to(self, dtype: str): ...
@property
def shape(self) -> tuple: ...
@property
def size(self) -> int: ...
@property
def dtype(self) -> str: ...# Result classes
class TranslationResult:
hypotheses: list[str]
scores: list[float]
class GenerationResult:
sequences: list[list[str]]
scores: list[float]
class ScoringResult:
scores: list[float]
class GenerationStepResult:
token: str
token_id: int
is_last: bool
log_prob: float
class EncoderForwardOutput:
last_hidden_state: StorageView
pooler_output: StorageView
# Enumerations
class DataType:
FLOAT32: str
FLOAT16: str
INT8: str
INT16: str
INT32: str
class Device:
CPU: str
CUDA: str
AUTO: str
# Configuration classes
class ExecutionStats:
num_tokens: int
num_examples: int
total_time_in_ms: float
class MpiInfo:
rank: int
size: int