Fast inference engine for Transformer models
—
Convert models from popular frameworks (Transformers, Fairseq, OpenNMT, etc.) to CTranslate2 format for optimized inference. CTranslate2 converters support quantization, file copying, and various framework-specific options to ensure optimal performance and compatibility.
Convert Hugging Face Transformers models to CTranslate2 format. Supports most popular model architectures including BERT, GPT-2, T5, BART, and more.
class TransformersConverter:
def __init__(self, model_name_or_path: str, activation_scales: str = None,
copy_files: list = None, load_as_float16: bool = False,
revision: str = None, low_cpu_mem_usage: bool = False,
trust_remote_code: bool = False):
"""
Initialize converter for Hugging Face Transformers models.
Args:
model_name_or_path (str): Model name on Hub or local path
activation_scales (str): Path to activation scales for smoothquant
copy_files (list): Additional files to copy to output directory
load_as_float16 (bool): Load model weights in float16
revision (str): Model revision/branch to use
low_cpu_mem_usage (bool): Enable low CPU memory loading
trust_remote_code (bool): Allow custom code execution
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""
Convert the model to CTranslate2 format.
Args:
output_dir (str): Output directory for converted model
vmap (str): Path to vocabulary mapping file
quantization (str): Quantization type ("int8", "int8_float16", "int16", "float16")
force (bool): Overwrite output directory if it exists
Returns:
str: Path to the converted model directory
"""
def convert_from_args(self, args) -> str:
"""
Convert model using parsed command-line arguments.
Args:
args: Parsed arguments object
Returns:
str: Path to the converted model directory
"""
@staticmethod
def declare_arguments(parser):
"""
Add converter-specific arguments to argument parser.
Args:
parser: ArgumentParser instance to modify
"""Convert Fairseq models to CTranslate2 format. Supports various Fairseq model architectures.
class FairseqConverter:
def __init__(self, model_path: str, data_dir: str = None):
"""
Initialize converter for Fairseq models.
Args:
model_path (str): Path to Fairseq model checkpoint
data_dir (str): Path to data directory with vocabularies
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""
Convert the Fairseq model to CTranslate2 format.
Args:
output_dir (str): Output directory for converted model
vmap (str): Path to vocabulary mapping file
quantization (str): Quantization type
force (bool): Overwrite output directory if it exists
Returns:
str: Path to the converted model directory
"""Convert OpenNMT-py and OpenNMT-tf models to CTranslate2 format.
class OpenNMTPyConverter:
def __init__(self, model_path: str):
"""
Initialize converter for OpenNMT-py models.
Args:
model_path (str): Path to OpenNMT-py model file
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""Convert the OpenNMT-py model to CTranslate2 format."""
class OpenNMTTFConverter:
def __init__(self, model_path: str):
"""
Initialize converter for OpenNMT-tf models.
Args:
model_path (str): Path to OpenNMT-tf model checkpoint
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""Convert the OpenNMT-tf model to CTranslate2 format."""Convert Marian NMT models to CTranslate2 format.
class MarianConverter:
def __init__(self, model_path: str):
"""
Initialize converter for Marian models.
Args:
model_path (str): Path to Marian model directory
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""Convert the Marian model to CTranslate2 format."""Convert OPUS-MT models to CTranslate2 format.
class OpusMTConverter:
def __init__(self, model_name: str):
"""
Initialize converter for OPUS-MT models.
Args:
model_name (str): OPUS-MT model name from Hugging Face Hub
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""Convert the OPUS-MT model to CTranslate2 format."""Convert OpenAI GPT-2 models to CTranslate2 format.
class OpenAIGPT2Converter:
def __init__(self, model_name: str = "124M"):
"""
Initialize converter for OpenAI GPT-2 models.
Args:
model_name (str): GPT-2 model size ("124M", "355M", "774M", "1558M")
"""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""Convert the GPT-2 model to CTranslate2 format."""All converters inherit from this base class providing common functionality.
class Converter:
"""Abstract base class for model converters."""
def convert(self, output_dir: str, vmap: str = None,
quantization: str = None, force: bool = False) -> str:
"""
Convert model to CTranslate2 format.
Args:
output_dir (str): Output directory for converted model
vmap (str): Path to vocabulary mapping file
quantization (str): Quantization type
force (bool): Overwrite output directory if it exists
Returns:
str: Path to the converted model directory
"""
def convert_from_args(self, args) -> str:
"""
Convert model using parsed command-line arguments.
Args:
args: Parsed arguments object with conversion parameters
Returns:
str: Path to the converted model directory
"""
@staticmethod
def declare_arguments(parser):
"""
Add common converter arguments to argument parser.
Args:
parser: ArgumentParser instance to modify
"""CTranslate2 provides command-line tools for model conversion:
# Available console scripts (entry points):
# ct2-transformers-converter - Convert Transformers models
# ct2-fairseq-converter - Convert Fairseq models
# ct2-opennmt-py-converter - Convert OpenNMT-py models
# ct2-opennmt-tf-converter - Convert OpenNMT-tf models
# ct2-marian-converter - Convert Marian models
# ct2-opus-mt-converter - Convert OPUS-MT models
# ct2-openai-gpt2-converter - Convert OpenAI GPT-2 modelsHelper functions for model conversion and optimization.
def fuse_linear(spec, layers: list):
"""
Fuse multiple linear layers for optimization.
Args:
spec: Model specification object
layers (list): List of linear layers to fuse
"""
def fuse_linear_prequant(spec, layers: list, axis: int):
"""
Fuse pre-quantized linear layers.
Args:
spec: Model specification object
layers (list): List of pre-quantized linear layers
axis (int): Axis along which to fuse
"""
def permute_for_sliced_rotary(weight, num_heads: int, rotary_dim: int = None):
"""
Permute weights for rotary position embeddings.
Args:
weight: Weight tensor to permute
num_heads (int): Number of attention heads
rotary_dim (int): Rotary embedding dimension
Returns:
Permuted weight tensor
"""
def smooth_activation(layer_norm, linear, activation_scales):
"""
Apply SmoothQuant activation smoothing technique.
Args:
layer_norm: Layer normalization module
linear: Linear layer module
activation_scales: Activation scaling factors
"""import ctranslate2
# Convert a Hugging Face model
converter = ctranslate2.converters.TransformersConverter("microsoft/DialoGPT-medium")
converter.convert("ct2_model", quantization="int8")
# Convert with additional options
converter = ctranslate2.converters.TransformersConverter(
"t5-small",
copy_files=["config.json", "tokenizer.json"],
load_as_float16=True
)
converter.convert("t5_ct2", quantization="int8_float16")
# Convert local model
converter = ctranslate2.converters.TransformersConverter("/path/to/local/model")
converter.convert("output_dir", force=True)import ctranslate2
# Convert Fairseq model
fairseq_converter = ctranslate2.converters.FairseqConverter(
"checkpoint_best.pt",
data_dir="data-bin/wmt14_en_de"
)
fairseq_converter.convert("fairseq_ct2")
# Convert OpenNMT-py model
opennmt_converter = ctranslate2.converters.OpenNMTPyConverter("model.pt")
opennmt_converter.convert("opennmt_ct2")
# Convert OPUS-MT model
opus_converter = ctranslate2.converters.OpusMTConverter("Helsinki-NLP/opus-mt-en-de")
opus_converter.convert("opus_ct2")# Convert Transformers model
ct2-transformers-converter --model microsoft/DialoGPT-medium --output_dir ct2_model --quantization int8
# Convert with custom options
ct2-transformers-converter \
--model t5-small \
--output_dir t5_ct2 \
--quantization int8_float16 \
--copy_files config.json tokenizer.json \
--load_as_float16
# Convert Fairseq model
ct2-fairseq-converter \
--model_path checkpoint_best.pt \
--data_dir data-bin/wmt14_en_de \
--output_dir fairseq_ct2 \
--quantization int8# Available quantization types:
quantization_options = [
"int8", # 8-bit integer quantization
"int8_float16", # 8-bit weights, 16-bit activations
"int16", # 16-bit integer quantization
"float16", # 16-bit floating point
"int8_float32", # 8-bit weights, 32-bit activations
"int4", # 4-bit integer quantization (experimental)
]
# Example with different quantization levels
converter = ctranslate2.converters.TransformersConverter("gpt2")
# Fastest inference, smaller model
converter.convert("gpt2_int8", quantization="int8")
# Balanced speed/quality
converter.convert("gpt2_fp16", quantization="float16")
# Highest quality, larger model
converter.convert("gpt2_fp32") # No quantization (default)# Quantization types
class Quantization:
CT2: str # Standard CTranslate2 quantization
AWQ_GEMM: str # AWQ quantization with GEMM
AWQ_GEMV: str # AWQ quantization with GEMVInstall with Tessl CLI
npx tessl i tessl/pypi-ctranslate2