Fast inference engine for Transformer models
—
Programmatically define and build Transformer model architectures from scratch using CTranslate2's specification system. Model specifications enable creating custom models, modifying existing architectures, and building models without relying on external frameworks.
Core specification classes that provide the foundation for building different types of Transformer models.
class ModelSpec:
"""Abstract base class for all model specifications."""
def save(self, output_dir: str):
"""
Save the model specification to a directory.
Args:
output_dir (str): Directory to save the model
"""
def validate(self):
"""Validate the model specification for correctness."""
def optimize(self, quantization: str = None):
"""
Optimize model weights with optional quantization.
Args:
quantization (str): Quantization type ("int8", "float16", etc.)
"""
def register_file(self, path: str, filename: str = None):
"""
Register additional files to include with the model.
Args:
path (str): Path to the file to register
filename (str): Optional custom filename in model directory
"""
class LayerSpec:
"""Abstract base class for layer specifications."""
def variables(self, prefix: str = "", ordered: bool = False) -> dict:
"""
Get layer variables with optional prefix.
Args:
prefix (str): Prefix for variable names
ordered (bool): Whether to return ordered dictionary
Returns:
dict: Dictionary of layer variables
"""
def validate(self):
"""Validate the layer specification."""
class SequenceToSequenceModelSpec(ModelSpec):
"""Base class for sequence-to-sequence model specifications."""
def register_source_vocabulary(self, tokens: list):
"""
Register source vocabulary tokens.
Args:
tokens (list): List of source vocabulary tokens
"""
def register_target_vocabulary(self, tokens: list):
"""
Register target vocabulary tokens.
Args:
tokens (list): List of target vocabulary tokens
"""
def register_vocabulary_mapping(self, path: str):
"""
Register vocabulary mapping file.
Args:
path (str): Path to vocabulary mapping file
"""
class LanguageModelSpec(ModelSpec):
"""Base class for language model specifications."""
def register_vocabulary(self, tokens: list):
"""
Register vocabulary tokens.
Args:
tokens (list): List of vocabulary tokens
"""Specific implementations for different Transformer model architectures.
class TransformerSpec(SequenceToSequenceModelSpec):
"""Specification for sequence-to-sequence Transformer models."""
def __init__(self, encoder: 'TransformerEncoderSpec', decoder: 'TransformerDecoderSpec'):
"""
Initialize Transformer specification.
Args:
encoder (TransformerEncoderSpec): Encoder specification
decoder (TransformerDecoderSpec): Decoder specification
"""
@classmethod
def from_config(cls, num_layers: int, num_heads: int,
d_model: int = 512, d_ff: int = 2048, **kwargs):
"""
Create Transformer specification from configuration.
Args:
num_layers (int): Number of encoder/decoder layers
num_heads (int): Number of attention heads
d_model (int): Model dimension
d_ff (int): Feed-forward dimension
**kwargs: Additional configuration parameters
Returns:
TransformerSpec: Configured Transformer specification
"""
class TransformerDecoderModelSpec(LanguageModelSpec):
"""Specification for decoder-only Transformer models (GPT-style)."""
def __init__(self, decoder: 'TransformerDecoderSpec'):
"""
Initialize decoder-only Transformer specification.
Args:
decoder (TransformerDecoderSpec): Decoder specification
"""
@classmethod
def from_config(cls, num_layers: int, num_heads: int,
d_model: int = 512, vocab_size: int = 50257, **kwargs):
"""
Create decoder-only Transformer from configuration.
Args:
num_layers (int): Number of decoder layers
num_heads (int): Number of attention heads
d_model (int): Model dimension
vocab_size (int): Vocabulary size
**kwargs: Additional configuration parameters
Returns:
TransformerDecoderModelSpec: Configured decoder model
"""
class TransformerEncoderModelSpec(ModelSpec):
"""Specification for encoder-only Transformer models (BERT-style)."""
def __init__(self, encoder: 'TransformerEncoderSpec', pooling_layer: bool = False):
"""
Initialize encoder-only Transformer specification.
Args:
encoder (TransformerEncoderSpec): Encoder specification
pooling_layer (bool): Whether to include pooling layer
"""Detailed specifications for Transformer encoder and decoder layers.
class TransformerEncoderSpec(LayerSpec):
"""Specification for Transformer encoder layers."""
def __init__(self, num_layers: int, num_heads: int,
pre_norm: bool = True, activation: str = "relu",
num_source_embeddings: int = None,
embeddings_merge: str = "concat",
layernorm_embedding: bool = False,
relative_position: bool = False,
relative_attention_bias: bool = False,
ffn_glu: bool = False, rms_norm: bool = False,
multi_query_attention: bool = False):
"""
Initialize Transformer encoder specification.
Args:
num_layers (int): Number of encoder layers
num_heads (int): Number of attention heads
pre_norm (bool): Whether to use pre-normalization
activation (str): Activation function ("relu", "gelu", etc.)
num_source_embeddings (int): Number of source embeddings
embeddings_merge (str): How to merge embeddings ("concat", "add")
layernorm_embedding (bool): Whether to normalize embeddings
relative_position (bool): Whether to use relative position
relative_attention_bias (bool): Whether to use attention bias
ffn_glu (bool): Whether to use GLU in feed-forward
rms_norm (bool): Whether to use RMS normalization
multi_query_attention (bool): Whether to use multi-query attention
"""
class TransformerDecoderSpec(LayerSpec):
"""Specification for Transformer decoder layers."""
def __init__(self, num_layers: int, num_heads: int,
pre_norm: bool = True, activation: str = "relu",
layernorm_embedding: bool = False,
with_encoder_attention: bool = True,
no_final_norm: bool = False,
project_in_out: bool = False,
relative_position: bool = False,
relative_attention_bias: bool = False,
alignment_layer: int = None,
alignment_heads: int = None,
ffn_glu: bool = False, rms_norm: bool = False,
alibi: bool = False,
alibi_use_positive_positions: bool = False,
scale_alibi: bool = False,
rotary_dim: int = None,
rotary_interleave: bool = True,
rotary_scaling_type: str = None,
rotary_scaling_factor: float = 1.0,
rotary_base: float = 10000.0,
parallel_residual: bool = False,
shared_layer_norm: bool = False,
pre_post_layer_norm: bool = False,
multi_query_attention: bool = False,
num_heads_kv: int = None,
head_dim: int = None,
sliding_window: int = None):
"""
Initialize Transformer decoder specification.
Args:
num_layers (int): Number of decoder layers
num_heads (int): Number of attention heads
pre_norm (bool): Whether to use pre-normalization
activation (str): Activation function
layernorm_embedding (bool): Whether to normalize embeddings
with_encoder_attention (bool): Whether to use encoder-decoder attention
no_final_norm (bool): Whether to skip final normalization
project_in_out (bool): Whether to project input/output
relative_position (bool): Whether to use relative position
relative_attention_bias (bool): Whether to use attention bias
alignment_layer (int): Layer for alignment attention
alignment_heads (int): Number of alignment heads
ffn_glu (bool): Whether to use GLU in feed-forward
rms_norm (bool): Whether to use RMS normalization
alibi (bool): Whether to use ALiBi position encoding
alibi_use_positive_positions (bool): Use positive positions in ALiBi
scale_alibi (bool): Whether to scale ALiBi
rotary_dim (int): Rotary embedding dimension
rotary_interleave (bool): Whether to interleave rotary embeddings
rotary_scaling_type (str): Type of rotary scaling
rotary_scaling_factor (float): Rotary scaling factor
rotary_base (float): Rotary base frequency
parallel_residual (bool): Whether to use parallel residual
shared_layer_norm (bool): Whether to share layer norm
pre_post_layer_norm (bool): Pre and post layer normalization
multi_query_attention (bool): Whether to use multi-query attention
num_heads_kv (int): Number of key-value heads
head_dim (int): Dimension per attention head
sliding_window (int): Sliding window size for attention
"""Building blocks for constructing Transformer architectures.
class LayerNormSpec(LayerSpec):
"""Layer normalization specification."""
def __init__(self, normalized_shape: int, eps: float = 1e-5):
"""
Initialize layer normalization.
Args:
normalized_shape (int): Size of normalized dimensions
eps (float): Epsilon for numerical stability
"""
class LinearSpec(LayerSpec):
"""Linear/dense layer specification."""
def __init__(self, in_features: int, out_features: int, bias: bool = True):
"""
Initialize linear layer.
Args:
in_features (int): Input feature dimension
out_features (int): Output feature dimension
bias (bool): Whether to include bias term
"""
class Conv1DSpec(LayerSpec):
"""1D convolution layer specification."""
def __init__(self, in_channels: int, out_channels: int,
kernel_size: int, stride: int = 1, padding: int = 0):
"""
Initialize 1D convolution layer.
Args:
in_channels (int): Number of input channels
out_channels (int): Number of output channels
kernel_size (int): Convolution kernel size
stride (int): Convolution stride
padding (int): Convolution padding
"""
class EmbeddingsSpec(LayerSpec):
"""Embedding layer specification."""
def __init__(self, num_embeddings: int, embedding_dim: int,
padding_idx: int = None):
"""
Initialize embedding layer.
Args:
num_embeddings (int): Vocabulary size
embedding_dim (int): Embedding dimension
padding_idx (int): Index for padding token
"""
class MultiHeadAttentionSpec(LayerSpec):
"""Multi-head attention layer specification."""
def __init__(self, d_model: int, num_heads: int, dropout: float = 0.0):
"""
Initialize multi-head attention.
Args:
d_model (int): Model dimension
num_heads (int): Number of attention heads
dropout (float): Dropout probability
"""Configuration objects for different model types.
class ModelConfig:
"""Base configuration class for models."""
def to_dict(self) -> dict:
"""Convert configuration to dictionary."""
def save_as_json(self, path: str):
"""
Save configuration as JSON file.
Args:
path (str): Path to save JSON file
"""
class SequenceToSequenceModelConfig(ModelConfig):
"""Configuration for sequence-to-sequence models."""
def __init__(self, unk_token: str = "<unk>", bos_token: str = "<s>",
eos_token: str = "</s>", decoder_start_token: str = None,
add_source_bos: bool = False, add_source_eos: bool = False):
"""
Initialize seq2seq model configuration.
Args:
unk_token (str): Unknown token
bos_token (str): Beginning of sequence token
eos_token (str): End of sequence token
decoder_start_token (str): Decoder start token
add_source_bos (bool): Add BOS to source sequences
add_source_eos (bool): Add EOS to source sequences
"""
class LanguageModelConfig(ModelConfig):
"""Configuration for language models."""
def __init__(self, unk_token: str = "<unk>", bos_token: str = "<s>",
eos_token: str = "</s>"):
"""
Initialize language model configuration.
Args:
unk_token (str): Unknown token
bos_token (str): Beginning of sequence token
eos_token (str): End of sequence token
"""Specifications for domain-specific models like Whisper and Wav2Vec2.
class WhisperSpec(ModelSpec):
"""Specification for Whisper speech recognition models."""
def __init__(self, num_encoder_layers: int, num_encoder_heads: int,
num_decoder_layers: int, num_decoder_heads: int,
d_model: int = 512, vocab_size: int = 51865):
"""
Initialize Whisper specification.
Args:
num_encoder_layers (int): Number of encoder layers
num_encoder_heads (int): Number of encoder attention heads
num_decoder_layers (int): Number of decoder layers
num_decoder_heads (int): Number of decoder attention heads
d_model (int): Model dimension
vocab_size (int): Vocabulary size
"""
class WhisperConfig(ModelConfig):
"""Configuration for Whisper models."""
def __init__(self, suppress_ids: list = None, suppress_ids_begin: list = None,
lang_ids: dict = None, alignment_heads: list = None):
"""
Initialize Whisper configuration.
Args:
suppress_ids (list): Token IDs to suppress during generation
suppress_ids_begin (list): Token IDs to suppress at beginning
lang_ids (dict): Language ID mappings
alignment_heads (list): Attention heads for alignment
"""
class Wav2Vec2Spec(ModelSpec):
"""Specification for Wav2Vec2 models."""
def __init__(self, feat_layers: list, num_layers: int, num_heads: int,
vocab_size: int, return_hidden: bool = False):
"""
Initialize Wav2Vec2 specification.
Args:
feat_layers (list): Feature extraction layer configuration
num_layers (int): Number of transformer layers
num_heads (int): Number of attention heads
vocab_size (int): Vocabulary size
return_hidden (bool): Whether to return hidden states
"""
class Wav2Vec2BertSpec(ModelSpec):
"""Specification for Wav2Vec2-BERT models."""
def __init__(self, num_hidden_layers: int, num_adapter_layers: int,
vocab_size: int, return_hidden: bool = False):
"""
Initialize Wav2Vec2-BERT specification.
Args:
num_hidden_layers (int): Number of hidden layers
num_adapter_layers (int): Number of adapter layers
vocab_size (int): Vocabulary size
return_hidden (bool): Whether to return hidden states
"""import ctranslate2.specs as specs
# Create encoder specification
encoder_spec = specs.TransformerEncoderSpec(
num_layers=6,
num_heads=8,
pre_norm=True,
activation="gelu",
ffn_glu=True
)
# Create decoder specification
decoder_spec = specs.TransformerDecoderSpec(
num_layers=6,
num_heads=8,
pre_norm=True,
activation="gelu",
with_encoder_attention=True,
ffn_glu=True
)
# Create full transformer specification
transformer_spec = specs.TransformerSpec(encoder_spec, decoder_spec)
# Register vocabularies
source_vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(1000)]
target_vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(1000)]
transformer_spec.register_source_vocabulary(source_vocab)
transformer_spec.register_target_vocabulary(target_vocab)
# Save the model
transformer_spec.save("custom_transformer_model")import ctranslate2.specs as specs
# Create decoder-only model (GPT-style)
decoder_spec = specs.TransformerDecoderSpec(
num_layers=12,
num_heads=12,
pre_norm=True,
activation="gelu",
with_encoder_attention=False, # No encoder for language models
rotary_dim=64, # Use rotary position embeddings
parallel_residual=True
)
# Create language model specification
lm_spec = specs.TransformerDecoderModelSpec(decoder_spec)
# Register vocabulary
vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(50000)]
lm_spec.register_vocabulary(vocab)
# Configure model
config = specs.LanguageModelConfig(
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>"
)
# Save the model
lm_spec.save("custom_language_model")import ctranslate2.specs as specs
# Create transformer using factory method
transformer_spec = specs.TransformerSpec.from_config(
num_layers=6,
num_heads=8,
d_model=512,
d_ff=2048,
activation="gelu",
pre_norm=True
)
# Create decoder-only model using factory method
decoder_spec = specs.TransformerDecoderModelSpec.from_config(
num_layers=12,
num_heads=12,
d_model=768,
vocab_size=50257,
activation="gelu"
)# Enumerations for specifications
class Activation:
RELU: str = "relu"
GELU: str = "gelu"
SWISH: str = "swish"
SILU: str = "silu"
TANH: str = "tanh"
SIGMOID: str = "sigmoid"
class EmbeddingsMerge:
CONCAT: str = "concat"
ADD: str = "add"
class RotaryScalingType:
LINEAR: str = "linear"
SU: str = "su"
LLAMA3: str = "llama3"
class Quantization:
CT2: str = "ct2"
AWQ_GEMM: str = "awq_gemm"
AWQ_GEMV: str = "awq_gemv"Install with Tessl CLI
npx tessl i tessl/pypi-ctranslate2