0
# Model Specifications
1
2
Programmatically define and build Transformer model architectures from scratch using CTranslate2's specification system. Model specifications enable creating custom models, modifying existing architectures, and building models without relying on external frameworks.
3
4
## Capabilities
5
6
### Base Model Specifications
7
8
Core specification classes that provide the foundation for building different types of Transformer models.
9
10
```python { .api }
11
class ModelSpec:
12
"""Abstract base class for all model specifications."""
13
14
def save(self, output_dir: str):
15
"""
16
Save the model specification to a directory.
17
18
Args:
19
output_dir (str): Directory to save the model
20
"""
21
22
def validate(self):
23
"""Validate the model specification for correctness."""
24
25
def optimize(self, quantization: str = None):
26
"""
27
Optimize model weights with optional quantization.
28
29
Args:
30
quantization (str): Quantization type ("int8", "float16", etc.)
31
"""
32
33
def register_file(self, path: str, filename: str = None):
34
"""
35
Register additional files to include with the model.
36
37
Args:
38
path (str): Path to the file to register
39
filename (str): Optional custom filename in model directory
40
"""
41
42
class LayerSpec:
43
"""Abstract base class for layer specifications."""
44
45
def variables(self, prefix: str = "", ordered: bool = False) -> dict:
46
"""
47
Get layer variables with optional prefix.
48
49
Args:
50
prefix (str): Prefix for variable names
51
ordered (bool): Whether to return ordered dictionary
52
53
Returns:
54
dict: Dictionary of layer variables
55
"""
56
57
def validate(self):
58
"""Validate the layer specification."""
59
60
class SequenceToSequenceModelSpec(ModelSpec):
61
"""Base class for sequence-to-sequence model specifications."""
62
63
def register_source_vocabulary(self, tokens: list):
64
"""
65
Register source vocabulary tokens.
66
67
Args:
68
tokens (list): List of source vocabulary tokens
69
"""
70
71
def register_target_vocabulary(self, tokens: list):
72
"""
73
Register target vocabulary tokens.
74
75
Args:
76
tokens (list): List of target vocabulary tokens
77
"""
78
79
def register_vocabulary_mapping(self, path: str):
80
"""
81
Register vocabulary mapping file.
82
83
Args:
84
path (str): Path to vocabulary mapping file
85
"""
86
87
class LanguageModelSpec(ModelSpec):
88
"""Base class for language model specifications."""
89
90
def register_vocabulary(self, tokens: list):
91
"""
92
Register vocabulary tokens.
93
94
Args:
95
tokens (list): List of vocabulary tokens
96
"""
97
```
98
99
### Transformer Model Specifications
100
101
Specific implementations for different Transformer model architectures.
102
103
```python { .api }
104
class TransformerSpec(SequenceToSequenceModelSpec):
105
"""Specification for sequence-to-sequence Transformer models."""
106
107
def __init__(self, encoder: 'TransformerEncoderSpec', decoder: 'TransformerDecoderSpec'):
108
"""
109
Initialize Transformer specification.
110
111
Args:
112
encoder (TransformerEncoderSpec): Encoder specification
113
decoder (TransformerDecoderSpec): Decoder specification
114
"""
115
116
@classmethod
117
def from_config(cls, num_layers: int, num_heads: int,
118
d_model: int = 512, d_ff: int = 2048, **kwargs):
119
"""
120
Create Transformer specification from configuration.
121
122
Args:
123
num_layers (int): Number of encoder/decoder layers
124
num_heads (int): Number of attention heads
125
d_model (int): Model dimension
126
d_ff (int): Feed-forward dimension
127
**kwargs: Additional configuration parameters
128
129
Returns:
130
TransformerSpec: Configured Transformer specification
131
"""
132
133
class TransformerDecoderModelSpec(LanguageModelSpec):
134
"""Specification for decoder-only Transformer models (GPT-style)."""
135
136
def __init__(self, decoder: 'TransformerDecoderSpec'):
137
"""
138
Initialize decoder-only Transformer specification.
139
140
Args:
141
decoder (TransformerDecoderSpec): Decoder specification
142
"""
143
144
@classmethod
145
def from_config(cls, num_layers: int, num_heads: int,
146
d_model: int = 512, vocab_size: int = 50257, **kwargs):
147
"""
148
Create decoder-only Transformer from configuration.
149
150
Args:
151
num_layers (int): Number of decoder layers
152
num_heads (int): Number of attention heads
153
d_model (int): Model dimension
154
vocab_size (int): Vocabulary size
155
**kwargs: Additional configuration parameters
156
157
Returns:
158
TransformerDecoderModelSpec: Configured decoder model
159
"""
160
161
class TransformerEncoderModelSpec(ModelSpec):
162
"""Specification for encoder-only Transformer models (BERT-style)."""
163
164
def __init__(self, encoder: 'TransformerEncoderSpec', pooling_layer: bool = False):
165
"""
166
Initialize encoder-only Transformer specification.
167
168
Args:
169
encoder (TransformerEncoderSpec): Encoder specification
170
pooling_layer (bool): Whether to include pooling layer
171
"""
172
```
173
174
### Transformer Layer Specifications
175
176
Detailed specifications for Transformer encoder and decoder layers.
177
178
```python { .api }
179
class TransformerEncoderSpec(LayerSpec):
180
"""Specification for Transformer encoder layers."""
181
182
def __init__(self, num_layers: int, num_heads: int,
183
pre_norm: bool = True, activation: str = "relu",
184
num_source_embeddings: int = None,
185
embeddings_merge: str = "concat",
186
layernorm_embedding: bool = False,
187
relative_position: bool = False,
188
relative_attention_bias: bool = False,
189
ffn_glu: bool = False, rms_norm: bool = False,
190
multi_query_attention: bool = False):
191
"""
192
Initialize Transformer encoder specification.
193
194
Args:
195
num_layers (int): Number of encoder layers
196
num_heads (int): Number of attention heads
197
pre_norm (bool): Whether to use pre-normalization
198
activation (str): Activation function ("relu", "gelu", etc.)
199
num_source_embeddings (int): Number of source embeddings
200
embeddings_merge (str): How to merge embeddings ("concat", "add")
201
layernorm_embedding (bool): Whether to normalize embeddings
202
relative_position (bool): Whether to use relative position
203
relative_attention_bias (bool): Whether to use attention bias
204
ffn_glu (bool): Whether to use GLU in feed-forward
205
rms_norm (bool): Whether to use RMS normalization
206
multi_query_attention (bool): Whether to use multi-query attention
207
"""
208
209
class TransformerDecoderSpec(LayerSpec):
210
"""Specification for Transformer decoder layers."""
211
212
def __init__(self, num_layers: int, num_heads: int,
213
pre_norm: bool = True, activation: str = "relu",
214
layernorm_embedding: bool = False,
215
with_encoder_attention: bool = True,
216
no_final_norm: bool = False,
217
project_in_out: bool = False,
218
relative_position: bool = False,
219
relative_attention_bias: bool = False,
220
alignment_layer: int = None,
221
alignment_heads: int = None,
222
ffn_glu: bool = False, rms_norm: bool = False,
223
alibi: bool = False,
224
alibi_use_positive_positions: bool = False,
225
scale_alibi: bool = False,
226
rotary_dim: int = None,
227
rotary_interleave: bool = True,
228
rotary_scaling_type: str = None,
229
rotary_scaling_factor: float = 1.0,
230
rotary_base: float = 10000.0,
231
parallel_residual: bool = False,
232
shared_layer_norm: bool = False,
233
pre_post_layer_norm: bool = False,
234
multi_query_attention: bool = False,
235
num_heads_kv: int = None,
236
head_dim: int = None,
237
sliding_window: int = None):
238
"""
239
Initialize Transformer decoder specification.
240
241
Args:
242
num_layers (int): Number of decoder layers
243
num_heads (int): Number of attention heads
244
pre_norm (bool): Whether to use pre-normalization
245
activation (str): Activation function
246
layernorm_embedding (bool): Whether to normalize embeddings
247
with_encoder_attention (bool): Whether to use encoder-decoder attention
248
no_final_norm (bool): Whether to skip final normalization
249
project_in_out (bool): Whether to project input/output
250
relative_position (bool): Whether to use relative position
251
relative_attention_bias (bool): Whether to use attention bias
252
alignment_layer (int): Layer for alignment attention
253
alignment_heads (int): Number of alignment heads
254
ffn_glu (bool): Whether to use GLU in feed-forward
255
rms_norm (bool): Whether to use RMS normalization
256
alibi (bool): Whether to use ALiBi position encoding
257
alibi_use_positive_positions (bool): Use positive positions in ALiBi
258
scale_alibi (bool): Whether to scale ALiBi
259
rotary_dim (int): Rotary embedding dimension
260
rotary_interleave (bool): Whether to interleave rotary embeddings
261
rotary_scaling_type (str): Type of rotary scaling
262
rotary_scaling_factor (float): Rotary scaling factor
263
rotary_base (float): Rotary base frequency
264
parallel_residual (bool): Whether to use parallel residual
265
shared_layer_norm (bool): Whether to share layer norm
266
pre_post_layer_norm (bool): Pre and post layer normalization
267
multi_query_attention (bool): Whether to use multi-query attention
268
num_heads_kv (int): Number of key-value heads
269
head_dim (int): Dimension per attention head
270
sliding_window (int): Sliding window size for attention
271
"""
272
```
273
274
### Common Layer Specifications
275
276
Building blocks for constructing Transformer architectures.
277
278
```python { .api }
279
class LayerNormSpec(LayerSpec):
280
"""Layer normalization specification."""
281
282
def __init__(self, normalized_shape: int, eps: float = 1e-5):
283
"""
284
Initialize layer normalization.
285
286
Args:
287
normalized_shape (int): Size of normalized dimensions
288
eps (float): Epsilon for numerical stability
289
"""
290
291
class LinearSpec(LayerSpec):
292
"""Linear/dense layer specification."""
293
294
def __init__(self, in_features: int, out_features: int, bias: bool = True):
295
"""
296
Initialize linear layer.
297
298
Args:
299
in_features (int): Input feature dimension
300
out_features (int): Output feature dimension
301
bias (bool): Whether to include bias term
302
"""
303
304
class Conv1DSpec(LayerSpec):
305
"""1D convolution layer specification."""
306
307
def __init__(self, in_channels: int, out_channels: int,
308
kernel_size: int, stride: int = 1, padding: int = 0):
309
"""
310
Initialize 1D convolution layer.
311
312
Args:
313
in_channels (int): Number of input channels
314
out_channels (int): Number of output channels
315
kernel_size (int): Convolution kernel size
316
stride (int): Convolution stride
317
padding (int): Convolution padding
318
"""
319
320
class EmbeddingsSpec(LayerSpec):
321
"""Embedding layer specification."""
322
323
def __init__(self, num_embeddings: int, embedding_dim: int,
324
padding_idx: int = None):
325
"""
326
Initialize embedding layer.
327
328
Args:
329
num_embeddings (int): Vocabulary size
330
embedding_dim (int): Embedding dimension
331
padding_idx (int): Index for padding token
332
"""
333
334
class MultiHeadAttentionSpec(LayerSpec):
335
"""Multi-head attention layer specification."""
336
337
def __init__(self, d_model: int, num_heads: int, dropout: float = 0.0):
338
"""
339
Initialize multi-head attention.
340
341
Args:
342
d_model (int): Model dimension
343
num_heads (int): Number of attention heads
344
dropout (float): Dropout probability
345
"""
346
```
347
348
### Configuration Classes
349
350
Configuration objects for different model types.
351
352
```python { .api }
353
class ModelConfig:
354
"""Base configuration class for models."""
355
356
def to_dict(self) -> dict:
357
"""Convert configuration to dictionary."""
358
359
def save_as_json(self, path: str):
360
"""
361
Save configuration as JSON file.
362
363
Args:
364
path (str): Path to save JSON file
365
"""
366
367
class SequenceToSequenceModelConfig(ModelConfig):
368
"""Configuration for sequence-to-sequence models."""
369
370
def __init__(self, unk_token: str = "<unk>", bos_token: str = "<s>",
371
eos_token: str = "</s>", decoder_start_token: str = None,
372
add_source_bos: bool = False, add_source_eos: bool = False):
373
"""
374
Initialize seq2seq model configuration.
375
376
Args:
377
unk_token (str): Unknown token
378
bos_token (str): Beginning of sequence token
379
eos_token (str): End of sequence token
380
decoder_start_token (str): Decoder start token
381
add_source_bos (bool): Add BOS to source sequences
382
add_source_eos (bool): Add EOS to source sequences
383
"""
384
385
class LanguageModelConfig(ModelConfig):
386
"""Configuration for language models."""
387
388
def __init__(self, unk_token: str = "<unk>", bos_token: str = "<s>",
389
eos_token: str = "</s>"):
390
"""
391
Initialize language model configuration.
392
393
Args:
394
unk_token (str): Unknown token
395
bos_token (str): Beginning of sequence token
396
eos_token (str): End of sequence token
397
"""
398
```
399
400
### Specialized Model Specifications
401
402
Specifications for domain-specific models like Whisper and Wav2Vec2.
403
404
```python { .api }
405
class WhisperSpec(ModelSpec):
406
"""Specification for Whisper speech recognition models."""
407
408
def __init__(self, num_encoder_layers: int, num_encoder_heads: int,
409
num_decoder_layers: int, num_decoder_heads: int,
410
d_model: int = 512, vocab_size: int = 51865):
411
"""
412
Initialize Whisper specification.
413
414
Args:
415
num_encoder_layers (int): Number of encoder layers
416
num_encoder_heads (int): Number of encoder attention heads
417
num_decoder_layers (int): Number of decoder layers
418
num_decoder_heads (int): Number of decoder attention heads
419
d_model (int): Model dimension
420
vocab_size (int): Vocabulary size
421
"""
422
423
class WhisperConfig(ModelConfig):
424
"""Configuration for Whisper models."""
425
426
def __init__(self, suppress_ids: list = None, suppress_ids_begin: list = None,
427
lang_ids: dict = None, alignment_heads: list = None):
428
"""
429
Initialize Whisper configuration.
430
431
Args:
432
suppress_ids (list): Token IDs to suppress during generation
433
suppress_ids_begin (list): Token IDs to suppress at beginning
434
lang_ids (dict): Language ID mappings
435
alignment_heads (list): Attention heads for alignment
436
"""
437
438
class Wav2Vec2Spec(ModelSpec):
439
"""Specification for Wav2Vec2 models."""
440
441
def __init__(self, feat_layers: list, num_layers: int, num_heads: int,
442
vocab_size: int, return_hidden: bool = False):
443
"""
444
Initialize Wav2Vec2 specification.
445
446
Args:
447
feat_layers (list): Feature extraction layer configuration
448
num_layers (int): Number of transformer layers
449
num_heads (int): Number of attention heads
450
vocab_size (int): Vocabulary size
451
return_hidden (bool): Whether to return hidden states
452
"""
453
454
class Wav2Vec2BertSpec(ModelSpec):
455
"""Specification for Wav2Vec2-BERT models."""
456
457
def __init__(self, num_hidden_layers: int, num_adapter_layers: int,
458
vocab_size: int, return_hidden: bool = False):
459
"""
460
Initialize Wav2Vec2-BERT specification.
461
462
Args:
463
num_hidden_layers (int): Number of hidden layers
464
num_adapter_layers (int): Number of adapter layers
465
vocab_size (int): Vocabulary size
466
return_hidden (bool): Whether to return hidden states
467
"""
468
```
469
470
## Usage Examples
471
472
### Building a Custom Transformer
473
474
```python
475
import ctranslate2.specs as specs
476
477
# Create encoder specification
478
encoder_spec = specs.TransformerEncoderSpec(
479
num_layers=6,
480
num_heads=8,
481
pre_norm=True,
482
activation="gelu",
483
ffn_glu=True
484
)
485
486
# Create decoder specification
487
decoder_spec = specs.TransformerDecoderSpec(
488
num_layers=6,
489
num_heads=8,
490
pre_norm=True,
491
activation="gelu",
492
with_encoder_attention=True,
493
ffn_glu=True
494
)
495
496
# Create full transformer specification
497
transformer_spec = specs.TransformerSpec(encoder_spec, decoder_spec)
498
499
# Register vocabularies
500
source_vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(1000)]
501
target_vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(1000)]
502
503
transformer_spec.register_source_vocabulary(source_vocab)
504
transformer_spec.register_target_vocabulary(target_vocab)
505
506
# Save the model
507
transformer_spec.save("custom_transformer_model")
508
```
509
510
### Building a Language Model
511
512
```python
513
import ctranslate2.specs as specs
514
515
# Create decoder-only model (GPT-style)
516
decoder_spec = specs.TransformerDecoderSpec(
517
num_layers=12,
518
num_heads=12,
519
pre_norm=True,
520
activation="gelu",
521
with_encoder_attention=False, # No encoder for language models
522
rotary_dim=64, # Use rotary position embeddings
523
parallel_residual=True
524
)
525
526
# Create language model specification
527
lm_spec = specs.TransformerDecoderModelSpec(decoder_spec)
528
529
# Register vocabulary
530
vocab = ["<unk>", "<s>", "</s>"] + ["token_" + str(i) for i in range(50000)]
531
lm_spec.register_vocabulary(vocab)
532
533
# Configure model
534
config = specs.LanguageModelConfig(
535
unk_token="<unk>",
536
bos_token="<s>",
537
eos_token="</s>"
538
)
539
540
# Save the model
541
lm_spec.save("custom_language_model")
542
```
543
544
### Using Factory Methods
545
546
```python
547
import ctranslate2.specs as specs
548
549
# Create transformer using factory method
550
transformer_spec = specs.TransformerSpec.from_config(
551
num_layers=6,
552
num_heads=8,
553
d_model=512,
554
d_ff=2048,
555
activation="gelu",
556
pre_norm=True
557
)
558
559
# Create decoder-only model using factory method
560
decoder_spec = specs.TransformerDecoderModelSpec.from_config(
561
num_layers=12,
562
num_heads=12,
563
d_model=768,
564
vocab_size=50257,
565
activation="gelu"
566
)
567
```
568
569
## Types
570
571
```python { .api }
572
# Enumerations for specifications
573
class Activation:
574
RELU: str = "relu"
575
GELU: str = "gelu"
576
SWISH: str = "swish"
577
SILU: str = "silu"
578
TANH: str = "tanh"
579
SIGMOID: str = "sigmoid"
580
581
class EmbeddingsMerge:
582
CONCAT: str = "concat"
583
ADD: str = "add"
584
585
class RotaryScalingType:
586
LINEAR: str = "linear"
587
SU: str = "su"
588
LLAMA3: str = "llama3"
589
590
class Quantization:
591
CT2: str = "ct2"
592
AWQ_GEMM: str = "awq_gemm"
593
AWQ_GEMV: str = "awq_gemv"
594
```