Fast inference engine for Transformer models
—
Helper functions for model management, device configuration, logging, and tensor operations. CTranslate2 provides utilities for checking model compatibility, managing computational resources, and working with multi-dimensional arrays efficiently.
Utilities for checking and managing CTranslate2 model directories and compatibility.
def contains_model(path: str) -> bool:
"""
Check if a directory contains a valid CTranslate2 model.
Args:
path (str): Path to directory to check
Returns:
bool: True if directory contains a valid CTranslate2 model
"""Functions for managing computational resources and checking hardware capabilities.
def get_cuda_device_count() -> int:
"""
Get the number of available CUDA devices.
Returns:
int: Number of CUDA devices available
"""
def get_supported_compute_types(device: str, device_index: int = 0) -> list:
"""
Get supported compute types for a specific device.
Args:
device (str): Device type ("cpu" or "cuda")
device_index (int): Device index for multi-device setups
Returns:
list: List of supported compute types for the device
"""Control random number generation for reproducible results.
def set_random_seed(seed: int):
"""
Set random seed for reproducible inference.
Args:
seed (int): Random seed value
"""Functions for configuring CTranslate2 logging levels and output.
def get_log_level() -> str:
"""
Get current logging level.
Returns:
str: Current log level ("trace", "debug", "info", "warning", "error", "critical", "off")
"""
def set_log_level(level: str):
"""
Set logging level for CTranslate2.
Args:
level (str): Log level to set ("trace", "debug", "info", "warning", "error", "critical", "off")
"""The StorageView class provides efficient multi-dimensional array operations with device management.
class StorageView:
def __init__(self, array=None, dtype=None):
"""
Initialize a StorageView for efficient tensor operations.
Args:
array: Input array data (numpy array, list, etc.)
dtype: Data type for the storage ("float32", "float16", "int32", "int16", "int8")
"""
def numpy(self):
"""
Convert StorageView to NumPy array.
Returns:
numpy.ndarray: NumPy array representation
"""
def copy(self) -> 'StorageView':
"""
Create a copy of the StorageView.
Returns:
StorageView: Copied StorageView object
"""
def to(self, dtype: str) -> 'StorageView':
"""
Convert StorageView to different data type.
Args:
dtype (str): Target data type
Returns:
StorageView: New StorageView with converted data type
"""
@property
def device(self) -> str:
"""Device where the storage is located."""
@property
def device_index(self) -> int:
"""Device index for multi-device setups."""
@property
def dtype(self) -> str:
"""Data type of the stored elements."""
@property
def shape(self) -> tuple:
"""Shape of the multi-dimensional array."""
@property
def size(self) -> int:
"""Total number of elements in the array."""
@property
def rank(self) -> int:
"""Number of dimensions in the array."""Monitor performance and resource usage during model inference.
class ExecutionStats:
"""Statistics from model execution."""
@property
def num_tokens(self) -> int:
"""Total number of tokens processed."""
@property
def num_examples(self) -> int:
"""Total number of examples processed."""
@property
def total_time_in_ms(self) -> float:
"""Total execution time in milliseconds."""Information about distributed processing setups.
class MpiInfo:
"""MPI (Message Passing Interface) information."""
@property
def rank(self) -> int:
"""Current process rank in MPI setup."""
@property
def size(self) -> int:
"""Total number of processes in MPI setup."""Constants for specifying data types and devices.
class DataType:
"""Data type constants for StorageView and model operations."""
FLOAT32: str = "float32"
FLOAT16: str = "float16"
BFLOAT16: str = "bfloat16"
INT32: str = "int32"
INT16: str = "int16"
INT8: str = "int8"
class Device:
"""Device constants for model placement."""
CPU: str = "cpu"
CUDA: str = "cuda"
AUTO: str = "auto"import ctranslate2
# Check if directory contains valid model
model_path = "path/to/potential/model"
if ctranslate2.contains_model(model_path):
print("Valid CTranslate2 model found")
translator = ctranslate2.Translator(model_path)
else:
print("No valid model found in directory")import ctranslate2
# Check available CUDA devices
cuda_count = ctranslate2.get_cuda_device_count()
print(f"Available CUDA devices: {cuda_count}")
if cuda_count > 0:
# Check supported compute types for GPU
gpu_compute_types = ctranslate2.get_supported_compute_types("cuda", 0)
print(f"GPU compute types: {gpu_compute_types}")
# Use optimal compute type
if "int8" in gpu_compute_types:
translator = ctranslate2.Translator(
"model_path",
device="cuda",
compute_type="int8"
)
# Check supported compute types for CPU
cpu_compute_types = ctranslate2.get_supported_compute_types("cpu")
print(f"CPU compute types: {cpu_compute_types}")import ctranslate2
# Set seed for reproducible inference
ctranslate2.set_random_seed(42)
# Now all inference will be deterministic
generator = ctranslate2.Generator("model_path", device="cpu")
results1 = generator.generate_batch([["Hello"]], sampling_temperature=0.8)
# Reset seed and run again - should get same results
ctranslate2.set_random_seed(42)
results2 = generator.generate_batch([["Hello"]], sampling_temperature=0.8)
assert results1[0].sequences == results2[0].sequencesimport ctranslate2
# Set logging level to see detailed information
ctranslate2.set_log_level("debug")
# Load model with debug logging
translator = ctranslate2.Translator("model_path", device="cpu")
# Get current log level
current_level = ctranslate2.get_log_level()
print(f"Current log level: {current_level}")
# Reduce logging for production
ctranslate2.set_log_level("warning")import ctranslate2
import numpy as np
# Create StorageView from numpy array
np_array = np.random.randn(3, 4).astype(np.float32)
storage = ctranslate2.StorageView(np_array)
print(f"Shape: {storage.shape}")
print(f"Size: {storage.size}")
print(f"Data type: {storage.dtype}")
print(f"Device: {storage.device}")
print(f"Rank: {storage.rank}")
# Convert to different data type
storage_fp16 = storage.to("float16")
print(f"New data type: {storage_fp16.dtype}")
# Convert back to numpy
np_result = storage_fp16.numpy()
print(f"Result shape: {np_result.shape}")
# Create copy
storage_copy = storage.copy()
print(f"Copy device: {storage_copy.device}")import ctranslate2
# Create translator with statistics enabled
translator = ctranslate2.Translator("model_path", device="cpu")
# Perform translation
source = [["Hello", "world"] for _ in range(100)]
results = translator.translate_batch(source)
# Note: ExecutionStats would be available through specific API calls
# or integrated profiling tools (implementation-specific)import ctranslate2
# Check available devices
cuda_count = ctranslate2.get_cuda_device_count()
if cuda_count >= 2:
# Use specific GPU device
translator_gpu0 = ctranslate2.Translator(
"model_path",
device="cuda",
device_index=0
)
translator_gpu1 = ctranslate2.Translator(
"model_path",
device="cuda",
device_index=1
)
# Or use multiple devices with tensor parallelism
translator_parallel = ctranslate2.Translator(
"model_path",
device="cuda",
device_index=[0, 1], # Use both GPUs
tensor_parallel=True
)import ctranslate2
# Configure for optimal performance
ctranslate2.set_log_level("warning") # Reduce logging overhead
ctranslate2.set_random_seed(42) # Reproducible results
# Check optimal compute type for device
device = "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
compute_types = ctranslate2.get_supported_compute_types(device)
# Select best compute type (prefer quantized for speed)
compute_type = "int8" if "int8" in compute_types else "default"
# Create optimized model instance
model = ctranslate2.Translator(
"model_path",
device=device,
compute_type=compute_type,
inter_threads=4, # Parallel processing
max_queued_batches=16, # Larger batch queue
flash_attention=True # Enable Flash Attention if available
)
print(f"Model loaded on {model.device} with {model.compute_type} precision")# Core utility types
class StorageView:
"""Multi-dimensional array container for efficient tensor operations."""
device: str # Device location ("cpu", "cuda")
device_index: int # Device index for multi-device setups
dtype: str # Data type of elements
shape: tuple # Array dimensions
size: int # Total number of elements
rank: int # Number of dimensions
class ExecutionStats:
"""Performance statistics from model execution."""
num_tokens: int # Number of tokens processed
num_examples: int # Number of examples processed
total_time_in_ms: float # Total execution time
class MpiInfo:
"""Multi-process interface information."""
rank: int # Process rank in distributed setup
size: int # Total number of processes
# Enumeration classes
class DataType:
"""Available data types for tensors and computations."""
FLOAT32: str = "float32"
FLOAT16: str = "float16"
BFLOAT16: str = "bfloat16"
INT32: str = "int32"
INT16: str = "int16"
INT8: str = "int8"
class Device:
"""Available device types for model execution."""
CPU: str = "cpu"
CUDA: str = "cuda"
AUTO: str = "auto"Install with Tessl CLI
npx tessl i tessl/pypi-ctranslate2