Language model clients with support for multiple providers through LiteLLM, including caching, finetuning, embeddings, and provider-specific features.
Main language model class supporting chat and text completion via LiteLLM.
class LM:
"""
Main language model class.
Provides unified interface to 100+ LLM providers through LiteLLM,
with built-in caching, retry logic, and finetuning support.
"""
def __init__(
self,
model: str,
model_type: str = "chat",
temperature: float = None,
max_tokens: int = None,
cache: bool = True,
callbacks: list = None,
num_retries: int = 3,
provider=None,
finetuning_model: str = None,
launch_kwargs: dict = None,
train_kwargs: dict = None,
use_developer_role: bool = False,
**kwargs
):
"""
Initialize language model.
Args:
model (str): Model identifier (e.g., "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20241022")
model_type (str): Model type - "chat", "text", or "responses" (default: "chat")
temperature (float | None): Sampling temperature (0.0 to 2.0)
max_tokens (int | None): Maximum tokens per response
cache (bool): Enable response caching (default: True)
callbacks (list | None): Callback functions for monitoring
num_retries (int): Number of retry attempts on failure (default: 3)
provider (Provider | None): Provider instance for advanced features
finetuning_model (str | None): Model identifier for finetuning
launch_kwargs (dict | None): Launch configuration for local models
train_kwargs (dict | None): Default training configuration
use_developer_role (bool): Use developer role for responses model (default: False)
**kwargs: Additional LM parameters:
- n (int): Number of completions to generate
- rollout_id (int): Seed for deterministic generation
- stop (list[str]): Stop sequences
- presence_penalty (float): Presence penalty
- frequency_penalty (float): Frequency penalty
- top_p (float): Nucleus sampling parameter
"""
pass
def __call__(self, prompt: str = None, messages: list = None, **kwargs):
"""
Generate completion.
Args:
prompt (str | None): Text prompt (for text models)
messages (list | None): List of message dicts (for chat models)
**kwargs: Override default parameters (temperature, max_tokens, etc.)
Returns:
List of response strings (length = n parameter)
"""
pass
def forward(self, prompt: str = None, messages: list = None, **kwargs):
"""
Same as __call__. Generate completion.
Args:
prompt (str | None): Text prompt
messages (list | None): Message list
**kwargs: Parameter overrides
Returns:
List of response strings
"""
pass
def copy(self, **kwargs):
"""
Create copy with updated parameters.
Args:
**kwargs: Parameters to override
Returns:
New LM instance
"""
pass
def inspect_history(self, n: int = 1):
"""
Pretty-print recent calls.
Args:
n (int): Number of recent calls to display (default: 1)
"""
pass
def get_convo(self, index: int):
"""
Get conversation at index.
Args:
index (int): Conversation index in history
Returns:
Conversation dict with messages and responses
"""
pass
def dump_state(self) -> dict:
"""
Serialize state to dictionary.
Returns:
Dictionary representation of LM state
"""
pass
def launch(self, **launch_kwargs):
"""
Launch model instance (for local/vLLM models).
Args:
**launch_kwargs: Launch configuration parameters
"""
pass
def kill(self):
"""Kill launched model instance."""
pass
def finetune(
self,
train_data: list,
train_kwargs: dict = None,
cache_finetune: bool = False
):
"""
Start finetuning job.
Args:
train_data (list): Training examples
train_kwargs (dict | None): Training configuration
cache_finetune (bool): Cache finetuning job (default: False)
Returns:
TrainingJob instance
"""
pass
def push_to_hf_hub(self, repo_id: str, **kwargs):
"""
Push finetuned model to Hugging Face Hub.
Args:
repo_id (str): Repository ID (e.g., "username/model-name")
**kwargs: Additional push parameters
"""
passUsage:
import dspy
# Basic usage
lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)
# With parameters
lm = dspy.LM(
'openai/gpt-4o-mini',
temperature=0.7,
max_tokens=500,
cache=True
)
# Direct call
response = lm(prompt="Translate to French: Hello")
print(response[0]) # "Bonjour"
# Chat messages
response = lm(messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2+2?"}
])
# Multiple completions
lm = dspy.LM('openai/gpt-4o-mini', n=5)
responses = lm(prompt="Write a haiku about coding")
for i, resp in enumerate(responses):
print(f"Completion {i+1}: {resp}")
# Copy with different parameters
lm_creative = lm.copy(temperature=1.5, max_tokens=1000)
# Inspect history
lm.inspect_history(n=3) # Show last 3 calls
# Different providers
anthropic_lm = dspy.LM('anthropic/claude-3-5-sonnet-20241022')
gemini_lm = dspy.LM('gemini/gemini-1.5-pro')
mistral_lm = dspy.LM('mistral/mistral-large-latest')Abstract base class for custom language model implementations.
class BaseLM:
"""
Base class for language models.
Defines the interface that custom LM implementations should follow.
Extend this class to create custom model integrations.
"""
def __call__(self, prompt: str = None, messages: list = None, **kwargs):
"""
Generate completion (must be implemented).
Args:
prompt (str | None): Text prompt
messages (list | None): Message list
**kwargs: Model-specific parameters
Returns:
List of response strings
"""
pass
def dump_state(self) -> dict:
"""
Serialize state (must be implemented).
Returns:
Dictionary representation of state
"""
passUsage:
import dspy
class CustomLM(dspy.BaseLM):
"""Custom language model integration."""
def __init__(self, model_path: str, **kwargs):
self.model_path = model_path
self.config = kwargs
# Initialize your model here
def __call__(self, prompt=None, messages=None, **kwargs):
# Implement your model's generation logic
# Must return list of strings
response = self._generate(prompt or messages)
return [response]
def dump_state(self):
return {
"model_path": self.model_path,
"config": self.config
}
# Use custom LM
custom_lm = CustomLM("/path/to/model")
dspy.configure(lm=custom_lm)Text embedding class for vectorization.
class Embedder:
"""
Embedding class for text vectorization.
Supports both hosted embedding models and custom embedding functions.
"""
def __init__(
self,
model,
batch_size: int = 200,
caching: bool = True,
**kwargs
):
"""
Initialize embedder.
Args:
model (str | callable): Model name or custom embedding function
- str: Hosted model (e.g., "openai/text-embedding-3-small")
- callable: Custom function that takes list of texts
batch_size (int): Batch size for processing (default: 200)
caching (bool): Cache embedding responses (default: True)
**kwargs: Additional model parameters
"""
pass
def __call__(self, texts: list, batch_size: int = None, **kwargs):
"""
Compute embeddings for texts.
Args:
texts (list): List of strings to embed
batch_size (int | None): Override batch size
**kwargs: Additional parameters
Returns:
numpy.ndarray of shape (len(texts), embedding_dim)
"""
passUsage:
import dspy
import numpy as np
# Hosted model
embedder = dspy.Embedder("openai/text-embedding-3-small")
embeddings = embedder(["hello", "world", "machine learning"])
print(embeddings.shape) # (3, 1536)
# Custom function
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embedder = dspy.Embedder(model.encode, batch_size=32)
embeddings = embedder(["hello", "world"])
print(embeddings.shape) # (2, 384)
# Use with KNN
knn = dspy.KNN(k=5, trainset=trainset, vectorizer=embedder)
similar = knn(query="machine learning")Base class for LM providers handling launching, finetuning, and provider-specific operations.
class Provider:
"""
Base class for LM providers.
Handles provider-specific operations like model launching,
finetuning, and reinforcement learning.
"""
finetunable: bool
"""Whether this provider supports finetuning."""
reinforceable: bool
"""Whether this provider supports reinforcement learning."""
TrainingJob: type
"""TrainingJob class for this provider."""
ReinforceJob: type
"""ReinforceJob class for this provider."""
@staticmethod
def is_provider_model(model: str) -> bool:
"""
Check if model belongs to this provider.
Args:
model (str): Model identifier
Returns:
True if model is from this provider
"""
pass
def launch(self, lm, launch_kwargs: dict):
"""
Launch model instance.
Args:
lm: LM instance
launch_kwargs (dict): Launch configuration
"""
pass
def kill(self, lm, launch_kwargs: dict):
"""
Kill launched instance.
Args:
lm: LM instance
launch_kwargs (dict): Launch configuration
"""
pass
def finetune(self, job, model: str, train_data: list, train_kwargs: dict):
"""
Start finetuning job.
Args:
job: TrainingJob instance
model (str): Model to finetune
train_data (list): Training examples
train_kwargs (dict): Training configuration
"""
passRepresents a finetuning job with async monitoring.
class TrainingJob:
"""
Training job for model finetuning.
Extends concurrent.futures.Future for async job monitoring.
"""
def __init__(
self,
thread=None,
model: str = None,
train_data: list = None,
train_data_format: str = None,
train_kwargs: dict = None
):
"""
Initialize training job.
Args:
thread: Thread running the job
model (str | None): Model being finetuned
train_data (list | None): Training data
train_data_format (str | None): Format of training data
train_kwargs (dict | None): Training configuration
"""
pass
def status(self) -> str:
"""
Get job status.
Returns:
Status string ("running", "completed", "failed", etc.)
"""
pass
def cancel(self) -> bool:
"""
Cancel job.
Returns:
True if successfully cancelled
"""
pass
def result(self, timeout: float = None):
"""
Wait for and get result.
Args:
timeout (float | None): Maximum wait time in seconds
Returns:
Finetuned model identifier
"""
pass
def done(self) -> bool:
"""
Check if job is done.
Returns:
True if job completed (success or failure)
"""
passUsage:
import dspy
# Start finetuning
lm = dspy.LM('openai/gpt-4o-mini', finetuning_model='gpt-4o-mini-2024-07-18')
train_data = [
{"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is ML?"},
{"role": "assistant", "content": "Machine Learning is..."}
]},
# ... more examples
]
job = lm.finetune(
train_data=train_data,
train_kwargs={
"n_epochs": 3,
"batch_size": 4,
"learning_rate_multiplier": 0.1
}
)
# Check status
print(job.status()) # "running"
# Wait for completion
finetuned_model_id = job.result(timeout=3600) # Wait up to 1 hour
print(f"Finetuned model: {finetuned_model_id}")
# Use finetuned model
finetuned_lm = dspy.LM(finetuned_model_id)
dspy.configure(lm=finetuned_lm)Inspect call history of language models or modules.
def inspect_history(lm_or_module, n: int = 1):
"""
Inspect call history with pretty printing.
Args:
lm_or_module: LM instance or Module instance
n (int): Number of recent calls to display (default: 1)
"""
passUsage:
import dspy
lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)
# Make some calls
qa = dspy.Predict("question -> answer")
qa(question="What is 2+2?")
qa(question="What is the capital of France?")
# Inspect LM history
dspy.inspect_history(lm, n=2)
# Inspect module history
dspy.inspect_history(qa, n=2)Configure the caching system for language model responses.
def configure_cache(
enable_disk_cache: bool = None,
enable_memory_cache: bool = None,
disk_cache_dir: str = None,
disk_size_limit_bytes: int = None,
memory_max_entries: int = None
):
"""
Configure DSPy caching system.
Args:
enable_disk_cache (bool | None): Enable disk cache (default: True)
enable_memory_cache (bool | None): Enable memory cache (default: True)
disk_cache_dir (str | None): Directory path for disk cache
disk_size_limit_bytes (int | None): Maximum disk cache size in bytes
memory_max_entries (int | None): Maximum number of memory cache entries
"""
passUsage:
import dspy
# Configure cache
dspy.configure_cache(
enable_disk_cache=True,
enable_memory_cache=True,
disk_cache_dir="/tmp/dspy_cache",
disk_size_limit_bytes=5_000_000_000, # 5GB
memory_max_entries=5000
)
# Disable caching entirely
dspy.configure_cache(
enable_disk_cache=False,
enable_memory_cache=False
)Control LiteLLM logging output.
def enable_litellm_logging():
"""Enable LiteLLM debug logging."""
pass
def disable_litellm_logging():
"""Disable LiteLLM debug logging."""
passUsage:
import dspy
# Disable verbose LiteLLM logs
dspy.disable_litellm_logging()
# Enable for debugging
dspy.enable_litellm_logging()Use different models for different purposes:
import dspy
# Fast model for simple tasks
fast_lm = dspy.LM('openai/gpt-4o-mini', temperature=0.0)
# Powerful model for complex reasoning
strong_lm = dspy.LM('openai/gpt-4o', temperature=0.7)
# Configure default
dspy.configure(lm=fast_lm)
# Use strong model in specific context
with dspy.context(lm=strong_lm):
result = complex_module(input=data)Try cheap model first, fall back to expensive:
import dspy
class CascadingQA(dspy.Module):
def __init__(self):
super().__init__()
self.cheap_qa = dspy.Predict("question -> answer")
self.expensive_qa = dspy.ChainOfThought("question -> answer")
self.judge = dspy.Predict("question, answer -> confident: bool")
def forward(self, question):
# Try cheap model
cheap_result = self.cheap_qa(question=question)
# Check confidence
judgment = self.judge(
question=question,
answer=cheap_result.answer
)
if judgment.confident:
return cheap_result
else:
# Fall back to expensive model
with dspy.context(lm=expensive_lm):
return self.expensive_qa(question=question)Complete finetuning workflow:
import dspy
# 1. Bootstrap high-quality training data
dspy.configure(lm=dspy.LM('openai/gpt-4o'))
program = MyModule()
optimizer = dspy.BootstrapFinetune(metric=my_metric)
compiled, finetune_data = optimizer.compile(program, trainset=trainset)
# 2. Finetune model
base_lm = dspy.LM(
'openai/gpt-4o-mini',
finetuning_model='gpt-4o-mini-2024-07-18'
)
job = base_lm.finetune(
train_data=finetune_data,
train_kwargs={"n_epochs": 3}
)
# 3. Wait for completion
finetuned_model_id = job.result(timeout=7200)
# 4. Use finetuned model
finetuned_lm = dspy.LM(finetuned_model_id)
dspy.configure(lm=finetuned_lm)
# 5. Evaluate improvement
evaluator = dspy.Evaluate(devset=test_set, metric=my_metric)
score = evaluator(program)Use local models with vLLM:
import dspy
# Launch local model with vLLM
lm = dspy.LM(
'meta-llama/Llama-3.1-8B-Instruct',
launch_kwargs={
"gpu_memory_utilization": 0.9,
"max_model_len": 8192,
"tensor_parallel_size": 2
}
)
lm.launch() # Start vLLM server
# Use local model
dspy.configure(lm=lm)
result = my_program(input=data)
# Clean up
lm.kill()Use embeddings for retrieval and similarity:
import dspy
# Create embedder
embedder = dspy.Embedder("openai/text-embedding-3-small")
# Embed documents
docs = ["Document 1 text", "Document 2 text", ...]
doc_embeddings = embedder(docs)
# Embed query
query = "search query"
query_embedding = embedder([query])
# Compute similarity
from numpy import dot
from numpy.linalg import norm
def cosine_similarity(a, b):
return dot(a, b) / (norm(a) * norm(b))
similarities = [
cosine_similarity(query_embedding[0], doc_emb)
for doc_emb in doc_embeddings
]
# Get top-k documents
top_k = sorted(enumerate(similarities), key=lambda x: -x[1])[:5]