Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence across sessions.
In-memory caching for fast access to frequently used model states and computations.
class LlamaRAMCache:
def __init__(self, capacity_bytes: int = 2 << 30):
"""
Initialize RAM-based cache.
Args:
capacity_bytes: Maximum cache size in bytes (default: 2GB)
"""
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
"""Get cached item by key."""
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
"""Store item in cache."""
def __contains__(self, key: Tuple[int, ...]) -> bool:
"""Check if key exists in cache."""
def __len__(self) -> int:
"""Get number of cached items."""
# Alias for backward compatibility
LlamaCache = LlamaRAMCachePersistent disk-based caching for long-term storage of model states and precomputed results.
class LlamaDiskCache:
def __init__(self, cache_dir: str = ".cache/llama_cpp"):
"""
Initialize disk-based cache.
Args:
cache_dir: Directory path for cache storage
"""
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
"""Get cached item from disk."""
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
"""Store item to disk cache."""
def __contains__(self, key: Tuple[int, ...]) -> bool:
"""Check if key exists in disk cache."""
def __len__(self) -> int:
"""Get number of cached items on disk."""Abstract base class defining the caching interface for custom implementations.
class BaseLlamaCache:
"""Abstract base class for cache implementations."""
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
"""Get item from cache."""
raise NotImplementedError
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
"""Store item in cache."""
raise NotImplementedError
def __contains__(self, key: Tuple[int, ...]) -> bool:
"""Check if key exists in cache."""
raise NotImplementedError
def __len__(self) -> int:
"""Get number of cached items."""
raise NotImplementedErrorSet and manage caching for Llama model instances.
# From Llama class
def set_cache(self, cache: Optional[BaseLlamaCache]) -> None:
"""
Set caching implementation for the model.
Args:
cache: Cache instance (LlamaRAMCache, LlamaDiskCache, or custom)
Use None to disable caching
"""from llama_cpp import Llama, LlamaRAMCache
# Create RAM cache with 1GB capacity
cache = LlamaRAMCache(capacity_bytes=1 << 30) # 1GB
# Initialize model with cache
llm = Llama(
model_path="./models/llama-2-7b.gguf",
n_ctx=2048,
)
llm.set_cache(cache)
# First completion (uncached)
response1 = llm.create_completion(
prompt="The capital of France is",
max_tokens=10,
)
# Second identical completion (cached, faster)
response2 = llm.create_completion(
prompt="The capital of France is",
max_tokens=10,
)
print(f"Cache size: {len(cache)} items")from llama_cpp import Llama, LlamaDiskCache
# Create disk cache in custom directory
cache = LlamaDiskCache(cache_dir="./my_llama_cache")
llm = Llama(model_path="./models/llama-2-7b.gguf")
llm.set_cache(cache)
# Generate text with caching
for i in range(3):
response = llm.create_completion(
prompt=f"Write a fact about number {i}:",
max_tokens=50,
)
print(f"Response {i}: {response['choices'][0]['text']}")
# Cache persists across program restarts
print(f"Disk cache contains {len(cache)} items")from llama_cpp import Llama, LlamaRAMCache
# Initialize with monitoring
cache = LlamaRAMCache(capacity_bytes=512 << 20) # 512MB
llm = Llama(model_path="./models/llama-2-7b.gguf")
llm.set_cache(cache)
prompts = [
"What is machine learning?",
"Explain neural networks.",
"What is deep learning?",
"Define artificial intelligence.",
"What is machine learning?", # Duplicate for cache hit
]
cache_stats = {"hits": 0, "misses": 0}
for i, prompt in enumerate(prompts):
initial_size = len(cache)
response = llm.create_completion(
prompt=prompt,
max_tokens=30,
)
final_size = len(cache)
if final_size > initial_size:
cache_stats["misses"] += 1
print(f"Prompt {i+1}: CACHE MISS - New cache size: {final_size}")
else:
cache_stats["hits"] += 1
print(f"Prompt {i+1}: CACHE HIT - Cache size: {final_size}")
print(f"Cache statistics: {cache_stats}")from llama_cpp.llama_cache import BaseLlamaCache
import json
import hashlib
from pathlib import Path
class JSONDiskCache(BaseLlamaCache):
"""Custom cache using JSON files for storage."""
def __init__(self, cache_dir: str = ".json_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _key_to_filename(self, key: Tuple[int, ...]) -> str:
"""Convert cache key to filename."""
key_str = str(key)
key_hash = hashlib.md5(key_str.encode()).hexdigest()
return f"{key_hash}.json"
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
file_path = self.cache_dir / self._key_to_filename(key)
if file_path.exists():
with open(file_path, 'r') as f:
return json.load(f)
return None
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
file_path = self.cache_dir / self._key_to_filename(key)
with open(file_path, 'w') as f:
json.dump(value, f)
def __contains__(self, key: Tuple[int, ...]) -> bool:
file_path = self.cache_dir / self._key_to_filename(key)
return file_path.exists()
def __len__(self) -> int:
return len(list(self.cache_dir.glob("*.json")))
# Use custom cache
custom_cache = JSONDiskCache("./custom_cache")
llm = Llama(model_path="./models/model.gguf")
llm.set_cache(custom_cache)import time
from llama_cpp import Llama, LlamaRAMCache
# Test without cache
llm_no_cache = Llama(model_path="./models/llama-2-7b.gguf")
llm_no_cache.set_cache(None) # Disable caching
# Test with cache
llm_with_cache = Llama(model_path="./models/llama-2-7b.gguf")
llm_with_cache.set_cache(LlamaRAMCache())
test_prompt = "Explain the concept of recursion in programming"
def time_completion(llm, label):
start_time = time.time()
response = llm.create_completion(
prompt=test_prompt,
max_tokens=100,
temperature=0.7,
)
end_time = time.time()
print(f"{label}: {end_time - start_time:.2f} seconds")
return response
# First run (both will be similar - no cache benefit yet)
print("First run:")
time_completion(llm_no_cache, "No cache")
time_completion(llm_with_cache, "With cache")
print("\nSecond run (same prompt):")
# Second run (cached version should be faster)
time_completion(llm_no_cache, "No cache")
time_completion(llm_with_cache, "With cache (should be faster)")import psutil
import os
from llama_cpp import Llama, LlamaRAMCache
def get_memory_usage():
"""Get current process memory usage in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024
# Monitor memory with different cache sizes
cache_sizes = [64 << 20, 256 << 20, 1 << 30] # 64MB, 256MB, 1GB
for cache_size in cache_sizes:
print(f"\nTesting cache size: {cache_size // (1024*1024)}MB")
initial_memory = get_memory_usage()
cache = LlamaRAMCache(capacity_bytes=cache_size)
llm = Llama(model_path="./models/llama-2-7b.gguf")
llm.set_cache(cache)
# Generate several completions
for i in range(10):
llm.create_completion(
prompt=f"Write about topic number {i}:",
max_tokens=50,
)
final_memory = get_memory_usage()
memory_increase = final_memory - initial_memory
print(f"Memory increase: {memory_increase:.1f}MB")
print(f"Cache items: {len(cache)}")from llama_cpp import LlamaDiskCache
import os
import time
# Create disk cache
cache = LlamaDiskCache(cache_dir="./temp_cache")
# Use cache
llm = Llama(model_path="./models/model.gguf")
llm.set_cache(cache)
# Generate some cached content
for i in range(5):
llm.create_completion(
prompt=f"Example prompt {i}",
max_tokens=20,
)
print(f"Cache directory size: {len(cache)} items")
# Manual cache cleanup
cache_dir = cache.cache_dir
if os.path.exists(cache_dir):
# Get cache directory size
total_size = sum(
os.path.getsize(os.path.join(cache_dir, f))
for f in os.listdir(cache_dir)
)
print(f"Cache directory size: {total_size / 1024 / 1024:.2f}MB")
# Clean up old files (example: older than 1 hour)
current_time = time.time()
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
if os.path.getmtime(file_path) < current_time - 3600: # 1 hour
os.remove(file_path)
print(f"Removed old cache file: {filename}")Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python