CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

caching.mddocs/

Caching

Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence across sessions.

Capabilities

RAM Cache

In-memory caching for fast access to frequently used model states and computations.

class LlamaRAMCache:
    def __init__(self, capacity_bytes: int = 2 << 30):
        """
        Initialize RAM-based cache.
        
        Args:
            capacity_bytes: Maximum cache size in bytes (default: 2GB)
        """

    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
        """Get cached item by key."""

    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
        """Store item in cache."""

    def __contains__(self, key: Tuple[int, ...]) -> bool:
        """Check if key exists in cache."""

    def __len__(self) -> int:
        """Get number of cached items."""

# Alias for backward compatibility
LlamaCache = LlamaRAMCache

Disk Cache

Persistent disk-based caching for long-term storage of model states and precomputed results.

class LlamaDiskCache:
    def __init__(self, cache_dir: str = ".cache/llama_cpp"):
        """
        Initialize disk-based cache.
        
        Args:
            cache_dir: Directory path for cache storage
        """

    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
        """Get cached item from disk."""

    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
        """Store item to disk cache."""

    def __contains__(self, key: Tuple[int, ...]) -> bool:
        """Check if key exists in disk cache."""

    def __len__(self) -> int:
        """Get number of cached items on disk."""

Base Cache Interface

Abstract base class defining the caching interface for custom implementations.

class BaseLlamaCache:
    """Abstract base class for cache implementations."""
    
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
        """Get item from cache."""
        raise NotImplementedError

    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
        """Store item in cache."""
        raise NotImplementedError

    def __contains__(self, key: Tuple[int, ...]) -> bool:
        """Check if key exists in cache."""
        raise NotImplementedError

    def __len__(self) -> int:
        """Get number of cached items."""
        raise NotImplementedError

Cache Integration

Set and manage caching for Llama model instances.

# From Llama class
def set_cache(self, cache: Optional[BaseLlamaCache]) -> None:
    """
    Set caching implementation for the model.
    
    Args:
        cache: Cache instance (LlamaRAMCache, LlamaDiskCache, or custom)
               Use None to disable caching
    """

Usage Examples

Basic RAM Caching

from llama_cpp import Llama, LlamaRAMCache

# Create RAM cache with 1GB capacity
cache = LlamaRAMCache(capacity_bytes=1 << 30)  # 1GB

# Initialize model with cache
llm = Llama(
    model_path="./models/llama-2-7b.gguf",
    n_ctx=2048,
)
llm.set_cache(cache)

# First completion (uncached)
response1 = llm.create_completion(
    prompt="The capital of France is",
    max_tokens=10,
)

# Second identical completion (cached, faster)
response2 = llm.create_completion(
    prompt="The capital of France is",
    max_tokens=10,
)

print(f"Cache size: {len(cache)} items")

Persistent Disk Caching

from llama_cpp import Llama, LlamaDiskCache

# Create disk cache in custom directory
cache = LlamaDiskCache(cache_dir="./my_llama_cache")

llm = Llama(model_path="./models/llama-2-7b.gguf")
llm.set_cache(cache)

# Generate text with caching
for i in range(3):
    response = llm.create_completion(
        prompt=f"Write a fact about number {i}:",
        max_tokens=50,
    )
    print(f"Response {i}: {response['choices'][0]['text']}")

# Cache persists across program restarts
print(f"Disk cache contains {len(cache)} items")

Cache Management

from llama_cpp import Llama, LlamaRAMCache

# Initialize with monitoring
cache = LlamaRAMCache(capacity_bytes=512 << 20)  # 512MB
llm = Llama(model_path="./models/llama-2-7b.gguf")
llm.set_cache(cache)

prompts = [
    "What is machine learning?",
    "Explain neural networks.",
    "What is deep learning?",
    "Define artificial intelligence.",
    "What is machine learning?",  # Duplicate for cache hit
]

cache_stats = {"hits": 0, "misses": 0}

for i, prompt in enumerate(prompts):
    initial_size = len(cache)
    
    response = llm.create_completion(
        prompt=prompt,
        max_tokens=30,
    )
    
    final_size = len(cache)
    
    if final_size > initial_size:
        cache_stats["misses"] += 1
        print(f"Prompt {i+1}: CACHE MISS - New cache size: {final_size}")
    else:
        cache_stats["hits"] += 1
        print(f"Prompt {i+1}: CACHE HIT - Cache size: {final_size}")

print(f"Cache statistics: {cache_stats}")

Custom Cache Implementation

from llama_cpp.llama_cache import BaseLlamaCache
import json
import hashlib
from pathlib import Path

class JSONDiskCache(BaseLlamaCache):
    """Custom cache using JSON files for storage."""
    
    def __init__(self, cache_dir: str = ".json_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
    
    def _key_to_filename(self, key: Tuple[int, ...]) -> str:
        """Convert cache key to filename."""
        key_str = str(key)
        key_hash = hashlib.md5(key_str.encode()).hexdigest()
        return f"{key_hash}.json"
    
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
        file_path = self.cache_dir / self._key_to_filename(key)
        if file_path.exists():
            with open(file_path, 'r') as f:
                return json.load(f)
        return None
    
    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
        file_path = self.cache_dir / self._key_to_filename(key)
        with open(file_path, 'w') as f:
            json.dump(value, f)
    
    def __contains__(self, key: Tuple[int, ...]) -> bool:
        file_path = self.cache_dir / self._key_to_filename(key)
        return file_path.exists()
    
    def __len__(self) -> int:
        return len(list(self.cache_dir.glob("*.json")))

# Use custom cache
custom_cache = JSONDiskCache("./custom_cache")
llm = Llama(model_path="./models/model.gguf")
llm.set_cache(custom_cache)

Cache Performance Testing

import time
from llama_cpp import Llama, LlamaRAMCache

# Test without cache
llm_no_cache = Llama(model_path="./models/llama-2-7b.gguf")
llm_no_cache.set_cache(None)  # Disable caching

# Test with cache
llm_with_cache = Llama(model_path="./models/llama-2-7b.gguf")
llm_with_cache.set_cache(LlamaRAMCache())

test_prompt = "Explain the concept of recursion in programming"

def time_completion(llm, label):
    start_time = time.time()
    response = llm.create_completion(
        prompt=test_prompt,
        max_tokens=100,
        temperature=0.7,
    )
    end_time = time.time()
    print(f"{label}: {end_time - start_time:.2f} seconds")
    return response

# First run (both will be similar - no cache benefit yet)
print("First run:")
time_completion(llm_no_cache, "No cache")
time_completion(llm_with_cache, "With cache")

print("\nSecond run (same prompt):")
# Second run (cached version should be faster)
time_completion(llm_no_cache, "No cache")
time_completion(llm_with_cache, "With cache (should be faster)")

Memory Usage Monitoring

import psutil
import os
from llama_cpp import Llama, LlamaRAMCache

def get_memory_usage():
    """Get current process memory usage in MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

# Monitor memory with different cache sizes
cache_sizes = [64 << 20, 256 << 20, 1 << 30]  # 64MB, 256MB, 1GB

for cache_size in cache_sizes:
    print(f"\nTesting cache size: {cache_size // (1024*1024)}MB")
    
    initial_memory = get_memory_usage()
    
    cache = LlamaRAMCache(capacity_bytes=cache_size)
    llm = Llama(model_path="./models/llama-2-7b.gguf")
    llm.set_cache(cache)
    
    # Generate several completions
    for i in range(10):
        llm.create_completion(
            prompt=f"Write about topic number {i}:",
            max_tokens=50,
        )
    
    final_memory = get_memory_usage()
    memory_increase = final_memory - initial_memory
    
    print(f"Memory increase: {memory_increase:.1f}MB")
    print(f"Cache items: {len(cache)}")

Cache Cleanup and Maintenance

from llama_cpp import LlamaDiskCache
import os
import time

# Create disk cache
cache = LlamaDiskCache(cache_dir="./temp_cache")

# Use cache
llm = Llama(model_path="./models/model.gguf")
llm.set_cache(cache)

# Generate some cached content
for i in range(5):
    llm.create_completion(
        prompt=f"Example prompt {i}",
        max_tokens=20,
    )

print(f"Cache directory size: {len(cache)} items")

# Manual cache cleanup
cache_dir = cache.cache_dir
if os.path.exists(cache_dir):
    # Get cache directory size
    total_size = sum(
        os.path.getsize(os.path.join(cache_dir, f)) 
        for f in os.listdir(cache_dir)
    )
    print(f"Cache directory size: {total_size / 1024 / 1024:.2f}MB")
    
    # Clean up old files (example: older than 1 hour)
    current_time = time.time()
    for filename in os.listdir(cache_dir):
        file_path = os.path.join(cache_dir, filename)
        if os.path.getmtime(file_path) < current_time - 3600:  # 1 hour
            os.remove(file_path)
            print(f"Removed old cache file: {filename}")

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json