Tessl Tile for pypi/llama-cpp-python@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

caching.md chat-completion.md grammar.md index.md llama-model.md low-level.md server.md tokenization.md vision.md

caching.mddocs/

0
# Caching
1

2
Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence across sessions.
3

4
## Capabilities
5

6
### RAM Cache
7

8
In-memory caching for fast access to frequently used model states and computations.
9

10
```python { .api }
11
class LlamaRAMCache:
12
    def __init__(self, capacity_bytes: int = 2 << 30):
13
        """
14
        Initialize RAM-based cache.
15
        
16
        Args:
17
            capacity_bytes: Maximum cache size in bytes (default: 2GB)
18
        """
19

20
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
21
        """Get cached item by key."""
22

23
    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
24
        """Store item in cache."""
25

26
    def __contains__(self, key: Tuple[int, ...]) -> bool:
27
        """Check if key exists in cache."""
28

29
    def __len__(self) -> int:
30
        """Get number of cached items."""
31

32
# Alias for backward compatibility
33
LlamaCache = LlamaRAMCache
34
```
35

36
### Disk Cache
37

38
Persistent disk-based caching for long-term storage of model states and precomputed results.
39

40
```python { .api }
41
class LlamaDiskCache:
42
    def __init__(self, cache_dir: str = ".cache/llama_cpp"):
43
        """
44
        Initialize disk-based cache.
45
        
46
        Args:
47
            cache_dir: Directory path for cache storage
48
        """
49

50
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
51
        """Get cached item from disk."""
52

53
    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
54
        """Store item to disk cache."""
55

56
    def __contains__(self, key: Tuple[int, ...]) -> bool:
57
        """Check if key exists in disk cache."""
58

59
    def __len__(self) -> int:
60
        """Get number of cached items on disk."""
61
```
62

63
### Base Cache Interface
64

65
Abstract base class defining the caching interface for custom implementations.
66

67
```python { .api }
68
class BaseLlamaCache:
69
    """Abstract base class for cache implementations."""
70
    
71
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
72
        """Get item from cache."""
73
        raise NotImplementedError
74

75
    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
76
        """Store item in cache."""
77
        raise NotImplementedError
78

79
    def __contains__(self, key: Tuple[int, ...]) -> bool:
80
        """Check if key exists in cache."""
81
        raise NotImplementedError
82

83
    def __len__(self) -> int:
84
        """Get number of cached items."""
85
        raise NotImplementedError
86
```
87

88
### Cache Integration
89

90
Set and manage caching for Llama model instances.
91

92
```python { .api }
93
# From Llama class
94
def set_cache(self, cache: Optional[BaseLlamaCache]) -> None:
95
    """
96
    Set caching implementation for the model.
97
    
98
    Args:
99
        cache: Cache instance (LlamaRAMCache, LlamaDiskCache, or custom)
100
               Use None to disable caching
101
    """
102
```
103

104
## Usage Examples
105

106
### Basic RAM Caching
107

108
```python
109
from llama_cpp import Llama, LlamaRAMCache
110

111
# Create RAM cache with 1GB capacity
112
cache = LlamaRAMCache(capacity_bytes=1 << 30)  # 1GB
113

114
# Initialize model with cache
115
llm = Llama(
116
    model_path="./models/llama-2-7b.gguf",
117
    n_ctx=2048,
118
)
119
llm.set_cache(cache)
120

121
# First completion (uncached)
122
response1 = llm.create_completion(
123
    prompt="The capital of France is",
124
    max_tokens=10,
125
)
126

127
# Second identical completion (cached, faster)
128
response2 = llm.create_completion(
129
    prompt="The capital of France is",
130
    max_tokens=10,
131
)
132

133
print(f"Cache size: {len(cache)} items")
134
```
135

136
### Persistent Disk Caching
137

138
```python
139
from llama_cpp import Llama, LlamaDiskCache
140

141
# Create disk cache in custom directory
142
cache = LlamaDiskCache(cache_dir="./my_llama_cache")
143

144
llm = Llama(model_path="./models/llama-2-7b.gguf")
145
llm.set_cache(cache)
146

147
# Generate text with caching
148
for i in range(3):
149
    response = llm.create_completion(
150
        prompt=f"Write a fact about number {i}:",
151
        max_tokens=50,
152
    )
153
    print(f"Response {i}: {response['choices'][0]['text']}")
154

155
# Cache persists across program restarts
156
print(f"Disk cache contains {len(cache)} items")
157
```
158

159
### Cache Management
160

161
```python
162
from llama_cpp import Llama, LlamaRAMCache
163

164
# Initialize with monitoring
165
cache = LlamaRAMCache(capacity_bytes=512 << 20)  # 512MB
166
llm = Llama(model_path="./models/llama-2-7b.gguf")
167
llm.set_cache(cache)
168

169
prompts = [
170
    "What is machine learning?",
171
    "Explain neural networks.",
172
    "What is deep learning?",
173
    "Define artificial intelligence.",
174
    "What is machine learning?",  # Duplicate for cache hit
175
]
176

177
cache_stats = {"hits": 0, "misses": 0}
178

179
for i, prompt in enumerate(prompts):
180
    initial_size = len(cache)
181
    
182
    response = llm.create_completion(
183
        prompt=prompt,
184
        max_tokens=30,
185
    )
186
    
187
    final_size = len(cache)
188
    
189
    if final_size > initial_size:
190
        cache_stats["misses"] += 1
191
        print(f"Prompt {i+1}: CACHE MISS - New cache size: {final_size}")
192
    else:
193
        cache_stats["hits"] += 1
194
        print(f"Prompt {i+1}: CACHE HIT - Cache size: {final_size}")
195

196
print(f"Cache statistics: {cache_stats}")
197
```
198

199
### Custom Cache Implementation
200

201
```python
202
from llama_cpp.llama_cache import BaseLlamaCache
203
import json
204
import hashlib
205
from pathlib import Path
206

207
class JSONDiskCache(BaseLlamaCache):
208
    """Custom cache using JSON files for storage."""
209
    
210
    def __init__(self, cache_dir: str = ".json_cache"):
211
        self.cache_dir = Path(cache_dir)
212
        self.cache_dir.mkdir(exist_ok=True)
213
    
214
    def _key_to_filename(self, key: Tuple[int, ...]) -> str:
215
        """Convert cache key to filename."""
216
        key_str = str(key)
217
        key_hash = hashlib.md5(key_str.encode()).hexdigest()
218
        return f"{key_hash}.json"
219
    
220
    def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
221
        file_path = self.cache_dir / self._key_to_filename(key)
222
        if file_path.exists():
223
            with open(file_path, 'r') as f:
224
                return json.load(f)
225
        return None
226
    
227
    def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
228
        file_path = self.cache_dir / self._key_to_filename(key)
229
        with open(file_path, 'w') as f:
230
            json.dump(value, f)
231
    
232
    def __contains__(self, key: Tuple[int, ...]) -> bool:
233
        file_path = self.cache_dir / self._key_to_filename(key)
234
        return file_path.exists()
235
    
236
    def __len__(self) -> int:
237
        return len(list(self.cache_dir.glob("*.json")))
238

239
# Use custom cache
240
custom_cache = JSONDiskCache("./custom_cache")
241
llm = Llama(model_path="./models/model.gguf")
242
llm.set_cache(custom_cache)
243
```
244

245
### Cache Performance Testing
246

247
```python
248
import time
249
from llama_cpp import Llama, LlamaRAMCache
250

251
# Test without cache
252
llm_no_cache = Llama(model_path="./models/llama-2-7b.gguf")
253
llm_no_cache.set_cache(None)  # Disable caching
254

255
# Test with cache
256
llm_with_cache = Llama(model_path="./models/llama-2-7b.gguf")
257
llm_with_cache.set_cache(LlamaRAMCache())
258

259
test_prompt = "Explain the concept of recursion in programming"
260

261
def time_completion(llm, label):
262
    start_time = time.time()
263
    response = llm.create_completion(
264
        prompt=test_prompt,
265
        max_tokens=100,
266
        temperature=0.7,
267
    )
268
    end_time = time.time()
269
    print(f"{label}: {end_time - start_time:.2f} seconds")
270
    return response
271

272
# First run (both will be similar - no cache benefit yet)
273
print("First run:")
274
time_completion(llm_no_cache, "No cache")
275
time_completion(llm_with_cache, "With cache")
276

277
print("\nSecond run (same prompt):")
278
# Second run (cached version should be faster)
279
time_completion(llm_no_cache, "No cache")
280
time_completion(llm_with_cache, "With cache (should be faster)")
281
```
282

283
### Memory Usage Monitoring
284

285
```python
286
import psutil
287
import os
288
from llama_cpp import Llama, LlamaRAMCache
289

290
def get_memory_usage():
291
    """Get current process memory usage in MB."""
292
    process = psutil.Process(os.getpid())
293
    return process.memory_info().rss / 1024 / 1024
294

295
# Monitor memory with different cache sizes
296
cache_sizes = [64 << 20, 256 << 20, 1 << 30]  # 64MB, 256MB, 1GB
297

298
for cache_size in cache_sizes:
299
    print(f"\nTesting cache size: {cache_size // (1024*1024)}MB")
300
    
301
    initial_memory = get_memory_usage()
302
    
303
    cache = LlamaRAMCache(capacity_bytes=cache_size)
304
    llm = Llama(model_path="./models/llama-2-7b.gguf")
305
    llm.set_cache(cache)
306
    
307
    # Generate several completions
308
    for i in range(10):
309
        llm.create_completion(
310
            prompt=f"Write about topic number {i}:",
311
            max_tokens=50,
312
        )
313
    
314
    final_memory = get_memory_usage()
315
    memory_increase = final_memory - initial_memory
316
    
317
    print(f"Memory increase: {memory_increase:.1f}MB")
318
    print(f"Cache items: {len(cache)}")
319
```
320

321
### Cache Cleanup and Maintenance
322

323
```python
324
from llama_cpp import LlamaDiskCache
325
import os
326
import time
327

328
# Create disk cache
329
cache = LlamaDiskCache(cache_dir="./temp_cache")
330

331
# Use cache
332
llm = Llama(model_path="./models/model.gguf")
333
llm.set_cache(cache)
334

335
# Generate some cached content
336
for i in range(5):
337
    llm.create_completion(
338
        prompt=f"Example prompt {i}",
339
        max_tokens=20,
340
    )
341

342
print(f"Cache directory size: {len(cache)} items")
343

344
# Manual cache cleanup
345
cache_dir = cache.cache_dir
346
if os.path.exists(cache_dir):
347
    # Get cache directory size
348
    total_size = sum(
349
        os.path.getsize(os.path.join(cache_dir, f)) 
350
        for f in os.listdir(cache_dir)
351
    )
352
    print(f"Cache directory size: {total_size / 1024 / 1024:.2f}MB")
353
    
354
    # Clean up old files (example: older than 1 hour)
355
    current_time = time.time()
356
    for filename in os.listdir(cache_dir):
357
        file_path = os.path.join(cache_dir, filename)
358
        if os.path.getmtime(file_path) < current_time - 3600:  # 1 hour
359
            os.remove(file_path)
360
            print(f"Removed old cache file: {filename}")
361
```

Version

Tile

Files

caching.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

caching.mddocs/