0
# Caching
1
2
Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence across sessions.
3
4
## Capabilities
5
6
### RAM Cache
7
8
In-memory caching for fast access to frequently used model states and computations.
9
10
```python { .api }
11
class LlamaRAMCache:
12
def __init__(self, capacity_bytes: int = 2 << 30):
13
"""
14
Initialize RAM-based cache.
15
16
Args:
17
capacity_bytes: Maximum cache size in bytes (default: 2GB)
18
"""
19
20
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
21
"""Get cached item by key."""
22
23
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
24
"""Store item in cache."""
25
26
def __contains__(self, key: Tuple[int, ...]) -> bool:
27
"""Check if key exists in cache."""
28
29
def __len__(self) -> int:
30
"""Get number of cached items."""
31
32
# Alias for backward compatibility
33
LlamaCache = LlamaRAMCache
34
```
35
36
### Disk Cache
37
38
Persistent disk-based caching for long-term storage of model states and precomputed results.
39
40
```python { .api }
41
class LlamaDiskCache:
42
def __init__(self, cache_dir: str = ".cache/llama_cpp"):
43
"""
44
Initialize disk-based cache.
45
46
Args:
47
cache_dir: Directory path for cache storage
48
"""
49
50
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
51
"""Get cached item from disk."""
52
53
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
54
"""Store item to disk cache."""
55
56
def __contains__(self, key: Tuple[int, ...]) -> bool:
57
"""Check if key exists in disk cache."""
58
59
def __len__(self) -> int:
60
"""Get number of cached items on disk."""
61
```
62
63
### Base Cache Interface
64
65
Abstract base class defining the caching interface for custom implementations.
66
67
```python { .api }
68
class BaseLlamaCache:
69
"""Abstract base class for cache implementations."""
70
71
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
72
"""Get item from cache."""
73
raise NotImplementedError
74
75
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
76
"""Store item in cache."""
77
raise NotImplementedError
78
79
def __contains__(self, key: Tuple[int, ...]) -> bool:
80
"""Check if key exists in cache."""
81
raise NotImplementedError
82
83
def __len__(self) -> int:
84
"""Get number of cached items."""
85
raise NotImplementedError
86
```
87
88
### Cache Integration
89
90
Set and manage caching for Llama model instances.
91
92
```python { .api }
93
# From Llama class
94
def set_cache(self, cache: Optional[BaseLlamaCache]) -> None:
95
"""
96
Set caching implementation for the model.
97
98
Args:
99
cache: Cache instance (LlamaRAMCache, LlamaDiskCache, or custom)
100
Use None to disable caching
101
"""
102
```
103
104
## Usage Examples
105
106
### Basic RAM Caching
107
108
```python
109
from llama_cpp import Llama, LlamaRAMCache
110
111
# Create RAM cache with 1GB capacity
112
cache = LlamaRAMCache(capacity_bytes=1 << 30) # 1GB
113
114
# Initialize model with cache
115
llm = Llama(
116
model_path="./models/llama-2-7b.gguf",
117
n_ctx=2048,
118
)
119
llm.set_cache(cache)
120
121
# First completion (uncached)
122
response1 = llm.create_completion(
123
prompt="The capital of France is",
124
max_tokens=10,
125
)
126
127
# Second identical completion (cached, faster)
128
response2 = llm.create_completion(
129
prompt="The capital of France is",
130
max_tokens=10,
131
)
132
133
print(f"Cache size: {len(cache)} items")
134
```
135
136
### Persistent Disk Caching
137
138
```python
139
from llama_cpp import Llama, LlamaDiskCache
140
141
# Create disk cache in custom directory
142
cache = LlamaDiskCache(cache_dir="./my_llama_cache")
143
144
llm = Llama(model_path="./models/llama-2-7b.gguf")
145
llm.set_cache(cache)
146
147
# Generate text with caching
148
for i in range(3):
149
response = llm.create_completion(
150
prompt=f"Write a fact about number {i}:",
151
max_tokens=50,
152
)
153
print(f"Response {i}: {response['choices'][0]['text']}")
154
155
# Cache persists across program restarts
156
print(f"Disk cache contains {len(cache)} items")
157
```
158
159
### Cache Management
160
161
```python
162
from llama_cpp import Llama, LlamaRAMCache
163
164
# Initialize with monitoring
165
cache = LlamaRAMCache(capacity_bytes=512 << 20) # 512MB
166
llm = Llama(model_path="./models/llama-2-7b.gguf")
167
llm.set_cache(cache)
168
169
prompts = [
170
"What is machine learning?",
171
"Explain neural networks.",
172
"What is deep learning?",
173
"Define artificial intelligence.",
174
"What is machine learning?", # Duplicate for cache hit
175
]
176
177
cache_stats = {"hits": 0, "misses": 0}
178
179
for i, prompt in enumerate(prompts):
180
initial_size = len(cache)
181
182
response = llm.create_completion(
183
prompt=prompt,
184
max_tokens=30,
185
)
186
187
final_size = len(cache)
188
189
if final_size > initial_size:
190
cache_stats["misses"] += 1
191
print(f"Prompt {i+1}: CACHE MISS - New cache size: {final_size}")
192
else:
193
cache_stats["hits"] += 1
194
print(f"Prompt {i+1}: CACHE HIT - Cache size: {final_size}")
195
196
print(f"Cache statistics: {cache_stats}")
197
```
198
199
### Custom Cache Implementation
200
201
```python
202
from llama_cpp.llama_cache import BaseLlamaCache
203
import json
204
import hashlib
205
from pathlib import Path
206
207
class JSONDiskCache(BaseLlamaCache):
208
"""Custom cache using JSON files for storage."""
209
210
def __init__(self, cache_dir: str = ".json_cache"):
211
self.cache_dir = Path(cache_dir)
212
self.cache_dir.mkdir(exist_ok=True)
213
214
def _key_to_filename(self, key: Tuple[int, ...]) -> str:
215
"""Convert cache key to filename."""
216
key_str = str(key)
217
key_hash = hashlib.md5(key_str.encode()).hexdigest()
218
return f"{key_hash}.json"
219
220
def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:
221
file_path = self.cache_dir / self._key_to_filename(key)
222
if file_path.exists():
223
with open(file_path, 'r') as f:
224
return json.load(f)
225
return None
226
227
def __setitem__(self, key: Tuple[int, ...], value: object) -> None:
228
file_path = self.cache_dir / self._key_to_filename(key)
229
with open(file_path, 'w') as f:
230
json.dump(value, f)
231
232
def __contains__(self, key: Tuple[int, ...]) -> bool:
233
file_path = self.cache_dir / self._key_to_filename(key)
234
return file_path.exists()
235
236
def __len__(self) -> int:
237
return len(list(self.cache_dir.glob("*.json")))
238
239
# Use custom cache
240
custom_cache = JSONDiskCache("./custom_cache")
241
llm = Llama(model_path="./models/model.gguf")
242
llm.set_cache(custom_cache)
243
```
244
245
### Cache Performance Testing
246
247
```python
248
import time
249
from llama_cpp import Llama, LlamaRAMCache
250
251
# Test without cache
252
llm_no_cache = Llama(model_path="./models/llama-2-7b.gguf")
253
llm_no_cache.set_cache(None) # Disable caching
254
255
# Test with cache
256
llm_with_cache = Llama(model_path="./models/llama-2-7b.gguf")
257
llm_with_cache.set_cache(LlamaRAMCache())
258
259
test_prompt = "Explain the concept of recursion in programming"
260
261
def time_completion(llm, label):
262
start_time = time.time()
263
response = llm.create_completion(
264
prompt=test_prompt,
265
max_tokens=100,
266
temperature=0.7,
267
)
268
end_time = time.time()
269
print(f"{label}: {end_time - start_time:.2f} seconds")
270
return response
271
272
# First run (both will be similar - no cache benefit yet)
273
print("First run:")
274
time_completion(llm_no_cache, "No cache")
275
time_completion(llm_with_cache, "With cache")
276
277
print("\nSecond run (same prompt):")
278
# Second run (cached version should be faster)
279
time_completion(llm_no_cache, "No cache")
280
time_completion(llm_with_cache, "With cache (should be faster)")
281
```
282
283
### Memory Usage Monitoring
284
285
```python
286
import psutil
287
import os
288
from llama_cpp import Llama, LlamaRAMCache
289
290
def get_memory_usage():
291
"""Get current process memory usage in MB."""
292
process = psutil.Process(os.getpid())
293
return process.memory_info().rss / 1024 / 1024
294
295
# Monitor memory with different cache sizes
296
cache_sizes = [64 << 20, 256 << 20, 1 << 30] # 64MB, 256MB, 1GB
297
298
for cache_size in cache_sizes:
299
print(f"\nTesting cache size: {cache_size // (1024*1024)}MB")
300
301
initial_memory = get_memory_usage()
302
303
cache = LlamaRAMCache(capacity_bytes=cache_size)
304
llm = Llama(model_path="./models/llama-2-7b.gguf")
305
llm.set_cache(cache)
306
307
# Generate several completions
308
for i in range(10):
309
llm.create_completion(
310
prompt=f"Write about topic number {i}:",
311
max_tokens=50,
312
)
313
314
final_memory = get_memory_usage()
315
memory_increase = final_memory - initial_memory
316
317
print(f"Memory increase: {memory_increase:.1f}MB")
318
print(f"Cache items: {len(cache)}")
319
```
320
321
### Cache Cleanup and Maintenance
322
323
```python
324
from llama_cpp import LlamaDiskCache
325
import os
326
import time
327
328
# Create disk cache
329
cache = LlamaDiskCache(cache_dir="./temp_cache")
330
331
# Use cache
332
llm = Llama(model_path="./models/model.gguf")
333
llm.set_cache(cache)
334
335
# Generate some cached content
336
for i in range(5):
337
llm.create_completion(
338
prompt=f"Example prompt {i}",
339
max_tokens=20,
340
)
341
342
print(f"Cache directory size: {len(cache)} items")
343
344
# Manual cache cleanup
345
cache_dir = cache.cache_dir
346
if os.path.exists(cache_dir):
347
# Get cache directory size
348
total_size = sum(
349
os.path.getsize(os.path.join(cache_dir, f))
350
for f in os.listdir(cache_dir)
351
)
352
print(f"Cache directory size: {total_size / 1024 / 1024:.2f}MB")
353
354
# Clean up old files (example: older than 1 hour)
355
current_time = time.time()
356
for filename in os.listdir(cache_dir):
357
file_path = os.path.join(cache_dir, filename)
358
if os.path.getmtime(file_path) < current_time - 3600: # 1 hour
359
os.remove(file_path)
360
print(f"Removed old cache file: {filename}")
361
```