or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

caching.mdchat-completion.mdgrammar.mdindex.mdllama-model.mdlow-level.mdserver.mdtokenization.mdvision.md

caching.mddocs/

0

# Caching

1

2

Memory and disk-based caching systems for model states, context, and computed results to improve inference performance and enable state persistence across sessions.

3

4

## Capabilities

5

6

### RAM Cache

7

8

In-memory caching for fast access to frequently used model states and computations.

9

10

```python { .api }

11

class LlamaRAMCache:

12

def __init__(self, capacity_bytes: int = 2 << 30):

13

"""

14

Initialize RAM-based cache.

15

16

Args:

17

capacity_bytes: Maximum cache size in bytes (default: 2GB)

18

"""

19

20

def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:

21

"""Get cached item by key."""

22

23

def __setitem__(self, key: Tuple[int, ...], value: object) -> None:

24

"""Store item in cache."""

25

26

def __contains__(self, key: Tuple[int, ...]) -> bool:

27

"""Check if key exists in cache."""

28

29

def __len__(self) -> int:

30

"""Get number of cached items."""

31

32

# Alias for backward compatibility

33

LlamaCache = LlamaRAMCache

34

```

35

36

### Disk Cache

37

38

Persistent disk-based caching for long-term storage of model states and precomputed results.

39

40

```python { .api }

41

class LlamaDiskCache:

42

def __init__(self, cache_dir: str = ".cache/llama_cpp"):

43

"""

44

Initialize disk-based cache.

45

46

Args:

47

cache_dir: Directory path for cache storage

48

"""

49

50

def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:

51

"""Get cached item from disk."""

52

53

def __setitem__(self, key: Tuple[int, ...], value: object) -> None:

54

"""Store item to disk cache."""

55

56

def __contains__(self, key: Tuple[int, ...]) -> bool:

57

"""Check if key exists in disk cache."""

58

59

def __len__(self) -> int:

60

"""Get number of cached items on disk."""

61

```

62

63

### Base Cache Interface

64

65

Abstract base class defining the caching interface for custom implementations.

66

67

```python { .api }

68

class BaseLlamaCache:

69

"""Abstract base class for cache implementations."""

70

71

def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:

72

"""Get item from cache."""

73

raise NotImplementedError

74

75

def __setitem__(self, key: Tuple[int, ...], value: object) -> None:

76

"""Store item in cache."""

77

raise NotImplementedError

78

79

def __contains__(self, key: Tuple[int, ...]) -> bool:

80

"""Check if key exists in cache."""

81

raise NotImplementedError

82

83

def __len__(self) -> int:

84

"""Get number of cached items."""

85

raise NotImplementedError

86

```

87

88

### Cache Integration

89

90

Set and manage caching for Llama model instances.

91

92

```python { .api }

93

# From Llama class

94

def set_cache(self, cache: Optional[BaseLlamaCache]) -> None:

95

"""

96

Set caching implementation for the model.

97

98

Args:

99

cache: Cache instance (LlamaRAMCache, LlamaDiskCache, or custom)

100

Use None to disable caching

101

"""

102

```

103

104

## Usage Examples

105

106

### Basic RAM Caching

107

108

```python

109

from llama_cpp import Llama, LlamaRAMCache

110

111

# Create RAM cache with 1GB capacity

112

cache = LlamaRAMCache(capacity_bytes=1 << 30) # 1GB

113

114

# Initialize model with cache

115

llm = Llama(

116

model_path="./models/llama-2-7b.gguf",

117

n_ctx=2048,

118

)

119

llm.set_cache(cache)

120

121

# First completion (uncached)

122

response1 = llm.create_completion(

123

prompt="The capital of France is",

124

max_tokens=10,

125

)

126

127

# Second identical completion (cached, faster)

128

response2 = llm.create_completion(

129

prompt="The capital of France is",

130

max_tokens=10,

131

)

132

133

print(f"Cache size: {len(cache)} items")

134

```

135

136

### Persistent Disk Caching

137

138

```python

139

from llama_cpp import Llama, LlamaDiskCache

140

141

# Create disk cache in custom directory

142

cache = LlamaDiskCache(cache_dir="./my_llama_cache")

143

144

llm = Llama(model_path="./models/llama-2-7b.gguf")

145

llm.set_cache(cache)

146

147

# Generate text with caching

148

for i in range(3):

149

response = llm.create_completion(

150

prompt=f"Write a fact about number {i}:",

151

max_tokens=50,

152

)

153

print(f"Response {i}: {response['choices'][0]['text']}")

154

155

# Cache persists across program restarts

156

print(f"Disk cache contains {len(cache)} items")

157

```

158

159

### Cache Management

160

161

```python

162

from llama_cpp import Llama, LlamaRAMCache

163

164

# Initialize with monitoring

165

cache = LlamaRAMCache(capacity_bytes=512 << 20) # 512MB

166

llm = Llama(model_path="./models/llama-2-7b.gguf")

167

llm.set_cache(cache)

168

169

prompts = [

170

"What is machine learning?",

171

"Explain neural networks.",

172

"What is deep learning?",

173

"Define artificial intelligence.",

174

"What is machine learning?", # Duplicate for cache hit

175

]

176

177

cache_stats = {"hits": 0, "misses": 0}

178

179

for i, prompt in enumerate(prompts):

180

initial_size = len(cache)

181

182

response = llm.create_completion(

183

prompt=prompt,

184

max_tokens=30,

185

)

186

187

final_size = len(cache)

188

189

if final_size > initial_size:

190

cache_stats["misses"] += 1

191

print(f"Prompt {i+1}: CACHE MISS - New cache size: {final_size}")

192

else:

193

cache_stats["hits"] += 1

194

print(f"Prompt {i+1}: CACHE HIT - Cache size: {final_size}")

195

196

print(f"Cache statistics: {cache_stats}")

197

```

198

199

### Custom Cache Implementation

200

201

```python

202

from llama_cpp.llama_cache import BaseLlamaCache

203

import json

204

import hashlib

205

from pathlib import Path

206

207

class JSONDiskCache(BaseLlamaCache):

208

"""Custom cache using JSON files for storage."""

209

210

def __init__(self, cache_dir: str = ".json_cache"):

211

self.cache_dir = Path(cache_dir)

212

self.cache_dir.mkdir(exist_ok=True)

213

214

def _key_to_filename(self, key: Tuple[int, ...]) -> str:

215

"""Convert cache key to filename."""

216

key_str = str(key)

217

key_hash = hashlib.md5(key_str.encode()).hexdigest()

218

return f"{key_hash}.json"

219

220

def __getitem__(self, key: Tuple[int, ...]) -> Optional[object]:

221

file_path = self.cache_dir / self._key_to_filename(key)

222

if file_path.exists():

223

with open(file_path, 'r') as f:

224

return json.load(f)

225

return None

226

227

def __setitem__(self, key: Tuple[int, ...], value: object) -> None:

228

file_path = self.cache_dir / self._key_to_filename(key)

229

with open(file_path, 'w') as f:

230

json.dump(value, f)

231

232

def __contains__(self, key: Tuple[int, ...]) -> bool:

233

file_path = self.cache_dir / self._key_to_filename(key)

234

return file_path.exists()

235

236

def __len__(self) -> int:

237

return len(list(self.cache_dir.glob("*.json")))

238

239

# Use custom cache

240

custom_cache = JSONDiskCache("./custom_cache")

241

llm = Llama(model_path="./models/model.gguf")

242

llm.set_cache(custom_cache)

243

```

244

245

### Cache Performance Testing

246

247

```python

248

import time

249

from llama_cpp import Llama, LlamaRAMCache

250

251

# Test without cache

252

llm_no_cache = Llama(model_path="./models/llama-2-7b.gguf")

253

llm_no_cache.set_cache(None) # Disable caching

254

255

# Test with cache

256

llm_with_cache = Llama(model_path="./models/llama-2-7b.gguf")

257

llm_with_cache.set_cache(LlamaRAMCache())

258

259

test_prompt = "Explain the concept of recursion in programming"

260

261

def time_completion(llm, label):

262

start_time = time.time()

263

response = llm.create_completion(

264

prompt=test_prompt,

265

max_tokens=100,

266

temperature=0.7,

267

)

268

end_time = time.time()

269

print(f"{label}: {end_time - start_time:.2f} seconds")

270

return response

271

272

# First run (both will be similar - no cache benefit yet)

273

print("First run:")

274

time_completion(llm_no_cache, "No cache")

275

time_completion(llm_with_cache, "With cache")

276

277

print("\nSecond run (same prompt):")

278

# Second run (cached version should be faster)

279

time_completion(llm_no_cache, "No cache")

280

time_completion(llm_with_cache, "With cache (should be faster)")

281

```

282

283

### Memory Usage Monitoring

284

285

```python

286

import psutil

287

import os

288

from llama_cpp import Llama, LlamaRAMCache

289

290

def get_memory_usage():

291

"""Get current process memory usage in MB."""

292

process = psutil.Process(os.getpid())

293

return process.memory_info().rss / 1024 / 1024

294

295

# Monitor memory with different cache sizes

296

cache_sizes = [64 << 20, 256 << 20, 1 << 30] # 64MB, 256MB, 1GB

297

298

for cache_size in cache_sizes:

299

print(f"\nTesting cache size: {cache_size // (1024*1024)}MB")

300

301

initial_memory = get_memory_usage()

302

303

cache = LlamaRAMCache(capacity_bytes=cache_size)

304

llm = Llama(model_path="./models/llama-2-7b.gguf")

305

llm.set_cache(cache)

306

307

# Generate several completions

308

for i in range(10):

309

llm.create_completion(

310

prompt=f"Write about topic number {i}:",

311

max_tokens=50,

312

)

313

314

final_memory = get_memory_usage()

315

memory_increase = final_memory - initial_memory

316

317

print(f"Memory increase: {memory_increase:.1f}MB")

318

print(f"Cache items: {len(cache)}")

319

```

320

321

### Cache Cleanup and Maintenance

322

323

```python

324

from llama_cpp import LlamaDiskCache

325

import os

326

import time

327

328

# Create disk cache

329

cache = LlamaDiskCache(cache_dir="./temp_cache")

330

331

# Use cache

332

llm = Llama(model_path="./models/model.gguf")

333

llm.set_cache(cache)

334

335

# Generate some cached content

336

for i in range(5):

337

llm.create_completion(

338

prompt=f"Example prompt {i}",

339

max_tokens=20,

340

)

341

342

print(f"Cache directory size: {len(cache)} items")

343

344

# Manual cache cleanup

345

cache_dir = cache.cache_dir

346

if os.path.exists(cache_dir):

347

# Get cache directory size

348

total_size = sum(

349

os.path.getsize(os.path.join(cache_dir, f))

350

for f in os.listdir(cache_dir)

351

)

352

print(f"Cache directory size: {total_size / 1024 / 1024:.2f}MB")

353

354

# Clean up old files (example: older than 1 hour)

355

current_time = time.time()

356

for filename in os.listdir(cache_dir):

357

file_path = os.path.join(cache_dir, filename)

358

if os.path.getmtime(file_path) < current_time - 3600: # 1 hour

359

os.remove(file_path)

360

print(f"Removed old cache file: {filename}")

361

```