Cache expensive file processing results using SHA-256 content hashes — path-independent, auto-invalidating, with service layer separation.
Install with Tessl CLI
npx tessl i github:affaan-m/everything-claude-code --skill content-hash-cache-pattern83
Does it follow best practices?
If you maintain this skill, you can automatically optimize it using the tessl CLI to improve its score:
npx tessl skill review --optimize ./path/to/skillValidation for skill structure
Cache expensive file processing results (PDF parsing, text extraction, image analysis) using SHA-256 content hashes as cache keys. Unlike path-based caching, this approach survives file moves/renames and auto-invalidates when content changes.
--cache/--no-cache CLI optionUse file content (not path) as the cache key:
import hashlib
from pathlib import Path
_HASH_CHUNK_SIZE = 65536 # 64KB chunks for large files
def compute_file_hash(path: Path) -> str:
"""SHA-256 of file contents (chunked for large files)."""
if not path.is_file():
raise FileNotFoundError(f"File not found: {path}")
sha256 = hashlib.sha256()
with open(path, "rb") as f:
while True:
chunk = f.read(_HASH_CHUNK_SIZE)
if not chunk:
break
sha256.update(chunk)
return sha256.hexdigest()Why content hash? File rename/move = cache hit. Content change = automatic invalidation. No index file needed.
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class CacheEntry:
file_hash: str
source_path: str
document: ExtractedDocument # The cached resultEach cache entry is stored as {hash}.json — O(1) lookup by hash, no index file required.
import json
from typing import Any
def write_cache(cache_dir: Path, entry: CacheEntry) -> None:
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file = cache_dir / f"{entry.file_hash}.json"
data = serialize_entry(entry)
cache_file.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
def read_cache(cache_dir: Path, file_hash: str) -> CacheEntry | None:
cache_file = cache_dir / f"{file_hash}.json"
if not cache_file.is_file():
return None
try:
raw = cache_file.read_text(encoding="utf-8")
data = json.loads(raw)
return deserialize_entry(data)
except (json.JSONDecodeError, ValueError, KeyError):
return None # Treat corruption as cache missKeep the processing function pure. Add caching as a separate service layer.
def extract_with_cache(
file_path: Path,
*,
cache_enabled: bool = True,
cache_dir: Path = Path(".cache"),
) -> ExtractedDocument:
"""Service layer: cache check -> extraction -> cache write."""
if not cache_enabled:
return extract_text(file_path) # Pure function, no cache knowledge
file_hash = compute_file_hash(file_path)
# Check cache
cached = read_cache(cache_dir, file_hash)
if cached is not None:
logger.info("Cache hit: %s (hash=%s)", file_path.name, file_hash[:12])
return cached.document
# Cache miss -> extract -> store
logger.info("Cache miss: %s (hash=%s)", file_path.name, file_hash[:12])
doc = extract_text(file_path)
entry = CacheEntry(file_hash=file_hash, source_path=str(file_path), document=doc)
write_cache(cache_dir, entry)
return doc| Decision | Rationale |
|---|---|
| SHA-256 content hash | Path-independent, auto-invalidates on content change |
{hash}.json file naming | O(1) lookup, no index file needed |
| Service layer wrapper | SRP: extraction stays pure, cache is a separate concern |
| Manual JSON serialization | Full control over frozen dataclass serialization |
Corruption returns None | Graceful degradation, re-processes on next run |
cache_dir.mkdir(parents=True) | Lazy directory creation on first write |
# BAD: Path-based caching (breaks on file move/rename)
cache = {"/path/to/file.pdf": result}
# BAD: Adding cache logic inside the processing function (SRP violation)
def extract_text(path, *, cache_enabled=False, cache_dir=None):
if cache_enabled: # Now this function has two responsibilities
...
# BAD: Using dataclasses.asdict() with nested frozen dataclasses
# (can cause issues with complex nested types)
data = dataclasses.asdict(entry) # Use manual serialization instead--cache/--no-cache options7713cee
If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.