tessl/pypi-fsspec

Unified pythonic interface for diverse file systems and storage backends

—

Pending

Overview

Eval results

Files

Caching System

Name: tessl/pypi-fsspec
Author: tessl

Multiple caching strategies for optimizing filesystem access patterns, including memory mapping, block caching, read-ahead caching, and background prefetching. The caching system improves performance with remote storage by reducing network requests and providing intelligent data prefetching.

Capabilities

Base Cache Class

Abstract base class that defines the interface for all caching implementations.

class BaseCache:
    """Base class for caching implementations."""
    
    def __init__(self, blocksize, fetcher, size, **kwargs):
        """
        Initialize cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        - **kwargs: additional cache-specific options
        """
    
    def _fetch(self, start, end):
        """
        Fetch data range.
        
        Parameters:
        - start: int, start byte offset
        - end: int, end byte offset
        
        Returns:
        bytes, fetched data
        """
    
    def _read_cache(self, start, end):
        """
        Read from cache if available.
        
        Parameters:
        - start: int, start byte offset  
        - end: int, end byte offset
        
        Returns:
        bytes or None, cached data or None if not cached
        """

Memory-Mapped Cache

Uses memory mapping for efficient access to cached files, particularly useful for large files with random access patterns.

class MMapCache(BaseCache):
    """Memory-mapped file cache for efficient random access."""
    
    def __init__(self, blocksize, fetcher, size, location=None, blocks=None):
        """
        Initialize memory-mapped cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        - location: str, local file path for memory mapping
        - blocks: set, specific blocks to cache
        """

Read-Ahead Cache

Implements read-ahead caching strategy that prefetches data based on sequential access patterns.

class ReadAheadCache(BaseCache):
    """Read-ahead cache optimized for sequential access patterns."""
    
    def __init__(self, blocksize, fetcher, size, maxblocks=32):
        """
        Initialize read-ahead cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        - maxblocks: int, maximum number of blocks to cache
        """

Block Cache

LRU-based block caching with configurable cache size and eviction policies.

class BlockCache(BaseCache):
    """Block-based cache with LRU eviction policy."""
    
    def __init__(self, blocksize, fetcher, size, maxblocks=32):
        """
        Initialize block cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        - maxblocks: int, maximum number of blocks to keep in cache
        """

Bytes Cache

Simple in-memory cache that stores entire file contents as bytes.

class BytesCache(BaseCache):
    """In-memory bytes cache for small files."""
    
    def __init__(self, blocksize, fetcher, size, **kwargs):
        """
        Initialize bytes cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        """

Background Block Cache

Advanced block cache with background prefetching for improved performance with predictable access patterns.

class BackgroundBlockCache(BaseCache):
    """Block cache with background prefetching capabilities."""
    
    def __init__(self, blocksize, fetcher, size, maxblocks=32):
        """
        Initialize background block cache.
        
        Parameters:
        - blocksize: int, size of cache blocks
        - fetcher: callable, function to fetch data
        - size: int, total size of cached object
        - maxblocks: int, maximum number of blocks to cache
        """

Cache Registry

Dictionary of available cache implementations that can be selected by name.

caches: dict
    """
    Mapping of cache names to cache classes.
    
    Available caches:
    - 'mmap': MMapCache
    - 'readahead': ReadAheadCache  
    - 'blockcache': BlockCache
    - 'bytes': BytesCache
    - 'background': BackgroundBlockCache
    """

Usage Patterns

Specifying Cache Type in File Opening

# Use specific cache type when opening files
with fsspec.open('s3://bucket/large-file.dat', cache_type='mmap') as f:
    # File uses memory-mapped caching
    data = f.read(1024)

# Use block cache with custom parameters
with fsspec.open('s3://bucket/file.dat', 
                 cache_type='blockcache',
                 block_size=1024*1024,
                 maxblocks=64) as f:
    data = f.read()

Cache Configuration for Different Access Patterns

# Sequential reading - use read-ahead cache
with fsspec.open('s3://bucket/log-file.txt', 
                 cache_type='readahead', 
                 block_size=64*1024) as f:
    for line in f:
        process_line(line)

# Random access - use memory-mapped cache
with fsspec.open('s3://bucket/database.dat',
                 cache_type='mmap',
                 block_size=4096) as f:
    # Jump to different positions efficiently
    f.seek(1000000)
    data1 = f.read(100)
    f.seek(5000000)
    data2 = f.read(100)

# Small files - use bytes cache
with fsspec.open('s3://bucket/config.json',
                 cache_type='bytes') as f:
    config = json.load(f)

Background Prefetching

# Use background cache for predictable access patterns
with fsspec.open('s3://bucket/time-series.dat',
                 cache_type='background',
                 block_size=1024*1024,
                 maxblocks=16) as f:
    # Cache will prefetch subsequent blocks in background
    for i in range(0, file_size, chunk_size):
        f.seek(i)
        chunk = f.read(chunk_size)
        process_chunk(chunk)

Filesystem-Level Cache Configuration

# Configure caching at filesystem level
s3 = fsspec.filesystem('s3', 
                      key='ACCESS_KEY', 
                      secret='SECRET_KEY',
                      default_cache_type='blockcache',
                      default_block_size=1024*1024)

# All files opened through this filesystem use the cache settings
with s3.open('bucket/file1.dat') as f:
    data1 = f.read()

with s3.open('bucket/file2.dat') as f:
    data2 = f.read()

Cache Performance Tuning

# Tune cache parameters for specific workloads

# Large files with sequential access
large_file_cache = {
    'cache_type': 'readahead',
    'block_size': 8 * 1024 * 1024,  # 8MB blocks
    'maxblocks': 4  # Keep 32MB in memory
}

# Database-like files with random access
random_access_cache = {
    'cache_type': 'mmap',
    'block_size': 64 * 1024,  # 64KB blocks
    'maxblocks': 256  # Keep 16MB in memory
}

# Many small files
small_files_cache = {
    'cache_type': 'bytes'  # Cache entire file
}

# Open files with appropriate cache settings
with fsspec.open('s3://bucket/large.dat', **large_file_cache) as f:
    process_large_file(f)

with fsspec.open('s3://bucket/index.db', **random_access_cache) as f:
    lookup_data(f)

with fsspec.open('s3://bucket/config.json', **small_files_cache) as f:
    config = json.load(f)

Monitoring Cache Performance

# Access cache statistics (implementation-dependent)
with fsspec.open('s3://bucket/file.dat', cache_type='blockcache') as f:
    # Perform operations
    data = f.read(1024*1024)
    
    # Some cache implementations provide statistics
    if hasattr(f.cache, 'hit_count'):
        print(f"Cache hits: {f.cache.hit_count}")
        print(f"Cache misses: {f.cache.miss_count}")
        print(f"Hit ratio: {f.cache.hit_count / (f.cache.hit_count + f.cache.miss_count)}")

Combining with Compression

# Caching works with compression
with fsspec.open('s3://bucket/data.csv.gz',
                 compression='gzip',
                 cache_type='readahead',
                 block_size=1024*1024) as f:
    # Compressed data is cached, decompression happens after cache
    df = pd.read_csv(f)

Cache Location Control

# Control where cache files are stored (for persistent caches)
import tempfile

cache_dir = tempfile.mkdtemp()

with fsspec.open('s3://bucket/large-file.dat',
                 cache_type='mmap',
                 cache_storage=cache_dir) as f:
    # Memory-mapped cache file stored in cache_dir
    data = f.read()

# Cache files persist after closing
# Subsequent opens can reuse cached data

Cache Invalidation

# Clear caches when needed
fs = fsspec.filesystem('s3')

# Clear cache for specific file
fs.invalidate_cache('bucket/file.dat')

# Clear all cached data for this filesystem
fs.invalidate_cache()

# Clear all filesystem instances (nuclear option)
fsspec.AbstractFileSystem.clear_instance_cache()

Cache Selection Guidelines

By Access Pattern

Sequential Reading: ReadAheadCache - Prefetches next blocks automatically
Random Access: MMapCache - Efficient memory mapping for jumping around
Mixed Access: BlockCache - Good general-purpose LRU cache
One-time Read: BytesCache - Simple for small files read once
Predictable Patterns: BackgroundBlockCache - Intelligent prefetching

By File Size

Small files (<1MB): BytesCache - Cache entire file in memory
Medium files (1MB-100MB): BlockCache or ReadAheadCache
Large files (>100MB): MMapCache for random access, ReadAheadCache for sequential

By Network Conditions

High latency: Larger block sizes, more aggressive prefetching
Low bandwidth: Smaller block sizes, conservative caching
Reliable connection: BackgroundBlockCache for intelligent prefetching
Unreliable connection: BlockCache with smaller blocks for retry resilience