Unified pythonic interface for diverse file systems and storage backends
—
Multiple caching strategies for optimizing filesystem access patterns, including memory mapping, block caching, read-ahead caching, and background prefetching. The caching system improves performance with remote storage by reducing network requests and providing intelligent data prefetching.
Abstract base class that defines the interface for all caching implementations.
class BaseCache:
"""Base class for caching implementations."""
def __init__(self, blocksize, fetcher, size, **kwargs):
"""
Initialize cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
- **kwargs: additional cache-specific options
"""
def _fetch(self, start, end):
"""
Fetch data range.
Parameters:
- start: int, start byte offset
- end: int, end byte offset
Returns:
bytes, fetched data
"""
def _read_cache(self, start, end):
"""
Read from cache if available.
Parameters:
- start: int, start byte offset
- end: int, end byte offset
Returns:
bytes or None, cached data or None if not cached
"""Uses memory mapping for efficient access to cached files, particularly useful for large files with random access patterns.
class MMapCache(BaseCache):
"""Memory-mapped file cache for efficient random access."""
def __init__(self, blocksize, fetcher, size, location=None, blocks=None):
"""
Initialize memory-mapped cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
- location: str, local file path for memory mapping
- blocks: set, specific blocks to cache
"""Implements read-ahead caching strategy that prefetches data based on sequential access patterns.
class ReadAheadCache(BaseCache):
"""Read-ahead cache optimized for sequential access patterns."""
def __init__(self, blocksize, fetcher, size, maxblocks=32):
"""
Initialize read-ahead cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
- maxblocks: int, maximum number of blocks to cache
"""LRU-based block caching with configurable cache size and eviction policies.
class BlockCache(BaseCache):
"""Block-based cache with LRU eviction policy."""
def __init__(self, blocksize, fetcher, size, maxblocks=32):
"""
Initialize block cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
- maxblocks: int, maximum number of blocks to keep in cache
"""Simple in-memory cache that stores entire file contents as bytes.
class BytesCache(BaseCache):
"""In-memory bytes cache for small files."""
def __init__(self, blocksize, fetcher, size, **kwargs):
"""
Initialize bytes cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
"""Advanced block cache with background prefetching for improved performance with predictable access patterns.
class BackgroundBlockCache(BaseCache):
"""Block cache with background prefetching capabilities."""
def __init__(self, blocksize, fetcher, size, maxblocks=32):
"""
Initialize background block cache.
Parameters:
- blocksize: int, size of cache blocks
- fetcher: callable, function to fetch data
- size: int, total size of cached object
- maxblocks: int, maximum number of blocks to cache
"""Dictionary of available cache implementations that can be selected by name.
caches: dict
"""
Mapping of cache names to cache classes.
Available caches:
- 'mmap': MMapCache
- 'readahead': ReadAheadCache
- 'blockcache': BlockCache
- 'bytes': BytesCache
- 'background': BackgroundBlockCache
"""# Use specific cache type when opening files
with fsspec.open('s3://bucket/large-file.dat', cache_type='mmap') as f:
# File uses memory-mapped caching
data = f.read(1024)
# Use block cache with custom parameters
with fsspec.open('s3://bucket/file.dat',
cache_type='blockcache',
block_size=1024*1024,
maxblocks=64) as f:
data = f.read()# Sequential reading - use read-ahead cache
with fsspec.open('s3://bucket/log-file.txt',
cache_type='readahead',
block_size=64*1024) as f:
for line in f:
process_line(line)
# Random access - use memory-mapped cache
with fsspec.open('s3://bucket/database.dat',
cache_type='mmap',
block_size=4096) as f:
# Jump to different positions efficiently
f.seek(1000000)
data1 = f.read(100)
f.seek(5000000)
data2 = f.read(100)
# Small files - use bytes cache
with fsspec.open('s3://bucket/config.json',
cache_type='bytes') as f:
config = json.load(f)# Use background cache for predictable access patterns
with fsspec.open('s3://bucket/time-series.dat',
cache_type='background',
block_size=1024*1024,
maxblocks=16) as f:
# Cache will prefetch subsequent blocks in background
for i in range(0, file_size, chunk_size):
f.seek(i)
chunk = f.read(chunk_size)
process_chunk(chunk)# Configure caching at filesystem level
s3 = fsspec.filesystem('s3',
key='ACCESS_KEY',
secret='SECRET_KEY',
default_cache_type='blockcache',
default_block_size=1024*1024)
# All files opened through this filesystem use the cache settings
with s3.open('bucket/file1.dat') as f:
data1 = f.read()
with s3.open('bucket/file2.dat') as f:
data2 = f.read()# Tune cache parameters for specific workloads
# Large files with sequential access
large_file_cache = {
'cache_type': 'readahead',
'block_size': 8 * 1024 * 1024, # 8MB blocks
'maxblocks': 4 # Keep 32MB in memory
}
# Database-like files with random access
random_access_cache = {
'cache_type': 'mmap',
'block_size': 64 * 1024, # 64KB blocks
'maxblocks': 256 # Keep 16MB in memory
}
# Many small files
small_files_cache = {
'cache_type': 'bytes' # Cache entire file
}
# Open files with appropriate cache settings
with fsspec.open('s3://bucket/large.dat', **large_file_cache) as f:
process_large_file(f)
with fsspec.open('s3://bucket/index.db', **random_access_cache) as f:
lookup_data(f)
with fsspec.open('s3://bucket/config.json', **small_files_cache) as f:
config = json.load(f)# Access cache statistics (implementation-dependent)
with fsspec.open('s3://bucket/file.dat', cache_type='blockcache') as f:
# Perform operations
data = f.read(1024*1024)
# Some cache implementations provide statistics
if hasattr(f.cache, 'hit_count'):
print(f"Cache hits: {f.cache.hit_count}")
print(f"Cache misses: {f.cache.miss_count}")
print(f"Hit ratio: {f.cache.hit_count / (f.cache.hit_count + f.cache.miss_count)}")# Caching works with compression
with fsspec.open('s3://bucket/data.csv.gz',
compression='gzip',
cache_type='readahead',
block_size=1024*1024) as f:
# Compressed data is cached, decompression happens after cache
df = pd.read_csv(f)# Control where cache files are stored (for persistent caches)
import tempfile
cache_dir = tempfile.mkdtemp()
with fsspec.open('s3://bucket/large-file.dat',
cache_type='mmap',
cache_storage=cache_dir) as f:
# Memory-mapped cache file stored in cache_dir
data = f.read()
# Cache files persist after closing
# Subsequent opens can reuse cached data# Clear caches when needed
fs = fsspec.filesystem('s3')
# Clear cache for specific file
fs.invalidate_cache('bucket/file.dat')
# Clear all cached data for this filesystem
fs.invalidate_cache()
# Clear all filesystem instances (nuclear option)
fsspec.AbstractFileSystem.clear_instance_cache()ReadAheadCache - Prefetches next blocks automaticallyMMapCache - Efficient memory mapping for jumping aroundBlockCache - Good general-purpose LRU cacheBytesCache - Simple for small files read onceBackgroundBlockCache - Intelligent prefetchingBytesCache - Cache entire file in memoryBlockCache or ReadAheadCacheMMapCache for random access, ReadAheadCache for sequentialBackgroundBlockCache for intelligent prefetchingBlockCache with smaller blocks for retry resilienceInstall with Tessl CLI
npx tessl i tessl/pypi-fsspec