tessl/pypi-fsspec

Unified pythonic interface for diverse file systems and storage backends

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Dictionary Mapping Interface

Name: tessl/pypi-fsspec
Author: tessl

Key-value store interface that presents filesystem paths as dictionary keys, enabling intuitive data access patterns and integration with mapping-based workflows. The FSMap class implements Python's MutableMapping interface to provide familiar dictionary operations on filesystem data.

Capabilities

FSMap Class

Dictionary-like interface to filesystem that maps string keys to file contents as bytes values.

class FSMap:
    """Dictionary-like interface to filesystem paths."""
    
    def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
        """
        Initialize FSMap.
        
        Parameters:
        - root: str, root path for mapping
        - fs: AbstractFileSystem, filesystem instance
        - check: bool, check if root path exists
        - create: bool, create root path if it doesn't exist
        - missing_exceptions: tuple, exceptions to catch for missing files
        """

Dictionary Interface Operations

Standard dictionary operations implemented through the MutableMapping interface.

def __getitem__(self, key):
    """
    Get file contents by key.
    
    Parameters:
    - key: str, file key (relative to root)
    
    Returns:
    bytes, file contents
    """

def __setitem__(self, key, value):
    """
    Set file contents by key.
    
    Parameters:
    - key: str, file key (relative to root)
    - value: bytes, data to write
    """

def __delitem__(self, key):
    """
    Delete file by key.
    
    Parameters:
    - key: str, file key (relative to root)
    """

def __iter__(self):
    """
    Iterate over all keys.
    
    Returns:
    iterator, file keys
    """

def __len__(self):
    """
    Get number of files.
    
    Returns:
    int, number of files in mapping
    """

Bulk Operations

Efficient operations for working with multiple keys simultaneously.

def getitems(self, keys, on_error='raise'):
    """
    Get multiple items by keys.
    
    Parameters:
    - keys: list, file keys to retrieve
    - on_error: str, how to handle missing keys ('raise', 'omit', 'return_none')
    
    Returns:
    dict, mapping of keys to file contents
    """

def setitems(self, d):
    """
    Set multiple items from dictionary.
    
    Parameters:
    - d: dict, mapping of keys to data
    """

def delitems(self, keys):
    """
    Delete multiple items by keys.
    
    Parameters:
    - keys: list, file keys to delete
    """

def clear(self):
    """Remove all files from the mapping."""

Properties and Utilities

Additional properties and utility methods for working with the mapped filesystem.

@property
def dirfs(self):
    """
    Get DirFileSystem for this mapping.
    
    Returns:
    DirFileSystem, filesystem view of the mapping directory
    """

Mapper Creation Function

Convenience function for creating FSMap instances from URLs.

def get_mapper(url='', check=False, create=False, **kwargs):
    """
    Create a key-value store interface to a filesystem.
    
    Parameters:
    - url: str, filesystem URL (default: current directory)
    - check: bool, check if path exists
    - create: bool, create path if it doesn't exist
    - **kwargs: additional options passed to filesystem
    
    Returns:
    FSMap, dictionary-like interface
    """

Usage Patterns

Basic Dictionary Operations

# Create mapper for S3 bucket
mapper = fsspec.get_mapper('s3://bucket/data/')

# Write data like a dictionary
mapper['file1.txt'] = b'Hello, world!'
mapper['subdir/file2.json'] = b'{"key": "value"}'

# Read data like a dictionary
content = mapper['file1.txt']
print(content.decode())  # Hello, world!

# Check if key exists
if 'file1.txt' in mapper:
    print('File exists')

# Delete files
del mapper['file1.txt']

# Get all keys
keys = list(mapper.keys())

Bulk Operations

# Write multiple files at once
data = {
    'file1.txt': b'Content 1',
    'file2.txt': b'Content 2', 
    'file3.txt': b'Content 3'
}
mapper.setitems(data)

# Read multiple files
contents = mapper.getitems(['file1.txt', 'file2.txt'])

# Handle missing files gracefully
contents = mapper.getitems(['file1.txt', 'missing.txt'], on_error='omit')

# Delete multiple files
mapper.delitems(['file1.txt', 'file2.txt'])

Integration with Data Processing

import json
import pickle

# JSON data storage
mapper = fsspec.get_mapper('s3://bucket/json-data/')

# Store JSON data
data = {'name': 'example', 'values': [1, 2, 3]}
mapper['config.json'] = json.dumps(data).encode()

# Load JSON data
raw_data = mapper['config.json']
config = json.loads(raw_data.decode())

# Binary data storage
binary_mapper = fsspec.get_mapper('gcs://bucket/models/')

# Store pickled model
import pickle
model = {'weights': [1.0, 2.0, 3.0], 'bias': 0.5}
binary_mapper['model.pkl'] = pickle.dumps(model)

# Load pickled model
model_data = binary_mapper['model.pkl']
loaded_model = pickle.loads(model_data)

Working with Nested Structures

# Create mapper with nested directory structure
mapper = fsspec.get_mapper('local:///data/experiment/')

# Organize data hierarchically using key paths
mapper['inputs/train.csv'] = train_data
mapper['inputs/test.csv'] = test_data
mapper['models/v1/weights.pkl'] = model_weights
mapper['models/v1/config.json'] = model_config
mapper['results/metrics.json'] = evaluation_metrics

# List all keys to see structure
for key in mapper:
    print(key)
# inputs/train.csv
# inputs/test.csv  
# models/v1/weights.pkl
# models/v1/config.json
# results/metrics.json

Error Handling

mapper = fsspec.get_mapper('s3://bucket/data/')

try:
    # This will raise KeyError if file doesn't exist
    content = mapper['nonexistent.txt']
except KeyError:
    print('File not found')

# Use getitems for graceful handling
result = mapper.getitems(['file1.txt', 'missing.txt'], on_error='omit')
# Only existing files are returned

# Check existence before access
if 'uncertain_file.txt' in mapper:
    content = mapper['uncertain_file.txt']

Performance Optimization

# Use bulk operations for better performance
keys_to_read = ['file1.txt', 'file2.txt', 'file3.txt']

# Efficient: single bulk operation
contents = mapper.getitems(keys_to_read)

# Inefficient: multiple individual operations
contents = {}
for key in keys_to_read:
    contents[key] = mapper[key]

# Efficient bulk write
data_batch = {
    f'batch_{i}.txt': f'Data {i}'.encode()
    for i in range(100)
}
mapper.setitems(data_batch)

Integration with Zarr and Array Libraries

import zarr

# Create mapper for Zarr store
store = fsspec.get_mapper('s3://bucket/zarr-data.zarr')

# Create Zarr array using fsspec mapper
z = zarr.zeros((1000, 1000), chunks=(100, 100), store=store)

# Write data to array
z[:100, :100] = 1.0

# The zarr metadata and chunks are stored as files in the mapper
print(list(store.keys()))
# ['.zarray', '0.0', '0.1', '1.0', '1.1', ...]

Caching and Local Access

# Create cached mapper for better performance
cached_mapper = fsspec.get_mapper(
    'simplecache::s3://bucket/data/',
    s3={'key': 'ACCESS_KEY', 'secret': 'SECRET_KEY'},
    cache_storage='/tmp/fsspec-cache'
)

# First access downloads and caches
data = cached_mapper['large_file.dat']

# Subsequent access reads from local cache
data = cached_mapper['large_file.dat']  # Much faster

Install with Tessl CLI