Unified pythonic interface for diverse file systems and storage backends
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Automatic compression and decompression support for multiple formats, enabling transparent handling of compressed files across all filesystem backends. fsspec automatically detects compression from file extensions and provides seamless integration with Python's compression libraries.
Register new compression formats and associate them with file extensions and compression handlers.
def register_compression(name, callback, extensions, force=False):
"""
Register a compression format.
Parameters:
- name: str, compression format name
- callback: callable, function that returns file-like object for decompression
- extensions: list of str, file extensions associated with this format
- force: bool, whether to overwrite existing registration
"""Query which compression formats are currently supported by the system.
def available_compressions():
"""
List all available compression formats.
Returns:
list of str, compression format names
"""Compression formats supported through Python's standard library:
# GZIP compression (.gz files)
'gzip': Uses gzip module for compression/decompression
# BZIP2 compression (.bz2 files)
'bz2': Uses bz2 module for compression/decompression
# LZMA compression (.lzma, .xz files)
'lzma': Uses lzma module for compression/decompression
# ZIP archive format (.zip files)
'zip': Uses zipfile module for archive accessAdditional compression formats available when optional dependencies are installed:
# Snappy compression (requires python-snappy)
'snappy': Fast compression optimized for speed over ratio
# LZ4 compression (requires lz4)
'lz4': Ultra-fast compression with .lz4 extension
# Zstandard compression (requires zstandard)
'zstd': Modern compression with .zst extension, good speed/ratio balance# fsspec automatically detects compression from file extensions
# Reading compressed files
with fsspec.open('data.csv.gz', 'rt') as f:
# Automatically decompressed
content = f.read()
with fsspec.open('logs.txt.bz2', 'rt') as f:
for line in f:
process_line(line)
with fsspec.open('archive.tar.xz', 'rb') as f:
# LZMA decompression
data = f.read()# Force specific compression format
with fsspec.open('data.csv', 'rt', compression='gzip') as f:
content = f.read()
# Override automatic detection
with fsspec.open('file.gz', 'rt', compression='bz2') as f:
# Treats .gz file as bz2 compressed
content = f.read()
# Disable compression for .gz file
with fsspec.open('not-compressed.gz', 'rt', compression=None) as f:
# Reads raw file without decompression
content = f.read()# Write compressed data
with fsspec.open('output.csv.gz', 'wt') as f:
# Automatically compressed using gzip
f.write('column1,column2\n')
f.write('value1,value2\n')
# Write with explicit compression
with fsspec.open('output.txt', 'wt', compression='bz2') as f:
f.write('This will be bz2 compressed\n')# S3 files with compression
with fsspec.open('s3://bucket/data.csv.gz', 'rt') as f:
df = pd.read_csv(f)
# HTTP files with compression
with fsspec.open('https://example.com/data.json.gz', 'rt') as f:
data = json.load(f)
# GCS files with compression
with fsspec.open('gcs://bucket/logs.txt.bz2', 'rt') as f:
for line in f:
process_log(line)# Process files with different compression formats
files = [
's3://bucket/data1.csv.gz', # gzip
's3://bucket/data2.csv.bz2', # bzip2
's3://bucket/data3.csv.xz', # lzma
's3://bucket/data4.csv' # uncompressed
]
dataframes = []
for file_path in files:
with fsspec.open(file_path, 'rt') as f:
# Compression automatically handled
df = pd.read_csv(f)
dataframes.append(df)
combined_df = pd.concat(dataframes)# Access files within ZIP archives
with fsspec.open('zip://data.csv::archive.zip', 'rt') as f:
# Reads data.csv from within archive.zip
content = f.read()
# Remote ZIP archives
with fsspec.open('zip://data.csv::s3://bucket/archive.zip', 'rt') as f:
content = f.read()import fsspec
import my_compression_lib
def my_compression_opener(file, mode='rb'):
"""Custom compression opener function."""
if 'r' in mode:
return my_compression_lib.decompress_file(file)
elif 'w' in mode:
return my_compression_lib.compress_file(file)
else:
raise ValueError(f"Unsupported mode: {mode}")
# Register custom compression format
fsspec.compression.register_compression(
name='myformat',
callback=my_compression_opener,
extensions=['.mycomp', '.mc']
)
# Now use custom compression
with fsspec.open('data.txt.mycomp', 'rt') as f:
content = f.read()# Choose compression based on use case
# For speed-critical applications
with fsspec.open('data.csv.lz4', 'rt') as f: # Fast decompression
df = pd.read_csv(f)
# For space-critical applications
with fsspec.open('data.csv.xz', 'rt') as f: # High compression ratio
df = pd.read_csv(f)
# For general use
with fsspec.open('data.csv.gz', 'rt') as f: # Good balance
df = pd.read_csv(f)# Compression works with caching layers
with fsspec.open('s3://bucket/large-data.csv.gz',
'rt',
cache_type='blockcache',
block_size=1024*1024) as f:
# Compressed data is cached, decompression happens after cache
df = pd.read_csv(f)import concurrent.futures
def process_compressed_file(file_path):
with fsspec.open(file_path, 'rt') as f:
return len(f.read())
# Process multiple compressed files in parallel
compressed_files = [
's3://bucket/file1.csv.gz',
's3://bucket/file2.csv.bz2',
's3://bucket/file3.csv.xz'
]
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
results = list(executor.map(process_compressed_file, compressed_files))# Check what compression formats are available
available = fsspec.compression.available_compressions()
print("Available compression formats:", available)
# Check for specific format
if 'lz4' in available:
print("LZ4 compression is available")
with fsspec.open('data.csv.lz4', 'rt') as f:
content = f.read()
else:
print("LZ4 not available, using gzip")
with fsspec.open('data.csv.gz', 'rt') as f:
content = f.read()try:
with fsspec.open('data.csv.gz', 'rt') as f:
content = f.read()
except ImportError as e:
print(f"Compression library not available: {e}")
except OSError as e:
print(f"Compression error (possibly corrupted file): {e}")
except Exception as e:
print(f"Unexpected error: {e}")lz4 package (pip install lz4)python-snappy packagezstandard package# Read compressed CSV directly into pandas
df = pd.read_csv(fsspec.open('s3://bucket/data.csv.gz', 'rt'))
# Write compressed CSV from pandas
with fsspec.open('output.csv.gz', 'wt') as f:
df.to_csv(f, index=False)# Read compressed JSON
with fsspec.open('config.json.gz', 'rt') as f:
config = json.load(f)
# Write compressed JSON
with fsspec.open('output.json.bz2', 'wt') as f:
json.dump(data, f, indent=2)# Read compressed numpy array
with fsspec.open('array.npy.gz', 'rb') as f:
array = np.load(f)
# Write compressed numpy array
with fsspec.open('output.npy.gz', 'wb') as f:
np.save(f, array)Install with Tessl CLI
npx tessl i tessl/pypi-fsspec