Zstandard bindings for Python providing high-performance compression and decompression operations
Training and using custom dictionaries to achieve better compression ratios on similar data sets by learning common patterns and structures.
Create custom compression dictionaries from sample data to improve compression ratios for similar data.
def train_dictionary(
dict_size: int,
samples: list[bytes],
k: int = 0,
d: int = 8,
f: int = 20,
split_point: float = 1.0,
accel: int = 1,
notifications: int = 0,
dict_id: int = 0,
level: int = 3,
steps: int = 4,
threads: int = 0
) -> ZstdCompressionDict:
"""
Train a compression dictionary from sample data.
Parameters:
- dict_size: int, target dictionary size in bytes
- samples: list[bytes], sample data for training
- k: int, segment size parameter (0 = auto)
- d: int, dmer size parameter (6-16, default 8)
- f: int, log frequency parameter (default 20)
- split_point: float, split point for training (0.0-1.0)
- accel: int, acceleration parameter (1-10)
- notifications: int, notification level (0=none, 1=basic, 2=verbose)
- dict_id: int, dictionary ID (0 = auto-generate)
- level: int, compression level for dictionary optimization
- steps: int, training steps (1-4)
- threads: int, number of threads (0 = auto)
Returns:
ZstdCompressionDict: Trained compression dictionary
"""Usage Example:
import zstandard as zstd
# Prepare sample data for training
samples = [
b'{"name": "John", "age": 30, "city": "New York"}',
b'{"name": "Jane", "age": 25, "city": "San Francisco"}',
b'{"name": "Bob", "age": 35, "city": "Chicago"}',
b'{"name": "Alice", "age": 28, "city": "Boston"}',
# ... more similar JSON documents
]
# Train dictionary
dictionary = zstd.train_dictionary(
dict_size=8192, # 8KB dictionary
samples=samples,
level=5,
threads=4
)
print(f"Dictionary size: {len(dictionary)} bytes")
print(f"Dictionary ID: {dictionary.dict_id()}")Container for compression dictionaries with metadata and optimization capabilities.
class ZstdCompressionDict:
def __init__(
self,
data: bytes,
dict_type: int = DICT_TYPE_AUTO,
k: int = 0,
d: int = 0
):
"""
Create a compression dictionary from raw dictionary data.
Parameters:
- data: bytes, raw dictionary data
- dict_type: int, dictionary type (DICT_TYPE_AUTO, DICT_TYPE_RAWCONTENT, DICT_TYPE_FULLDICT)
- k: int, segment size parameter
- d: int, dmer size parameter
"""
def __len__(self) -> int:
"""Get dictionary size in bytes."""
def dict_id(self) -> int:
"""
Get dictionary ID.
Returns:
int: Dictionary identifier
"""
def as_bytes(self) -> bytes:
"""
Get dictionary data as bytes.
Returns:
bytes: Raw dictionary data
"""
def precompute_compress(
self,
level: int = 3,
compression_params: ZstdCompressionParameters = None
):
"""
Precompute compression tables for better performance.
Parameters:
- level: int, compression level to optimize for
- compression_params: ZstdCompressionParameters, detailed parameters
"""
# Properties
k: int # Segment size parameter
d: int # Dmer size parameterUsage Example:
import zstandard as zstd
# Load dictionary from file
with open('dictionary.zdict', 'rb') as f:
dict_data = f.read()
# Create dictionary object
dictionary = zstd.ZstdCompressionDict(dict_data)
# Optimize for specific compression level
dictionary.precompute_compress(level=9)
# Get dictionary information
print(f"Dictionary size: {len(dictionary)} bytes")
print(f"Dictionary ID: {dictionary.dict_id()}")
print(f"Parameters: k={dictionary.k}, d={dictionary.d}")
# Save optimized dictionary
optimized_data = dictionary.as_bytes()
with open('optimized_dictionary.zdict', 'wb') as f:
f.write(optimized_data)Using dictionaries with compressors to achieve better compression ratios.
class ZstdCompressor:
def __init__(
self,
level: int = 3,
dict_data: ZstdCompressionDict = None,
**kwargs
):
"""
Create compressor with dictionary support.
Parameters:
- level: int, compression level
- dict_data: ZstdCompressionDict, compression dictionary
- **kwargs: other compressor parameters
"""Usage Example:
import zstandard as zstd
# Train dictionary from samples
samples = [b"sample data 1", b"sample data 2", b"sample data 3"]
dictionary = zstd.train_dictionary(4096, samples)
# Create compressor with dictionary
compressor = zstd.ZstdCompressor(level=5, dict_data=dictionary)
# Compress new data (similar to training samples)
new_data = b"new sample data with similar patterns"
compressed = compressor.compress(new_data)
# Compare compression ratios
compressor_no_dict = zstd.ZstdCompressor(level=5)
compressed_no_dict = compressor_no_dict.compress(new_data)
print(f"With dictionary: {len(compressed)} bytes")
print(f"Without dictionary: {len(compressed_no_dict)} bytes")
print(f"Improvement: {(len(compressed_no_dict) - len(compressed)) / len(compressed_no_dict) * 100:.1f}%")Using dictionaries with decompressors to decompress dictionary-compressed data.
class ZstdDecompressor:
def __init__(
self,
dict_data: ZstdCompressionDict = None,
**kwargs
):
"""
Create decompressor with dictionary support.
Parameters:
- dict_data: ZstdCompressionDict, decompression dictionary
- **kwargs: other decompressor parameters
"""Usage Example:
import zstandard as zstd
# Load dictionary (same as used for compression)
dictionary = zstd.ZstdCompressionDict(dict_data)
# Create decompressor with dictionary
decompressor = zstd.ZstdDecompressor(dict_data=dictionary)
# Decompress dictionary-compressed data
decompressed = decompressor.decompress(compressed_data)
print(f"Decompressed: {decompressed}")Different dictionary types for various use cases and compatibility requirements.
# Dictionary type constants
DICT_TYPE_AUTO: int # Auto-detect dictionary type
DICT_TYPE_RAWCONTENT: int # Raw content dictionary
DICT_TYPE_FULLDICT: int # Full dictionary with headersUsage Example:
import zstandard as zstd
# Raw content dictionary (just the sample data)
raw_dict = zstd.ZstdCompressionDict(
sample_data,
dict_type=zstd.DICT_TYPE_RAWCONTENT
)
# Full dictionary (with zstd dictionary headers)
full_dict = zstd.ZstdCompressionDict(
trained_dict_data,
dict_type=zstd.DICT_TYPE_FULLDICT
)
# Auto-detect type
auto_dict = zstd.ZstdCompressionDict(
dict_data,
dict_type=zstd.DICT_TYPE_AUTO
)Guidelines for effective dictionary usage:
Training Data Selection:
import zstandard as zstd
# Use representative samples
samples = collect_representative_data()
# Aim for 100-1000 samples, each 1KB-64KB
filtered_samples = [s for s in samples if 1024 <= len(s) <= 65536]
# Dictionary size: typically 64KB-1MB
dictionary = zstd.train_dictionary(
dict_size=min(65536, sum(len(s) for s in filtered_samples) // 100),
samples=filtered_samples
)Performance Optimization:
import zstandard as zstd
# Precompute tables for target compression level
dictionary.precompute_compress(level=compression_level)
# Reuse compressor/decompressor objects
compressor = zstd.ZstdCompressor(level=5, dict_data=dictionary)
decompressor = zstd.ZstdDecompressor(dict_data=dictionary)
# Compress multiple items efficiently
for data in data_items:
compressed = compressor.compress(data)
process_compressed(compressed)Dictionary Persistence:
import zstandard as zstd
# Save dictionary for later use
dictionary_data = dictionary.as_bytes()
with open('model.zdict', 'wb') as f:
f.write(dictionary_data)
# Load dictionary
with open('model.zdict', 'rb') as f:
dict_data = f.read()
dictionary = zstd.ZstdCompressionDict(dict_data)Install with Tessl CLI
npx tessl i tessl/pypi-zstandard