tessl/pypi-zstandard

Zstandard bindings for Python providing high-performance compression and decompression operations

Overview

Eval results

Files

Buffer Operations

Name: tessl/pypi-zstandard
Author: tessl

Advanced buffer management for zero-copy operations, efficient batch processing, and high-performance data handling in compression and decompression workflows.

Capabilities

Buffer Segments

Individual buffer segments that provide efficient access to portions of larger buffers without copying data.

class BufferSegment:
    @property
    def offset(self) -> int:
        """Offset of this segment within the parent buffer."""

    def __len__(self) -> int:
        """Get segment length in bytes."""

    def tobytes(self) -> bytes:
        """
        Convert segment to bytes.

        Returns:
        bytes: Copy of segment data
        """

Usage Example:

import zstandard as zstd

# Buffer segments are typically returned by compression operations
compressor = zstd.ZstdCompressor()
result = compressor.multi_compress_to_buffer([b"data1", b"data2", b"data3"])

# Access individual segments
for i, segment in enumerate(result):
    print(f"Segment {i}: offset={segment.offset}, length={len(segment)}")
    data = segment.tobytes()
    process_data(data)

Buffer Collections

Collections of buffer segments that provide efficient iteration and access patterns.

class BufferSegments:
    def __len__(self) -> int:
        """Get number of segments in collection."""

    def __getitem__(self, i: int) -> BufferSegment:
        """
        Get segment by index.

        Parameters:
        - i: int, segment index

        Returns:
        BufferSegment: Segment at index
        """

Usage Example:

import zstandard as zstd

# BufferSegments collections are returned by some operations
compressor = zstd.ZstdCompressor()
result = compressor.multi_compress_to_buffer([b"data1", b"data2"])

# Iterate over segments
for segment in result:
    data = segment.tobytes()
    print(f"Segment data: {len(data)} bytes")

# Access by index
first_segment = result[0]
second_segment = result[1]

Buffers with Segments

Buffers that contain multiple segments, providing both the raw data and segment boundary information.

class BufferWithSegments:
    @property
    def size(self) -> int:
        """Total buffer size in bytes."""

    def __init__(self, data: bytes, segments: bytes):
        """
        Create buffer with segment information.

        Parameters:
        - data: bytes, raw buffer data
        - segments: bytes, segment boundary information
        """

    def __len__(self) -> int:
        """Get number of segments."""

    def __getitem__(self, i: int) -> BufferSegment:
        """
        Get segment by index.

        Parameters:
        - i: int, segment index

        Returns:
        BufferSegment: Segment at index
        """

    def segments(self):
        """Get segments iterator."""

    def tobytes(self) -> bytes:
        """
        Convert entire buffer to bytes.

        Returns:
        bytes: Complete buffer data
        """

Usage Example:

import zstandard as zstd

# Create buffer with segments manually (advanced usage)
data = b"concatenated data from multiple sources"
# segments contains boundary information (format is internal)
segments = b"..."  # segment boundary data

buffer = zstd.BufferWithSegments(data, segments)

print(f"Buffer size: {buffer.size} bytes")
print(f"Number of segments: {len(buffer)}")

# Access segments
for i in range(len(buffer)):
    segment = buffer[i]
    segment_data = segment.tobytes()
    print(f"Segment {i}: {len(segment_data)} bytes")

# Get all data
all_data = buffer.tobytes()

Buffer Collections

Collections of multiple buffers with segments, used for batch operations and efficient data management.

class BufferWithSegmentsCollection:
    def __init__(self, *args):
        """
        Create collection of buffers with segments.

        Parameters:
        - *args: BufferWithSegments objects
        """

    def __len__(self) -> int:
        """Get number of buffers in collection."""

    def __getitem__(self, i: int) -> BufferSegment:
        """
        Get segment by global index across all buffers.

        Parameters:
        - i: int, global segment index

        Returns: 
        BufferSegment: Segment at index
        """

    def size(self) -> int:
        """
        Get total size of all buffers.

        Returns:
        int: Total size in bytes
        """

Usage Example:

import zstandard as zstd

# Collections are typically returned by multi-threaded operations
compressor = zstd.ZstdCompressor()
data_items = [b"item1", b"item2", b"item3", b"item4"]

# Multi-compress returns a collection
collection = compressor.multi_compress_to_buffer(data_items, threads=2)

print(f"Collection size: {collection.size()} bytes")
print(f"Number of items: {len(collection)}")

# Access compressed items
for i in range(len(collection)):
    segment = collection[i]
    compressed_data = segment.tobytes()
    print(f"Item {i}: {len(compressed_data)} bytes compressed")

Batch Compression with Buffers

Efficient batch compression that returns results in buffer collections for optimal memory usage.

class ZstdCompressor:
    def multi_compress_to_buffer(
        self,
        data,
        threads: int = 0
    ) -> BufferWithSegmentsCollection:
        """
        Compress multiple data items to buffer collection.

        Parameters:
        - data: list[bytes], BufferWithSegments, or BufferWithSegmentsCollection
        - threads: int, number of threads (0 = auto)

        Returns:
        BufferWithSegmentsCollection: Compressed data in buffer collection
        """

Usage Example:

import zstandard as zstd

compressor = zstd.ZstdCompressor(level=5)

# Prepare data for batch compression
documents = [
    b'{"id": 1, "text": "First document"}',
    b'{"id": 2, "text": "Second document"}',
    b'{"id": 3, "text": "Third document"}',
    b'{"id": 4, "text": "Fourth document"}'
]

# Compress in parallel
result = compressor.multi_compress_to_buffer(documents, threads=4)

# Process results efficiently
total_original = sum(len(doc) for doc in documents)
total_compressed = result.size()

print(f"Compressed {total_original} bytes to {total_compressed} bytes")
print(f"Compression ratio: {total_original/total_compressed:.2f}:1")

# Extract individual compressed documents
compressed_docs = []
for i in range(len(result)):
    segment = result[i]
    compressed_docs.append(segment.tobytes())

Batch Decompression with Buffers

Efficient batch decompression using buffer collections for high-throughput processing.

class ZstdDecompressor:
    def multi_decompress_to_buffer(
        self,
        frames,
        decompressed_sizes: bytes = b"",
        threads: int = 0
    ) -> BufferWithSegmentsCollection:
        """
        Decompress multiple frames to buffer collection.

        Parameters:
        - frames: list[bytes], BufferWithSegments, or BufferWithSegmentsCollection
        - decompressed_sizes: bytes, expected decompressed sizes (optional optimization)
        - threads: int, number of threads (0 = auto)

        Returns:
        BufferWithSegmentsCollection: Decompressed data in buffer collection
        """

Usage Example:

import zstandard as zstd

decompressor = zstd.ZstdDecompressor()

# Compressed frames from previous example
compressed_frames = compressed_docs

# Decompress in parallel
result = decompressor.multi_decompress_to_buffer(compressed_frames, threads=4)

print(f"Decompressed {len(compressed_frames)} frames")
print(f"Total decompressed size: {result.size()} bytes")

# Extract decompressed data
decompressed_docs = []
for i in range(len(result)):
    segment = result[i]
    decompressed_docs.append(segment.tobytes())

# Verify round-trip
for i, (original, decompressed) in enumerate(zip(documents, decompressed_docs)):
    assert original == decompressed, f"Mismatch in document {i}"

Zero-Copy Operations

Advanced usage patterns that minimize memory copying for maximum performance.

Usage Example:

import zstandard as zstd

def process_large_dataset(data_items):
    """Process large dataset with minimal memory copying."""
    compressor = zstd.ZstdCompressor(level=3)
    
    # Compress in batches to manage memory
    batch_size = 1000
    all_results = []
    
    for i in range(0, len(data_items), batch_size):
        batch = data_items[i:i+batch_size]
        
        # Multi-compress returns BufferWithSegmentsCollection
        compressed_batch = compressor.multi_compress_to_buffer(batch, threads=4)
        
        # Process segments without copying unless necessary
        for j in range(len(compressed_batch)):
            segment = compressed_batch[j]
            
            # Only copy if we need to persist the data
            if need_to_store(j):
                data = segment.tobytes()
                store_data(i + j, data)
            else:
                # Use segment directly for temporary operations
                process_segment_in_place(segment)
    
    return all_results

def stream_compress_with_buffers(input_stream, output_stream):
    """Stream compression using buffers for efficiency."""
    compressor = zstd.ZstdCompressor()
    
    # Read chunks and compress in batches
    chunks = []
    chunk_size = 64 * 1024  # 64KB chunks
    
    while True:
        chunk = input_stream.read(chunk_size)
        if not chunk:
            break
            
        chunks.append(chunk)
        
        # Process in batches of 100 chunks
        if len(chunks) >= 100:
            result = compressor.multi_compress_to_buffer(chunks, threads=2)
            
            # Write compressed data
            for i in range(len(result)):
                segment = result[i]
                output_stream.write(segment.tobytes())
            
            chunks = []
    
    # Process remaining chunks
    if chunks:
        result = compressor.multi_compress_to_buffer(chunks, threads=2)
        for i in range(len(result)):
            segment = result[i]
            output_stream.write(segment.tobytes())

Memory Management

Buffer operations provide efficient memory usage patterns for high-performance applications.

Memory Usage Example:

import zstandard as zstd

def analyze_buffer_memory():
    """Analyze memory usage of buffer operations."""
    compressor = zstd.ZstdCompressor()
    
    # Large dataset
    data = [b"x" * 1024 for _ in range(1000)]  # 1000 x 1KB items
    
    print(f"Original data: {sum(len(item) for item in data)} bytes")
    print(f"Compressor memory: {compressor.memory_size()} bytes")
    
    # Compress to buffer collection
    result = compressor.multi_compress_to_buffer(data, threads=4)
    
    print(f"Compressed size: {result.size()} bytes")
    print(f"Number of segments: {len(result)}")
    
    # Efficient iteration without copying
    for i, segment in enumerate(result):
        # segment.tobytes() copies data - avoid if possible
        size = len(segment)  # No copy required
        offset = segment.offset  # No copy required
        
        if i < 5:  # Show first few
            print(f"Segment {i}: size={size}, offset={offset}")

Performance Considerations

Buffer operations minimize memory copying for better performance
Multi-threaded operations return buffer collections for efficient parallel processing
Segments provide zero-copy access to portions of larger buffers
Use tobytes() only when you need a copy of the data
Buffer collections enable efficient batch processing of large datasets
Memory usage is optimized for high-throughput scenarios

Install with Tessl CLI