tessl/pypi-quilt3

Quilt manages data like code with packages, repositories, browsing and revision history for machine learning and data-driven domains

—

Pending

Overview

Eval results

Files

Package Data Access

Name: tessl/pypi-quilt3
Author: tessl

Methods for accessing, deserializing, and working with data files within packages. Supports various data formats with caching and optimization features.

Capabilities

File Access and Retrieval

Access files within packages and retrieve their physical locations.

class PackageEntry:
    def get(self) -> str:
        """
        Returns the physical key of this PackageEntry.

        Returns:
        Physical path or S3 URI to the file
        """

    def get_cached_path(self) -> str:
        """
        Returns a locally cached physical key, if available.

        Returns:
        Local file path if cached, otherwise None
        """

    def fetch(self, dest: str = None):
        """
        Gets objects from entry and saves them to dest.

        Parameters:
        - dest: Destination path for the downloaded file

        Returns:
        Path to the fetched file
        """

Binary Data Access

Retrieve raw bytes from package entries with caching support.

class PackageEntry:
    def get_bytes(self, use_cache_if_available: bool = True) -> bytes:
        """
        Returns the bytes of the object this entry corresponds to.

        Parameters:
        - use_cache_if_available: Whether to use cached version if available

        Returns:
        Raw bytes of the file contents
        """

Structured Data Access

Access structured data formats like JSON and text files.

class PackageEntry:
    def get_as_json(self, use_cache_if_available: bool = True) -> dict:
        """
        Returns a JSON file as a dict. Assumes that the file is encoded using utf-8.

        Parameters:
        - use_cache_if_available: Whether to use cached version if available

        Returns:
        Parsed JSON data as a dictionary

        Raises:
        JSONDecodeError if file is not valid JSON
        """

    def get_as_string(self, use_cache_if_available: bool = True) -> str:
        """
        Return the object as a string. Assumes that the file is encoded using utf-8.

        Parameters:
        - use_cache_if_available: Whether to use cached version if available

        Returns:
        File contents as a UTF-8 decoded string
        """

Data Deserialization

Deserialize files using format-specific handlers and custom functions.

class PackageEntry:
    def deserialize(self, func=None, **format_opts):
        """
        Returns the object this entry corresponds to.

        Parameters:
        - func: Custom deserialization function
        - **format_opts: Format-specific options

        Returns:
        Deserialized data object (format depends on file type and func)

        Supported formats:
        - CSV: Returns pandas DataFrame (requires pandas)
        - Parquet: Returns pandas DataFrame (requires pandas, pyarrow)
        - JSON: Returns parsed JSON object
        - Custom: Uses provided func parameter
        """

    def __call__(self, func=None, **kwargs):
        """
        Shorthand for self.deserialize()

        Parameters:
        - func: Custom deserialization function
        - **kwargs: Passed to deserialize method

        Returns:
        Deserialized data object
        """

Entry Metadata and Properties

Access and modify entry metadata and properties.

class PackageEntry:
    @property
    def meta(self) -> dict:
        """
        Get user metadata for this entry.

        Returns:
        Dictionary of user metadata
        """

    def set_meta(self, meta: dict):
        """
        Sets the user_meta for this PackageEntry.

        Parameters:
        - meta: Dictionary of metadata to set
        """

    def set(self, path: str = None, meta: dict = None):
        """
        Returns self with the physical key set to path.

        Parameters:
        - path: New physical path for the entry
        - meta: New metadata for the entry

        Returns:
        New PackageEntry with updated properties
        """

    @property
    def size(self) -> int:
        """Size of the entry in bytes."""

    @property
    def hash(self) -> dict:
        """Hash information for the entry."""

    def as_dict(self) -> dict:
        """
        Returns dict representation of entry.

        Returns:
        Dictionary containing entry metadata and properties
        """

    def with_physical_key(self, key):
        """
        Returns a new PackageEntry with a different physical key.

        Parameters:
        - key: New PhysicalKey for the entry

        Returns:
        New PackageEntry with updated physical key
        """

Entry Representation and Equality

String representation and equality comparison for entries.

class PackageEntry:
    def __repr__(self) -> str:
        """String representation of the PackageEntry."""

    def __eq__(self, other) -> bool:
        """
        Equality comparison between PackageEntry objects.

        Parameters:
        - other: Another PackageEntry to compare with

        Returns:
        True if entries are equivalent (same size and hash)
        """

Usage Examples

Basic File Access

import quilt3

# Browse a package
pkg = quilt3.Package.browse("my-username/my-dataset")

# Get a specific file entry
data_file = pkg["data/measurements.csv"]

# Get the physical location
file_path = data_file.get()
print(f"File location: {file_path}")

# Download file locally
local_path = data_file.fetch("./downloaded_measurements.csv")
print(f"Downloaded to: {local_path}")

Working with Different Data Formats

# JSON data access
config_entry = pkg["config/settings.json"]
config_data = config_entry.get_as_json()
print(f"Configuration: {config_data}")

# Text file access
readme_entry = pkg["README.txt"]
readme_content = readme_entry.get_as_string()
print(readme_content)

# Binary data access
image_entry = pkg["images/photo.jpg"]
image_bytes = image_entry.get_bytes()
print(f"Image size: {len(image_bytes)} bytes")

Data Deserialization with pandas

# Deserialize CSV to pandas DataFrame (requires pandas)
csv_entry = pkg["data/measurements.csv"]
df = csv_entry.deserialize()  # Automatically detects CSV format
print(df.head())

# Deserialize Parquet file (requires pandas and pyarrow)
parquet_entry = pkg["data/results.parquet"]
df = parquet_entry.deserialize()
print(f"DataFrame shape: {df.shape}")

# Custom deserialization function
def load_custom_format(file_path):
    # Custom loading logic
    return {"loaded_from": file_path}

custom_entry = pkg["data/custom.dat"]
custom_data = custom_entry.deserialize(func=load_custom_format)
print(custom_data)

Entry Metadata Management

# Access entry metadata
data_entry = pkg["data/experiment_1.csv"]
metadata = data_entry.meta
print(f"Entry metadata: {metadata}")

# Create new entry with metadata
new_entry = data_entry.set(meta={
    "experiment": "exp_001",
    "date": "2024-01-15",
    "researcher": "Dr. Smith"
})

# Get entry properties
print(f"File size: {data_entry.size} bytes")
print(f"Hash info: {data_entry.hash}")
print(f"Entry dict: {data_entry.as_dict()}")

Cached Access

# First access - downloads and caches
data = csv_entry.get_bytes(use_cache_if_available=True)

# Second access - uses cached version
data_cached = csv_entry.get_bytes(use_cache_if_available=True)  # Faster

# Force fresh download
data_fresh = csv_entry.get_bytes(use_cache_if_available=False)

# Check if cached version exists
cached_path = csv_entry.get_cached_path()
if cached_path:
    print(f"Cached at: {cached_path}")
else:
    print("No cached version available")

Working with Large Files

# Stream large files without loading entirely into memory
large_file = pkg["data/large_dataset.csv"]

# Get file handle for streaming
file_path = large_file.get()

# Use with context manager for efficient access
with open(file_path, 'r') as f:
    for line_num, line in enumerate(f):
        if line_num > 100:  # Process first 100 lines
            break
        process_line(line)

# Or deserialize with chunking (for pandas)
for chunk in large_file.deserialize(chunksize=1000):
    process_chunk(chunk)

Install with Tessl CLI