Quilt manages data like code with packages, repositories, browsing and revision history for machine learning and data-driven domains
—
Methods for accessing, deserializing, and working with data files within packages. Supports various data formats with caching and optimization features.
Access files within packages and retrieve their physical locations.
class PackageEntry:
def get(self) -> str:
"""
Returns the physical key of this PackageEntry.
Returns:
Physical path or S3 URI to the file
"""
def get_cached_path(self) -> str:
"""
Returns a locally cached physical key, if available.
Returns:
Local file path if cached, otherwise None
"""
def fetch(self, dest: str = None):
"""
Gets objects from entry and saves them to dest.
Parameters:
- dest: Destination path for the downloaded file
Returns:
Path to the fetched file
"""Retrieve raw bytes from package entries with caching support.
class PackageEntry:
def get_bytes(self, use_cache_if_available: bool = True) -> bytes:
"""
Returns the bytes of the object this entry corresponds to.
Parameters:
- use_cache_if_available: Whether to use cached version if available
Returns:
Raw bytes of the file contents
"""Access structured data formats like JSON and text files.
class PackageEntry:
def get_as_json(self, use_cache_if_available: bool = True) -> dict:
"""
Returns a JSON file as a dict. Assumes that the file is encoded using utf-8.
Parameters:
- use_cache_if_available: Whether to use cached version if available
Returns:
Parsed JSON data as a dictionary
Raises:
JSONDecodeError if file is not valid JSON
"""
def get_as_string(self, use_cache_if_available: bool = True) -> str:
"""
Return the object as a string. Assumes that the file is encoded using utf-8.
Parameters:
- use_cache_if_available: Whether to use cached version if available
Returns:
File contents as a UTF-8 decoded string
"""Deserialize files using format-specific handlers and custom functions.
class PackageEntry:
def deserialize(self, func=None, **format_opts):
"""
Returns the object this entry corresponds to.
Parameters:
- func: Custom deserialization function
- **format_opts: Format-specific options
Returns:
Deserialized data object (format depends on file type and func)
Supported formats:
- CSV: Returns pandas DataFrame (requires pandas)
- Parquet: Returns pandas DataFrame (requires pandas, pyarrow)
- JSON: Returns parsed JSON object
- Custom: Uses provided func parameter
"""
def __call__(self, func=None, **kwargs):
"""
Shorthand for self.deserialize()
Parameters:
- func: Custom deserialization function
- **kwargs: Passed to deserialize method
Returns:
Deserialized data object
"""Access and modify entry metadata and properties.
class PackageEntry:
@property
def meta(self) -> dict:
"""
Get user metadata for this entry.
Returns:
Dictionary of user metadata
"""
def set_meta(self, meta: dict):
"""
Sets the user_meta for this PackageEntry.
Parameters:
- meta: Dictionary of metadata to set
"""
def set(self, path: str = None, meta: dict = None):
"""
Returns self with the physical key set to path.
Parameters:
- path: New physical path for the entry
- meta: New metadata for the entry
Returns:
New PackageEntry with updated properties
"""
@property
def size(self) -> int:
"""Size of the entry in bytes."""
@property
def hash(self) -> dict:
"""Hash information for the entry."""
def as_dict(self) -> dict:
"""
Returns dict representation of entry.
Returns:
Dictionary containing entry metadata and properties
"""
def with_physical_key(self, key):
"""
Returns a new PackageEntry with a different physical key.
Parameters:
- key: New PhysicalKey for the entry
Returns:
New PackageEntry with updated physical key
"""String representation and equality comparison for entries.
class PackageEntry:
def __repr__(self) -> str:
"""String representation of the PackageEntry."""
def __eq__(self, other) -> bool:
"""
Equality comparison between PackageEntry objects.
Parameters:
- other: Another PackageEntry to compare with
Returns:
True if entries are equivalent (same size and hash)
"""import quilt3
# Browse a package
pkg = quilt3.Package.browse("my-username/my-dataset")
# Get a specific file entry
data_file = pkg["data/measurements.csv"]
# Get the physical location
file_path = data_file.get()
print(f"File location: {file_path}")
# Download file locally
local_path = data_file.fetch("./downloaded_measurements.csv")
print(f"Downloaded to: {local_path}")# JSON data access
config_entry = pkg["config/settings.json"]
config_data = config_entry.get_as_json()
print(f"Configuration: {config_data}")
# Text file access
readme_entry = pkg["README.txt"]
readme_content = readme_entry.get_as_string()
print(readme_content)
# Binary data access
image_entry = pkg["images/photo.jpg"]
image_bytes = image_entry.get_bytes()
print(f"Image size: {len(image_bytes)} bytes")# Deserialize CSV to pandas DataFrame (requires pandas)
csv_entry = pkg["data/measurements.csv"]
df = csv_entry.deserialize() # Automatically detects CSV format
print(df.head())
# Deserialize Parquet file (requires pandas and pyarrow)
parquet_entry = pkg["data/results.parquet"]
df = parquet_entry.deserialize()
print(f"DataFrame shape: {df.shape}")
# Custom deserialization function
def load_custom_format(file_path):
# Custom loading logic
return {"loaded_from": file_path}
custom_entry = pkg["data/custom.dat"]
custom_data = custom_entry.deserialize(func=load_custom_format)
print(custom_data)# Access entry metadata
data_entry = pkg["data/experiment_1.csv"]
metadata = data_entry.meta
print(f"Entry metadata: {metadata}")
# Create new entry with metadata
new_entry = data_entry.set(meta={
"experiment": "exp_001",
"date": "2024-01-15",
"researcher": "Dr. Smith"
})
# Get entry properties
print(f"File size: {data_entry.size} bytes")
print(f"Hash info: {data_entry.hash}")
print(f"Entry dict: {data_entry.as_dict()}")# First access - downloads and caches
data = csv_entry.get_bytes(use_cache_if_available=True)
# Second access - uses cached version
data_cached = csv_entry.get_bytes(use_cache_if_available=True) # Faster
# Force fresh download
data_fresh = csv_entry.get_bytes(use_cache_if_available=False)
# Check if cached version exists
cached_path = csv_entry.get_cached_path()
if cached_path:
print(f"Cached at: {cached_path}")
else:
print("No cached version available")# Stream large files without loading entirely into memory
large_file = pkg["data/large_dataset.csv"]
# Get file handle for streaming
file_path = large_file.get()
# Use with context manager for efficient access
with open(file_path, 'r') as f:
for line_num, line in enumerate(f):
if line_num > 100: # Process first 100 lines
break
process_line(line)
# Or deserialize with chunking (for pandas)
for chunk in large_file.deserialize(chunksize=1000):
process_chunk(chunk)Install with Tessl CLI
npx tessl i tessl/pypi-quilt3