Quilt manages data like code with packages, repositories, browsing and revision history for machine learning and data-driven domains
—
Functions for working with package registries, including listing packages, searching, copying data, and package deletion.
from typing import UnionList packages and package versions from registries.
def list_packages(registry: str = None) -> list:
"""
Lists Packages in the registry.
Returns an iterable of all named packages in a registry.
If the registry is None, default to the local registry.
Parameters:
- registry: Location of registry to load package from
Returns:
An iterable of strings containing the names of the packages
"""
def list_package_versions(name: str, registry: str = None) -> list:
"""
Lists versions of a given package.
Returns an iterable of (latest_or_unix_ts, hash) of package revisions.
If the registry is None, default to the local registry.
Parameters:
- name: Name of the package
- registry: Location of registry to load package from
Returns:
An iterable of tuples containing the version and hash for the package
Raises:
QuiltException: If package name is invalid
"""Search for packages and data across registries.
def search(query: Union[str, dict], limit: int = 10) -> list:
"""
Execute a search against the configured search endpoint.
Parameters:
- query: Query string to query if passed as str, DSL query body if passed as dict
- limit: Maximum number of results to return. Defaults to 10
Returns:
List of search results
Query Syntax:
- String queries use Query String Query syntax
- Dict queries use Query DSL syntax
- Both follow Elasticsearch patterns
Note:
Forces a call to configure_from_default if no config exists
"""Copy data between different storage locations.
def copy(src: str, dest: str):
"""
Copies src object from QUILT to dest.
Either of src and dest may be S3 paths (starting with s3://)
or local file paths (starting with file:///).
Parameters:
- src: A path to retrieve
- dest: A path to write to
Examples:
- copy('s3://bucket/file.csv', 'file:///local/path/file.csv')
- copy('file:///local/file.csv', 's3://bucket/uploaded.csv')
- copy('s3://source-bucket/data.json', 's3://dest-bucket/data.json')
"""Delete packages or package versions from registries.
def delete_package(name: str, registry: str = None, top_hash: str = None):
"""
Delete a package. Deletes only the manifest entries and not the underlying files.
Parameters:
- name: Name of the package
- registry: The registry the package will be removed from
- top_hash: Optional. A package hash to delete, instead of the whole package
Note:
- If top_hash is None, deletes the entire package
- If top_hash is provided, deletes only that specific version
- Underlying data files are not deleted, only package manifests
Raises:
QuiltException: If package name is invalid
"""import quilt3
# List all packages in default registry
packages = quilt3.list_packages()
print(f"Found {len(packages)} packages:")
for package in packages:
print(f" {package}")
# List packages in specific registry
remote_packages = quilt3.list_packages("s3://my-registry-bucket")
print(f"Remote packages: {len(remote_packages)}")
# List versions of a specific package
versions = quilt3.list_package_versions("my-username/my-dataset")
print("Package versions:")
for timestamp, hash_val in versions:
print(f" {timestamp}: {hash_val[:8]}...")import quilt3
# Simple text search
results = quilt3.search("machine learning dataset", limit=20)
print(f"Found {len(results)} results")
for result in results:
hit = result['_source']
print(f"Package: {hit.get('name', 'Unknown')}")
print(f"Description: {hit.get('description', 'No description')}")
print("---")
# Advanced search with Query DSL
advanced_query = {
"query": {
"bool": {
"must": [
{"match": {"description": "experiment"}},
{"term": {"file_extensions": "csv"}}
],
"filter": [
{"range": {"size": {"gte": 1000000}}}
]
}
},
"sort": [{"modified": {"order": "desc"}}]
}
advanced_results = quilt3.search(advanced_query, limit=10)
print(f"Found {len(advanced_results)} large CSV experiment datasets")import quilt3
# Copy from S3 to local
quilt3.copy(
src="s3://source-bucket/data/measurements.csv",
dest="file:///tmp/local_measurements.csv"
)
# Copy from local to S3
quilt3.copy(
src="file:///home/user/processed_data.json",
dest="s3://dest-bucket/processed/data.json"
)
# Copy between S3 buckets
quilt3.copy(
src="s3://source-bucket/raw/dataset.parquet",
dest="s3://backup-bucket/archives/dataset_backup.parquet"
)
# Batch copy operations
files_to_copy = [
("s3://source/file1.csv", "file:///local/file1.csv"),
("s3://source/file2.json", "file:///local/file2.json"),
("s3://source/file3.parquet", "file:///local/file3.parquet")
]
for src, dest in files_to_copy:
print(f"Copying {src} to {dest}")
quilt3.copy(src, dest)
print("✓ Complete")import quilt3
# Get package information before deletion
package_name = "my-username/old-dataset"
versions = quilt3.list_package_versions(package_name)
print(f"Package {package_name} has {len(versions)} versions")
# Delete specific version
specific_hash = "abc123def456..."
quilt3.delete_package(package_name, top_hash=specific_hash)
print(f"Deleted version {specific_hash[:8]}...")
# Delete entire package (all versions)
# WARNING: This removes all versions!
confirm = input(f"Delete entire package {package_name}? (yes/no): ")
if confirm.lower() == 'yes':
quilt3.delete_package(package_name)
print(f"Deleted all versions of {package_name}")
else:
print("Package deletion cancelled")
# Verify deletion
try:
remaining_versions = quilt3.list_package_versions(package_name)
print(f"Remaining versions: {len(remaining_versions)}")
except Exception as e:
print(f"Package no longer exists: {e}")import quilt3
def audit_registry(registry_url=None):
"""Audit a registry for packages and their sizes"""
packages = quilt3.list_packages(registry_url)
print(f"Registry audit for: {registry_url or 'default'}")
print(f"Total packages: {len(packages)}")
package_stats = []
for package in packages:
try:
versions = quilt3.list_package_versions(package, registry_url)
package_stats.append({
'name': package,
'version_count': len(versions),
'latest_hash': versions[0][1] if versions else None
})
except Exception as e:
print(f"Error processing {package}: {e}")
# Sort by version count
package_stats.sort(key=lambda x: x['version_count'], reverse=True)
print("\nTop packages by version count:")
for stats in package_stats[:10]:
print(f" {stats['name']}: {stats['version_count']} versions")
return package_stats
# Run registry audit
stats = audit_registry()import quilt3
def find_recent_packages(days=7):
"""Find packages modified in the last N days"""
from datetime import datetime, timedelta
cutoff_date = datetime.now() - timedelta(days=days)
cutoff_timestamp = cutoff_date.timestamp()
query = {
"query": {
"range": {
"last_modified": {
"gte": cutoff_timestamp
}
}
},
"sort": [{"last_modified": {"order": "desc"}}]
}
results = quilt3.search(query, limit=50)
print(f"Packages modified in last {days} days:")
for result in results:
source = result['_source']
name = source.get('name', 'Unknown')
modified = source.get('last_modified', 0)
mod_date = datetime.fromtimestamp(modified).strftime('%Y-%m-%d %H:%M')
print(f" {name} - modified {mod_date}")
return results
def search_by_file_type(extension, limit=20):
"""Search for packages containing files with specific extension"""
query = {
"query": {
"term": {
"file_extensions": extension.lower().replace('.', '')
}
}
}
results = quilt3.search(query, limit=limit)
print(f"Packages containing .{extension} files:")
for result in results:
source = result['_source']
name = source.get('name', 'Unknown')
file_count = source.get('file_count', 0)
print(f" {name} ({file_count} files)")
return results
# Use search functions
recent = find_recent_packages(30)
csv_packages = search_by_file_type('csv', 15)
parquet_packages = search_by_file_type('parquet', 10)import quilt3
from quilt3.util import QuiltException
def safe_registry_operation(operation_name, operation_func):
"""Safely execute registry operations with error handling"""
try:
result = operation_func()
print(f"✓ {operation_name} completed successfully")
return result
except QuiltException as e:
print(f"✗ {operation_name} failed (Quilt error): {e}")
return None
except Exception as e:
print(f"✗ {operation_name} failed (unexpected error): {e}")
return None
# Safe operations
packages = safe_registry_operation(
"List packages",
lambda: quilt3.list_packages()
)
if packages:
print(f"Found {len(packages)} packages")
# Safe search with validation
def safe_search(query, limit=10):
if isinstance(query, str) and len(query.strip()) == 0:
print("Error: Empty search query")
return []
return quilt3.search(query, limit)
results = safe_registry_operation(
"Search packages",
lambda: safe_search("experiment data", 20)
)Install with Tessl CLI
npx tessl i tessl/pypi-quilt3