CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-quilt3

Quilt manages data like code with packages, repositories, browsing and revision history for machine learning and data-driven domains

Pending
Overview
Eval results
Files

registry-operations.mddocs/

Package Registry Operations

Functions for working with package registries, including listing packages, searching, copying data, and package deletion.

Type Imports

from typing import Union

Capabilities

Package Listing

List packages and package versions from registries.

def list_packages(registry: str = None) -> list:
    """
    Lists Packages in the registry.

    Returns an iterable of all named packages in a registry.
    If the registry is None, default to the local registry.

    Parameters:
    - registry: Location of registry to load package from

    Returns:
    An iterable of strings containing the names of the packages
    """

def list_package_versions(name: str, registry: str = None) -> list:
    """
    Lists versions of a given package.

    Returns an iterable of (latest_or_unix_ts, hash) of package revisions.
    If the registry is None, default to the local registry.

    Parameters:
    - name: Name of the package
    - registry: Location of registry to load package from

    Returns:
    An iterable of tuples containing the version and hash for the package

    Raises:
    QuiltException: If package name is invalid
    """

Package Search

Search for packages and data across registries.

def search(query: Union[str, dict], limit: int = 10) -> list:
    """
    Execute a search against the configured search endpoint.

    Parameters:
    - query: Query string to query if passed as str, DSL query body if passed as dict
    - limit: Maximum number of results to return. Defaults to 10

    Returns:
    List of search results

    Query Syntax:
    - String queries use Query String Query syntax
    - Dict queries use Query DSL syntax
    - Both follow Elasticsearch patterns

    Note:
    Forces a call to configure_from_default if no config exists
    """

Data Copying

Copy data between different storage locations.

def copy(src: str, dest: str):
    """
    Copies src object from QUILT to dest.

    Either of src and dest may be S3 paths (starting with s3://)
    or local file paths (starting with file:///).

    Parameters:
    - src: A path to retrieve
    - dest: A path to write to

    Examples:
    - copy('s3://bucket/file.csv', 'file:///local/path/file.csv')
    - copy('file:///local/file.csv', 's3://bucket/uploaded.csv')
    - copy('s3://source-bucket/data.json', 's3://dest-bucket/data.json')
    """

Package Deletion

Delete packages or package versions from registries.

def delete_package(name: str, registry: str = None, top_hash: str = None):
    """
    Delete a package. Deletes only the manifest entries and not the underlying files.

    Parameters:
    - name: Name of the package
    - registry: The registry the package will be removed from
    - top_hash: Optional. A package hash to delete, instead of the whole package

    Note:
    - If top_hash is None, deletes the entire package
    - If top_hash is provided, deletes only that specific version
    - Underlying data files are not deleted, only package manifests

    Raises:
    QuiltException: If package name is invalid
    """

Usage Examples

Package Discovery

import quilt3

# List all packages in default registry
packages = quilt3.list_packages()
print(f"Found {len(packages)} packages:")
for package in packages:
    print(f"  {package}")

# List packages in specific registry
remote_packages = quilt3.list_packages("s3://my-registry-bucket")
print(f"Remote packages: {len(remote_packages)}")

# List versions of a specific package
versions = quilt3.list_package_versions("my-username/my-dataset")
print("Package versions:")
for timestamp, hash_val in versions:
    print(f"  {timestamp}: {hash_val[:8]}...")

Package Search

import quilt3

# Simple text search
results = quilt3.search("machine learning dataset", limit=20)
print(f"Found {len(results)} results")

for result in results:
    hit = result['_source']
    print(f"Package: {hit.get('name', 'Unknown')}")
    print(f"Description: {hit.get('description', 'No description')}")
    print("---")

# Advanced search with Query DSL
advanced_query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"description": "experiment"}},
                {"term": {"file_extensions": "csv"}}
            ],
            "filter": [
                {"range": {"size": {"gte": 1000000}}}
            ]
        }
    },
    "sort": [{"modified": {"order": "desc"}}]
}

advanced_results = quilt3.search(advanced_query, limit=10)
print(f"Found {len(advanced_results)} large CSV experiment datasets")

Data Transfer Operations

import quilt3

# Copy from S3 to local
quilt3.copy(
    src="s3://source-bucket/data/measurements.csv",
    dest="file:///tmp/local_measurements.csv"
)

# Copy from local to S3
quilt3.copy(
    src="file:///home/user/processed_data.json", 
    dest="s3://dest-bucket/processed/data.json"
)

# Copy between S3 buckets
quilt3.copy(
    src="s3://source-bucket/raw/dataset.parquet",
    dest="s3://backup-bucket/archives/dataset_backup.parquet"
)

# Batch copy operations
files_to_copy = [
    ("s3://source/file1.csv", "file:///local/file1.csv"),
    ("s3://source/file2.json", "file:///local/file2.json"),
    ("s3://source/file3.parquet", "file:///local/file3.parquet")
]

for src, dest in files_to_copy:
    print(f"Copying {src} to {dest}")
    quilt3.copy(src, dest)
    print("✓ Complete")

Package Management

import quilt3

# Get package information before deletion
package_name = "my-username/old-dataset"
versions = quilt3.list_package_versions(package_name)
print(f"Package {package_name} has {len(versions)} versions")

# Delete specific version
specific_hash = "abc123def456..."
quilt3.delete_package(package_name, top_hash=specific_hash)
print(f"Deleted version {specific_hash[:8]}...")

# Delete entire package (all versions)
# WARNING: This removes all versions!
confirm = input(f"Delete entire package {package_name}? (yes/no): ")
if confirm.lower() == 'yes':
    quilt3.delete_package(package_name)
    print(f"Deleted all versions of {package_name}")
else:
    print("Package deletion cancelled")

# Verify deletion
try:
    remaining_versions = quilt3.list_package_versions(package_name)
    print(f"Remaining versions: {len(remaining_versions)}")
except Exception as e:
    print(f"Package no longer exists: {e}")

Registry Management

import quilt3

def audit_registry(registry_url=None):
    """Audit a registry for packages and their sizes"""
    
    packages = quilt3.list_packages(registry_url)
    print(f"Registry audit for: {registry_url or 'default'}")
    print(f"Total packages: {len(packages)}")
    
    package_stats = []
    
    for package in packages:
        try:
            versions = quilt3.list_package_versions(package, registry_url)
            package_stats.append({
                'name': package,
                'version_count': len(versions),
                'latest_hash': versions[0][1] if versions else None
            })
        except Exception as e:
            print(f"Error processing {package}: {e}")
    
    # Sort by version count
    package_stats.sort(key=lambda x: x['version_count'], reverse=True)
    
    print("\nTop packages by version count:")
    for stats in package_stats[:10]:
        print(f"  {stats['name']}: {stats['version_count']} versions")
    
    return package_stats

# Run registry audit
stats = audit_registry()

Search and Discovery Workflows

import quilt3

def find_recent_packages(days=7):
    """Find packages modified in the last N days"""
    
    from datetime import datetime, timedelta
    
    cutoff_date = datetime.now() - timedelta(days=days)
    cutoff_timestamp = cutoff_date.timestamp()
    
    query = {
        "query": {
            "range": {
                "last_modified": {
                    "gte": cutoff_timestamp
                }
            }
        },
        "sort": [{"last_modified": {"order": "desc"}}]
    }
    
    results = quilt3.search(query, limit=50)
    
    print(f"Packages modified in last {days} days:")
    for result in results:
        source = result['_source']
        name = source.get('name', 'Unknown')
        modified = source.get('last_modified', 0)
        mod_date = datetime.fromtimestamp(modified).strftime('%Y-%m-%d %H:%M')
        print(f"  {name} - modified {mod_date}")
    
    return results

def search_by_file_type(extension, limit=20):
    """Search for packages containing files with specific extension"""
    
    query = {
        "query": {
            "term": {
                "file_extensions": extension.lower().replace('.', '')
            }
        }
    }
    
    results = quilt3.search(query, limit=limit)
    
    print(f"Packages containing .{extension} files:")
    for result in results:
        source = result['_source']
        name = source.get('name', 'Unknown')
        file_count = source.get('file_count', 0)
        print(f"  {name} ({file_count} files)")
    
    return results

# Use search functions
recent = find_recent_packages(30)
csv_packages = search_by_file_type('csv', 15)
parquet_packages = search_by_file_type('parquet', 10)

Error Handling

import quilt3
from quilt3.util import QuiltException

def safe_registry_operation(operation_name, operation_func):
    """Safely execute registry operations with error handling"""
    
    try:
        result = operation_func()
        print(f"✓ {operation_name} completed successfully")
        return result
        
    except QuiltException as e:
        print(f"✗ {operation_name} failed (Quilt error): {e}")
        return None
        
    except Exception as e:
        print(f"✗ {operation_name} failed (unexpected error): {e}")
        return None

# Safe operations
packages = safe_registry_operation(
    "List packages",
    lambda: quilt3.list_packages()
)

if packages:
    print(f"Found {len(packages)} packages")

# Safe search with validation
def safe_search(query, limit=10):
    if isinstance(query, str) and len(query.strip()) == 0:
        print("Error: Empty search query")
        return []
    
    return quilt3.search(query, limit)

results = safe_registry_operation(
    "Search packages",
    lambda: safe_search("experiment data", 20)
)

Install with Tessl CLI

npx tessl i tessl/pypi-quilt3

docs

admin.md

bucket-operations.md

config-session.md

data-access.md

hooks.md

index.md

package-management.md

registry-operations.md

tile.json