CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-ubelt

A Python utility belt containing simple tools, a stdlib like feel, and extra batteries

Overview
Eval results
Files

download-caching.mddocs/

Download and Caching

Download files with progress tracking, verification, and comprehensive caching systems for computations and data.

Capabilities

File Downloads

Download files from URLs with progress tracking, hash verification, and caching support.

def download(url, fpath=None, hash_prefix=None, hasher='sha512', **kwargs):
    """
    Download file from URL with progress and verification.
    
    Args:
        url (str): URL to download from
        fpath (str|Path): Local file path (auto-generated if None)
        hash_prefix (str): Expected hash prefix for verification
        hasher (str): Hash algorithm ('sha512', 'sha256', 'md5')
        verbose (int): Verbosity level
        chunk_size (int): Download chunk size in bytes
        timeout (float): Connection timeout
        
    Returns:
        str: Path to downloaded file
        
    Raises:
        URLError: Download failed
        HashMismatchError: Hash verification failed
    """

def grabdata(url, fpath=None, dpath=None, fname=None, **kwargs):
    """
    Download and cache data with automatic path handling.
    
    Args:
        url (str): URL to download
        fpath (str): Explicit file path
        dpath (str): Directory for cached file
        fname (str): Filename for cached file
        **kwargs: Additional download options
        
    Returns:
        str: Path to cached file
    """

class DownloadManager:
    """
    Manage multiple download operations with queuing and progress tracking.
    """
    def __init__(self, max_workers=4): ...
    
    def submit(self, url, fpath=None, **kwargs): ...
    def download_all(self): ...
    def __enter__(self): ...
    def __exit__(self, exc_type, exc_val, exc_tb): ...

Computation Caching

Cache expensive computations to disk with dependency tracking and automatic invalidation.

class Cacher:
    """
    On-disk caching with dependency tracking.
    Automatically invalidates cache when dependencies change.
    """
    def __init__(self, fname, depends=None, dpath=None, appname='ubelt', **kwargs):
        """
        Args:
            fname (str): Cache filename
            depends: Dependencies that invalidate cache when changed
            dpath (str): Cache directory
            appname (str): Application name for cache organization
            **kwargs: Additional cache options
        """
    
    def tryload(self):
        """
        Try to load cached result.
        
        Returns:
            object|None: Cached result or None if cache miss/invalid
        """
    
    def save(self, data):
        """
        Save data to cache.
        
        Args:
            data: Data to cache
        """
    
    def clear(self):
        """Clear cached data."""
    
    def exists(self):
        """
        Check if cache exists and is valid.
        
        Returns:
            bool: True if cache exists and dependencies unchanged
        """
    
    def ensure(self, func, *args, **kwargs):
        """
        Ensure cached result exists, computing if necessary.
        
        Args:
            func: Function to call if cache miss
            *args: Arguments for func
            **kwargs: Keyword arguments for func
            
        Returns:
            object: Cached or computed result
        """

class CacheStamp:
    """
    Lightweight cache stamping for file-producing computations.
    Tracks when outputs are newer than inputs.
    """
    def __init__(self, fname, dpath=None, **kwargs): ...
    
    def expired(self, *depends):
        """
        Check if cache is expired relative to dependencies.
        
        Args:
            *depends: File paths or other dependencies
            
        Returns:
            bool: True if cache is expired
        """
    
    def renew(self):
        """Update cache timestamp."""
    
    def clear(self):
        """Remove cache stamp."""

Usage Examples

File Downloads

import ubelt as ub

# Simple download
url = 'https://example.com/data.zip'
fpath = ub.download(url)
print(f"Downloaded to: {fpath}")

# Download with verification
url = 'https://example.com/important.tar.gz'
expected_hash = 'a1b2c3d4e5f6...'  # First few characters of expected hash
fpath = ub.download(url, hash_prefix=expected_hash, hasher='sha256')

# Download to specific location
local_path = './downloads/myfile.zip'
ub.download(url, fpath=local_path, verbose=2)

# Download with caching (won't re-download if file exists)
cached_file = ub.grabdata(url, dpath='./cache')

Multiple Downloads

import ubelt as ub

# Download multiple files
urls = [
    'https://example.com/file1.zip',
    'https://example.com/file2.tar.gz',
    'https://example.com/file3.json'
]

# Sequential downloads
files = []
for url in urls:
    fpath = ub.download(url, dpath='./downloads')
    files.append(fpath)

# Parallel downloads with DownloadManager
with ub.DownloadManager(max_workers=3) as dm:
    futures = []
    for url in urls:
        future = dm.submit(url, dpath='./downloads')
        futures.append(future)
    
    # Get results
    files = [future.result() for future in futures]

Computation Caching

import ubelt as ub
import time

def expensive_computation(n):
    """Simulate expensive computation"""
    print(f"Computing for n={n}...")
    time.sleep(2)  # Simulate work
    return n ** 2

# Basic caching
cache = ub.Cacher('computation_cache')
result = cache.tryload()
if result is None:
    result = expensive_computation(100)
    cache.save(result)
print(f"Result: {result}")

# Dependency-based caching
input_file = 'input.txt'
with open(input_file, 'w') as f:
    f.write('some input data')

# Cache depends on input file
cache = ub.Cacher('file_processing', depends=[input_file])
result = cache.tryload()
if result is None:
    # Process the file
    with open(input_file, 'r') as f:
        data = f.read()
    result = data.upper()  # Simple processing
    cache.save(result)

# Cache will be invalidated if input.txt changes

# Using ensure for cleaner code
def process_data(filename):
    with open(filename, 'r') as f:
        return f.read().upper()

cache = ub.Cacher('processing', depends=[input_file])
result = cache.ensure(process_data, input_file)

Cache Stamps for File Operations

import ubelt as ub

# Stamp-based caching for file generation
input_files = ['input1.txt', 'input2.txt', 'config.json']
output_file = 'processed_output.json'

stamp = ub.CacheStamp('processing_stamp')

if stamp.expired(*input_files, output_file):
    print("Processing files...")
    # Do expensive file processing
    processed_data = {'result': 'processed'}
    
    # Write output
    import json
    with open(output_file, 'w') as f:
        json.dump(processed_data, f)
    
    # Update stamp
    stamp.renew()
else:
    print("Using cached output")

# Output file exists and is newer than inputs

Advanced Caching Patterns

import ubelt as ub

# Cache with custom dependencies
def get_data_hash():
    """Get hash of current data state"""
    return ub.hash_data({'version': '1.2', 'config': 'prod'})

# Cache that depends on data state, not just files
cache = ub.Cacher('model_cache', depends=[get_data_hash()])

def train_model():
    print("Training model...")
    return {'accuracy': 0.95, 'model': 'trained_weights'}

model = cache.ensure(train_model)

# Organized caching with app-specific directories
user_cache = ub.Cacher('user_prefs', appname='myapp')
model_cache = ub.Cacher('models', appname='myapp', dpath='./models')

# Clear caches when needed
if need_fresh_data:
    cache.clear()
    
# Check cache status
if cache.exists():
    print("Cache is valid")
    data = cache.tryload()
else:
    print("Cache expired or missing")

Install with Tessl CLI

npx tessl i tessl/pypi-ubelt

docs

dict-operations.md

download-caching.md

function-utilities.md

hashing-imports.md

index.md

list-operations.md

path-operations.md

progress-timing.md

system-integration.md

text-processing.md

tile.json