CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-ubelt

A Python utility belt containing simple tools, a stdlib like feel, and extra batteries

Overview
Eval results
Files

hashing-imports.mddocs/

Hashing and Import Utilities

Hash arbitrary data and files, plus dynamic module importing and path resolution utilities for data integrity and module management.

Capabilities

Data and File Hashing

Functions for computing secure hashes of arbitrary Python data structures and files.

def hash_data(data, hasher=NoParam, base=NoParam, types=False, convert=False, extensions=None):
    """
    Hash arbitrary Python data structures.
    
    Args:
        data: Any Python object (dict, list, str, etc.)
        hasher: Hash algorithm (NoParam uses 'sha512')
        base: Output encoding (NoParam uses 'hex') 
        types (bool): Include type information in hash
        convert (bool): Convert data to hashable format
        extensions: Custom extensions for handling special types
        
    Returns:
        str: Hash digest as string
        
    Note:
        Data is normalized for consistent hashing across runs.
        Supports nested structures, numpy arrays, and custom objects.
        NoParam defaults: hasher='sha512', base='hex'
    """

def hash_file(fpath, blocksize=1048576, stride=1, maxbytes=None, hasher=NoParam, base=NoParam):
    """
    Hash file contents efficiently.
    
    Args:
        fpath (str|Path): Path to file
        blocksize (int): Read block size in bytes (default: 1MB)
        stride (int): Read every nth block (default: 1 = all blocks)
        maxbytes (int): Maximum bytes to read (None = entire file)
        hasher: Hash algorithm (NoParam uses 'sha512')
        base: Output encoding (NoParam uses 'hex')
        
    Returns:
        str: File hash digest
        
    Raises:
        FileNotFoundError: File does not exist
        IOError: Cannot read file
        
    Note:
        NoParam defaults: hasher='sha512', base='hex'
    """

Module Import Utilities

Dynamic module importing and path resolution for runtime module loading.

def import_module_from_name(name, **kwargs):
    """
    Import module by name with error handling.
    
    Args:        
        name (str): Module name (e.g., 'os.path', 'numpy')
        **kwargs: Additional import options
        
    Returns:
        module: Imported module object
        
    Raises:
        ImportError: Module cannot be imported
    """

def import_module_from_path(modpath, index=-1):
    """
    Import module from file path.
    
    Args:
        modpath (str|Path): Path to Python file
        index (int): Module index for namespace packages
        
    Returns:
        module: Imported module object
        
    Raises:
        ImportError: Cannot import from path
        FileNotFoundError: File does not exist
    """

Module Path Utilities

Functions for converting between module names and file paths.

def modname_to_modpath(modname, **kwargs):
    """
    Convert module name to file path.
    
    Args:
        modname (str): Module name (e.g., 'os.path')
        **kwargs: Additional resolution options
        
    Returns:
        str|None: Path to module file or None if not found
    """

def modpath_to_modname(fpath, **kwargs):
    """
    Convert file path to module name.
    
    Args:
        fpath (str|Path): Path to Python file
        **kwargs: Additional conversion options
        
    Returns:
        str: Module name
    """

def split_modpath(fpath, **kwargs):
    """
    Split module path into components.
    
    Args:
        fpath (str|Path): Path to Python file
        **kwargs: Additional options
        
    Returns:
        dict: Dictionary with path components
    """

Usage Examples

Data Hashing

import ubelt as ub

# Hash simple data
data = {'name': 'Alice', 'age': 30, 'scores': [95, 87, 92]}
hash_value = ub.hash_data(data)
print(f"Data hash: {hash_value}")

# Different hash algorithms
sha256_hash = ub.hash_data(data, hasher='sha256')
md5_hash = ub.hash_data(data, hasher='md5')
print(f"SHA256: {sha256_hash}")
print(f"MD5: {md5_hash}")

# Different output encodings
hex_hash = ub.hash_data(data, base='hex')
b64_hash = ub.hash_data(data, base='base64')
print(f"Hex: {hex_hash}")
print(f"Base64: {b64_hash}")

# Truncated hashes
short_hash = ub.hash_data(data, hashlen=8)
print(f"Short hash: {short_hash}")

Complex Data Hashing

import ubelt as ub
import numpy as np

# Hash complex nested structures
complex_data = {
    'metadata': {
        'version': '1.0',
        'created': '2023-01-01'
    },
    'arrays': [
        np.array([1, 2, 3, 4]),
        np.array([[1, 2], [3, 4]])
    ],
    'config': {
        'learning_rate': 0.001,
        'batch_size': 32,
        'layers': [128, 64, 32]
    }
}

hash_value = ub.hash_data(complex_data)
print(f"Complex data hash: {hash_value}")

# Hashing is consistent across runs
hash2 = ub.hash_data(complex_data)
assert hash_value == hash2  # Same data produces same hash

# Order-independent hashing for dicts
data1 = {'a': 1, 'b': 2}
data2 = {'b': 2, 'a': 1}
hash1 = ub.hash_data(data1)
hash2 = ub.hash_data(data2)
assert hash1 == hash2  # Dict order doesn't matter

File Hashing

import ubelt as ub

# Hash file contents
file_path = 'example.txt'
with open(file_path, 'w') as f:
    f.write('Hello, World!')

file_hash = ub.hash_file(file_path)
print(f"File hash: {file_hash}")

# Hash large files efficiently (uses chunks)
large_file_hash = ub.hash_file('large_file.bin', blocksize=65536)

# Verify file integrity
def verify_file(fpath, expected_hash):
    actual_hash = ub.hash_file(fpath)
    return actual_hash == expected_hash

is_valid = verify_file(file_path, file_hash)
print(f"File is valid: {is_valid}")

# Quick hash for caching
cache_key = ub.hash_file('config.json', hashlen=8)
print(f"Cache key: {cache_key}")

Dynamic Module Importing

import ubelt as ub

# Import module by name
os_module = ub.import_module_from_name('os')
print(f"OS name: {os_module.name}")

# Import submodules
path_module = ub.import_module_from_name('os.path')
print(f"Current dir: {path_module.abspath('.')}")

# Safe importing with error handling
try:
    numpy = ub.import_module_from_name('numpy')
    print("NumPy is available")
except ImportError:
    print("NumPy not installed")

# Import from file path
script_path = 'my_script.py'
with open(script_path, 'w') as f:
    f.write('''
def greet(name):
    return f"Hello, {name}!"

VERSION = "1.0"
''')

my_module = ub.import_module_from_path(script_path)
print(my_module.greet("World"))
print(f"Version: {my_module.VERSION}")

Module Path Resolution

import ubelt as ub

# Convert module name to path
os_path = ub.modname_to_modpath('os')
print(f"OS module path: {os_path}")

json_path = ub.modname_to_modpath('json')
print(f"JSON module path: {json_path}")

# Convert path to module name
if json_path:
    module_name = ub.modpath_to_modname(json_path)
    print(f"Module name: {module_name}")

# Split module path into components
if json_path:
    components = ub.split_modpath(json_path)
    print(f"Path components: {components}")

# Find package modules
import sys
for path in sys.path:
    if 'site-packages' in path:
        print(f"Site packages: {path}")
        break

Data Integrity and Caching

import ubelt as ub
import json

# Cache with data integrity
def cached_computation(data):
    """Cache expensive computation with data hash as key"""
    data_hash = ub.hash_data(data, hashlen=16)
    cache_file = f'cache_{data_hash}.json'
    
    try:
        with open(cache_file, 'r') as f:
            cached_result = json.load(f)
        print("Using cached result")
        return cached_result
    except FileNotFoundError:
        print("Computing new result")
        # Expensive computation
        result = sum(x**2 for x in data.get('values', []))
        
        # Cache the result
        with open(cache_file, 'w') as f:
            json.dump(result, f)
        
        return result

# Test caching
data1 = {'values': [1, 2, 3, 4, 5], 'metadata': 'test'}
result1 = cached_computation(data1)  # Computes new
result2 = cached_computation(data1)  # Uses cache

# Different data gets different cache
data2 = {'values': [1, 2, 3, 4, 6], 'metadata': 'test'}  # Changed last value
result3 = cached_computation(data2)  # Computes new

print(f"Results: {result1}, {result2}, {result3}")

File Verification and Checksums

import ubelt as ub

# Create checksums for multiple files
files_to_check = ['file1.txt', 'file2.txt', 'file3.txt']

# Create test files
for i, fname in enumerate(files_to_check):
    with open(fname, 'w') as f:
        f.write(f'Content of file {i+1}')

# Generate checksums
checksums = {}
for fpath in files_to_check:
    checksums[fpath] = ub.hash_file(fpath, hasher='sha256', hashlen=16)

print("File checksums:")
for fpath, checksum in checksums.items():
    print(f"{fpath}: {checksum}")

# Verify files later
def verify_files(expected_checksums):
    """Verify files haven't changed"""
    for fpath, expected in expected_checksums.items():
        try:
            actual = ub.hash_file(fpath, hasher='sha256', hashlen=16)
            if actual == expected:
                print(f"✓ {fpath} is valid")
            else:
                print(f"✗ {fpath} has changed!")
        except FileNotFoundError:
            print(f"✗ {fpath} is missing!")

verify_files(checksums)

# Modify a file and check again
with open('file2.txt', 'a') as f:
    f.write(' - modified')

print("\nAfter modification:")
verify_files(checksums)

Install with Tessl CLI

npx tessl i tessl/pypi-ubelt

docs

dict-operations.md

download-caching.md

function-utilities.md

hashing-imports.md

index.md

list-operations.md

path-operations.md

progress-timing.md

system-integration.md

text-processing.md

tile.json