tessl/pypi-cloudpathlib

Pathlib-style classes for cloud storage services that provide seamless access to AWS S3, Google Cloud Storage, and Azure Blob Storage with familiar filesystem operations.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Standard Library Integration

Name: tessl/pypi-cloudpathlib
Author: tessl

Monkey patching capabilities to make Python's built-in functions work transparently with cloud paths. These patches enable existing code to work with cloud storage without modification by extending standard library functions to recognize and handle CloudPath objects.

Capabilities

Patching Functions

Functions to patch various parts of the Python standard library.

def patch_open(original_open = None) -> None:
    """
    Patch builtin open() to work with CloudPaths.
    
    Args:
        original_open: Original open function to preserve (optional)
    """

def patch_os_functions() -> None:
    """
    Patch os and os.path functions to work with CloudPaths.
    
    Patches functions like os.listdir, os.stat, os.path.exists, etc.
    """

def patch_glob() -> None:
    """
    Patch glob.glob() and glob.iglob() to work with CloudPaths.
    """

def patch_all_builtins() -> None:
    """
    Apply all patches at once.
    Equivalent to calling patch_open(), patch_os_functions(), and patch_glob().
    """

Patched Functions

The following functions are modified to work with CloudPath objects:

Built-in Functions

# After patch_open()
def open(file, mode='r', **kwargs):
    """Enhanced open() that works with CloudPath objects."""

OS Module Functions

# After patch_os_functions()
def os.fspath(path): ...
def os.listdir(path): ...
def os.lstat(path): ...
def os.mkdir(path, mode=0o777, *, dir_fd=None): ...
def os.makedirs(name, mode=0o777, exist_ok=False): ...
def os.remove(path, *, dir_fd=None): ...
def os.removedirs(name): ...
def os.rename(src, dst, *, src_dir_fd=None, dst_dir_fd=None): ...
def os.renames(old, new): ...
def os.replace(src, dst, *, src_dir_fd=None, dst_dir_fd=None): ...
def os.rmdir(path, *, dir_fd=None): ...
def os.scandir(path='.'): ...
def os.stat(path, *, dir_fd=None, follow_symlinks=True): ...
def os.unlink(path, *, dir_fd=None): ...
def os.walk(top, topdown=True, onerror=None, followlinks=False): ...

OS.Path Module Functions

# After patch_os_functions()
def os.path.basename(path): ...
def os.path.commonpath(paths): ...
def os.path.commonprefix(list): ...
def os.path.dirname(path): ...
def os.path.exists(path): ...
def os.path.getatime(path): ...
def os.path.getmtime(path): ...
def os.path.getctime(path): ...
def os.path.getsize(path): ...
def os.path.isfile(path): ...
def os.path.isdir(path): ...
def os.path.join(path, *paths): ...
def os.path.split(path): ...
def os.path.splitext(path): ...

Glob Module Functions

# After patch_glob()
def glob.glob(pathname, *, recursive=False): ...
def glob.iglob(pathname, *, recursive=False): ...

Usage Examples

Basic Patching

from cloudpathlib import patch_all_builtins, CloudPath

# Apply all patches
patch_all_builtins()

# Now standard library functions work with CloudPath
cloud_file = CloudPath("s3://my-bucket/data.txt")

# Built-in open() now works with CloudPath
with open(cloud_file, 'r') as f:
    content = f.read()

# os.path functions work with CloudPath
import os.path
print(os.path.exists(cloud_file))      # True/False
print(os.path.basename(cloud_file))    # "data.txt"
print(os.path.dirname(cloud_file))     # "s3://my-bucket"
print(os.path.getsize(cloud_file))     # File size in bytes

# glob works with CloudPath
import glob
csv_files = glob.glob("s3://my-bucket/*.csv")
all_files = glob.glob("s3://my-bucket/**/*", recursive=True)

Selective Patching

from cloudpathlib import patch_open, patch_os_functions, patch_glob

# Apply patches selectively
patch_open()        # Only patch open()
patch_os_functions()  # Only patch os and os.path functions
patch_glob()        # Only patch glob functions

# Or combine as needed
patch_open()
patch_glob()  # Skip os functions if not needed

Legacy Code Integration

# Existing code that works with local files
def process_files(directory):
    """Legacy function that processes files in a directory."""
    import os
    import glob
    
    # This code was written for local files
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        
        if os.path.isfile(filepath):
            size = os.path.getsize(filepath)
            print(f"Processing {filename} ({size} bytes)")
            
            with open(filepath, 'r') as f:
                content = f.read()
                # Process content...

# After patching, this works with cloud storage too!
from cloudpathlib import patch_all_builtins, CloudPath

patch_all_builtins()

# Same function now works with cloud paths
process_files("s3://my-bucket/data/")      # Works!
process_files("/local/directory/")         # Still works!
process_files("gs://bucket/files/")        # Works!

Environment Variable Configuration

import os
from cloudpathlib import patch_all_builtins

# CloudPathLib automatically applies patches based on environment variables
# Set these before importing cloudpathlib:

# CLOUDPATHLIB_PATCH_OPEN=1      - patches open()
# CLOUDPATHLIB_PATCH_OS=1        - patches os functions  
# CLOUDPATHLIB_PATCH_GLOB=1      - patches glob functions
# CLOUDPATHLIB_PATCH_ALL=1       - patches everything

# Or apply patches programmatically
if os.environ.get("ENABLE_CLOUD_PATCHING"):
    patch_all_builtins()

# Now existing code works with cloud paths
def backup_config():
    config_path = os.environ.get("CONFIG_PATH", "./config.json")
    backup_path = os.environ.get("BACKUP_PATH", "./config.backup.json")
    
    # Works whether paths are local or cloud URIs
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            config_data = f.read()
        
        with open(backup_path, 'w') as f:
            f.write(config_data)
        
        print(f"Backed up {config_path} to {backup_path}")

# Usage
# CONFIG_PATH=s3://config-bucket/prod-config.json
# BACKUP_PATH=s3://backup-bucket/config-backup.json
backup_config()  # Works with cloud paths!

File Processing Pipelines

from cloudpathlib import patch_all_builtins
import os
import glob
import shutil

patch_all_builtins()

def data_pipeline(input_dir, output_dir, pattern="*.csv"):
    """Data processing pipeline that works with any storage."""
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Find all matching files
    search_pattern = os.path.join(input_dir, pattern)
    input_files = glob.glob(search_pattern)
    
    print(f"Found {len(input_files)} files matching {pattern}")
    
    for input_file in input_files:
        # Get file info
        filename = os.path.basename(input_file)
        file_size = os.path.getsize(input_file)
        
        print(f"Processing {filename} ({file_size} bytes)")
        
        # Read and process
        with open(input_file, 'r') as f:
            data = f.read()
        
        processed_data = data.upper()  # Example processing
        
        # Write output
        output_file = os.path.join(output_dir, f"processed_{filename}")
        with open(output_file, 'w') as f:
            f.write(processed_data)
        
        print(f"Wrote {output_file}")

# Works with any combination of local and cloud storage
data_pipeline(
    input_dir="s3://raw-data-bucket/csv/",
    output_dir="s3://processed-data-bucket/csv/",
    pattern="*.csv"
)

data_pipeline(
    input_dir="/local/input/",
    output_dir="gs://output-bucket/processed/",
    pattern="*.txt"
)

Directory Traversal

from cloudpathlib import patch_all_builtins
import os

patch_all_builtins()

def find_files_by_extension(root_dir, extension):
    """Find all files with given extension."""
    found_files = []
    
    # os.walk now works with cloud paths
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(extension):
                filepath = os.path.join(dirpath, filename)
                file_size = os.path.getsize(filepath)
                found_files.append({
                    'path': filepath,
                    'size': file_size,
                    'dir': dirpath
                })
    
    return found_files

# Works with cloud storage
python_files = find_files_by_extension("s3://code-bucket/", ".py")
log_files = find_files_by_extension("gs://logs-bucket/", ".log")

for file_info in python_files:
    print(f"Python file: {file_info['path']} ({file_info['size']} bytes)")

CSV Processing Example

from cloudpathlib import patch_all_builtins
import csv
import os
import glob

patch_all_builtins()

def process_csv_files(input_pattern, output_dir):
    """Process CSV files with standard library functions."""
    
    # Find all CSV files
    csv_files = glob.glob(input_pattern)
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        output_file = os.path.join(output_dir, f"summary_{filename}")
        
        print(f"Processing {filename}")
        
        # Read CSV
        with open(csv_file, 'r', newline='') as infile:
            reader = csv.DictReader(infile)
            rows = list(reader)
        
        # Generate summary
        summary = {
            'filename': filename,
            'row_count': len(rows),
            'columns': list(rows[0].keys()) if rows else [],
            'file_size': os.path.getsize(csv_file)
        }
        
        # Write summary
        with open(output_file, 'w', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=summary.keys())
            writer.writeheader()
            writer.writerow(summary)
        
        print(f"Summary written to {output_file}")

# Works with cloud CSV files
process_csv_files(
    input_pattern="s3://data-bucket/exports/*.csv",
    output_dir="s3://reports-bucket/summaries/"
)

JSON Configuration Processing

from cloudpathlib import patch_all_builtins
import json
import os
import glob

patch_all_builtins()

def merge_config_files(config_pattern, output_file):
    """Merge multiple JSON config files."""
    
    config_files = glob.glob(config_pattern)
    merged_config = {}
    
    for config_file in config_files:
        filename = os.path.basename(config_file)
        print(f"Loading config from {filename}")
        
        with open(config_file, 'r') as f:
            config_data = json.load(f)
        
        # Merge configuration
        merged_config.update(config_data)
    
    # Write merged configuration
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump(merged_config, f, indent=2)
    
    print(f"Merged configuration written to {output_file}")
    return merged_config

# Merge cloud-based config files
merged = merge_config_files(
    config_pattern="s3://config-bucket/environments/*.json",
    output_file="s3://config-bucket/merged/production.json"
)

Batch File Operations

from cloudpathlib import patch_all_builtins
import os
import shutil
import glob

patch_all_builtins()

def organize_files_by_date(source_pattern, base_output_dir):
    """Organize files into date-based directories."""
    
    files_to_organize = glob.glob(source_pattern)
    
    for file_path in files_to_organize:
        # Get file modification time
        stat_info = os.stat(file_path)
        mod_time = stat_info.st_mtime
        
        # Create date-based directory structure
        from datetime import datetime
        date_str = datetime.fromtimestamp(mod_time).strftime("%Y/%m/%d")
        
        output_dir = os.path.join(base_output_dir, date_str)
        os.makedirs(output_dir, exist_ok=True)
        
        filename = os.path.basename(file_path)
        output_path = os.path.join(output_dir, filename)
        
        # Move file (copy for cross-cloud operations)
        print(f"Moving {filename} to {date_str}/")
        with open(file_path, 'rb') as src, open(output_path, 'wb') as dst:
            dst.write(src.read())
        
        # Remove original (be careful with this!)
        # os.remove(file_path)

# Organize cloud files by date
organize_files_by_date(
    source_pattern="s3://uploads-bucket/incoming/*",
    base_output_dir="s3://organized-bucket/by-date/"
)

Error Handling with Patched Functions

from cloudpathlib import patch_all_builtins
import os
import glob

patch_all_builtins()

def safe_file_operations(file_pattern):
    """Demonstrate error handling with patched functions."""
    
    try:
        files = glob.glob(file_pattern)
        print(f"Found {len(files)} files")
        
        for file_path in files:
            try:
                # Check if file exists
                if os.path.exists(file_path):
                    # Get file info
                    size = os.path.getsize(file_path)
                    print(f"File: {os.path.basename(file_path)} ({size} bytes)")
                    
                    # Try to read file
                    with open(file_path, 'r') as f:
                        content = f.read(100)  # Read first 100 chars
                        print(f"Content preview: {content[:50]}...")
                
            except PermissionError:
                print(f"Permission denied: {file_path}")
            except UnicodeDecodeError:
                print(f"Binary file (skipping): {file_path}")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    except Exception as e:
        print(f"Error with pattern {file_pattern}: {e}")

# Handle errors gracefully
safe_file_operations("s3://my-bucket/**/*.txt")
safe_file_operations("/nonexistent/path/*")

Install with Tessl CLI