Pathlib-style classes for cloud storage services that provide seamless access to AWS S3, Google Cloud Storage, and Azure Blob Storage with familiar filesystem operations.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Monkey patching capabilities to make Python's built-in functions work transparently with cloud paths. These patches enable existing code to work with cloud storage without modification by extending standard library functions to recognize and handle CloudPath objects.
Functions to patch various parts of the Python standard library.
def patch_open(original_open = None) -> None:
"""
Patch builtin open() to work with CloudPaths.
Args:
original_open: Original open function to preserve (optional)
"""
def patch_os_functions() -> None:
"""
Patch os and os.path functions to work with CloudPaths.
Patches functions like os.listdir, os.stat, os.path.exists, etc.
"""
def patch_glob() -> None:
"""
Patch glob.glob() and glob.iglob() to work with CloudPaths.
"""
def patch_all_builtins() -> None:
"""
Apply all patches at once.
Equivalent to calling patch_open(), patch_os_functions(), and patch_glob().
"""The following functions are modified to work with CloudPath objects:
# After patch_open()
def open(file, mode='r', **kwargs):
"""Enhanced open() that works with CloudPath objects."""# After patch_os_functions()
def os.fspath(path): ...
def os.listdir(path): ...
def os.lstat(path): ...
def os.mkdir(path, mode=0o777, *, dir_fd=None): ...
def os.makedirs(name, mode=0o777, exist_ok=False): ...
def os.remove(path, *, dir_fd=None): ...
def os.removedirs(name): ...
def os.rename(src, dst, *, src_dir_fd=None, dst_dir_fd=None): ...
def os.renames(old, new): ...
def os.replace(src, dst, *, src_dir_fd=None, dst_dir_fd=None): ...
def os.rmdir(path, *, dir_fd=None): ...
def os.scandir(path='.'): ...
def os.stat(path, *, dir_fd=None, follow_symlinks=True): ...
def os.unlink(path, *, dir_fd=None): ...
def os.walk(top, topdown=True, onerror=None, followlinks=False): ...# After patch_os_functions()
def os.path.basename(path): ...
def os.path.commonpath(paths): ...
def os.path.commonprefix(list): ...
def os.path.dirname(path): ...
def os.path.exists(path): ...
def os.path.getatime(path): ...
def os.path.getmtime(path): ...
def os.path.getctime(path): ...
def os.path.getsize(path): ...
def os.path.isfile(path): ...
def os.path.isdir(path): ...
def os.path.join(path, *paths): ...
def os.path.split(path): ...
def os.path.splitext(path): ...# After patch_glob()
def glob.glob(pathname, *, recursive=False): ...
def glob.iglob(pathname, *, recursive=False): ...from cloudpathlib import patch_all_builtins, CloudPath
# Apply all patches
patch_all_builtins()
# Now standard library functions work with CloudPath
cloud_file = CloudPath("s3://my-bucket/data.txt")
# Built-in open() now works with CloudPath
with open(cloud_file, 'r') as f:
content = f.read()
# os.path functions work with CloudPath
import os.path
print(os.path.exists(cloud_file)) # True/False
print(os.path.basename(cloud_file)) # "data.txt"
print(os.path.dirname(cloud_file)) # "s3://my-bucket"
print(os.path.getsize(cloud_file)) # File size in bytes
# glob works with CloudPath
import glob
csv_files = glob.glob("s3://my-bucket/*.csv")
all_files = glob.glob("s3://my-bucket/**/*", recursive=True)from cloudpathlib import patch_open, patch_os_functions, patch_glob
# Apply patches selectively
patch_open() # Only patch open()
patch_os_functions() # Only patch os and os.path functions
patch_glob() # Only patch glob functions
# Or combine as needed
patch_open()
patch_glob() # Skip os functions if not needed# Existing code that works with local files
def process_files(directory):
"""Legacy function that processes files in a directory."""
import os
import glob
# This code was written for local files
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
size = os.path.getsize(filepath)
print(f"Processing {filename} ({size} bytes)")
with open(filepath, 'r') as f:
content = f.read()
# Process content...
# After patching, this works with cloud storage too!
from cloudpathlib import patch_all_builtins, CloudPath
patch_all_builtins()
# Same function now works with cloud paths
process_files("s3://my-bucket/data/") # Works!
process_files("/local/directory/") # Still works!
process_files("gs://bucket/files/") # Works!import os
from cloudpathlib import patch_all_builtins
# CloudPathLib automatically applies patches based on environment variables
# Set these before importing cloudpathlib:
# CLOUDPATHLIB_PATCH_OPEN=1 - patches open()
# CLOUDPATHLIB_PATCH_OS=1 - patches os functions
# CLOUDPATHLIB_PATCH_GLOB=1 - patches glob functions
# CLOUDPATHLIB_PATCH_ALL=1 - patches everything
# Or apply patches programmatically
if os.environ.get("ENABLE_CLOUD_PATCHING"):
patch_all_builtins()
# Now existing code works with cloud paths
def backup_config():
config_path = os.environ.get("CONFIG_PATH", "./config.json")
backup_path = os.environ.get("BACKUP_PATH", "./config.backup.json")
# Works whether paths are local or cloud URIs
if os.path.exists(config_path):
with open(config_path, 'r') as f:
config_data = f.read()
with open(backup_path, 'w') as f:
f.write(config_data)
print(f"Backed up {config_path} to {backup_path}")
# Usage
# CONFIG_PATH=s3://config-bucket/prod-config.json
# BACKUP_PATH=s3://backup-bucket/config-backup.json
backup_config() # Works with cloud paths!from cloudpathlib import patch_all_builtins
import os
import glob
import shutil
patch_all_builtins()
def data_pipeline(input_dir, output_dir, pattern="*.csv"):
"""Data processing pipeline that works with any storage."""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Find all matching files
search_pattern = os.path.join(input_dir, pattern)
input_files = glob.glob(search_pattern)
print(f"Found {len(input_files)} files matching {pattern}")
for input_file in input_files:
# Get file info
filename = os.path.basename(input_file)
file_size = os.path.getsize(input_file)
print(f"Processing {filename} ({file_size} bytes)")
# Read and process
with open(input_file, 'r') as f:
data = f.read()
processed_data = data.upper() # Example processing
# Write output
output_file = os.path.join(output_dir, f"processed_{filename}")
with open(output_file, 'w') as f:
f.write(processed_data)
print(f"Wrote {output_file}")
# Works with any combination of local and cloud storage
data_pipeline(
input_dir="s3://raw-data-bucket/csv/",
output_dir="s3://processed-data-bucket/csv/",
pattern="*.csv"
)
data_pipeline(
input_dir="/local/input/",
output_dir="gs://output-bucket/processed/",
pattern="*.txt"
)from cloudpathlib import patch_all_builtins
import os
patch_all_builtins()
def find_files_by_extension(root_dir, extension):
"""Find all files with given extension."""
found_files = []
# os.walk now works with cloud paths
for dirpath, dirnames, filenames in os.walk(root_dir):
for filename in filenames:
if filename.endswith(extension):
filepath = os.path.join(dirpath, filename)
file_size = os.path.getsize(filepath)
found_files.append({
'path': filepath,
'size': file_size,
'dir': dirpath
})
return found_files
# Works with cloud storage
python_files = find_files_by_extension("s3://code-bucket/", ".py")
log_files = find_files_by_extension("gs://logs-bucket/", ".log")
for file_info in python_files:
print(f"Python file: {file_info['path']} ({file_info['size']} bytes)")from cloudpathlib import patch_all_builtins
import csv
import os
import glob
patch_all_builtins()
def process_csv_files(input_pattern, output_dir):
"""Process CSV files with standard library functions."""
# Find all CSV files
csv_files = glob.glob(input_pattern)
# Create output directory
os.makedirs(output_dir, exist_ok=True)
for csv_file in csv_files:
filename = os.path.basename(csv_file)
output_file = os.path.join(output_dir, f"summary_{filename}")
print(f"Processing {filename}")
# Read CSV
with open(csv_file, 'r', newline='') as infile:
reader = csv.DictReader(infile)
rows = list(reader)
# Generate summary
summary = {
'filename': filename,
'row_count': len(rows),
'columns': list(rows[0].keys()) if rows else [],
'file_size': os.path.getsize(csv_file)
}
# Write summary
with open(output_file, 'w', newline='') as outfile:
writer = csv.DictWriter(outfile, fieldnames=summary.keys())
writer.writeheader()
writer.writerow(summary)
print(f"Summary written to {output_file}")
# Works with cloud CSV files
process_csv_files(
input_pattern="s3://data-bucket/exports/*.csv",
output_dir="s3://reports-bucket/summaries/"
)from cloudpathlib import patch_all_builtins
import json
import os
import glob
patch_all_builtins()
def merge_config_files(config_pattern, output_file):
"""Merge multiple JSON config files."""
config_files = glob.glob(config_pattern)
merged_config = {}
for config_file in config_files:
filename = os.path.basename(config_file)
print(f"Loading config from {filename}")
with open(config_file, 'r') as f:
config_data = json.load(f)
# Merge configuration
merged_config.update(config_data)
# Write merged configuration
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
json.dump(merged_config, f, indent=2)
print(f"Merged configuration written to {output_file}")
return merged_config
# Merge cloud-based config files
merged = merge_config_files(
config_pattern="s3://config-bucket/environments/*.json",
output_file="s3://config-bucket/merged/production.json"
)from cloudpathlib import patch_all_builtins
import os
import shutil
import glob
patch_all_builtins()
def organize_files_by_date(source_pattern, base_output_dir):
"""Organize files into date-based directories."""
files_to_organize = glob.glob(source_pattern)
for file_path in files_to_organize:
# Get file modification time
stat_info = os.stat(file_path)
mod_time = stat_info.st_mtime
# Create date-based directory structure
from datetime import datetime
date_str = datetime.fromtimestamp(mod_time).strftime("%Y/%m/%d")
output_dir = os.path.join(base_output_dir, date_str)
os.makedirs(output_dir, exist_ok=True)
filename = os.path.basename(file_path)
output_path = os.path.join(output_dir, filename)
# Move file (copy for cross-cloud operations)
print(f"Moving {filename} to {date_str}/")
with open(file_path, 'rb') as src, open(output_path, 'wb') as dst:
dst.write(src.read())
# Remove original (be careful with this!)
# os.remove(file_path)
# Organize cloud files by date
organize_files_by_date(
source_pattern="s3://uploads-bucket/incoming/*",
base_output_dir="s3://organized-bucket/by-date/"
)from cloudpathlib import patch_all_builtins
import os
import glob
patch_all_builtins()
def safe_file_operations(file_pattern):
"""Demonstrate error handling with patched functions."""
try:
files = glob.glob(file_pattern)
print(f"Found {len(files)} files")
for file_path in files:
try:
# Check if file exists
if os.path.exists(file_path):
# Get file info
size = os.path.getsize(file_path)
print(f"File: {os.path.basename(file_path)} ({size} bytes)")
# Try to read file
with open(file_path, 'r') as f:
content = f.read(100) # Read first 100 chars
print(f"Content preview: {content[:50]}...")
except PermissionError:
print(f"Permission denied: {file_path}")
except UnicodeDecodeError:
print(f"Binary file (skipping): {file_path}")
except Exception as e:
print(f"Error processing {file_path}: {e}")
except Exception as e:
print(f"Error with pattern {file_pattern}: {e}")
# Handle errors gracefully
safe_file_operations("s3://my-bucket/**/*.txt")
safe_file_operations("/nonexistent/path/*")Install with Tessl CLI
npx tessl i tessl/pypi-cloudpathlib