Pathlib-style classes for cloud storage services that provide seamless access to AWS S3, Google Cloud Storage, and Azure Blob Storage with familiar filesystem operations.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Configuration options for cache management, file handling modes, and other library settings that control behavior across all cloud providers. These settings allow fine-tuned control over caching, performance, and integration with existing systems.
Configuration enum for controlling how CloudPathLib manages local file caching.
class FileCacheMode(str, Enum):
"""File cache management strategies."""
persistent = "persistent"
"""
Cache persists until manually cleared.
Files remain cached across Python sessions.
"""
tmp_dir = "tmp_dir"
"""
Cache in temporary directory (default).
Files cached in system temp directory, may be cleaned by OS.
"""
cloudpath_object = "cloudpath_object"
"""
Cache cleared when CloudPath object is deleted.
Automatic cleanup when path objects go out of scope.
"""
close_file = "close_file"
"""
Cache cleared when file is closed.
Immediate cleanup after file operations complete.
"""
@classmethod
def from_environment(cls) -> "FileCacheMode":
"""
Parse cache mode from environment variable.
Returns:
FileCacheMode from CLOUDPATHLIB_CACHE_MODE env var
"""Global registry that tracks all available cloud provider implementations and their associated path and client classes.
implementation_registry: typing.Dict[str, "CloudImplementation"]
"""
Global registry mapping cloud provider keys to their implementation metadata.
Keys: "s3", "gs", "azure", "http", "https"
"""
class CloudImplementation:
"""
Metadata container for cloud provider implementations.
Attributes:
name (str): Provider identifier ("s3", "gs", "azure", etc.)
dependencies_loaded (bool): Whether required dependencies are available
_client_class (Type[Client]): Client class for this provider
_path_class (Type[CloudPath]): Path class for this provider
"""
name: str
dependencies_loaded: bool = True
_client_class: typing.Type["Client"]
_path_class: typing.Type["CloudPath"]
def validate_completeness(self) -> None:
"""Validate that implementation has all required components."""from cloudpathlib import FileCacheMode, S3Client, CloudPath
# Configure client with specific cache mode
client = S3Client(
file_cache_mode=FileCacheMode.persistent,
local_cache_dir="/var/cache/cloudpathlib"
)
# Create paths with configured caching
path = CloudPath("s3://my-bucket/large-file.dat", client=client)
# File is cached persistently
content = path.read_bytes() # Downloads and caches
content = path.read_bytes() # Uses cached version (no download)
# Cache persists across Python sessionsimport os
from cloudpathlib import FileCacheMode
# Set environment variable
os.environ["CLOUDPATHLIB_CACHE_MODE"] = "persistent"
# Parse from environment
cache_mode = FileCacheMode.from_environment()
print(f"Cache mode: {cache_mode}") # FileCacheMode.persistent
# Use in client configuration
client = S3Client(file_cache_mode=cache_mode)# Persistent caching - files stay cached until manually cleared
persistent_client = S3Client(
file_cache_mode=FileCacheMode.persistent,
local_cache_dir="/persistent/cache"
)
# Temporary caching - system handles cleanup
temp_client = S3Client(
file_cache_mode=FileCacheMode.tmp_dir
)
# Object-scoped caching - cleared when path object is deleted
object_client = S3Client(
file_cache_mode=FileCacheMode.cloudpath_object
)
# File-scoped caching - cleared when file is closed
file_client = S3Client(
file_cache_mode=FileCacheMode.close_file
)
# Demonstrate different behaviors
path1 = CloudPath("s3://bucket/file.txt", client=persistent_client)
path2 = CloudPath("s3://bucket/file.txt", client=temp_client)
path3 = CloudPath("s3://bucket/file.txt", client=object_client)
path4 = CloudPath("s3://bucket/file.txt", client=file_client)
# Read files with different caching behaviors
content1 = path1.read_text() # Cached persistently
content2 = path2.read_text() # Cached in temp directory
content3 = path3.read_text() # Cached until path3 is deleted
content4 = path4.read_text() # Cache cleared immediately after readdef configure_high_performance_client():
"""Configure client for high-performance scenarios."""
return S3Client(
file_cache_mode=FileCacheMode.persistent,
local_cache_dir="/fast/ssd/cache", # Use fast storage for cache
boto3_transfer_config=boto3.s3.transfer.TransferConfig(
multipart_threshold=1024 * 25, # 25MB
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True
)
)
def configure_memory_constrained_client():
"""Configure client for memory-constrained environments."""
return S3Client(
file_cache_mode=FileCacheMode.close_file, # Immediate cleanup
local_cache_dir="/tmp/cloudpath_cache" # Use temp directory
)
# Use appropriate configuration
high_perf_client = configure_high_performance_client()
memory_client = configure_memory_constrained_client()import os
def get_cache_config():
"""Get cache configuration based on environment."""
environment = os.getenv("ENVIRONMENT", "development")
if environment == "production":
return {
"file_cache_mode": FileCacheMode.persistent,
"local_cache_dir": "/var/cache/app/cloudpathlib"
}
elif environment == "staging":
return {
"file_cache_mode": FileCacheMode.tmp_dir,
"local_cache_dir": "/tmp/staging_cache"
}
else: # development
return {
"file_cache_mode": FileCacheMode.cloudpath_object,
"local_cache_dir": "./dev_cache"
}
# Apply environment-specific configuration
cache_config = get_cache_config()
client = S3Client(**cache_config)import tempfile
import shutil
from pathlib import Path
class ManagedCacheDirectory:
"""Context manager for temporary cache directories."""
def __init__(self, prefix="cloudpath_"):
self.prefix = prefix
self.temp_dir = None
def __enter__(self):
self.temp_dir = Path(tempfile.mkdtemp(prefix=self.prefix))
return str(self.temp_dir)
def __exit__(self, exc_type, exc_val, exc_tb):
if self.temp_dir and self.temp_dir.exists():
shutil.rmtree(self.temp_dir)
# Use managed cache directory
with ManagedCacheDirectory() as cache_dir:
client = S3Client(
file_cache_mode=FileCacheMode.persistent,
local_cache_dir=cache_dir
)
path = CloudPath("s3://bucket/file.txt", client=client)
content = path.read_text() # Cached in managed directory
# Directory automatically cleaned up when exiting contextimport os
from pathlib import Path
def get_cache_stats(cache_dir):
"""Get statistics about cache directory."""
cache_path = Path(cache_dir)
if not cache_path.exists():
return {"exists": False}
files = list(cache_path.rglob("*"))
file_sizes = [f.stat().st_size for f in files if f.is_file()]
return {
"exists": True,
"total_files": len([f for f in files if f.is_file()]),
"total_directories": len([f for f in files if f.is_dir()]),
"total_size_bytes": sum(file_sizes),
"total_size_mb": sum(file_sizes) / (1024 * 1024),
"largest_file_bytes": max(file_sizes) if file_sizes else 0
}
# Monitor cache usage
cache_dir = "/tmp/cloudpath_cache"
client = S3Client(
file_cache_mode=FileCacheMode.persistent,
local_cache_dir=cache_dir
)
# Perform operations
path1 = CloudPath("s3://bucket/file1.txt", client=client)
path2 = CloudPath("s3://bucket/file2.txt", client=client)
content1 = path1.read_text()
content2 = path2.read_text()
# Check cache statistics
stats = get_cache_stats(cache_dir)
print(f"Cache stats: {stats}")from pathlib import Path
def validate_cache_configuration(file_cache_mode, local_cache_dir):
"""Validate cache configuration settings."""
issues = []
# Validate cache mode
if not isinstance(file_cache_mode, FileCacheMode):
issues.append(f"Invalid cache mode: {file_cache_mode}")
# Validate cache directory
if local_cache_dir:
cache_path = Path(local_cache_dir)
# Check if parent directory exists
if not cache_path.parent.exists():
issues.append(f"Cache directory parent does not exist: {cache_path.parent}")
# Check if we can create the directory
try:
cache_path.mkdir(parents=True, exist_ok=True)
except PermissionError:
issues.append(f"Cannot create cache directory: {cache_path}")
# Check write permissions
if cache_path.exists() and not os.access(cache_path, os.W_OK):
issues.append(f"No write permission to cache directory: {cache_path}")
return issues
# Validate configuration before using
cache_mode = FileCacheMode.persistent
cache_dir = "/tmp/my_cache"
issues = validate_cache_configuration(cache_mode, cache_dir)
if issues:
print("Configuration issues:")
for issue in issues:
print(f" - {issue}")
else:
print("Configuration is valid")
client = S3Client(
file_cache_mode=cache_mode,
local_cache_dir=cache_dir
)import time
from datetime import datetime, timedelta
def cleanup_old_cache_files(cache_dir, max_age_days=7):
"""Remove cache files older than specified days."""
cache_path = Path(cache_dir)
if not cache_path.exists():
return 0
cutoff_time = time.time() - (max_age_days * 24 * 60 * 60)
removed_count = 0
for file_path in cache_path.rglob("*"):
if file_path.is_file():
if file_path.stat().st_mtime < cutoff_time:
file_path.unlink()
removed_count += 1
return removed_count
def cleanup_large_cache_files(cache_dir, max_size_mb=100):
"""Remove cache files larger than specified size."""
cache_path = Path(cache_dir)
if not cache_path.exists():
return 0
max_size_bytes = max_size_mb * 1024 * 1024
removed_count = 0
for file_path in cache_path.rglob("*"):
if file_path.is_file():
if file_path.stat().st_size > max_size_bytes:
file_path.unlink()
removed_count += 1
return removed_count
# Usage
cache_dir = "/tmp/cloudpath_cache"
# Clean up old files
old_files_removed = cleanup_old_cache_files(cache_dir, max_age_days=3)
print(f"Removed {old_files_removed} old cache files")
# Clean up large files
large_files_removed = cleanup_large_cache_files(cache_dir, max_size_mb=50)
print(f"Removed {large_files_removed} large cache files")class CacheConfiguration:
"""Advanced cache configuration management."""
def __init__(self):
self.configurations = {}
def register_config(self, name, **kwargs):
"""Register a named configuration."""
self.configurations[name] = kwargs
def get_client(self, config_name, client_class, **additional_args):
"""Create client with named configuration."""
config = self.configurations.get(config_name, {})
config.update(additional_args)
return client_class(**config)
# Set up configuration registry
cache_config = CacheConfiguration()
# Register different configurations
cache_config.register_config(
"high_performance",
file_cache_mode=FileCacheMode.persistent,
local_cache_dir="/fast/cache"
)
cache_config.register_config(
"low_memory",
file_cache_mode=FileCacheMode.close_file,
local_cache_dir="/tmp/cache"
)
cache_config.register_config(
"development",
file_cache_mode=FileCacheMode.cloudpath_object,
local_cache_dir="./dev_cache"
)
# Create clients with named configurations
high_perf_s3 = cache_config.get_client(
"high_performance",
S3Client,
aws_profile="production"
)
low_mem_gs = cache_config.get_client(
"low_memory",
GSClient,
project="my-project"
)import os
class EnvironmentConfiguration:
"""Configuration management using environment variables."""
@staticmethod
def get_cache_mode():
"""Get cache mode from environment."""
mode_str = os.getenv("CLOUDPATHLIB_CACHE_MODE", "tmp_dir")
try:
return FileCacheMode(mode_str)
except ValueError:
print(f"Invalid cache mode '{mode_str}', using default")
return FileCacheMode.tmp_dir
@staticmethod
def get_cache_dir():
"""Get cache directory from environment."""
return os.getenv("CLOUDPATHLIB_CACHE_DIR")
@staticmethod
def is_caching_enabled():
"""Check if caching is enabled."""
return os.getenv("CLOUDPATHLIB_DISABLE_CACHE", "").lower() != "true"
@classmethod
def create_s3_client(cls):
"""Create S3 client from environment configuration."""
if not cls.is_caching_enabled():
# Disable caching
return S3Client(file_cache_mode=FileCacheMode.close_file)
return S3Client(
file_cache_mode=cls.get_cache_mode(),
local_cache_dir=cls.get_cache_dir()
)
# Usage with environment variables
"""
Environment setup:
export CLOUDPATHLIB_CACHE_MODE=persistent
export CLOUDPATHLIB_CACHE_DIR=/var/cache/myapp
export CLOUDPATHLIB_DISABLE_CACHE=false
"""
env_client = EnvironmentConfiguration.create_s3_client()
path = CloudPath("s3://bucket/file.txt", client=env_client)def print_configuration_help():
"""Print help for CloudPathLib configuration options."""
help_text = """
CloudPathLib Configuration Options
=================================
Environment Variables:
CLOUDPATHLIB_CACHE_MODE - Cache management mode
Values: persistent, tmp_dir, cloudpath_object, close_file
Default: tmp_dir
CLOUDPATHLIB_CACHE_DIR - Custom cache directory path
Default: System temp directory
CLOUDPATHLIB_DISABLE_CACHE - Disable all caching
Values: true, false
Default: false
CLOUDPATHLIB_PATCH_OPEN - Auto-patch open() function
Values: true, false
Default: false
CLOUDPATHLIB_PATCH_OS - Auto-patch os functions
Values: true, false
Default: false
CLOUDPATHLIB_PATCH_GLOB - Auto-patch glob functions
Values: true, false
Default: false
CLOUDPATHLIB_PATCH_ALL - Auto-patch all functions
Values: true, false
Default: false
Cache Modes:
persistent - Files cached until manually cleared
tmp_dir - Files cached in temp directory (default)
cloudpath_object - Cache cleared when CloudPath deleted
close_file - Cache cleared when file closed
Example Configuration:
export CLOUDPATHLIB_CACHE_MODE=persistent
export CLOUDPATHLIB_CACHE_DIR=/var/cache/myapp
export CLOUDPATHLIB_PATCH_ALL=true
"""
print(help_text)
# Show configuration help
print_configuration_help()Install with Tessl CLI
npx tessl i tessl/pypi-cloudpathlib