CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-cloudpathlib

Pathlib-style classes for cloud storage services that provide seamless access to AWS S3, Google Cloud Storage, and Azure Blob Storage with familiar filesystem operations.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

configuration.mddocs/

Configuration and Enums

Configuration options for cache management, file handling modes, and other library settings that control behavior across all cloud providers. These settings allow fine-tuned control over caching, performance, and integration with existing systems.

Capabilities

FileCacheMode Enum

Configuration enum for controlling how CloudPathLib manages local file caching.

class FileCacheMode(str, Enum):
    """File cache management strategies."""
    
    persistent = "persistent"
    """
    Cache persists until manually cleared.
    Files remain cached across Python sessions.
    """
    
    tmp_dir = "tmp_dir"
    """
    Cache in temporary directory (default).
    Files cached in system temp directory, may be cleaned by OS.
    """
    
    cloudpath_object = "cloudpath_object"
    """
    Cache cleared when CloudPath object is deleted.
    Automatic cleanup when path objects go out of scope.
    """
    
    close_file = "close_file"
    """
    Cache cleared when file is closed.
    Immediate cleanup after file operations complete.
    """
    
    @classmethod
    def from_environment(cls) -> "FileCacheMode":
        """
        Parse cache mode from environment variable.
        
        Returns:
            FileCacheMode from CLOUDPATHLIB_CACHE_MODE env var
        """

Implementation Registry

Global registry that tracks all available cloud provider implementations and their associated path and client classes.

implementation_registry: typing.Dict[str, "CloudImplementation"]
"""
Global registry mapping cloud provider keys to their implementation metadata.
Keys: "s3", "gs", "azure", "http", "https"
"""

class CloudImplementation:
    """
    Metadata container for cloud provider implementations.
    
    Attributes:
        name (str): Provider identifier ("s3", "gs", "azure", etc.)
        dependencies_loaded (bool): Whether required dependencies are available
        _client_class (Type[Client]): Client class for this provider  
        _path_class (Type[CloudPath]): Path class for this provider
    """
    name: str
    dependencies_loaded: bool = True
    _client_class: typing.Type["Client"]
    _path_class: typing.Type["CloudPath"]
    
    def validate_completeness(self) -> None:
        """Validate that implementation has all required components."""

Usage Examples

Basic Cache Mode Configuration

from cloudpathlib import FileCacheMode, S3Client, CloudPath

# Configure client with specific cache mode
client = S3Client(
    file_cache_mode=FileCacheMode.persistent,
    local_cache_dir="/var/cache/cloudpathlib"
)

# Create paths with configured caching
path = CloudPath("s3://my-bucket/large-file.dat", client=client)

# File is cached persistently
content = path.read_bytes()  # Downloads and caches
content = path.read_bytes()  # Uses cached version (no download)

# Cache persists across Python sessions

Environment-Based Configuration

import os
from cloudpathlib import FileCacheMode

# Set environment variable
os.environ["CLOUDPATHLIB_CACHE_MODE"] = "persistent"

# Parse from environment
cache_mode = FileCacheMode.from_environment()
print(f"Cache mode: {cache_mode}")  # FileCacheMode.persistent

# Use in client configuration
client = S3Client(file_cache_mode=cache_mode)

Different Cache Strategies

# Persistent caching - files stay cached until manually cleared
persistent_client = S3Client(
    file_cache_mode=FileCacheMode.persistent,
    local_cache_dir="/persistent/cache"
)

# Temporary caching - system handles cleanup
temp_client = S3Client(
    file_cache_mode=FileCacheMode.tmp_dir
)

# Object-scoped caching - cleared when path object is deleted
object_client = S3Client(
    file_cache_mode=FileCacheMode.cloudpath_object
)

# File-scoped caching - cleared when file is closed
file_client = S3Client(
    file_cache_mode=FileCacheMode.close_file
)

# Demonstrate different behaviors
path1 = CloudPath("s3://bucket/file.txt", client=persistent_client)
path2 = CloudPath("s3://bucket/file.txt", client=temp_client)
path3 = CloudPath("s3://bucket/file.txt", client=object_client)
path4 = CloudPath("s3://bucket/file.txt", client=file_client)

# Read files with different caching behaviors
content1 = path1.read_text()  # Cached persistently
content2 = path2.read_text()  # Cached in temp directory
content3 = path3.read_text()  # Cached until path3 is deleted
content4 = path4.read_text()  # Cache cleared immediately after read

Performance-Oriented Configuration

def configure_high_performance_client():
    """Configure client for high-performance scenarios."""
    return S3Client(
        file_cache_mode=FileCacheMode.persistent,
        local_cache_dir="/fast/ssd/cache",  # Use fast storage for cache
        boto3_transfer_config=boto3.s3.transfer.TransferConfig(
            multipart_threshold=1024 * 25,  # 25MB
            max_concurrency=10,
            multipart_chunksize=1024 * 25,
            use_threads=True
        )
    )

def configure_memory_constrained_client():
    """Configure client for memory-constrained environments."""
    return S3Client(
        file_cache_mode=FileCacheMode.close_file,  # Immediate cleanup
        local_cache_dir="/tmp/cloudpath_cache"     # Use temp directory
    )

# Use appropriate configuration
high_perf_client = configure_high_performance_client()
memory_client = configure_memory_constrained_client()

Development vs Production Configuration

import os

def get_cache_config():
    """Get cache configuration based on environment."""
    environment = os.getenv("ENVIRONMENT", "development")
    
    if environment == "production":
        return {
            "file_cache_mode": FileCacheMode.persistent,
            "local_cache_dir": "/var/cache/app/cloudpathlib"
        }
    elif environment == "staging":
        return {
            "file_cache_mode": FileCacheMode.tmp_dir,
            "local_cache_dir": "/tmp/staging_cache"
        }
    else:  # development
        return {
            "file_cache_mode": FileCacheMode.cloudpath_object,
            "local_cache_dir": "./dev_cache"
        }

# Apply environment-specific configuration
cache_config = get_cache_config()
client = S3Client(**cache_config)

Cache Directory Management

import tempfile
import shutil
from pathlib import Path

class ManagedCacheDirectory:
    """Context manager for temporary cache directories."""
    
    def __init__(self, prefix="cloudpath_"):
        self.prefix = prefix
        self.temp_dir = None
    
    def __enter__(self):
        self.temp_dir = Path(tempfile.mkdtemp(prefix=self.prefix))
        return str(self.temp_dir)
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.temp_dir and self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)

# Use managed cache directory
with ManagedCacheDirectory() as cache_dir:
    client = S3Client(
        file_cache_mode=FileCacheMode.persistent,
        local_cache_dir=cache_dir
    )
    
    path = CloudPath("s3://bucket/file.txt", client=client)
    content = path.read_text()  # Cached in managed directory
    
    # Directory automatically cleaned up when exiting context

Cache Monitoring

import os
from pathlib import Path

def get_cache_stats(cache_dir):
    """Get statistics about cache directory."""
    cache_path = Path(cache_dir)
    
    if not cache_path.exists():
        return {"exists": False}
    
    files = list(cache_path.rglob("*"))
    file_sizes = [f.stat().st_size for f in files if f.is_file()]
    
    return {
        "exists": True,
        "total_files": len([f for f in files if f.is_file()]),
        "total_directories": len([f for f in files if f.is_dir()]),
        "total_size_bytes": sum(file_sizes),
        "total_size_mb": sum(file_sizes) / (1024 * 1024),
        "largest_file_bytes": max(file_sizes) if file_sizes else 0
    }

# Monitor cache usage
cache_dir = "/tmp/cloudpath_cache"
client = S3Client(
    file_cache_mode=FileCacheMode.persistent,
    local_cache_dir=cache_dir
)

# Perform operations
path1 = CloudPath("s3://bucket/file1.txt", client=client)
path2 = CloudPath("s3://bucket/file2.txt", client=client)

content1 = path1.read_text()
content2 = path2.read_text()

# Check cache statistics
stats = get_cache_stats(cache_dir)
print(f"Cache stats: {stats}")

Configuration Validation

from pathlib import Path

def validate_cache_configuration(file_cache_mode, local_cache_dir):
    """Validate cache configuration settings."""
    issues = []
    
    # Validate cache mode
    if not isinstance(file_cache_mode, FileCacheMode):
        issues.append(f"Invalid cache mode: {file_cache_mode}")
    
    # Validate cache directory
    if local_cache_dir:
        cache_path = Path(local_cache_dir)
        
        # Check if parent directory exists
        if not cache_path.parent.exists():
            issues.append(f"Cache directory parent does not exist: {cache_path.parent}")
        
        # Check if we can create the directory
        try:
            cache_path.mkdir(parents=True, exist_ok=True)
        except PermissionError:
            issues.append(f"Cannot create cache directory: {cache_path}")
        
        # Check write permissions
        if cache_path.exists() and not os.access(cache_path, os.W_OK):
            issues.append(f"No write permission to cache directory: {cache_path}")
    
    return issues

# Validate configuration before using
cache_mode = FileCacheMode.persistent
cache_dir = "/tmp/my_cache"

issues = validate_cache_configuration(cache_mode, cache_dir)
if issues:
    print("Configuration issues:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("Configuration is valid")
    client = S3Client(
        file_cache_mode=cache_mode,
        local_cache_dir=cache_dir
    )

Cache Cleanup Utilities

import time
from datetime import datetime, timedelta

def cleanup_old_cache_files(cache_dir, max_age_days=7):
    """Remove cache files older than specified days."""
    cache_path = Path(cache_dir)
    
    if not cache_path.exists():
        return 0
    
    cutoff_time = time.time() - (max_age_days * 24 * 60 * 60)
    removed_count = 0
    
    for file_path in cache_path.rglob("*"):
        if file_path.is_file():
            if file_path.stat().st_mtime < cutoff_time:
                file_path.unlink()
                removed_count += 1
    
    return removed_count

def cleanup_large_cache_files(cache_dir, max_size_mb=100):
    """Remove cache files larger than specified size."""
    cache_path = Path(cache_dir)
    
    if not cache_path.exists():
        return 0
    
    max_size_bytes = max_size_mb * 1024 * 1024
    removed_count = 0
    
    for file_path in cache_path.rglob("*"):
        if file_path.is_file():
            if file_path.stat().st_size > max_size_bytes:
                file_path.unlink()
                removed_count += 1
    
    return removed_count

# Usage
cache_dir = "/tmp/cloudpath_cache"

# Clean up old files
old_files_removed = cleanup_old_cache_files(cache_dir, max_age_days=3)
print(f"Removed {old_files_removed} old cache files")

# Clean up large files
large_files_removed = cleanup_large_cache_files(cache_dir, max_size_mb=50)
print(f"Removed {large_files_removed} large cache files")

Advanced Configuration Patterns

class CacheConfiguration:
    """Advanced cache configuration management."""
    
    def __init__(self):
        self.configurations = {}
    
    def register_config(self, name, **kwargs):
        """Register a named configuration."""
        self.configurations[name] = kwargs
    
    def get_client(self, config_name, client_class, **additional_args):
        """Create client with named configuration."""
        config = self.configurations.get(config_name, {})
        config.update(additional_args)
        return client_class(**config)

# Set up configuration registry
cache_config = CacheConfiguration()

# Register different configurations
cache_config.register_config(
    "high_performance",
    file_cache_mode=FileCacheMode.persistent,
    local_cache_dir="/fast/cache"
)

cache_config.register_config(
    "low_memory",
    file_cache_mode=FileCacheMode.close_file,
    local_cache_dir="/tmp/cache"
)

cache_config.register_config(
    "development",
    file_cache_mode=FileCacheMode.cloudpath_object,
    local_cache_dir="./dev_cache"
)

# Create clients with named configurations
high_perf_s3 = cache_config.get_client(
    "high_performance",
    S3Client,
    aws_profile="production"
)

low_mem_gs = cache_config.get_client(
    "low_memory",
    GSClient,
    project="my-project"
)

Environment Variable Integration

import os

class EnvironmentConfiguration:
    """Configuration management using environment variables."""
    
    @staticmethod
    def get_cache_mode():
        """Get cache mode from environment."""
        mode_str = os.getenv("CLOUDPATHLIB_CACHE_MODE", "tmp_dir")
        try:
            return FileCacheMode(mode_str)
        except ValueError:
            print(f"Invalid cache mode '{mode_str}', using default")
            return FileCacheMode.tmp_dir
    
    @staticmethod
    def get_cache_dir():
        """Get cache directory from environment."""
        return os.getenv("CLOUDPATHLIB_CACHE_DIR")
    
    @staticmethod
    def is_caching_enabled():
        """Check if caching is enabled."""
        return os.getenv("CLOUDPATHLIB_DISABLE_CACHE", "").lower() != "true"
    
    @classmethod
    def create_s3_client(cls):
        """Create S3 client from environment configuration."""
        if not cls.is_caching_enabled():
            # Disable caching
            return S3Client(file_cache_mode=FileCacheMode.close_file)
        
        return S3Client(
            file_cache_mode=cls.get_cache_mode(),
            local_cache_dir=cls.get_cache_dir()
        )

# Usage with environment variables
"""
Environment setup:
export CLOUDPATHLIB_CACHE_MODE=persistent
export CLOUDPATHLIB_CACHE_DIR=/var/cache/myapp
export CLOUDPATHLIB_DISABLE_CACHE=false
"""

env_client = EnvironmentConfiguration.create_s3_client()
path = CloudPath("s3://bucket/file.txt", client=env_client)

Configuration Documentation

def print_configuration_help():
    """Print help for CloudPathLib configuration options."""
    
    help_text = """
CloudPathLib Configuration Options
=================================

Environment Variables:
  CLOUDPATHLIB_CACHE_MODE     - Cache management mode
                                Values: persistent, tmp_dir, cloudpath_object, close_file
                                Default: tmp_dir
  
  CLOUDPATHLIB_CACHE_DIR      - Custom cache directory path
                                Default: System temp directory
  
  CLOUDPATHLIB_DISABLE_CACHE  - Disable all caching
                                Values: true, false
                                Default: false
  
  CLOUDPATHLIB_PATCH_OPEN     - Auto-patch open() function
                                Values: true, false
                                Default: false
  
  CLOUDPATHLIB_PATCH_OS       - Auto-patch os functions
                                Values: true, false
                                Default: false
  
  CLOUDPATHLIB_PATCH_GLOB     - Auto-patch glob functions
                                Values: true, false
                                Default: false
  
  CLOUDPATHLIB_PATCH_ALL      - Auto-patch all functions
                                Values: true, false
                                Default: false

Cache Modes:
  persistent        - Files cached until manually cleared
  tmp_dir          - Files cached in temp directory (default)
  cloudpath_object - Cache cleared when CloudPath deleted
  close_file       - Cache cleared when file closed

Example Configuration:
  export CLOUDPATHLIB_CACHE_MODE=persistent
  export CLOUDPATHLIB_CACHE_DIR=/var/cache/myapp
  export CLOUDPATHLIB_PATCH_ALL=true
"""
    
    print(help_text)

# Show configuration help
print_configuration_help()

Install with Tessl CLI

npx tessl i tessl/pypi-cloudpathlib

docs

anypath.md

azure-integration.md

client-management.md

cloud-operations.md

configuration.md

core-operations.md

directory-operations.md

exceptions.md

file-io.md

gcs-integration.md

http-support.md

index.md

patching.md

s3-integration.md

tile.json