tessl/pypi-cloudpathlib

Pathlib-style classes for cloud storage services that provide seamless access to AWS S3, Google Cloud Storage, and Azure Blob Storage with familiar filesystem operations.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Universal Path Handling

Name: tessl/pypi-cloudpathlib
Author: tessl

AnyPath provides intelligent dispatching between cloud paths and local filesystem paths, enabling code that works seamlessly with both local and cloud storage. This universal interface allows you to write path-agnostic code that automatically handles different storage backends.

Capabilities

AnyPath Class

Polymorphic constructor that automatically dispatches to the appropriate path type.

class AnyPath:
    """Universal path constructor."""
    
    def __new__(
        cls,
        *args,
        **kwargs
    ) -> typing.Union[CloudPath, "pathlib.Path"]:
        """
        Create appropriate path type based on input.
        
        Args:
            *args: Path arguments
            **kwargs: Additional arguments
            
        Returns:
            CloudPath instance for cloud URIs, pathlib.Path for local paths
        """
    
    @classmethod
    def validate(cls, v):
        """
        Pydantic validator for AnyPath instances.
        
        Args:
            v: Value to validate
            
        Returns:
            Validated path object
        """

Helper Functions

Utility functions for path conversion and handling.

def to_anypath(
    s: typing.Union[str, "os.PathLike"]
) -> typing.Union[CloudPath, "pathlib.Path"]:
    """
    Convert string or PathLike to appropriate path type.
    
    Args:
        s: String or path-like object
        
    Returns:
        CloudPath for cloud URIs, pathlib.Path for local paths
    """

Usage Examples

Basic AnyPath Usage

from cloudpathlib import AnyPath

# Automatically dispatches to appropriate path type
cloud_path = AnyPath("s3://my-bucket/file.txt")
print(type(cloud_path))  # <class 'cloudpathlib.s3.S3Path'>

local_path = AnyPath("/home/user/file.txt")
print(type(local_path))  # <class 'pathlib.PosixPath'>

windows_path = AnyPath("C:\\Users\\user\\file.txt")
print(type(windows_path))  # <class 'pathlib.WindowsPath'>

# Works with different cloud providers
gcs_path = AnyPath("gs://my-bucket/file.txt")
azure_path = AnyPath("az://my-container/file.txt")
http_path = AnyPath("https://example.com/file.txt")

Path-Agnostic Functions

def process_file(path_str):
    """Process file regardless of storage location."""
    path = AnyPath(path_str)
    
    # Same API works for both local and cloud paths
    if path.exists():
        content = path.read_text()
        
        # Process content
        processed = content.upper()
        
        # Write back to same location
        output_path = path.with_stem(path.stem + "_processed")
        output_path.write_text(processed)
        
        return output_path
    else:
        raise FileNotFoundError(f"File not found: {path}")

# Works with any path type
local_result = process_file("/tmp/local_file.txt")
s3_result = process_file("s3://bucket/cloud_file.txt")
gcs_result = process_file("gs://bucket/gcs_file.txt")

Configuration-Driven Path Handling

import os
from pathlib import Path

def get_data_path(filename):
    """Get data path based on environment configuration."""
    storage_type = os.getenv("STORAGE_TYPE", "local")
    
    if storage_type == "local":
        base_path = os.getenv("LOCAL_DATA_DIR", "./data")
        return AnyPath(base_path) / filename
    elif storage_type == "s3":
        bucket = os.getenv("S3_BUCKET", "default-bucket")
        return AnyPath(f"s3://{bucket}/data") / filename
    elif storage_type == "gcs":
        bucket = os.getenv("GCS_BUCKET", "default-bucket")
        return AnyPath(f"gs://{bucket}/data") / filename
    else:
        raise ValueError(f"Unknown storage type: {storage_type}")

# Usage - works with any configured storage
data_file = get_data_path("dataset.csv")
print(f"Using: {data_file}")

# Read/write operations work the same regardless of backend
if data_file.exists():
    data = data_file.read_text()
else:
    data_file.write_text("id,name,value\n1,test,100")

Batch Processing with Mixed Paths

def process_file_list(file_paths):
    """Process list of files from different storage locations."""
    results = []
    
    for path_str in file_paths:
        path = AnyPath(path_str)
        
        print(f"Processing {path} (type: {type(path).__name__})")
        
        if path.exists():
            # Same operations work for all path types
            size = path.stat().st_size
            modified = path.stat().st_mtime
            
            results.append({
                'path': str(path),
                'type': type(path).__name__,
                'size': size,
                'modified': modified
            })
        else:
            print(f"Skipping non-existent file: {path}")
    
    return results

# Mix of local and cloud paths
mixed_paths = [
    "/home/user/local_file.txt",
    "s3://my-bucket/s3_file.txt",
    "gs://my-bucket/gcs_file.txt",
    "az://my-container/azure_file.txt",
    "C:\\Users\\user\\windows_file.txt"
]

results = process_file_list(mixed_paths)
for result in results:
    print(f"{result['type']}: {result['path']} ({result['size']} bytes)")

Data Pipeline with Flexible Storage

class DataPipeline:
    """Data pipeline that works with any storage backend."""
    
    def __init__(self, input_path, output_path, temp_dir=None):
        self.input_path = AnyPath(input_path)
        self.output_path = AnyPath(output_path)
        self.temp_dir = AnyPath(temp_dir) if temp_dir else None
    
    def process(self):
        """Run the pipeline."""
        print(f"Input: {self.input_path} ({type(self.input_path).__name__})")
        print(f"Output: {self.output_path} ({type(self.output_path).__name__})")
        
        # Read input data
        raw_data = self.input_path.read_text()
        
        # Process data
        processed_data = self.transform_data(raw_data)
        
        # Write temporary result if temp directory specified
        if self.temp_dir:
            temp_file = self.temp_dir / f"temp_{self.input_path.name}"
            temp_file.parent.mkdir(parents=True, exist_ok=True)
            temp_file.write_text(processed_data)
            print(f"Temp file: {temp_file}")
        
        # Write final output
        self.output_path.parent.mkdir(parents=True, exist_ok=True)
        self.output_path.write_text(processed_data)
        
        return self.output_path
    
    def transform_data(self, data):
        """Transform the data (example transformation)."""
        lines = data.strip().split('\n')
        processed_lines = [f"PROCESSED: {line}" for line in lines]
        return '\n'.join(processed_lines)

# Works with any combination of storage types
pipeline1 = DataPipeline(
    input_path="s3://source-bucket/raw_data.txt",
    output_path="/tmp/processed_data.txt",
    temp_dir="gs://temp-bucket/pipeline-temp/"
)

pipeline2 = DataPipeline(
    input_path="/home/user/input.txt",
    output_path="az://output-container/result.txt"
)

# Same interface, different storage backends
result1 = pipeline1.process()
result2 = pipeline2.process()

Dynamic Path Resolution

def resolve_path(path_spec):
    """Resolve path specification to actual path."""
    if isinstance(path_spec, dict):
        # Dynamic path specification
        storage_type = path_spec.get('type', 'local')
        
        if storage_type == 'local':
            base_dir = path_spec.get('base_dir', '.')
            filename = path_spec['filename']
            return AnyPath(base_dir) / filename
        
        elif storage_type == 's3':
            bucket = path_spec['bucket']
            key = path_spec['key']
            return AnyPath(f"s3://{bucket}/{key}")
        
        elif storage_type == 'gcs':
            bucket = path_spec['bucket']
            blob = path_spec['blob']
            return AnyPath(f"gs://{bucket}/{blob}")
        
        elif storage_type == 'azure':
            container = path_spec['container']
            blob = path_spec['blob']
            return AnyPath(f"az://{container}/{blob}")
    
    else:
        # Direct path specification
        return AnyPath(path_spec)

# Example path specifications
path_specs = [
    "/direct/local/path.txt",
    "s3://direct-bucket/file.txt",
    {
        'type': 'local',
        'base_dir': '/home/user/data',
        'filename': 'config.json'
    },
    {
        'type': 's3',
        'bucket': 'my-data-bucket',
        'key': 'processed/results.csv'
    },
    {
        'type': 'gcs',
        'bucket': 'analytics-bucket',
        'blob': 'reports/monthly.pdf'
    }
]

# Resolve all specifications
resolved_paths = [resolve_path(spec) for spec in path_specs]
for original, resolved in zip(path_specs, resolved_paths):
    print(f"{original} -> {resolved} ({type(resolved).__name__})")

Testing with Path Abstraction

import tempfile
import pytest

class TestDataProcessor:
    """Test data processor with different storage backends."""
    
    def setup_test_data(self, storage_type="local"):
        """Setup test data for different storage types."""
        if storage_type == "local":
            temp_dir = tempfile.mkdtemp()
            test_file = AnyPath(temp_dir) / "test_data.txt"
        else:
            # Use environment variables for cloud testing
            if storage_type == "s3":
                test_file = AnyPath("s3://test-bucket/test_data.txt")
            elif storage_type == "gcs":
                test_file = AnyPath("gs://test-bucket/test_data.txt")
            else:
                pytest.skip(f"Storage type {storage_type} not configured for testing")
        
        # Same setup code works for all storage types
        test_file.write_text("line1\nline2\nline3")
        return test_file
    
    @pytest.mark.parametrize("storage_type", ["local", "s3", "gcs"])
    def test_file_processing(self, storage_type):
        """Test file processing with different storage backends."""
        test_file = self.setup_test_data(storage_type)
        
        # Process file
        result = process_file(str(test_file))
        
        # Verify results work the same way
        assert result.exists()
        content = result.read_text()
        assert "LINE1" in content  # Assuming process_file converts to uppercase
        
        # Cleanup
        if storage_type == "local":
            result.unlink()
            test_file.unlink()

# Usage in configuration management
def load_config(config_path_spec):
    """Load configuration from various sources."""
    config_path = AnyPath(config_path_spec)
    
    if config_path.exists():
        return json.loads(config_path.read_text())
    else:
        # Return default config
        return {"default": True}

# Works with any path type
local_config = load_config("./config.json")
s3_config = load_config("s3://config-bucket/prod-config.json")
gcs_config = load_config("gs://config-bucket/staging-config.json")

Helper Function Usage

from cloudpathlib import to_anypath

# Convert various inputs to appropriate path types
paths = [
    "/local/file.txt",
    "s3://bucket/file.txt",
    Path("/another/local/file.txt"),
    "gs://bucket/data.json",
    "https://example.com/api/data"
]

converted_paths = [to_anypath(p) for p in paths]

for original, converted in zip(paths, converted_paths):
    print(f"{original} -> {type(converted).__name__}")

# Use in functions that accept string or path objects
def safe_read_file(path_input):
    """Safely read file from string or path object."""
    path = to_anypath(path_input)
    
    try:
        return path.read_text()
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return None

# Works with any input type
content1 = safe_read_file("/tmp/file.txt")
content2 = safe_read_file("s3://bucket/file.txt")
content3 = safe_read_file(Path("/home/user/file.txt"))

Pydantic Integration

from pydantic import BaseModel
from cloudpathlib import AnyPath

class DataConfig(BaseModel):
    """Configuration model with path validation."""
    
    input_path: AnyPath
    output_path: AnyPath
    temp_dir: AnyPath = None
    
    class Config:
        # Allow AnyPath types
        arbitrary_types_allowed = True

# Validation works with any path type
config_data = {
    "input_path": "s3://source-bucket/data.csv",
    "output_path": "/tmp/processed.csv",
    "temp_dir": "gs://temp-bucket/workspace/"
}

config = DataConfig(**config_data)
print(f"Input: {config.input_path} ({type(config.input_path).__name__})")
print(f"Output: {config.output_path} ({type(config.output_path).__name__})")
print(f"Temp: {config.temp_dir} ({type(config.temp_dir).__name__})")

# Use validated paths
if config.input_path.exists():
    data = config.input_path.read_text()
    config.output_path.write_text(data.upper())

Install with Tessl CLI