CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

Pending
Overview
Eval results
Files

cloud-storage.mddocs/

Cloud Storage Integration

PyStow provides built-in support for downloading files from major cloud storage services, including AWS S3 and Google Drive. This enables seamless integration with cloud-hosted datasets and files.

AWS S3 Support

S3 File Download

def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:
    """Ensure a file is downloaded from AWS S3.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        s3_bucket: The S3 bucket name
        s3_key: The S3 key name
        name: Overrides the name of the file at the end of the S3 key, if given.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        kwargs: Remaining kwargs to forward to Module.ensure_from_s3.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

Google Drive Support

Google Drive File Download

def ensure_from_google(key: str, *subkeys: str, name: str, file_id: str, force: bool = False) -> Path:
    """Ensure a file is downloaded from Google Drive.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file
        file_id: The file identifier of the Google file. If your share link is
            https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
            your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

Usage Examples

AWS S3 Downloads

import pystow

# Download file from S3 bucket
path = pystow.ensure_from_s3(
    "myapp", "datasets",
    s3_bucket="my-data-bucket",
    s3_key="datasets/v1/train.csv",
    name="training_data.csv"
)

# Download with nested S3 key
path = pystow.ensure_from_s3(
    "myapp", "models",
    s3_bucket="ml-models",
    s3_key=["experiments", "model_v2", "checkpoint.pkl"],
    name="model_checkpoint.pkl"
)

# Use custom name
path = pystow.ensure_from_s3(
    "myapp", "resources",
    s3_bucket="public-datasets", 
    s3_key="data/raw/file_with_complex_name.csv",
    name="simple_name.csv"  # Rename for local storage
)

Google Drive Downloads

import pystow

# Download from Google Drive using file ID
path = pystow.ensure_from_google(
    "myapp", "datasets",
    name="dataset.zip",
    file_id="1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z"
)

# Force re-download
path = pystow.ensure_from_google(
    "myapp", "models",
    name="pretrained_model.pkl",
    file_id="1BcDfG2hIjKlMnOpQrStUvWxYz3456789",
    force=True
)

Module-Based Cloud Downloads

import pystow

# Create module for project
module = pystow.module("myproject")

# Download from S3 using module
s3_path = module.ensure_from_s3(
    "data", "raw",
    s3_bucket="research-data",
    s3_key="experiments/dataset_v3.csv"
)

# Download from Google Drive using module
gdrive_path = module.ensure_from_google(
    "models", "pretrained",
    name="bert_model.tar.gz",
    file_id="1ExAmPlE_fIlE_iD_123456789"
)

AWS S3 Configuration

import pystow
import boto3

# Download with custom boto3 client configuration
path = pystow.ensure_from_s3(
    "myapp", "secure_data",
    s3_bucket="private-bucket",
    s3_key="sensitive/data.json",
    client_kwargs={
        "region_name": "us-west-2",
        "aws_access_key_id": "your_access_key",
        "aws_secret_access_key": "your_secret_key"
    }
)

# Using existing boto3 client
s3_client = boto3.client('s3', region_name='eu-west-1')
path = pystow.ensure_from_s3(
    "myapp", "eu_data",
    s3_bucket="eu-data-bucket",
    s3_key="regional/dataset.csv",
    client=s3_client
)

Advanced S3 Downloads

import pystow

# Download with additional S3 transfer options
path = pystow.ensure_from_s3(
    "myapp", "large_files",
    s3_bucket="big-data-bucket",
    s3_key="large_dataset/data.parquet",
    download_file_kwargs={
        "Config": {
            "multipart_threshold": 1024 * 25,  # 25MB
            "max_concurrency": 10,
            "multipart_chunksize": 1024 * 25,
            "use_threads": True
        }
    }
)

# Download and force refresh
path = pystow.ensure_from_s3(
    "myapp", "live_data",
    s3_bucket="streaming-data",
    s3_key="current/metrics.json",
    force=True  # Always fetch latest version
)

Error Handling and Authentication

import pystow
from botocore.exceptions import NoCredentialsError, ClientError

try:
    # Download from S3
    path = pystow.ensure_from_s3(
        "myapp", "datasets",
        s3_bucket="secure-bucket",
        s3_key="protected/data.csv"
    )
    print(f"Downloaded to: {path}")
    
except NoCredentialsError:
    print("AWS credentials not found. Please configure AWS CLI or set environment variables.")
    
except ClientError as e:
    error_code = e.response['Error']['Code']
    if error_code == 'NoSuchBucket':
        print("S3 bucket does not exist")
    elif error_code == 'NoSuchKey':
        print("S3 key does not exist")
    elif error_code == 'AccessDenied':
        print("Access denied to S3 resource")
    else:
        print(f"S3 error: {e}")

Cloud-Based Data Processing Workflows

import pystow
import pandas as pd

def process_s3_dataset(bucket, key, output_name):
    """Download S3 dataset, process it, and save locally"""
    
    # Download raw data from S3
    raw_path = pystow.ensure_from_s3(
        "myapp", "raw_data",
        s3_bucket=bucket,
        s3_key=key
    )
    
    # Load and process data
    df = pd.read_csv(raw_path)
    processed_df = df.groupby('category').agg({
        'value': 'mean',
        'count': 'sum'
    }).reset_index()
    
    # Save processed data locally
    pystow.dump_df(
        "myapp", "processed",
        name=output_name,
        obj=processed_df
    )
    
    return processed_df

# Use the function
result = process_s3_dataset(
    bucket="analytics-data",
    key="daily_reports/2023/report_2023_12_01.csv",
    output_name="daily_summary.csv"
)

Multi-Source Data Integration

import pystow
import pandas as pd

def integrate_cloud_datasets():
    """Integrate datasets from multiple cloud sources"""
    
    # Download from S3
    s3_data_path = pystow.ensure_from_s3(
        "myapp", "sources", "s3",
        s3_bucket="primary-data",
        s3_key="exports/dataset_a.csv"
    )
    
    # Download from Google Drive
    gdrive_data_path = pystow.ensure_from_google(
        "myapp", "sources", "gdrive", 
        name="dataset_b.csv",
        file_id="1ExAmPlE_gDrIvE_fIlE_iD"
    )
    
    # Load both datasets
    df_a = pd.read_csv(s3_data_path)
    df_b = pd.read_csv(gdrive_data_path)
    
    # Merge datasets
    merged_df = pd.merge(df_a, df_b, on='id', how='inner')
    
    # Save integrated dataset
    pystow.dump_df(
        "myapp", "integrated",
        name="combined_dataset.csv",
        obj=merged_df
    )
    
    return merged_df

# Integrate data from multiple sources
combined_data = integrate_cloud_datasets()

Authentication Setup

AWS S3 Authentication

PyStow uses boto3 for S3 access, which supports multiple authentication methods:

  1. AWS CLI Configuration:

    aws configure
  2. Environment Variables:

    export AWS_ACCESS_KEY_ID=your_access_key
    export AWS_SECRET_ACCESS_KEY=your_secret_key
    export AWS_DEFAULT_REGION=us-east-1
  3. IAM Roles (when running on AWS infrastructure)

  4. Programmatic Configuration:

    path = pystow.ensure_from_s3(
        "myapp", "data",
        s3_bucket="my-bucket",
        s3_key="data.csv",
        client_kwargs={
            "aws_access_key_id": "your_key",
            "aws_secret_access_key": "your_secret",
            "region_name": "us-west-2"
        }
    )

Google Drive Authentication

Google Drive downloads work with publicly shared files using the file ID from the share URL. For private files, additional authentication setup may be required through the Google API.

Install with Tessl CLI

npx tessl i tessl/pypi-pystow

docs

archives.md

cloud-storage.md

configuration.md

data-formats.md

directory-management.md

file-operations.md

index.md

module-class.md

nltk-integration.md

web-scraping.md

tile.json