Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
—
PyStow provides built-in support for downloading files from major cloud storage services, including AWS S3 and Google Drive. This enables seamless integration with cloud-hosted datasets and files.
def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:
"""Ensure a file is downloaded from AWS S3.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
s3_bucket: The S3 bucket name
s3_key: The S3 key name
name: Overrides the name of the file at the end of the S3 key, if given.
force: Should the download be done again, even if the path already exists?
Defaults to false.
kwargs: Remaining kwargs to forward to Module.ensure_from_s3.
Returns:
The path of the file that has been downloaded (or already exists)
"""def ensure_from_google(key: str, *subkeys: str, name: str, file_id: str, force: bool = False) -> Path:
"""Ensure a file is downloaded from Google Drive.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
name: The name of the file
file_id: The file identifier of the Google file. If your share link is
https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
force: Should the download be done again, even if the path already exists?
Defaults to false.
Returns:
The path of the file that has been downloaded (or already exists)
"""import pystow
# Download file from S3 bucket
path = pystow.ensure_from_s3(
"myapp", "datasets",
s3_bucket="my-data-bucket",
s3_key="datasets/v1/train.csv",
name="training_data.csv"
)
# Download with nested S3 key
path = pystow.ensure_from_s3(
"myapp", "models",
s3_bucket="ml-models",
s3_key=["experiments", "model_v2", "checkpoint.pkl"],
name="model_checkpoint.pkl"
)
# Use custom name
path = pystow.ensure_from_s3(
"myapp", "resources",
s3_bucket="public-datasets",
s3_key="data/raw/file_with_complex_name.csv",
name="simple_name.csv" # Rename for local storage
)import pystow
# Download from Google Drive using file ID
path = pystow.ensure_from_google(
"myapp", "datasets",
name="dataset.zip",
file_id="1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z"
)
# Force re-download
path = pystow.ensure_from_google(
"myapp", "models",
name="pretrained_model.pkl",
file_id="1BcDfG2hIjKlMnOpQrStUvWxYz3456789",
force=True
)import pystow
# Create module for project
module = pystow.module("myproject")
# Download from S3 using module
s3_path = module.ensure_from_s3(
"data", "raw",
s3_bucket="research-data",
s3_key="experiments/dataset_v3.csv"
)
# Download from Google Drive using module
gdrive_path = module.ensure_from_google(
"models", "pretrained",
name="bert_model.tar.gz",
file_id="1ExAmPlE_fIlE_iD_123456789"
)import pystow
import boto3
# Download with custom boto3 client configuration
path = pystow.ensure_from_s3(
"myapp", "secure_data",
s3_bucket="private-bucket",
s3_key="sensitive/data.json",
client_kwargs={
"region_name": "us-west-2",
"aws_access_key_id": "your_access_key",
"aws_secret_access_key": "your_secret_key"
}
)
# Using existing boto3 client
s3_client = boto3.client('s3', region_name='eu-west-1')
path = pystow.ensure_from_s3(
"myapp", "eu_data",
s3_bucket="eu-data-bucket",
s3_key="regional/dataset.csv",
client=s3_client
)import pystow
# Download with additional S3 transfer options
path = pystow.ensure_from_s3(
"myapp", "large_files",
s3_bucket="big-data-bucket",
s3_key="large_dataset/data.parquet",
download_file_kwargs={
"Config": {
"multipart_threshold": 1024 * 25, # 25MB
"max_concurrency": 10,
"multipart_chunksize": 1024 * 25,
"use_threads": True
}
}
)
# Download and force refresh
path = pystow.ensure_from_s3(
"myapp", "live_data",
s3_bucket="streaming-data",
s3_key="current/metrics.json",
force=True # Always fetch latest version
)import pystow
from botocore.exceptions import NoCredentialsError, ClientError
try:
# Download from S3
path = pystow.ensure_from_s3(
"myapp", "datasets",
s3_bucket="secure-bucket",
s3_key="protected/data.csv"
)
print(f"Downloaded to: {path}")
except NoCredentialsError:
print("AWS credentials not found. Please configure AWS CLI or set environment variables.")
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == 'NoSuchBucket':
print("S3 bucket does not exist")
elif error_code == 'NoSuchKey':
print("S3 key does not exist")
elif error_code == 'AccessDenied':
print("Access denied to S3 resource")
else:
print(f"S3 error: {e}")import pystow
import pandas as pd
def process_s3_dataset(bucket, key, output_name):
"""Download S3 dataset, process it, and save locally"""
# Download raw data from S3
raw_path = pystow.ensure_from_s3(
"myapp", "raw_data",
s3_bucket=bucket,
s3_key=key
)
# Load and process data
df = pd.read_csv(raw_path)
processed_df = df.groupby('category').agg({
'value': 'mean',
'count': 'sum'
}).reset_index()
# Save processed data locally
pystow.dump_df(
"myapp", "processed",
name=output_name,
obj=processed_df
)
return processed_df
# Use the function
result = process_s3_dataset(
bucket="analytics-data",
key="daily_reports/2023/report_2023_12_01.csv",
output_name="daily_summary.csv"
)import pystow
import pandas as pd
def integrate_cloud_datasets():
"""Integrate datasets from multiple cloud sources"""
# Download from S3
s3_data_path = pystow.ensure_from_s3(
"myapp", "sources", "s3",
s3_bucket="primary-data",
s3_key="exports/dataset_a.csv"
)
# Download from Google Drive
gdrive_data_path = pystow.ensure_from_google(
"myapp", "sources", "gdrive",
name="dataset_b.csv",
file_id="1ExAmPlE_gDrIvE_fIlE_iD"
)
# Load both datasets
df_a = pd.read_csv(s3_data_path)
df_b = pd.read_csv(gdrive_data_path)
# Merge datasets
merged_df = pd.merge(df_a, df_b, on='id', how='inner')
# Save integrated dataset
pystow.dump_df(
"myapp", "integrated",
name="combined_dataset.csv",
obj=merged_df
)
return merged_df
# Integrate data from multiple sources
combined_data = integrate_cloud_datasets()PyStow uses boto3 for S3 access, which supports multiple authentication methods:
AWS CLI Configuration:
aws configureEnvironment Variables:
export AWS_ACCESS_KEY_ID=your_access_key
export AWS_SECRET_ACCESS_KEY=your_secret_key
export AWS_DEFAULT_REGION=us-east-1IAM Roles (when running on AWS infrastructure)
Programmatic Configuration:
path = pystow.ensure_from_s3(
"myapp", "data",
s3_bucket="my-bucket",
s3_key="data.csv",
client_kwargs={
"aws_access_key_id": "your_key",
"aws_secret_access_key": "your_secret",
"region_name": "us-west-2"
}
)Google Drive downloads work with publicly shared files using the file ID from the share URL. For private files, additional authentication setup may be required through the Google API.
Install with Tessl CLI
npx tessl i tessl/pypi-pystow