Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
—
The Module class provides an object-oriented interface for PyStow's directory management and file operations. It encapsulates all functionality within a specific directory context, making it ideal for organizing data within applications.
class Module:
"""The class wrapping the directory lookup implementation."""
def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:
"""Initialize the module.
Args:
base: The base directory for the module
ensure_exists: Should the base directory be created automatically?
Defaults to true.
"""
@classmethod
def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
"""Get a module for the given directory or one of its subdirectories.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
ensure_exists: Should all directories be created automatically? Defaults
to true.
Returns:
A module
"""def module(self, *subkeys: str, ensure_exists: bool = True) -> Module:
"""Get a module for a subdirectory of the current module.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
ensure_exists: Should all directories be created automatically? Defaults
to true.
Returns:
A module representing the subdirectory based on the given subkeys.
"""
def join(self, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:
"""Get a subdirectory of the current module.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
ensure_exists: Should all directories be created automatically? Defaults
to true.
name: The name of the file (optional) inside the folder
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
Returns:
The path of the directory or subdirectory for the given module.
"""
def joinpath_sqlite(self, *subkeys: str, name: str) -> str:
"""Get an SQLite database connection string.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
name: The name of the database file.
Returns:
A SQLite path string.
"""def ensure(self, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Returns:
The path of the file that has been downloaded (or already exists)
"""
def ensure_custom(self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:
"""Ensure a file is present, and run a custom create function otherwise.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
name: The file name.
force: Should the file be re-created, even if the path already exists?
provider: The file provider. Will be run with the path as the first
positional argument, if the file needs to be generated.
kwargs: Additional keyword-based parameters passed to the provider.
Returns:
The path of the file that has been created (or already exists)
Raises:
ValueError: If the provider was called but the file was not created by it.
"""def ensure_untar(self, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a tar file is downloaded and unarchived.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
directory: Overrides the name of the directory into which the tar archive
is extracted. If none given, will use the stem of the file name that gets
downloaded.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
Returns:
The path of the directory where the file that has been downloaded gets
extracted to
"""
def ensure_gunzip(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a tar.gz file is downloaded and unarchived.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already
exists? Defaults to false.
autoclean: Should the zipped file be deleted?
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Returns:
The path of the directory where the file that has been downloaded gets
extracted to
"""The Module class provides all the same context manager methods as the functional API:
open() - Open files with various modesopen_gz() - Open gzipped filesensure_open() - Download and open filesensure_open_zip() - Download zip and open inner filesensure_open_lzma() - Download and open LZMA filesensure_open_tarfile() - Download tar and open inner filesensure_open_gz() - Download and open gzipped filesensure_open_bz2() - Download and open BZ2 filesThe Module class provides all data format methods:
ensure_csv() - Download CSV as DataFrameload_df() - Load existing CSV as DataFramedump_df() - Save DataFrame to fileensure_excel() - Download Excel as DataFrameensure_tar_df() - Extract CSV from TAR archiveensure_zip_df() - Extract CSV from ZIP archiveensure_json() - Download and parse JSONensure_json_bz2() - Download compressed JSONload_json() - Load existing JSON filedump_json() - Save object as JSONensure_xml() - Download and parse XMLensure_tar_xml() - Extract XML from TAR archiveload_xml() - Load existing XML filedump_xml() - Save XML ElementTreeensure_rdf() - Download and parse RDF with cachingload_rdf() - Load existing RDF filedump_rdf() - Save RDF graphensure_pickle() - Download and load pickleensure_pickle_gz() - Download compressed pickleload_pickle() - Load existing pickleload_pickle_gz() - Load compressed pickledump_pickle() - Save object as pickleensure_zip_np() - Load NumPy array from ZIPdef ensure_from_s3(self, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, client: botocore.client.BaseClient | None = None, client_kwargs: Mapping[str, Any] | None = None, download_file_kwargs: Mapping[str, Any] | None = None, force: bool = False) -> Path:
"""Ensure a file is downloaded from AWS S3.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
s3_bucket: The S3 bucket name
s3_key: The S3 key name
name: Overrides the name of the file at the end of the S3 key, if given.
client: A botocore client. If none given, one will be created
automatically
client_kwargs: Keyword arguments to be passed to the client on
instantiation.
download_file_kwargs: Keyword arguments to be passed to
boto3.s3.transfer.S3Transfer.download_file
force: Should the download be done again, even if the path already
exists? Defaults to false.
Returns:
The path of the file that has been downloaded (or already exists)
"""
def ensure_from_google(self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded from Google Drive.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
name: The name of the file
file_id: The file identifier of the Google file. If your share link is
https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to
pystow.utils.download_from_google.
Returns:
The path of the file that has been downloaded (or already exists)
"""@contextmanager
def ensure_open_sqlite(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
"""Ensure and connect to a SQLite database.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Yields:
An instance of sqlite3.Connection from sqlite3.connect
"""
@contextmanager
def ensure_open_sqlite_gz(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
"""Ensure and connect to a SQLite database that's gzipped.
Unfortunately, it's a paid feature to directly read gzipped sqlite files, so
this automatically gunzips it first.
Args:
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Yields:
An instance of sqlite3.Connection from sqlite3.connect
"""import pystow
# Create a module for your application
module = pystow.module("myapp")
# Get subdirectories
data_module = module.module("datasets")
config_module = module.module("config")
# Get file paths
data_file = data_module.join(name="data.csv")
config_file = config_module.join(name="settings.json")import pystow
# Create module
module = pystow.module("myproject")
# Download files
dataset_path = module.ensure(
"datasets", "raw",
url="https://example.com/data.csv"
)
# Work with compressed archives
extracted_dir = module.ensure_untar(
"archives",
url="https://example.com/dataset.tar.gz",
directory="dataset_v1"
)
# Custom file creation
processed_path = module.ensure_custom(
"processed",
name="summary.txt",
provider=lambda path: path.write_text("Processing complete"),
force=False
)import pystow
import pandas as pd
# Create module
module = pystow.module("analytics")
# Work with DataFrames
df = module.ensure_csv(
"raw_data",
url="https://example.com/sales.csv"
)
# Process and save
summary_df = df.groupby('region').sum()
module.dump_df(
"processed",
name="regional_summary.csv",
obj=summary_df
)
# Work with JSON
config = module.ensure_json(
"config",
url="https://api.example.com/settings.json"
)
# Save processed config
module.dump_json(
"processed_config",
name="app_config.json",
obj=config,
json_dump_kwargs={"indent": 2}
)import pystow
# Create module
module = pystow.module("research")
# Download from S3
s3_data = module.ensure_from_s3(
"datasets", "external",
s3_bucket="public-datasets",
s3_key="research/dataset_v2.csv"
)
# Download from Google Drive
gdrive_model = module.ensure_from_google(
"models", "pretrained",
name="bert_model.tar.gz",
file_id="1ExAmPlE_fIlE_iD_123456789"
)import pystow
import pandas as pd
class DataPipeline:
"""Data processing pipeline using PyStow modules"""
def __init__(self, project_name):
self.module = pystow.module(project_name)
self.raw_data = self.module.module("raw_data")
self.processed = self.module.module("processed")
self.models = self.module.module("models")
self.outputs = self.module.module("outputs")
def download_data(self, url, name):
"""Download raw data"""
return self.raw_data.ensure(url=url, name=name)
def process_data(self, raw_file, output_name):
"""Process raw data and save"""
df = pd.read_csv(raw_file)
# Processing logic here
processed_df = df.groupby('category').agg({
'value': 'mean',
'count': 'sum'
}).reset_index()
# Save processed data
self.processed.dump_df(name=output_name, obj=processed_df)
return self.processed.join(name=output_name)
def save_model(self, model, name):
"""Save trained model"""
self.models.dump_pickle(name=name, obj=model)
def load_model(self, name):
"""Load trained model"""
return self.models.load_pickle(name=name)
# Usage
pipeline = DataPipeline("my_ml_project")
# Download data
raw_path = pipeline.download_data(
url="https://example.com/training_data.csv",
name="training.csv"
)
# Process data
processed_path = pipeline.process_data(raw_path, "processed_training.csv")
# The module automatically organizes everything:
# ~/.data/my_ml_project/
# ├── raw_data/
# │ └── training.csv
# ├── processed/
# │ └── processed_training.csv
# ├── models/
# └── outputs/import pystow
from contextlib import contextmanager
class ConfigurableModule:
"""Module with configuration-driven behavior"""
def __init__(self, name, config_module="config"):
self.module = pystow.module(name)
self.config_module = config_module
def get_base_url(self):
"""Get base URL from configuration"""
return pystow.get_config(self.config_module, "base_url")
def get_api_key(self):
"""Get API key from configuration"""
return pystow.get_config(self.config_module, "api_key")
def download_with_auth(self, endpoint, name):
"""Download with authentication"""
base_url = self.get_base_url()
api_key = self.get_api_key()
return self.module.ensure(
url=f"{base_url}/{endpoint}",
name=name,
download_kwargs={
"headers": {"Authorization": f"Bearer {api_key}"}
}
)
@contextmanager
def temp_file(self, name):
"""Context manager for temporary files"""
temp_path = self.module.join("temp", name=name)
try:
yield temp_path
finally:
if temp_path.exists():
temp_path.unlink()
# Usage
app_module = ConfigurableModule("myapp")
# Download with authentication
data_path = app_module.download_with_auth("data/latest.csv", "current_data.csv")
# Use temporary file
with app_module.temp_file("temp_processing.csv") as temp_path:
# Process data using temp file
df = pd.read_csv(data_path)
df.to_csv(temp_path)
# temp_path is automatically cleaned upInstall with Tessl CLI
npx tessl i tessl/pypi-pystow