Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
npx @tessl/cli install tessl/pypi-pystow@0.7.0PyStow is a Python library that provides a standardized and configurable way to manage data directories for Python applications. It offers a simple API for creating and accessing application-specific data directories in a user's file system, with support for nested directory structures, automatic directory creation, and environment variable-based configuration.
The library enables developers to easily download, cache, and manage files from the internet with built-in support for various data formats including CSV, RDF, Excel, and compressed archives (ZIP, TAR, LZMA, GZ). It includes functionality for ensuring files are downloaded only once and cached locally, with features for handling tabular data through pandas integration, RDF data through rdflib integration, and provides configurable storage locations that respect both traditional home directory patterns and XDG Base Directory specifications.
pip install pystowimport pystow
# Most common usage patterns
module = pystow.module("myapp")
path = pystow.join("myapp", "data")
data = pystow.ensure_csv("myapp", url="https://example.com/data.csv")import pystow
# Get a module for your application
module = pystow.module("myapp")
# Create nested directories and get paths
data_dir = module.join("datasets", "version1")
config_path = module.join("config", name="settings.json")
# Using functional API
path = pystow.join("myapp", "data", name="file.txt")import pystow
# Download and cache a file
path = pystow.ensure(
"myapp", "data",
url="https://example.com/dataset.csv",
name="dataset.csv"
)
# File is automatically cached - subsequent calls return the cached version
# Use force=True to re-download
path = pystow.ensure(
"myapp", "data",
url="https://example.com/dataset.csv",
name="dataset.csv",
force=True
)import pystow
import pandas as pd
# Download and load CSV as DataFrame
df = pystow.ensure_csv(
"myapp", "datasets",
url="https://example.com/data.csv"
)
# Download and parse JSON
data = pystow.ensure_json(
"myapp", "config",
url="https://api.example.com/config.json"
)
# Work with compressed files
graph = pystow.ensure_rdf(
"myapp", "ontologies",
url="https://example.com/ontology.rdf.gz",
parse_kwargs={"format": "xml"}
)PyStow is built around a modular architecture with two main usage patterns:
pystow.ensure(), pystow.join())pystow.module())The core Module class manages directory structures and provides methods for file operations, while the functional API provides convenient shortcuts for common tasks. All operations support:
ensure_exists parameterCore functionality for creating and managing application data directories with configurable storage locations and automatic directory creation.
def module(key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
"""Return a module for the application.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
ensure_exists: Should all directories be created automatically? Defaults to true.
Returns:
The module object that manages getting and ensuring
"""
def join(key: str, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:
"""Return the home data directory for the given module.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join
name: The name of the file (optional) inside the folder
ensure_exists: Should all directories be created automatically? Defaults to true.
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
Returns:
The path of the directory or subdirectory for the given module.
"""Comprehensive file download system with caching, compression support, and cloud storage integration.
def ensure(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Returns:
The path of the file that has been downloaded (or already exists)
"""Built-in support for common data formats including CSV, JSON, XML, RDF, Excel, and Python objects with pandas and specialized library integration.
def ensure_csv(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
"""Download a CSV and open as a dataframe with pandas.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
Returns:
A pandas DataFrame
"""
def ensure_json(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
"""Download JSON and open with json.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
open_kwargs: Additional keyword arguments passed to open
json_load_kwargs: Keyword arguments to pass through to json.load.
Returns:
A JSON object (list, dict, etc.)
"""HTML parsing and web content extraction with BeautifulSoup integration for downloading and parsing web pages.
def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:
"""Ensure a webpage is downloaded and parsed with BeautifulSoup.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup
Returns:
An BeautifulSoup object
"""Support for compressed archives including ZIP, TAR, GZIP, LZMA, and BZ2 with automatic extraction and content access.
def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded and untarred.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
directory: Overrides the name of the directory into which the tar archive is
extracted. If none given, will use the stem of the file name that gets
downloaded.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
Returns:
The path of the directory where the file that has been downloaded gets
extracted to
"""Download files from cloud storage services including AWS S3 and Google Drive with authentication support.
def ensure_from_s3(key: str, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, force: bool = False, **kwargs: Any) -> Path:
"""Ensure a file is downloaded from AWS S3.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
s3_bucket: The S3 bucket name
s3_key: The S3 key name
name: Overrides the name of the file at the end of the S3 key, if given.
force: Should the download be done again, even if the path already exists?
Defaults to false.
kwargs: Remaining kwargs to forward to Module.ensure_from_s3.
Returns:
The path of the file that has been downloaded (or already exists)
"""Environment variable and INI file-based configuration system for storing API keys, URLs, and other settings.
def get_config(module: str, key: str, *, passthrough: X | None = None, default: X | None = None, dtype: type[X] | None = None, raise_on_missing: bool = False) -> Any:
"""Get a configuration value.
Args:
module: Name of the module (e.g., pybel) to get configuration for
key: Name of the key (e.g., connection)
passthrough: If this is not none, will get returned
default: If the environment and configuration files don't contain anything,
this is returned.
dtype: The datatype to parse out. Can either be int, float,
bool, or str. If none, defaults to str.
raise_on_missing: If true, will raise a value error if no data is found and
no default is given
Returns:
The config value or the default.
Raises:
ConfigError: If raise_on_missing conditions are met
"""
def write_config(module: str, key: str, value: str) -> None:
"""Write a configuration value.
Args:
module: The name of the app (e.g., indra)
key: The key of the configuration in the app
value: The value of the configuration in the app
"""Integration with NLTK (Natural Language Toolkit) for managing linguistic data resources.
def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:
"""Ensure NLTK data is downloaded in a standard way.
Args:
resource: Name of the resource to download, e.g., stopwords
Returns:
A pair of the NLTK cache directory and a boolean that says if download was successful
"""The core Module class that provides object-oriented interface for data directory management with all file operations as methods.
class Module:
"""The class wrapping the directory lookup implementation."""
def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:
"""Initialize the module.
Args:
base: The base directory for the module
ensure_exists: Should the base directory be created automatically?
Defaults to true.
"""
@classmethod
def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
"""Get a module for the given directory or one of its subdirectories.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
ensure_exists: Should all directories be created automatically? Defaults
to true.
Returns:
A module
"""from typing import Union, Optional, Callable, Any
from pathlib import Path
# Version specification type
VersionHint = Union[None, str, Callable[[], Optional[str]]]
# JSON data type
JSON = Any
# File provider function type
Provider = Callable[..., None]
# HTTP timeout specification
TimeoutHint = Union[int, float, None, tuple[Union[float, int], Union[float, int]]]class ConfigError(ValueError):
"""Raised when configuration can not be looked up."""
def __init__(self, module: str, key: str):
"""Initialize the configuration error.
Args:
module: Name of the module, e.g., bioportal
key: Name of the key inside the module, e.g., api_key
"""