Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
—
PyStow provides comprehensive support for compressed archives and files, including ZIP, TAR, GZIP, LZMA, and BZ2 formats. It can automatically extract archives, access files within archives, and handle various compression formats transparently.
def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded and untarred.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
directory: Overrides the name of the directory into which the tar archive is
extracted. If none given, will use the stem of the file name that gets
downloaded.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
Returns:
The path of the directory where the file that has been downloaded gets
extracted to
"""def ensure_gunzip(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
"""Ensure a file is downloaded and gunzipped.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
autoclean: Should the zipped file be deleted?
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
Returns:
The path of the directory where the file that has been downloaded gets
extracted to
"""@contextmanager
def ensure_open_zip(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
"""Ensure a file is downloaded then open it with zipfile.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
mode: The read mode, passed to zipfile.open
open_kwargs: Additional keyword arguments passed to zipfile.open
Yields:
An open file object
"""@contextmanager
def ensure_open_tarfile(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
"""Ensure a tar file is downloaded and open a file inside it.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
mode: The read mode, passed to tarfile.open
open_kwargs: Additional keyword arguments passed to tarfile.open
Yields:
An open file object
"""@contextmanager
def ensure_open_gz(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:
"""Ensure a gzipped file is downloaded and open a file inside it.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
mode: The read mode, passed to gzip.open
open_kwargs: Additional keyword arguments passed to gzip.open
Yields:
An open file object
"""@contextmanager
def ensure_open_lzma(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rt", open_kwargs: Mapping[str, Any] | None = None) -> Generator[lzma.LZMAFile | io.TextIOWrapper[lzma.LZMAFile], None, None]:
"""Ensure a LZMA-compressed file is downloaded and open a file inside it.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
mode: The read mode, passed to lzma.open
open_kwargs: Additional keyword arguments passed to lzma.open
Yields:
An open file object
"""@contextmanager
def ensure_open_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[bz2.BZ2File, None, None]:
"""Ensure a BZ2-compressed file is downloaded and open a file inside it.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME
where key is uppercased is checked first before using the default home
directory.
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
mode: The read mode, passed to bz2.open
open_kwargs: Additional keyword arguments passed to bz2.open
Yields:
An open file object
"""def ensure_zip_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
"""Download a zip file and open an inner file as a dataframe with pandas.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
Returns:
A pandas DataFrame
"""
def ensure_tar_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
"""Download a tar file and open an inner file as a dataframe with pandas.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
Returns:
A dataframe
"""def ensure_tar_xml(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
"""Download a tar file and open an inner file as an XML with lxml.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
Returns:
An ElementTree object
"""def ensure_zip_np(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, load_kwargs: Mapping[str, Any] | None = None) -> numpy.typing.ArrayLike:
"""Download a zip file and open an inner file as an array-like with numpy.
Args:
key: The module name
subkeys: A sequence of additional strings to join. If none are given, returns
the directory for this module.
url: The URL to download.
inner_path: The relative path to the file inside the archive
name: Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
force: Should the download be done again, even if the path already exists?
Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
load_kwargs: Additional keyword arguments that are passed through to
read_zip_np and transitively to numpy.load.
Returns:
An array-like object
"""import pystow
# Download and extract tar archive
extracted_dir = pystow.ensure_untar(
"myapp", "datasets",
url="https://example.com/dataset.tar.gz",
directory="dataset_v1" # Custom extraction directory name
)
# Access extracted files
data_file = extracted_dir / "data" / "train.csv"import pystow
# Download and decompress gzipped file
decompressed_file = pystow.ensure_gunzip(
"myapp", "data",
url="https://example.com/large_file.txt.gz",
autoclean=True # Remove .gz file after decompression
)
# Read decompressed content
content = decompressed_file.read_text()import pystow
# Access file inside ZIP archive without extraction
with pystow.ensure_open_zip(
"myapp", "archives",
url="https://example.com/data.zip",
inner_path="data/file.txt"
) as file:
content = file.read()
# Extract DataFrame from CSV inside ZIP
df = pystow.ensure_zip_df(
"myapp", "datasets",
url="https://example.com/dataset.zip",
inner_path="dataset/train.csv",
read_csv_kwargs={"sep": ","}
)
# Load NumPy array from ZIP
array = pystow.ensure_zip_np(
"myapp", "arrays",
url="https://example.com/arrays.zip",
inner_path="data.npy"
)import pystow
# Access file inside TAR archive
with pystow.ensure_open_tarfile(
"myapp", "archives",
url="https://example.com/data.tar.gz",
inner_path="data/config.json"
) as file:
import json
config = json.load(file)
# Extract DataFrame from TAR
df = pystow.ensure_tar_df(
"myapp", "datasets",
url="https://example.com/dataset.tar.bz2",
inner_path="dataset/data.csv"
)
# Parse XML from TAR
tree = pystow.ensure_tar_xml(
"myapp", "documents",
url="https://example.com/docs.tar.gz",
inner_path="docs/schema.xml"
)import pystow
# Work with GZIP files
with pystow.ensure_open_gz(
"myapp", "logs",
url="https://example.com/logfile.log.gz",
mode="rt" # Text mode
) as file:
lines = file.readlines()
# Work with LZMA/XZ files
with pystow.ensure_open_lzma(
"myapp", "compressed",
url="https://example.com/data.txt.xz",
mode="rt"
) as file:
data = file.read()
# Work with BZ2 files
with pystow.ensure_open_bz2(
"myapp", "compressed",
url="https://example.com/data.bz2",
mode="rb"
) as file:
binary_data = file.read()import pystow
# Load gzipped pickle
model = pystow.ensure_pickle_gz(
"myapp", "models",
url="https://example.com/model.pkl.gz"
)
# Load BZ2-compressed JSON
data = pystow.ensure_json_bz2(
"myapp", "data",
url="https://api.example.com/large_dataset.json.bz2"
)
# Save gzipped pickle
pystow.module("myapp").dump_pickle(
"cache",
name="processed_data.pkl",
obj=large_data_structure
)
# Then manually compress if neededimport pystow
import pandas as pd
# Download archive, extract specific file, process data
def process_archive_data(archive_url, inner_file):
# Extract DataFrame from archive
df = pystow.ensure_zip_df(
"myapp", "raw_data",
url=archive_url,
inner_path=inner_file,
read_csv_kwargs={"sep": "\t"}
)
# Process data
processed_df = df.groupby("category").agg({
"value": "sum",
"count": "mean"
})
# Save processed data
pystow.dump_df(
"myapp", "processed",
name="summary.csv",
obj=processed_df
)
return processed_df
# Use the function
result = process_archive_data(
"https://example.com/dataset.zip",
"raw/data.tsv"
)Install with Tessl CLI
npx tessl i tessl/pypi-pystow