tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

—

Pending

Overview

Eval results

Files

Archive and Compression Support

Name: tessl/pypi-pystow
Author: tessl

PyStow provides comprehensive support for compressed archives and files, including ZIP, TAR, GZIP, LZMA, and BZ2 formats. It can automatically extract archives, access files within archives, and handle various compression formats transparently.

Archive Extraction

TAR Archive Extraction

def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a file is downloaded and untarred.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        directory: Overrides the name of the directory into which the tar archive is
            extracted. If none given, will use the stem of the file name that gets
            downloaded.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
    
    Returns:
        The path of the directory where the file that has been downloaded gets
        extracted to
    """

GZIP Decompression

def ensure_gunzip(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a file is downloaded and gunzipped.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        autoclean: Should the zipped file be deleted?
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Returns:
        The path of the directory where the file that has been downloaded gets
        extracted to
    """

Compressed Archive Access

ZIP File Access

@contextmanager
def ensure_open_zip(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
    """Ensure a file is downloaded then open it with zipfile.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to zipfile.open
        open_kwargs: Additional keyword arguments passed to zipfile.open
    
    Yields:
        An open file object
    """

TAR File Access

@contextmanager
def ensure_open_tarfile(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
    """Ensure a tar file is downloaded and open a file inside it.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to tarfile.open
        open_kwargs: Additional keyword arguments passed to tarfile.open
    
    Yields:
        An open file object
    """

Compression Format Support

GZIP Files

@contextmanager
def ensure_open_gz(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:
    """Ensure a gzipped file is downloaded and open a file inside it.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to gzip.open
        open_kwargs: Additional keyword arguments passed to gzip.open
    
    Yields:
        An open file object
    """

LZMA Files

@contextmanager
def ensure_open_lzma(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rt", open_kwargs: Mapping[str, Any] | None = None) -> Generator[lzma.LZMAFile | io.TextIOWrapper[lzma.LZMAFile], None, None]:
    """Ensure a LZMA-compressed file is downloaded and open a file inside it.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to lzma.open
        open_kwargs: Additional keyword arguments passed to lzma.open
    
    Yields:
        An open file object
    """

BZ2 Files

@contextmanager
def ensure_open_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[bz2.BZ2File, None, None]:
    """Ensure a BZ2-compressed file is downloaded and open a file inside it.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to bz2.open
        open_kwargs: Additional keyword arguments passed to bz2.open
    
    Yields:
        An open file object
    """

Archive Data Format Support

CSV from Archives

def ensure_zip_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
    """Download a zip file and open an inner file as a dataframe with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
    
    Returns:
        A pandas DataFrame
    """

def ensure_tar_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
    """Download a tar file and open an inner file as a dataframe with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
    
    Returns:
        A dataframe
    """

XML from Archives

def ensure_tar_xml(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
    """Download a tar file and open an inner file as an XML with lxml.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
    
    Returns:
        An ElementTree object
    """

NumPy Arrays from Archives

def ensure_zip_np(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, load_kwargs: Mapping[str, Any] | None = None) -> numpy.typing.ArrayLike:
    """Download a zip file and open an inner file as an array-like with numpy.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        inner_path: The relative path to the file inside the archive
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        load_kwargs: Additional keyword arguments that are passed through to
            read_zip_np and transitively to numpy.load.
    
    Returns:
        An array-like object
    """

Usage Examples

TAR Archive Extraction

import pystow

# Download and extract tar archive
extracted_dir = pystow.ensure_untar(
    "myapp", "datasets",
    url="https://example.com/dataset.tar.gz",
    directory="dataset_v1"  # Custom extraction directory name
)

# Access extracted files
data_file = extracted_dir / "data" / "train.csv"

GZIP Decompression

import pystow

# Download and decompress gzipped file
decompressed_file = pystow.ensure_gunzip(
    "myapp", "data",
    url="https://example.com/large_file.txt.gz",
    autoclean=True  # Remove .gz file after decompression
)

# Read decompressed content
content = decompressed_file.read_text()

Working with ZIP Archives

import pystow

# Access file inside ZIP archive without extraction
with pystow.ensure_open_zip(
    "myapp", "archives",
    url="https://example.com/data.zip",
    inner_path="data/file.txt"
) as file:
    content = file.read()

# Extract DataFrame from CSV inside ZIP
df = pystow.ensure_zip_df(
    "myapp", "datasets", 
    url="https://example.com/dataset.zip",
    inner_path="dataset/train.csv",
    read_csv_kwargs={"sep": ","}
)

# Load NumPy array from ZIP
array = pystow.ensure_zip_np(
    "myapp", "arrays",
    url="https://example.com/arrays.zip", 
    inner_path="data.npy"
)

Working with TAR Archives

import pystow

# Access file inside TAR archive
with pystow.ensure_open_tarfile(
    "myapp", "archives",
    url="https://example.com/data.tar.gz",
    inner_path="data/config.json"
) as file:
    import json
    config = json.load(file)

# Extract DataFrame from TAR
df = pystow.ensure_tar_df(
    "myapp", "datasets",
    url="https://example.com/dataset.tar.bz2",
    inner_path="dataset/data.csv"
)

# Parse XML from TAR
tree = pystow.ensure_tar_xml(
    "myapp", "documents",
    url="https://example.com/docs.tar.gz",
    inner_path="docs/schema.xml"
)

Compressed File Formats

import pystow

# Work with GZIP files
with pystow.ensure_open_gz(
    "myapp", "logs",
    url="https://example.com/logfile.log.gz",
    mode="rt"  # Text mode
) as file:
    lines = file.readlines()

# Work with LZMA/XZ files  
with pystow.ensure_open_lzma(
    "myapp", "compressed",
    url="https://example.com/data.txt.xz",
    mode="rt"
) as file:
    data = file.read()

# Work with BZ2 files
with pystow.ensure_open_bz2(
    "myapp", "compressed", 
    url="https://example.com/data.bz2",
    mode="rb"
) as file:
    binary_data = file.read()

Compressed Data Formats

import pystow

# Load gzipped pickle
model = pystow.ensure_pickle_gz(
    "myapp", "models",
    url="https://example.com/model.pkl.gz"
)

# Load BZ2-compressed JSON
data = pystow.ensure_json_bz2(
    "myapp", "data",
    url="https://api.example.com/large_dataset.json.bz2"
)

# Save gzipped pickle
pystow.module("myapp").dump_pickle(
    "cache",
    name="processed_data.pkl",
    obj=large_data_structure
)
# Then manually compress if needed

Complex Archive Workflows

import pystow
import pandas as pd

# Download archive, extract specific file, process data
def process_archive_data(archive_url, inner_file):
    # Extract DataFrame from archive
    df = pystow.ensure_zip_df(
        "myapp", "raw_data",
        url=archive_url,
        inner_path=inner_file,
        read_csv_kwargs={"sep": "\t"}
    )
    
    # Process data
    processed_df = df.groupby("category").agg({
        "value": "sum",
        "count": "mean"
    })
    
    # Save processed data
    pystow.dump_df(
        "myapp", "processed",
        name="summary.csv",
        obj=processed_df
    )
    
    return processed_df

# Use the function
result = process_archive_data(
    "https://example.com/dataset.zip",
    "raw/data.tsv"
)

Install with Tessl CLI