tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

—

Pending

Overview

Eval results

Files

Module Class API

Name: tessl/pypi-pystow
Author: tessl

The Module class provides an object-oriented interface for PyStow's directory management and file operations. It encapsulates all functionality within a specific directory context, making it ideal for organizing data within applications.

Core Module Class

class Module:
    """The class wrapping the directory lookup implementation."""
    
    def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:
        """Initialize the module.
        
        Args:
            base: The base directory for the module
            ensure_exists: Should the base directory be created automatically?
                Defaults to true.
        """
    
    @classmethod
    def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
        """Get a module for the given directory or one of its subdirectories.
        
        Args:
            key: The name of the module. No funny characters. The envvar <key>_HOME
                where key is uppercased is checked first before using the default home
                directory.
            subkeys: A sequence of additional strings to join. If none are given,
                returns the directory for this module.
            ensure_exists: Should all directories be created automatically? Defaults
                to true.
        
        Returns:
            A module
        """

Directory Management Methods

def module(self, *subkeys: str, ensure_exists: bool = True) -> Module:
    """Get a module for a subdirectory of the current module.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        ensure_exists: Should all directories be created automatically? Defaults
            to true.
    
    Returns:
        A module representing the subdirectory based on the given subkeys.
    """

def join(self, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:
    """Get a subdirectory of the current module.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        ensure_exists: Should all directories be created automatically? Defaults
            to true.
        name: The name of the file (optional) inside the folder
        version: The optional version, or no-argument callable that returns an
            optional version. This is prepended before the subkeys.
    
    Returns:
        The path of the directory or subdirectory for the given module.
    """

def joinpath_sqlite(self, *subkeys: str, name: str) -> str:
    """Get an SQLite database connection string.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        name: The name of the database file.
    
    Returns:
        A SQLite path string.
    """

File Download Methods

def ensure(self, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a file is downloaded.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        version: The optional version, or no-argument callable that returns an
            optional version. This is prepended before the subkeys.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

def ensure_custom(self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:
    """Ensure a file is present, and run a custom create function otherwise.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        name: The file name.
        force: Should the file be re-created, even if the path already exists?
        provider: The file provider. Will be run with the path as the first
            positional argument, if the file needs to be generated.
        kwargs: Additional keyword-based parameters passed to the provider.
    
    Returns:
        The path of the file that has been created (or already exists)
    
    Raises:
        ValueError: If the provider was called but the file was not created by it.
    """

Archive and Compression Methods

def ensure_untar(self, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a tar file is downloaded and unarchived.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        directory: Overrides the name of the directory into which the tar archive
            is extracted. If none given, will use the stem of the file name that gets
            downloaded.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
    
    Returns:
        The path of the directory where the file that has been downloaded gets
        extracted to
    """

def ensure_gunzip(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a tar.gz file is downloaded and unarchived.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        autoclean: Should the zipped file be deleted?
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Returns:
        The path of the directory where the file that has been downloaded gets
        extracted to
    """

File I/O Context Managers

The Module class provides all the same context manager methods as the functional API:

open() - Open files with various modes
open_gz() - Open gzipped files
ensure_open() - Download and open files
ensure_open_zip() - Download zip and open inner files
ensure_open_lzma() - Download and open LZMA files
ensure_open_tarfile() - Download tar and open inner files
ensure_open_gz() - Download and open gzipped files
ensure_open_bz2() - Download and open BZ2 files

Data Format Methods

The Module class provides all data format methods:

CSV/DataFrame Methods

ensure_csv() - Download CSV as DataFrame
load_df() - Load existing CSV as DataFrame
dump_df() - Save DataFrame to file
ensure_excel() - Download Excel as DataFrame
ensure_tar_df() - Extract CSV from TAR archive
ensure_zip_df() - Extract CSV from ZIP archive

JSON Methods

ensure_json() - Download and parse JSON
ensure_json_bz2() - Download compressed JSON
load_json() - Load existing JSON file
dump_json() - Save object as JSON

XML Methods

ensure_xml() - Download and parse XML
ensure_tar_xml() - Extract XML from TAR archive
load_xml() - Load existing XML file
dump_xml() - Save XML ElementTree

RDF Methods

ensure_rdf() - Download and parse RDF with caching
load_rdf() - Load existing RDF file
dump_rdf() - Save RDF graph

Pickle Methods

ensure_pickle() - Download and load pickle
ensure_pickle_gz() - Download compressed pickle
load_pickle() - Load existing pickle
load_pickle_gz() - Load compressed pickle
dump_pickle() - Save object as pickle

NumPy Methods

ensure_zip_np() - Load NumPy array from ZIP

Cloud Storage Methods

def ensure_from_s3(self, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, client: botocore.client.BaseClient | None = None, client_kwargs: Mapping[str, Any] | None = None, download_file_kwargs: Mapping[str, Any] | None = None, force: bool = False) -> Path:
    """Ensure a file is downloaded from AWS S3.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        s3_bucket: The S3 bucket name
        s3_key: The S3 key name
        name: Overrides the name of the file at the end of the S3 key, if given.
        client: A botocore client. If none given, one will be created
            automatically
        client_kwargs: Keyword arguments to be passed to the client on
            instantiation.
        download_file_kwargs: Keyword arguments to be passed to
            boto3.s3.transfer.S3Transfer.download_file
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

def ensure_from_google(self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a file is downloaded from Google Drive.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        name: The name of the file
        file_id: The file identifier of the Google file. If your share link is
            https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
            your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to
            pystow.utils.download_from_google.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

Database Methods

@contextmanager
def ensure_open_sqlite(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
    """Ensure and connect to a SQLite database.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Yields:
        An instance of sqlite3.Connection from sqlite3.connect
    """

@contextmanager
def ensure_open_sqlite_gz(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
    """Ensure and connect to a SQLite database that's gzipped.
    
    Unfortunately, it's a paid feature to directly read gzipped sqlite files, so
    this automatically gunzips it first.
    
    Args:
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Yields:
        An instance of sqlite3.Connection from sqlite3.connect
    """

Usage Examples

Basic Module Usage

import pystow

# Create a module for your application
module = pystow.module("myapp")

# Get subdirectories
data_module = module.module("datasets")
config_module = module.module("config")

# Get file paths
data_file = data_module.join(name="data.csv")
config_file = config_module.join(name="settings.json")

File Operations with Module

import pystow

# Create module
module = pystow.module("myproject")

# Download files
dataset_path = module.ensure(
    "datasets", "raw",
    url="https://example.com/data.csv"
)

# Work with compressed archives
extracted_dir = module.ensure_untar(
    "archives",
    url="https://example.com/dataset.tar.gz",
    directory="dataset_v1"
)

# Custom file creation
processed_path = module.ensure_custom(
    "processed",
    name="summary.txt",
    provider=lambda path: path.write_text("Processing complete"),
    force=False
)

Data Format Operations

import pystow
import pandas as pd

# Create module
module = pystow.module("analytics")

# Work with DataFrames
df = module.ensure_csv(
    "raw_data",
    url="https://example.com/sales.csv"
)

# Process and save
summary_df = df.groupby('region').sum()
module.dump_df(
    "processed",
    name="regional_summary.csv",
    obj=summary_df
)

# Work with JSON
config = module.ensure_json(
    "config",
    url="https://api.example.com/settings.json"
)

# Save processed config
module.dump_json(
    "processed_config",
    name="app_config.json",
    obj=config,
    json_dump_kwargs={"indent": 2}
)

Cloud Storage with Module

import pystow

# Create module
module = pystow.module("research")

# Download from S3
s3_data = module.ensure_from_s3(
    "datasets", "external",
    s3_bucket="public-datasets",
    s3_key="research/dataset_v2.csv"
)

# Download from Google Drive
gdrive_model = module.ensure_from_google(
    "models", "pretrained",
    name="bert_model.tar.gz",
    file_id="1ExAmPlE_fIlE_iD_123456789"
)

Module-Based Project Organization

import pystow
import pandas as pd

class DataPipeline:
    """Data processing pipeline using PyStow modules"""
    
    def __init__(self, project_name):
        self.module = pystow.module(project_name)
        self.raw_data = self.module.module("raw_data")
        self.processed = self.module.module("processed")
        self.models = self.module.module("models")
        self.outputs = self.module.module("outputs")
    
    def download_data(self, url, name):
        """Download raw data"""
        return self.raw_data.ensure(url=url, name=name)
    
    def process_data(self, raw_file, output_name):
        """Process raw data and save"""
        df = pd.read_csv(raw_file)
        
        # Processing logic here
        processed_df = df.groupby('category').agg({
            'value': 'mean',
            'count': 'sum'
        }).reset_index()
        
        # Save processed data
        self.processed.dump_df(name=output_name, obj=processed_df)
        return self.processed.join(name=output_name)
    
    def save_model(self, model, name):
        """Save trained model"""
        self.models.dump_pickle(name=name, obj=model)
    
    def load_model(self, name):
        """Load trained model"""
        return self.models.load_pickle(name=name)

# Usage
pipeline = DataPipeline("my_ml_project")

# Download data
raw_path = pipeline.download_data(
    url="https://example.com/training_data.csv",
    name="training.csv"
)

# Process data
processed_path = pipeline.process_data(raw_path, "processed_training.csv")

# The module automatically organizes everything:
# ~/.data/my_ml_project/
# ├── raw_data/
# │   └── training.csv
# ├── processed/
# │   └── processed_training.csv
# ├── models/
# └── outputs/

Advanced Module Patterns

import pystow
from contextlib import contextmanager

class ConfigurableModule:
    """Module with configuration-driven behavior"""
    
    def __init__(self, name, config_module="config"):
        self.module = pystow.module(name)
        self.config_module = config_module
    
    def get_base_url(self):
        """Get base URL from configuration"""
        return pystow.get_config(self.config_module, "base_url")
    
    def get_api_key(self):
        """Get API key from configuration"""
        return pystow.get_config(self.config_module, "api_key")
    
    def download_with_auth(self, endpoint, name):
        """Download with authentication"""
        base_url = self.get_base_url()
        api_key = self.get_api_key()
        
        return self.module.ensure(
            url=f"{base_url}/{endpoint}",
            name=name,
            download_kwargs={
                "headers": {"Authorization": f"Bearer {api_key}"}
            }
        )
    
    @contextmanager
    def temp_file(self, name):
        """Context manager for temporary files"""
        temp_path = self.module.join("temp", name=name)
        try:
            yield temp_path
        finally:
            if temp_path.exists():
                temp_path.unlink()

# Usage
app_module = ConfigurableModule("myapp")

# Download with authentication
data_path = app_module.download_with_auth("data/latest.csv", "current_data.csv")

# Use temporary file
with app_module.temp_file("temp_processing.csv") as temp_path:
    # Process data using temp file
    df = pd.read_csv(data_path)
    df.to_csv(temp_path)
    # temp_path is automatically cleaned up

Install with Tessl CLI