CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

Pending
Overview
Eval results
Files

file-operations.mddocs/

File Download and Caching

PyStow provides a comprehensive file download and caching system that automatically manages file retrieval, storage, and cache validation. Files are downloaded once and reused from cache on subsequent requests.

Core Download Functions

Basic File Download

def ensure(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
    """Ensure a file is downloaded.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        version: The optional version, or no-argument callable that returns an
            optional version. This is prepended before the subkeys.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
    
    Returns:
        The path of the file that has been downloaded (or already exists)
    """

Custom File Creation

def ensure_custom(key: str, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:
    """Ensure a file is present, and run a custom create function otherwise.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The file name.
        force: Should the file be re-created, even if the path already exists?
        provider: The file provider. Will be run with the path as the first
            positional argument, if the file needs to be generated.
        kwargs: Additional keyword-based parameters passed to the provider.
    
    Returns:
        The path of the file that has been created (or already exists)
    """

File I/O Context Managers

Basic File Opening

@contextmanager
def open(key: str, *subkeys: str, name: str, mode: Literal["r", "rb", "rt", "w", "wb", "wt"] = "r", open_kwargs: Mapping[str, Any] | None = None, ensure_exists: bool = False) -> Generator[StringIO | BytesIO, None, None]:
    """Open a file.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        mode: The read or write mode, passed to open
        open_kwargs: Additional keyword arguments passed to open
        ensure_exists: Should the directory the file is in be made? Set to true on
            write operations.
    
    Yields:
        An open file object
    """

Gzipped File Opening

@contextmanager
def open_gz(key: str, *subkeys: str, name: str, mode: Literal["r", "w", "rt", "wt", "rb", "wb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, ensure_exists: bool = False) -> Generator[StringIO | BytesIO, None, None]:
    """Open a gzipped file that exists already.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        mode: The read mode, passed to gzip.open
        open_kwargs: Additional keyword arguments passed to gzip.open
        ensure_exists: Should the file be made? Set to true on write operations.
    
    Yields:
        An open file object
    """

Download and Open

@contextmanager
def ensure_open(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rt", "w", "wt"] | Literal["rb", "wb"] = "r", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:
    """Ensure a file is downloaded and open it.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME
            where key is uppercased is checked first before using the default home
            directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to lzma.open
        open_kwargs: Additional keyword arguments passed to lzma.open
    
    Yields:
        An open file object
    """

Gzipped File Operations

@contextmanager
def open_gz(key: str, *subkeys: str, name: str, mode: Literal["r", "w", "rt", "wt", "rb", "wb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, ensure_exists: bool = False) -> Generator[StringIO | BytesIO, None, None]:
    """Open a gzipped file that exists already.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        mode: The read mode, passed to gzip.open
        open_kwargs: Additional keyword arguments passed to gzip.open
        ensure_exists: Should the file be made? Set to true on write operations.
    
    Yields:
        An open file object
    """

Usage Examples

Basic File Download

import pystow

# Download a file with automatic caching
path = pystow.ensure(
    "myapp", "datasets",
    url="https://example.com/data.csv",
    name="dataset.csv"
)

# File is cached - subsequent calls return immediately
path = pystow.ensure(
    "myapp", "datasets", 
    url="https://example.com/data.csv",
    name="dataset.csv"
)

# Force re-download
path = pystow.ensure(
    "myapp", "datasets",
    url="https://example.com/data.csv", 
    name="dataset.csv",
    force=True
)

Download with Versioning

import pystow
import requests

def get_data_version():
    """Get current version from API"""
    response = requests.get("https://api.example.com/version")
    return response.json()["version"]

# Version-aware download
path = pystow.ensure(
    "myapp", "datasets",
    url="https://example.com/data.csv",
    version=get_data_version
)
# Stores in: ~/.data/myapp/v1.2.3/datasets/data.csv

Custom File Generation

import pystow
import pandas as pd

def create_processed_data(path, raw_data_url):
    """Custom function to create processed data file"""
    # Download raw data
    raw_path = pystow.ensure(
        "myapp", "raw",
        url=raw_data_url
    )
    
    # Process data
    df = pd.read_csv(raw_path)
    processed_df = df.groupby('category').sum()
    
    # Save to the target path
    processed_df.to_csv(path)

# Ensure processed data exists
processed_path = pystow.ensure_custom(
    "myapp", "processed",
    name="aggregated_data.csv",
    provider=create_processed_data,
    raw_data_url="https://example.com/raw_data.csv"
)

File I/O Operations

import pystow

# Read from existing file
with pystow.open("myapp", "config", name="settings.txt", mode="r") as file:
    config = file.read()

# Write to file (creates directories automatically)
with pystow.open("myapp", "logs", name="app.log", mode="w", ensure_exists=True) as file:
    file.write("Application started\n")

# Download and read in one step
with pystow.ensure_open(
    "myapp", "data",
    url="https://example.com/data.txt"
) as file:
    content = file.read()

# Work with gzipped files
with pystow.open_gz("myapp", "compressed", name="data.gz", mode="rt") as file:
    data = file.read()

Download Configuration

import pystow

# Configure download behavior
path = pystow.ensure(
    "myapp", "data",
    url="https://example.com/large_file.zip",
    download_kwargs={
        "timeout": 300,           # 5 minute timeout
        "stream": True,           # Stream download
        "verify": True,           # Verify SSL certificates
        "headers": {              # Custom headers
            "User-Agent": "MyApp/1.0"
        }
    }
)

Module-Based File Operations

import pystow

# Create module instance
module = pystow.module("myapp")

# Download files using module
data_path = module.ensure(
    "datasets",
    url="https://example.com/data.csv"
)

# Open files using module
with module.open("config", name="settings.json", mode="r") as file:
    config = json.load(file)

# Custom file creation with module
processed_path = module.ensure_custom(
    "processed",
    name="summary.txt",
    provider=lambda path: path.write_text("Summary complete")
)

Install with Tessl CLI

npx tessl i tessl/pypi-pystow

docs

archives.md

cloud-storage.md

configuration.md

data-formats.md

directory-management.md

file-operations.md

index.md

module-class.md

nltk-integration.md

web-scraping.md

tile.json