CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

Pending
Overview
Eval results
Files

data-formats.mddocs/

Data Format Support

PyStow provides built-in support for common data formats with automatic parsing and serialization. It integrates with popular libraries like pandas, lxml, and rdflib to handle CSV, JSON, XML, RDF, Excel, and Python objects seamlessly.

CSV and DataFrames

CSV Download and Parsing

def ensure_csv(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
    """Download a CSV and open as a dataframe with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
    
    Returns:
        A pandas DataFrame
    """

Excel Support

def ensure_excel(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_excel_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
    """Download an excel file and open as a dataframe with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        read_excel_kwargs: Keyword arguments to pass through to pandas.read_excel.
    
    Returns:
        A pandas DataFrame
    """

DataFrame Operations

def load_df(key: str, *subkeys: str, name: str, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
    """Open a pre-existing CSV as a dataframe with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
    
    Returns:
        A pandas DataFrame
    """

def dump_df(key: str, *subkeys: str, name: str, obj: pd.DataFrame, sep: str = "\t", index: bool = False, to_csv_kwargs: Mapping[str, Any] | None = None) -> None:
    """Dump a dataframe to a TSV file with pandas.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        obj: The dataframe to dump
        sep: The separator to use, defaults to a tab
        index: Should the index be dumped? Defaults to false.
        to_csv_kwargs: Keyword arguments to pass through to pandas.DataFrame.to_csv.
    """

JSON Format

JSON Download and Parsing

def ensure_json(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
    """Download JSON and open with json.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        open_kwargs: Additional keyword arguments passed to open
        json_load_kwargs: Keyword arguments to pass through to json.load.
    
    Returns:
        A JSON object (list, dict, etc.)
    """

Compressed JSON

def ensure_json_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
    """Download BZ2-compressed JSON and open with json.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        open_kwargs: Additional keyword arguments passed to bz2.open
        json_load_kwargs: Keyword arguments to pass through to json.load.
    
    Returns:
        A JSON object (list, dict, etc.)
    """

JSON Operations

def load_json(key: str, *subkeys: str, name: str, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
    """Open a JSON file json.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        json_load_kwargs: Keyword arguments to pass through to json.load.
    
    Returns:
        A JSON object (list, dict, etc.)
    """

def dump_json(key: str, *subkeys: str, name: str, obj: JSON, open_kwargs: Mapping[str, Any] | None = None, json_dump_kwargs: Mapping[str, Any] | None = None) -> None:
    """Dump an object to a file with json.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        obj: The object to dump
        open_kwargs: Additional keyword arguments passed to open
        json_dump_kwargs: Keyword arguments to pass through to json.dump.
    """

XML Format

XML Download and Parsing

def ensure_xml(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
    """Download an XML file and open it with lxml.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
    
    Returns:
        An ElementTree object
    """

XML Operations

def load_xml(key: str, *subkeys: str, name: str, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
    """Load an XML file with lxml.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
    
    Returns:
        An ElementTree object
    """

def dump_xml(key: str, *subkeys: str, name: str, obj: lxml.etree.ElementTree, open_kwargs: Mapping[str, Any] | None = None, write_kwargs: Mapping[str, Any] | None = None) -> None:
    """Dump an XML element tree to a file with lxml.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        obj: The object to dump
        open_kwargs: Additional keyword arguments passed to open
        write_kwargs: Keyword arguments to pass through to lxml.etree.ElementTree.write.
    """

RDF Format

RDF Download and Parsing

def ensure_rdf(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, precache: bool = True, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
    """Download a RDF file and open with rdflib.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        precache: Should the parsed rdflib.Graph be stored as a pickle for
            fast loading?
        parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
            and transitively to rdflib.Graph.parse.
    
    Returns:
        An RDF graph
    """

RDF Operations

def load_rdf(key: str, *subkeys: str, name: str | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
    """Open an RDF file with rdflib.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
            and transitively to rdflib.Graph.parse.
    
    Returns:
        An RDF graph
    """

def dump_rdf(key: str, *subkeys: str, name: str, obj: rdflib.Graph, format: str = "turtle", serialize_kwargs: Mapping[str, Any] | None = None) -> None:
    """Dump an RDF graph to a file with rdflib.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        obj: The object to dump
        format: The format to dump in
        serialize_kwargs: Keyword arguments to through to rdflib.Graph.serialize.
    """

Pickle Format

Pickle Operations

def ensure_pickle(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
    """Download a pickle file and open with pickle.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given. Also
            useful for URLs that don't have proper filenames with extensions.
        force: Should the download be done again, even if the path already exists?
            Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        mode: The read mode, passed to open
        open_kwargs: Additional keyword arguments passed to open
        pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
    
    Returns:
        Any object
    """

def load_pickle(key: str, *subkeys: str, name: str, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
    """Open a pickle file with pickle.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        mode: The read mode, passed to open
        open_kwargs: Additional keyword arguments passed to open
        pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
    
    Returns:
        Any object
    """

def dump_pickle(key: str, *subkeys: str, name: str, obj: Any, mode: Literal["wb"] = "wb", open_kwargs: Mapping[str, Any] | None = None, pickle_dump_kwargs: Mapping[str, Any] | None = None) -> None:
    """Dump an object to a file with pickle.
    
    Args:
        key: The module name
        subkeys: A sequence of additional strings to join. If none are given, returns
            the directory for this module.
        name: The name of the file to open
        obj: The object to dump
        mode: The read mode, passed to open
        open_kwargs: Additional keyword arguments passed to open
        pickle_dump_kwargs: Keyword arguments to pass through to pickle.dump.
    """

Usage Examples

CSV and DataFrames

import pystow
import pandas as pd

# Download and parse CSV
df = pystow.ensure_csv(
    "myapp", "datasets",
    url="https://example.com/data.csv",
    read_csv_kwargs={"sep": ",", "header": 0}
)

# Load existing CSV
df = pystow.load_df("myapp", "processed", name="clean_data.csv")

# Save DataFrame
pystow.dump_df(
    "myapp", "outputs",
    name="results.tsv",
    obj=df,
    sep="\t"
)

# Excel files
excel_df = pystow.ensure_excel(
    "myapp", "reports",
    url="https://example.com/report.xlsx",
    read_excel_kwargs={"sheet_name": "Summary"}
)

JSON Data

import pystow

# Download and parse JSON
config = pystow.ensure_json(
    "myapp", "config",
    url="https://api.example.com/config.json"
)

# Load existing JSON
data = pystow.load_json("myapp", "cache", name="api_response.json")

# Save JSON data
pystow.dump_json(
    "myapp", "outputs",
    name="results.json",
    obj={"status": "complete", "count": 42},
    json_dump_kwargs={"indent": 2}
)

# Compressed JSON
large_data = pystow.ensure_json_bz2(
    "myapp", "datasets",
    url="https://example.com/large_dataset.json.bz2"
)

XML Processing

import pystow
from lxml import etree

# Download and parse XML
tree = pystow.ensure_xml(
    "myapp", "schemas",
    url="https://example.com/schema.xml"
)

# Access elements
root = tree.getroot()
elements = root.xpath("//element[@type='important']")

# Load existing XML
local_tree = pystow.load_xml("myapp", "data", name="document.xml")

# Save XML
pystow.dump_xml(
    "myapp", "outputs",
    name="modified.xml",
    obj=tree
)

RDF Data

import pystow
import rdflib

# Download and parse RDF with caching
graph = pystow.ensure_rdf(
    "myapp", "ontologies",
    url="https://example.com/ontology.rdf.gz",
    parse_kwargs={"format": "xml"},
    precache=True  # Cache parsed graph as pickle for speed
)

# Query the graph
results = graph.query("""
    SELECT ?subject ?predicate ?object
    WHERE { ?subject ?predicate ?object }
    LIMIT 10
""")

# Save RDF in different format
pystow.dump_rdf(
    "myapp", "outputs",
    name="data.ttl",
    obj=graph,
    format="turtle"
)

Python Objects

import pystow

# Download and load pickled object
model = pystow.ensure_pickle(
    "myapp", "models",
    url="https://example.com/trained_model.pkl"
)

# Save Python object
data_structure = {"key": "value", "list": [1, 2, 3]}
pystow.dump_pickle(
    "myapp", "cache",
    name="data.pkl",
    obj=data_structure
)

# Load existing pickle
cached_data = pystow.load_pickle("myapp", "cache", name="data.pkl")

Install with Tessl CLI

npx tessl i tessl/pypi-pystow

docs

archives.md

cloud-storage.md

configuration.md

data-formats.md

directory-management.md

file-operations.md

index.md

module-class.md

nltk-integration.md

web-scraping.md

tile.json