A friend to fetch your data files
—
Primary functionality for downloading and caching individual files or managing collections of data files with version control and hash verification. These functions form the foundation of Pooch's data management capabilities.
Downloads and caches individual files with hash verification, supporting custom processors and downloaders.
def retrieve(
url: str,
known_hash: str | None,
fname: str | None = None,
path: str | None = None,
processor: callable | None = None,
downloader: callable | None = None,
progressbar: bool = False
) -> str:
"""
Download and cache a single file locally.
Parameters:
- url: The URL to the file that is to be downloaded
- known_hash: A known hash (checksum) of the file. Will be used to verify the download. By default, assumes SHA256. To specify different algorithm, prepend with 'algorithm:', e.g., 'md5:pw9co2iun29juoh'. If None, will NOT check the hash
- fname: The name that will be used to save the file. If None, will create a unique file name
- path: The location of the cache folder on disk. If None, will save to a pooch folder in the default cache location
- processor: If not None, then a function that will be called before returning the full path and after the file has been downloaded
- downloader: If not None, then a function that will be called to download a given URL to a provided local file name
- progressbar: If True, will print a progress bar of the download. Requires tqdm to be installed
Returns:
The absolute path (including the file name) of the file in the local storage
"""Creates a Pooch instance with sensible defaults for managing multiple data files with versioning support.
def create(
path: str | list | tuple,
base_url: str,
version: str | None = None,
version_dev: str = "master",
env: str | None = None,
registry: dict | None = None,
urls: dict | None = None,
retry_if_failed: int = 0,
allow_updates: bool = True
) -> Pooch:
"""
Create a Pooch with sensible defaults to fetch data files.
Parameters:
- path: The path to the local data storage folder. If this is a list or tuple, will join the parts. The version will be appended to the end of this path
- base_url: Base URL for the remote data source. Should have a {version} formatting mark in it
- version: The version string for your project. Should be PEP440 compatible. If None, will not attempt to format base_url and no subfolder will be appended to path
- version_dev: The name used for the development version of a project. If your data is hosted on Github, then "master" is a good choice
- env: An environment variable that can be used to overwrite path
- registry: A record of the files that are managed by this Pooch. Keys should be the file names and the values should be their hashes
- urls: Custom URLs for downloading individual files in the registry
- retry_if_failed: Retry a file download the specified number of times if it fails
- allow_updates: Whether existing files in local storage that have a hash mismatch with the registry are allowed to update from the remote URL
Returns:
A Pooch instance configured with the given parameters
"""Manager for local data storage that can fetch from remote sources with registry-based file management.
class Pooch:
"""
Manager for a local data storage that can fetch from a remote source.
Avoid creating Pooch instances directly. Use pooch.create instead.
"""
def __init__(
self,
path: str,
base_url: str,
registry: dict | None = None,
urls: dict | None = None,
retry_if_failed: int = 0,
allow_updates: bool = True
):
"""
Parameters:
- path: The path to the local data storage folder
- base_url: Base URL for the remote data source. All requests will be made relative to this URL
- registry: A record of the files that are managed by this Pooch. Keys should be the file names and values should be their hashes
- urls: Custom URLs for downloading individual files in the registry
- retry_if_failed: Retry a file download the specified number of times if it fails
- allow_updates: Whether existing files in local storage that have a hash mismatch with the registry are allowed to update from the remote URL
"""
@property
def abspath(self) -> Path:
"""Absolute path to the local storage."""
@property
def registry_files(self) -> list[str]:
"""List of file names on the registry."""
def fetch(
self,
fname: str,
processor: callable | None = None,
downloader: callable | None = None,
progressbar: bool = False
) -> str:
"""
Get the absolute path to a file in the local storage.
Parameters:
- fname: The file name (relative to the base_url of the remote data storage) of the file in the registry
- processor: If not None, then a function that will be called before returning the full path and after the file has been downloaded
- downloader: If not None, then a function that will be called to download a given URL to a provided local file name
- progressbar: If True, will print a progress bar of the download
Returns:
The absolute path to the file in the local storage
"""
def get_url(self, fname: str) -> str:
"""
Get the download URL for the given file.
Parameters:
- fname: The file name (relative to the base_url) in the registry
Returns:
The download URL for the file
"""
def load_registry(self, fname: str | object) -> None:
"""
Load entries from a file and add them to the registry.
Each line should contain file name and hash separated by a space.
Hash can specify algorithm using 'alg:hash' format. Custom URLs
can be specified as a third element. Line comments start with '#'.
Parameters:
- fname: Path to the registry file or an open file object
"""
def load_registry_from_doi(self) -> None:
"""
Populate the registry using the data repository API.
Fill the registry with all files available in the data repository,
along with their hashes. Makes a request to the repository API to
retrieve this information. No files are downloaded during this process.
Requires that the Pooch was created with a DOI base_url.
"""
def is_available(self, fname: str, downloader: callable | None = None) -> bool:
"""
Check if a file is available for download from the remote storage.
Parameters:
- fname: The file name (relative to the base_url) in the registry
- downloader: If not None, then a function that will be called to check if the file is available
Returns:
True if the file is available, False otherwise
"""import pooch
# Download a single file with hash verification
fname = pooch.retrieve(
url="https://github.com/fatiando/pooch/raw/v1.8.2/data/tiny-data.txt",
known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
)
with open(fname) as f:
data = f.read()import pooch
# Create a data manager for your project
data_manager = pooch.create(
path=pooch.os_cache("myproject"),
base_url="https://github.com/myproject/data/raw/{version}/",
version="v1.0.0",
registry={
"temperature.csv": "md5:ab12cd34ef56...",
"pressure.dat": "sha256:12345abc...",
"readme.txt": "md5:987fde65...",
}
)
# Fetch files from the registry
temp_data = data_manager.fetch("temperature.csv")
pressure_data = data_manager.fetch("pressure.dat")
# Check what files are available
print(data_manager.registry_files)import pooch
# Create registry from directory
pooch.make_registry("data/", "registry.txt", recursive=True)
# Load registry from file
data_manager = pooch.create(
path=pooch.os_cache("myproject"),
base_url="https://example.com/data/",
)
data_manager.load_registry("registry.txt")Install with Tessl CLI
npx tessl i tessl/pypi-pooch