A friend to fetch your data files
—
Post-download processors for automatic decompression, archive extraction, and custom file transformations. These processors execute after successful downloads to prepare files for use.
Automatically decompresses files compressed with gzip, bzip2, or xz/lzma algorithms.
class Decompress:
"""Processor to decompress files after download."""
def __init__(self, method: str = "auto", name: str | None = None):
"""
Parameters:
- method: The decompression method. Can be 'auto' (default), 'gzip', 'xz', or 'bz2'. If 'auto', will determine the method from the file extension
- name: The name that will be used for the decompressed file. If None, will remove the compression extension from the downloaded file name
"""
def __call__(self, fname: str, action: str, pooch: object) -> str:
"""
Decompress the given file.
Parameters:
- fname: Full path to the downloaded file
- action: Either 'download' or 'update' depending on the action taken by the Pooch
- pooch: The Pooch instance that is calling this processor
Returns:
The full path to the decompressed file
"""Extracts ZIP archives and returns paths to all extracted files.
class Unzip:
"""Processor to unzip downloaded ZIP files."""
def __init__(
self,
members: list[str] | None = None,
extract_dir: str | None = None,
password: bytes | None = None
):
"""
Parameters:
- members: List of archive members to extract. If None, will extract all members
- extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the ZIP file
- password: Password to use for encrypted ZIP files
"""
def __call__(self, fname: str, action: str, pooch: object) -> list[str]:
"""
Extract the given ZIP file.
Parameters:
- fname: Full path to the downloaded ZIP file
- action: Either 'download' or 'update' depending on the action taken by the Pooch
- pooch: The Pooch instance that is calling this processor
Returns:
A list with the full paths to all extracted files
"""Extracts TAR archives (including compressed variants like .tar.gz, .tar.bz2, .tar.xz).
class Untar:
"""Processor to untar downloaded TAR files."""
def __init__(
self,
members: list[str] | None = None,
extract_dir: str | None = None
):
"""
Parameters:
- members: List of archive members to extract. If None, will extract all members
- extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the TAR file
"""
def __call__(self, fname: str, action: str, pooch: object) -> list[str]:
"""
Extract the given TAR file.
Parameters:
- fname: Full path to the downloaded TAR file
- action: Either 'download' or 'update' depending on the action taken by the Pooch
- pooch: The Pooch instance that is calling this processor
Returns:
A list with the full paths to all extracted files
"""import pooch
# Download and automatically decompress a gzipped file
fname = pooch.retrieve(
"https://example.com/data.txt.gz",
known_hash="md5:abc123...",
processor=pooch.Decompress()
)
# Returns path to decompressed data.txt fileimport pooch
# Explicitly specify decompression method and output name
fname = pooch.retrieve(
"https://example.com/dataset.xz",
known_hash="sha256:def456...",
processor=pooch.Decompress(method="xz", name="dataset.csv")
)
# Returns path to dataset.csvimport pooch
# Extract all files from a ZIP archive
files = pooch.retrieve(
"https://example.com/data.zip",
known_hash="md5:ghi789...",
processor=pooch.Unzip()
)
# Returns list of paths to all extracted files
# Extract specific files only
files = pooch.retrieve(
"https://example.com/data.zip",
known_hash="md5:ghi789...",
processor=pooch.Unzip(members=["data.csv", "readme.txt"])
)
# Returns list with paths to data.csv and readme.txt onlyimport pooch
# Extract a tar.gz archive
files = pooch.retrieve(
"https://example.com/dataset.tar.gz",
known_hash="sha256:jkl012...",
processor=pooch.Untar()
)
# Returns list of paths to all extracted files
# Extract to specific directory
files = pooch.retrieve(
"https://example.com/dataset.tar.bz2",
known_hash="sha256:mno345...",
processor=pooch.Untar(extract_dir="./extracted_data")
)import pooch
# Create data manager with processors
data_manager = pooch.create(
path=pooch.os_cache("myproject"),
base_url="https://example.com/data/",
registry={
"dataset.csv.gz": "md5:abc123...",
"images.zip": "sha256:def456...",
"archive.tar.xz": "sha256:ghi789...",
}
)
# Fetch and decompress
csv_file = data_manager.fetch("dataset.csv.gz", processor=pooch.Decompress())
# Fetch and extract archive
image_files = data_manager.fetch("images.zip", processor=pooch.Unzip())
# Fetch and extract compressed tar
archive_files = data_manager.fetch("archive.tar.xz", processor=pooch.Untar())import pooch
import os
class CustomProcessor:
"""Custom processor example."""
def __init__(self, suffix="_processed"):
self.suffix = suffix
def __call__(self, fname, action, pooch):
"""Process the downloaded file."""
output_name = fname.replace('.txt', f'{self.suffix}.txt')
# Custom processing logic here
with open(fname, 'r') as infile, open(output_name, 'w') as outfile:
outfile.write(infile.read().upper())
return output_name
# Use custom processor
fname = pooch.retrieve(
"https://example.com/data.txt",
known_hash="md5:abc123...",
processor=CustomProcessor(suffix="_uppercase")
)Install with Tessl CLI
npx tessl i tessl/pypi-pooch