CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pooch

A friend to fetch your data files

Pending
Overview
Eval results
Files

file-processing.mddocs/

File Processing

Post-download processors for automatic decompression, archive extraction, and custom file transformations. These processors execute after successful downloads to prepare files for use.

Capabilities

File Decompression

Automatically decompresses files compressed with gzip, bzip2, or xz/lzma algorithms.

class Decompress:
    """Processor to decompress files after download."""
    
    def __init__(self, method: str = "auto", name: str | None = None):
        """
        Parameters:
        - method: The decompression method. Can be 'auto' (default), 'gzip', 'xz', or 'bz2'. If 'auto', will determine the method from the file extension
        - name: The name that will be used for the decompressed file. If None, will remove the compression extension from the downloaded file name
        """
    
    def __call__(self, fname: str, action: str, pooch: object) -> str:
        """
        Decompress the given file.

        Parameters:
        - fname: Full path to the downloaded file
        - action: Either 'download' or 'update' depending on the action taken by the Pooch
        - pooch: The Pooch instance that is calling this processor

        Returns:
        The full path to the decompressed file
        """

ZIP Archive Extraction

Extracts ZIP archives and returns paths to all extracted files.

class Unzip:
    """Processor to unzip downloaded ZIP files."""
    
    def __init__(
        self, 
        members: list[str] | None = None,
        extract_dir: str | None = None,
        password: bytes | None = None
    ):
        """
        Parameters:
        - members: List of archive members to extract. If None, will extract all members
        - extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the ZIP file
        - password: Password to use for encrypted ZIP files
        """
    
    def __call__(self, fname: str, action: str, pooch: object) -> list[str]:
        """
        Extract the given ZIP file.

        Parameters:
        - fname: Full path to the downloaded ZIP file
        - action: Either 'download' or 'update' depending on the action taken by the Pooch
        - pooch: The Pooch instance that is calling this processor

        Returns:
        A list with the full paths to all extracted files
        """

TAR Archive Extraction

Extracts TAR archives (including compressed variants like .tar.gz, .tar.bz2, .tar.xz).

class Untar:
    """Processor to untar downloaded TAR files."""
    
    def __init__(
        self,
        members: list[str] | None = None,
        extract_dir: str | None = None
    ):
        """
        Parameters:
        - members: List of archive members to extract. If None, will extract all members
        - extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the TAR file
        """
    
    def __call__(self, fname: str, action: str, pooch: object) -> list[str]:
        """
        Extract the given TAR file.

        Parameters:
        - fname: Full path to the downloaded TAR file
        - action: Either 'download' or 'update' depending on the action taken by the Pooch
        - pooch: The Pooch instance that is calling this processor

        Returns:
        A list with the full paths to all extracted files
        """

Usage Examples

Automatic Decompression

import pooch

# Download and automatically decompress a gzipped file
fname = pooch.retrieve(
    "https://example.com/data.txt.gz",
    known_hash="md5:abc123...",
    processor=pooch.Decompress()
)
# Returns path to decompressed data.txt file

Specific Decompression Method

import pooch

# Explicitly specify decompression method and output name
fname = pooch.retrieve(
    "https://example.com/dataset.xz",
    known_hash="sha256:def456...",
    processor=pooch.Decompress(method="xz", name="dataset.csv")
)
# Returns path to dataset.csv

ZIP File Extraction

import pooch

# Extract all files from a ZIP archive
files = pooch.retrieve(
    "https://example.com/data.zip",
    known_hash="md5:ghi789...",
    processor=pooch.Unzip()
)
# Returns list of paths to all extracted files

# Extract specific files only
files = pooch.retrieve(
    "https://example.com/data.zip",
    known_hash="md5:ghi789...",
    processor=pooch.Unzip(members=["data.csv", "readme.txt"])
)
# Returns list with paths to data.csv and readme.txt only

TAR Archive Extraction

import pooch

# Extract a tar.gz archive
files = pooch.retrieve(
    "https://example.com/dataset.tar.gz",
    known_hash="sha256:jkl012...",
    processor=pooch.Untar()
)
# Returns list of paths to all extracted files

# Extract to specific directory
files = pooch.retrieve(
    "https://example.com/dataset.tar.bz2",
    known_hash="sha256:mno345...",
    processor=pooch.Untar(extract_dir="./extracted_data")
)

Using with Pooch Manager

import pooch

# Create data manager with processors
data_manager = pooch.create(
    path=pooch.os_cache("myproject"),
    base_url="https://example.com/data/",
    registry={
        "dataset.csv.gz": "md5:abc123...",
        "images.zip": "sha256:def456...",
        "archive.tar.xz": "sha256:ghi789...",
    }
)

# Fetch and decompress
csv_file = data_manager.fetch("dataset.csv.gz", processor=pooch.Decompress())

# Fetch and extract archive
image_files = data_manager.fetch("images.zip", processor=pooch.Unzip())

# Fetch and extract compressed tar
archive_files = data_manager.fetch("archive.tar.xz", processor=pooch.Untar())

Custom Processors

import pooch
import os

class CustomProcessor:
    """Custom processor example."""
    
    def __init__(self, suffix="_processed"):
        self.suffix = suffix
    
    def __call__(self, fname, action, pooch):
        """Process the downloaded file."""
        output_name = fname.replace('.txt', f'{self.suffix}.txt')
        
        # Custom processing logic here
        with open(fname, 'r') as infile, open(output_name, 'w') as outfile:
            outfile.write(infile.read().upper())
        
        return output_name

# Use custom processor
fname = pooch.retrieve(
    "https://example.com/data.txt",
    known_hash="md5:abc123...",
    processor=CustomProcessor(suffix="_uppercase")
)

Install with Tessl CLI

npx tessl i tessl/pypi-pooch

docs

core-data-management.md

download-protocols.md

file-processing.md

index.md

utilities-helpers.md

tile.json