A friend to fetch your data files
—
Specialized downloader classes for different protocols and authentication methods. These downloaders handle specific protocols and provide customization options for authentication, headers, and connection parameters.
Automatically chooses the appropriate downloader based on URL protocol.
def choose_downloader(url: str, progressbar: bool = False) -> callable:
"""
Choose the appropriate downloader for the given URL.
Parameters:
- url: The URL for which to choose a downloader
- progressbar: If True, will use a downloader that displays a progress bar
Returns:
A downloader function appropriate for the URL's protocol
"""Downloads files over HTTP/HTTPS with support for authentication, custom headers, and progress bars.
class HTTPDownloader:
"""Download files over HTTP/HTTPS with optional authentication."""
def __init__(
self,
progressbar: bool = False,
chunk_size: int = 1024,
**kwargs
):
"""
Parameters:
- progressbar: If True, will display a progress bar during download. Requires tqdm
- chunk_size: Files are streamed/downloaded in chunks of this size (in bytes)
- **kwargs: Extra keyword arguments to forward to requests.get
"""
def __call__(self, url: str, output_file: str, pooch: object) -> None:
"""
Download the given URL to the given output file.
Parameters:
- url: The URL to the file that will be downloaded
- output_file: Path (and file name) to which the file will be downloaded
- pooch: The Pooch instance that is calling this method
"""Downloads files over FTP with optional authentication.
class FTPDownloader:
"""Download files over FTP with optional authentication."""
def __init__(
self,
port: int = 21,
username: str = "anonymous",
password: str = "",
account: str = "",
timeout: float | None = None,
progressbar: bool = False,
chunk_size: int = 1024
):
"""
Parameters:
- port: Port used by the FTP server. Defaults to 21
- username: The username used to login to the FTP server. Defaults to 'anonymous'
- password: The password used to login to the FTP server. Defaults to empty string
- account: Account information for the FTP server. Usually not required
- timeout: Timeout in seconds for blocking operations
- progressbar: If True, will display a progress bar during download. Requires tqdm
- chunk_size: Files are streamed/downloaded in chunks of this size (in bytes)
"""
def __call__(self, url: str, output_file: str, pooch: object) -> None:
"""
Download the given URL to the given output file.
Parameters:
- url: The URL to the file that will be downloaded
- output_file: Path (and file name) to which the file will be downloaded
- pooch: The Pooch instance that is calling this method
"""Downloads files over SFTP (SSH File Transfer Protocol) with authentication.
class SFTPDownloader:
"""Download files over SFTP (SSH File Transfer Protocol)."""
def __init__(
self,
port: int = 22,
username: str = "anonymous",
password: str = "",
account: str = "",
timeout: float | None = None,
progressbar: bool = False
):
"""
Parameters:
- port: Port used by the SFTP server. Defaults to 22
- username: The username used to login to the SFTP server. Defaults to 'anonymous'
- password: The password used to login to the SFTP server. Defaults to empty string
- account: Account information for the SFTP server. Usually not required
- timeout: Timeout in seconds for the connection
- progressbar: If True, will display a progress bar during download. Requires tqdm
"""
def __call__(self, url: str, output_file: str, pooch: object) -> None:
"""
Download the given URL to the given output file.
Parameters:
- url: The URL to the file that will be downloaded
- output_file: Path (and file name) to which the file will be downloaded
- pooch: The Pooch instance that is calling this method
"""Downloads files from data repositories (Zenodo, Figshare, Dataverse) using DOI identifiers. Uses repository APIs to resolve DOI URLs to actual HTTP download links.
class DOIDownloader:
"""
Download files from data repositories using DOI identifiers.
Supported repositories:
- figshare (www.figshare.com)
- Zenodo (www.zenodo.org)
- Dataverse instances (dataverse.org)
DOI URL format: doi:{DOI}/{filename}
Example: doi:10.5281/zenodo.3939050/data.csv
"""
def __init__(
self,
progressbar: bool = False,
chunk_size: int = 1024,
**kwargs
):
"""
Parameters:
- progressbar: If True, will display a progress bar during download. Requires tqdm
- chunk_size: Files are streamed/downloaded in chunks of this size (in bytes)
- **kwargs: Extra keyword arguments to forward to requests.get for HTTP requests
"""
def __call__(self, url: str, output_file: str, pooch: object) -> None:
"""
Download the given DOI URL to the given output file.
Parameters:
- url: The DOI URL in format 'doi:{DOI}/{filename}' pointing to a file in a supported repository
- output_file: Path (and file name) to which the file will be downloaded
- pooch: The Pooch instance that is calling this method
"""Utility functions for working with DOI-based downloads.
def doi_to_url(doi: str) -> str:
"""
Follow a DOI link to resolve the URL of the archive.
Parameters:
- doi: The DOI of the archive
Returns:
The URL of the archive in the data repository
"""
def doi_to_repository(doi: str) -> object:
"""
Instantiate a data repository instance from a given DOI.
Parameters:
- doi: The DOI of the archive
Returns:
The data repository object for the DOI
"""import pooch
# Create HTTP downloader with custom headers
downloader = pooch.HTTPDownloader(
progressbar=True,
auth=("username", "password"),
headers={"User-Agent": "MyApp/1.0"}
)
# Use with retrieve
fname = pooch.retrieve(
"https://example.com/protected/data.csv",
known_hash="md5:abc123...",
downloader=downloader
)import pooch
# Create FTP downloader with authentication
downloader = pooch.FTPDownloader(
port=21,
username="myuser",
password="mypassword",
progressbar=True
)
# Use with retrieve
fname = pooch.retrieve(
"ftp://ftp.example.com/data/dataset.zip",
known_hash="sha256:def456...",
downloader=downloader
)import pooch
# Create SFTP downloader
downloader = pooch.SFTPDownloader(
port=22,
username="myuser",
password="mypassword",
progressbar=True
)
# Use with retrieve
fname = pooch.retrieve(
"sftp://secure.example.com/data/dataset.tar.gz",
known_hash="sha256:ghi789...",
downloader=downloader
)import pooch
# Create DOI downloader
downloader = pooch.DOIDownloader(progressbar=True)
# Download from Zenodo using DOI
fname = pooch.retrieve(
"doi:10.5281/zenodo.3939050/tiny-data.txt",
known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
downloader=downloader
)
# Or use automatic downloader selection
fname = pooch.retrieve(
"doi:10.5281/zenodo.3939050/tiny-data.txt",
known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
# Automatically chooses DOIDownloader for doi: URLs
)import pooch
def my_custom_downloader(url, output_file, pooch):
"""Custom downloader function."""
# Implement custom download logic
pass
# Use custom downloader
fname = pooch.retrieve(
"custom://example.com/data.txt",
known_hash="sha256:abc123...",
downloader=my_custom_downloader
)Install with Tessl CLI
npx tessl i tessl/pypi-pooch