Parameterize and run Jupyter and nteract Notebooks
—
Support for multiple storage systems including local filesystem, cloud storage (S3, Azure, GCS), distributed filesystems (HDFS), and remote repositories (GitHub). Papermill's modular I/O system enables seamless notebook execution across different storage environments.
Primary functions for loading and saving notebooks across different storage backends.
def load_notebook_node(notebook_path: str) -> nbformat.NotebookNode:
"""
Loads notebook from various sources (local, S3, etc.).
Parameters:
- notebook_path: Path to notebook (supports local paths, S3 URLs, etc.)
Returns:
nbformat.NotebookNode: Loaded notebook object
Raises:
FileNotFoundError: If notebook doesn't exist
"""
def write_ipynb(nb: nbformat.NotebookNode, path: str) -> None:
"""
Writes notebook to specified location.
Parameters:
- nb: Notebook to write
- path: Destination path (supports local paths, S3 URLs, etc.)
"""
def list_notebook_files(path: str) -> list[str]:
"""
Lists notebook files in directory.
Parameters:
- path: Directory path to list
Returns:
list[str]: List of notebook file paths
"""Utilities for working with paths and managing I/O operations.
def get_pretty_path(path: str) -> str:
"""
Formats paths for display.
Parameters:
- path: Path to format
Returns:
str: Formatted path string
"""
def local_file_io_cwd(path: str = None):
"""
Context manager for local file operations.
Parameters:
- path: Working directory path (optional)
Returns:
Context manager that temporarily changes working directory
"""
def read_yaml_file(path: str) -> dict:
"""
Reads YAML configuration files.
Parameters:
- path: Path to YAML file
Returns:
dict: Parsed YAML content
"""class PapermillIO:
"""
Central I/O handler that delegates to specific storage handlers.
Automatically routes requests based on path/URL schemes.
"""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
def pretty_path(self, path: str) -> str: ...class LocalHandler:
"""Handler for local filesystem operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...class S3Handler:
"""Handler for Amazon S3 storage operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
class ADLHandler:
"""Handler for Azure Data Lake storage operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
class ABSHandler:
"""Handler for Azure Blob Storage operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
class GCSHandler:
"""Handler for Google Cloud Storage operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...class HDFSHandler:
"""Handler for Hadoop Distributed File System operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
class GithubHandler:
"""Handler for GitHub repository operations."""
def read(self, path: str) -> str: ...
def listdir(self, path: str) -> list[str]: ...
class HttpHandler:
"""Handler for HTTP/HTTPS operations."""
def read(self, path: str) -> str: ...
class StreamHandler:
"""Handler for stream I/O operations."""
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...class S3:
"""S3 client for interacting with Amazon S3."""
def __init__(self, **kwargs): ...
def read(self, key: str) -> str: ...
def write(self, buf: str, key: str) -> None: ...
def listdir(self, prefix: str) -> list[str]: ...
class Bucket:
"""S3 bucket representation."""
def __init__(self, name: str, service: str = None): ...
class Prefix:
"""S3 prefix representation."""
def __init__(self, bucket: Bucket, name: str): ...
class Key:
"""S3 key representation."""
def __init__(self, prefix: Prefix, name: str): ...class ADL:
"""Azure Data Lake client."""
def __init__(self, **kwargs): ...
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...
class AzureBlobStore:
"""Azure Blob Storage client."""
def __init__(self, **kwargs): ...
def read(self, path: str) -> str: ...
def write(self, buf: str, path: str) -> None: ...
def listdir(self, path: str) -> list[str]: ...import papermill as pm
# Execute with local paths
pm.execute_notebook(
'/path/to/input.ipynb',
'/path/to/output.ipynb',
parameters={'data_file': '/data/input.csv'}
)import papermill as pm
# Execute with S3 paths
pm.execute_notebook(
's3://my-bucket/notebooks/analysis.ipynb',
's3://my-bucket/results/output.ipynb',
parameters={'dataset': 's3://my-bucket/data/sales.csv'}
)
# Mixed local and S3
pm.execute_notebook(
'local_template.ipynb',
's3://my-bucket/results/report.ipynb',
parameters={'config': 'production'}
)import papermill as pm
# Azure Data Lake
pm.execute_notebook(
'adl://mydatalake.azuredatalakestore.net/notebooks/analysis.ipynb',
'adl://mydatalake.azuredatalakestore.net/results/output.ipynb'
)
# Azure Blob Storage
pm.execute_notebook(
'abs://myaccount.blob.core.windows.net/container/notebook.ipynb',
'abs://myaccount.blob.core.windows.net/container/result.ipynb'
)import papermill as pm
# Execute with GCS paths
pm.execute_notebook(
'gs://my-bucket/notebooks/analysis.ipynb',
'gs://my-bucket/results/output.ipynb',
parameters={'project_id': 'my-gcp-project'}
)import papermill as pm
# Execute with HDFS paths
pm.execute_notebook(
'hdfs://namenode:port/notebooks/analysis.ipynb',
'hdfs://namenode:port/results/output.ipynb'
)import papermill as pm
# Execute notebook directly from GitHub
pm.execute_notebook(
'https://raw.githubusercontent.com/user/repo/main/notebook.ipynb',
'output.ipynb',
parameters={'branch': 'main'}
)import papermill as pm
# Execute notebook from HTTP URL
pm.execute_notebook(
'https://example.com/notebooks/analysis.ipynb',
'local_output.ipynb'
)from papermill.iorw import papermill_io
# Register custom handler
class CustomHandler:
def read(self, path):
# Custom read logic
pass
def write(self, buf, path):
# Custom write logic
pass
# Register with papermill
papermill_io.register("custom://", CustomHandler())import os
# S3 authentication via environment variables
os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-key'
# Azure authentication
os.environ['AZURE_STORAGE_ACCOUNT'] = 'your-account'
os.environ['AZURE_STORAGE_KEY'] = 'your-key'
# GCS authentication
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/path/to/service-account.json'import papermill as pm
# Execute with progress tracking for large notebooks
pm.execute_notebook(
's3://large-bucket/big-notebook.ipynb',
's3://large-bucket/results/output.ipynb',
progress_bar=True,
log_output=True,
start_timeout=300 # Extended timeout for large files
)Install with Tessl CLI
npx tessl i tessl/pypi-papermill