Pythonic Pandoc filters library for programmatic document manipulation and transformation
—
CLI tools for running panflute as a Pandoc filter with automatic filter discovery and execution capabilities. These tools enable panflute to be used as a standalone filter processor or as part of complex document processing pipelines.
Run panflute as a standard Pandoc filter that processes stdin/stdout.
def main():
"""
Entry point for panflute CLI when used as a Pandoc filter.
Reads JSON from stdin, processes with filters specified in metadata,
and writes results to stdout. Filters are specified in document
metadata under 'panflute-filters' key.
Example usage:
pandoc document.md --filter panflute -o output.html
Document metadata can specify filters:
---
panflute-filters:
- emphasis_filter
- link_processor
panflute-path:
- ./filters
- ~/.pandoc/filters
---
"""Run panflute with command-line arguments for complex filter pipelines.
def panfl():
"""
Advanced CLI for panflute with command-line filter specification.
Supports multiple modes:
1. Pandoc filter mode (single filter, no arguments)
2. Pipeline mode (multiple filters with --to option)
Command-line options:
- filters: Filter names/paths (positional arguments)
- --to/-t: Output format for pipeline mode
- --dir/-d: Additional search directories (multiple allowed)
- --data-dir: Include default Pandoc data directories
- --no-sys-path: Exclude Python sys.path from search
Example usage:
# Pipeline mode
pandoc -t json | panfl filter1 filter2 --to html | pandoc -f json
# Pandoc filter mode
pandoc document.md --filter panfl -o output.html
# Custom search directories
panfl --dir ./custom-filters --data-dir filter1 filter2 --to latex
"""Low-level document processing with filter execution.
def stdio(filters=None,
search_dirs=None,
data_dir=True,
sys_path=True,
panfl_=False,
input_stream=None,
output_stream=None):
"""
Process document from stdin with automatic filter discovery and execution.
Parameters:
- filters: list of filter names/paths (None = read from metadata)
- search_dirs: list of directories to search for filters (None = read from metadata)
- data_dir: include Pandoc data directories in search (default: True)
- sys_path: include Python sys.path in search (default: True)
- panfl_: use panfl behavior vs standard panflute (default: False)
- input_stream: input source (default: stdin)
- output_stream: output destination (default: stdout)
The function reads document metadata for configuration:
- 'panflute-filters': list/string of filter names
- 'panflute-path': list/string of search directories
- 'panflute-verbose': enable debug output
- 'panflute-echo': display debug message
Special metadata values:
- '--data-dir' in panflute-path: enable data_dir
- '--no-sys-path' in panflute-path: disable sys_path
Example usage in filter:
import panflute as pf
# Custom processing with specific filters
pf.stdio(
filters=['my_filter', 'cleanup_filter'],
search_dirs=['./filters', '~/.pandoc/filters'],
input_stream=open('input.json'),
output_stream=open('output.json', 'w')
)
"""Find and manage filter files in the filesystem.
def get_filter_dirs(hardcoded=True) -> list:
"""
Get directories where panflute searches for filters.
Parameters:
- hardcoded: use predefined paths vs querying Pandoc (default: True)
Returns:
list: Directory paths to search for filters
Default search locations:
- Linux/macOS: ~/.local/share/pandoc/filters, ~/.pandoc/filters (older Pandoc)
- Windows: %APPDATA%/pandoc/filters
Example:
import panflute as pf
# Get default filter directories
dirs = pf.get_filter_dirs()
print("Searching for filters in:", dirs)
# Query Pandoc directly for current directories
current_dirs = pf.get_filter_dirs(hardcoded=False)
"""Configure filters through document frontmatter:
---
title: "My Document"
author: "John Doe"
# Panflute configuration
panflute-filters:
- emphasis_processor
- link_enhancer
- bibliography_generator
panflute-path:
- ./my-filters
- ~/.local/share/pandoc/filters
- --data-dir
panflute-verbose: true
panflute-echo: "Processing with custom filters..."
---
# Document Content
This document will be processed by the specified filters.# Basic filter usage
pandoc document.md --filter panflute -o output.html
# Advanced pipeline with multiple filters
pandoc document.md -t json | \
panfl filter1.py custom_processor --to json | \
pandoc -f json -o output.pdf
# Using custom search directories
panfl --dir ./project-filters --dir ~/.pandoc/filters \
emphasis_filter link_processor --to html
# Filter development and testing
echo '{"pandoc-api-version":[1,23],"meta":{},"blocks":[]}' | \
panfl --data-dir my_test_filter --to jsonCreate a custom filter execution environment:
import panflute as pf
import sys
import io
def run_custom_pipeline():
"""Run a custom filter pipeline with error handling."""
# Define filter sequence
filters = [
'preprocessing_filter',
'content_enhancer',
'formatting_cleaner'
]
# Set up search paths
search_dirs = [
'./filters',
'~/.local/share/pandoc/filters',
'/usr/local/share/pandoc/filters'
]
try:
# Load document
doc = pf.load()
# Add processing metadata
doc.processing_start = pf.meta2builtin(doc.get_metadata('date', ''))
doc.filter_count = 0
# Process with filters
pf.stdio(
filters=filters,
search_dirs=search_dirs,
data_dir=True,
sys_path=True,
input_stream=sys.stdin,
output_stream=sys.stdout
)
except Exception as e:
pf.debug(f"Filter pipeline failed: {e}")
sys.exit(1)
if __name__ == '__main__':
run_custom_pipeline()import panflute as pf
import os
def list_available_filters():
"""Discover and list all available filters."""
search_dirs = pf.get_filter_dirs()
available_filters = []
for directory in search_dirs:
if os.path.exists(directory):
for filename in os.listdir(directory):
if filename.endswith('.py'):
filter_path = os.path.join(directory, filename)
filter_name = filename[:-3] # Remove .py extension
# Check if it's a valid panflute filter
try:
with open(filter_path, 'r') as f:
content = f.read()
if 'def main(' in content and 'panflute' in content:
available_filters.append({
'name': filter_name,
'path': filter_path,
'directory': directory
})
except Exception:
continue
return available_filters
def validate_filter_environment():
"""Check filter environment and dependencies."""
# Check Pandoc installation
try:
version_info = pf.run_pandoc(args=['--version'])
pf.debug(f"Pandoc version: {version_info.split()[1]}")
except Exception as e:
pf.debug(f"Pandoc not found: {e}")
return False
# Check filter directories
dirs = pf.get_filter_dirs()
pf.debug(f"Filter search directories: {dirs}")
for directory in dirs:
if os.path.exists(directory):
filter_count = len([f for f in os.listdir(directory) if f.endswith('.py')])
pf.debug(f" {directory}: {filter_count} Python files")
else:
pf.debug(f" {directory}: does not exist")
return True
# Usage example
if __name__ == '__main__':
if validate_filter_environment():
filters = list_available_filters()
pf.debug(f"Found {len(filters)} available filters:")
for filter_info in filters:
pf.debug(f" {filter_info['name']} ({filter_info['path']})")import panflute as pf
import subprocess
import tempfile
import os
def pandoc_filter_chain():
"""Example of complex document processing chain."""
def preprocess_filter(elem, doc):
"""First stage: clean up input."""
if isinstance(elem, pf.Str):
# Normalize whitespace
elem.text = ' '.join(elem.text.split())
return elem
def content_filter(elem, doc):
"""Second stage: transform content."""
if isinstance(elem, pf.Header) and elem.level == 1:
# Add document structure tracking
if not hasattr(doc, 'sections'):
doc.sections = []
doc.sections.append(pf.stringify(elem))
return elem
def postprocess_filter(elem, doc):
"""Third stage: final formatting."""
if isinstance(elem, pf.Link):
# Ensure external links open in new tab
if elem.url.startswith('http'):
elem.attributes['target'] = '_blank'
return elem
def finalize_doc(doc):
"""Add document metadata after processing."""
if hasattr(doc, 'sections'):
doc.metadata['generated-toc'] = pf.MetaList(
*[pf.MetaString(section) for section in doc.sections]
)
# Run the filter chain
pf.run_filters(
[preprocess_filter, content_filter, postprocess_filter],
finalize=finalize_doc
)
if __name__ == '__main__':
pandoc_filter_chain()Install with Tessl CLI
npx tessl i tessl/pypi-panflute