Pythonic Pandoc filters library for programmatic document manipulation and transformation
—
Utility functions for text extraction, document conversion, YAML processing, and external tool integration. These tools provide powerful capabilities for document transformation, content analysis, and integration with external systems like Pandoc and shell commands.
Extract plain text content from document elements while preserving formatting context.
def stringify(element, newlines=True) -> str:
"""
Return the raw text version of an element and its children.
Parameters:
- element: Element to extract text from
- newlines: add newlines after paragraphs (default: True)
Returns:
str: Plain text representation of the element
Example:
import panflute as pf
# Extract text from complex elements
elem = pf.Para(
pf.Str('Hello '),
pf.Emph(pf.Str('beautiful')),
pf.Str(' world!')
)
text = pf.stringify(elem) # "Hello beautiful world!\n\n"
# Extract without paragraph newlines
text = pf.stringify(elem, newlines=False) # "Hello beautiful world!"
# Extract text from entire document
doc = pf.load()
full_text = pf.stringify(doc)
"""Convert text between different formats using Pandoc's conversion capabilities.
def convert_text(text,
input_format='markdown',
output_format='panflute',
standalone=False,
extra_args=None,
pandoc_path=None):
"""
Convert formatted text using Pandoc internally.
Parameters:
- text: text to convert (str, Element, or list of Elements)
- input_format: source format ('markdown', 'latex', 'html', 'panflute', etc.)
- output_format: target format ('panflute', 'html', 'latex', 'markdown', etc.)
- standalone: create standalone document (default: False)
- extra_args: additional Pandoc arguments (list)
- pandoc_path: path to pandoc executable (optional)
Returns:
list|Doc|str: Converted content (type depends on output_format)
Example:
import panflute as pf
# Convert Markdown to panflute elements
md_text = "This is *emphasized* text with **bold** formatting."
elements = pf.convert_text(md_text, input_format='markdown')
# Returns: [Para(Str(This) Space Str(is) Space Emph(Str(emphasized)) ...)]
# Convert panflute elements to HTML
para = pf.Para(pf.Str('Hello '), pf.Strong(pf.Str('world')))
html = pf.convert_text(para, input_format='panflute', output_format='html')
# Returns: "<p>Hello <strong>world</strong></p>"
# Create standalone document
doc = pf.convert_text(md_text, standalone=True)
# Returns: Doc object with metadata and proper structure
# Use custom Pandoc arguments
latex = pf.convert_text(
md_text,
output_format='latex',
extra_args=['--template=custom.tex', '--variable=fontsize:12pt']
)
"""Parse and process code blocks with YAML frontmatter for dynamic content generation.
def yaml_filter(element, doc, tag=None, function=None, tags=None, strict_yaml=False):
"""
Convenience function for parsing code blocks with YAML options.
Parameters:
- element: current element being processed
- doc: document being filtered
- tag: class name to match (str)
- function: function to call for matching blocks
- tags: dict mapping class names to functions
- strict_yaml: require explicit YAML delimiters (default: False)
The function parameter receives (options, data, element, doc):
- options: parsed YAML dict
- data: remaining code content after YAML
- element: original CodeBlock element
- doc: document being processed
Example:
import panflute as pf
def process_chart(options, data, element, doc):
chart_type = options.get('type', 'bar')
title = options.get('title', 'Chart')
# Generate chart based on options and data
return pf.Para(pf.Str(f"Generated {chart_type} chart: {title}"))
def filter_func(elem, doc):
return pf.yaml_filter(elem, doc, tag='chart', function=process_chart)
if __name__ == '__main__':
pf.run_filter(filter_func)
# Processes code blocks like:
# ```chart
# type: line
# title: Sales Data
# ---
# January: 100
# February: 150
# March: 120
# ```
"""Execute external commands and shell scripts from within filters.
def shell(args, wait=True, msg=None):
"""
Execute external command and get its output.
Parameters:
- args: command and arguments (str or list)
- wait: wait for command completion (default: True)
- msg: input message to send to command (bytes, optional)
Returns:
bytes: command output (if wait=True)
Raises:
IOError: if command fails (non-zero exit code)
Example:
import panflute as pf
# Run a simple command
output = pf.shell(['ls', '-la'])
# Run with input
result = pf.shell('grep -i python', msg=b'This is Python code\\nThis is Java code\\n')
# Run command with string (automatically parsed)
output = pf.shell('pandoc --version')
# Run without waiting (fire and forget)
pf.shell(['notify-send', 'Filter completed'], wait=False)
"""
def run_pandoc(text='', args=None, pandoc_path=None) -> str:
"""
Low-level function to call Pandoc with input text and arguments.
Parameters:
- text: input text to process (str)
- args: Pandoc command-line arguments (list)
- pandoc_path: path to pandoc executable (optional)
Returns:
str: Pandoc output
Example:
import panflute as pf
# Get Pandoc version
version = pf.run_pandoc(args=['--version'])
# Convert markdown to HTML
html = pf.run_pandoc(
'# Hello\\n\\nThis is **markdown**.',
args=['--from=markdown', '--to=html']
)
# Use specific Pandoc installation
output = pf.run_pandoc(
'Some text',
args=['--to=latex'],
pandoc_path='/usr/local/bin/pandoc'
)
"""Retrieve configuration options from multiple sources with fallback logic.
def get_option(options=None, local_tag=None, doc=None, doc_tag=None, default=None, error_on_none=True):
"""
Fetch option from element attributes, document metadata, or default value.
Parameters:
- options: element attributes dict (local level)
- local_tag: attribute key to look for (str)
- doc: document object (for metadata access)
- doc_tag: metadata key to look for (str, supports dot notation)
- default: fallback value if not found
- error_on_none: raise ValueError if no value found (default: True)
Returns:
any: Retrieved option value
The search order is: local > document > default
Example:
import panflute as pf
def process_div(elem, doc):
if isinstance(elem, pf.Div):
# Get style from div attributes, fallback to document metadata
style = pf.get_option(
elem.attributes, 'style',
doc, 'default-div-style',
default='bordered'
)
# Get nested metadata with dot notation
font_size = pf.get_option(
None, None,
doc, 'formatting.font.size',
default='12pt'
)
elem.attributes['data-style'] = style
elem.attributes['data-font-size'] = font_size
if __name__ == '__main__':
pf.run_filter(process_div)
"""
def meta2builtin(meta):
"""
Convert MetaValue elements to Python builtin types.
Parameters:
- meta: MetaValue element to convert
Returns:
any: Python builtin equivalent (str, bool, list, dict, etc.)
Conversion rules:
- MetaBool -> bool
- MetaString -> str
- MetaList -> list (recursively converted)
- MetaMap -> dict (recursively converted)
- MetaInlines/MetaBlocks -> str (via stringify)
Example:
import panflute as pf
# Convert metadata to Python types
doc = pf.load()
# Convert MetaBool to bool
show_toc = pf.meta2builtin(doc.metadata.get('show-toc')) # True/False
# Convert MetaList to list
authors = pf.meta2builtin(doc.metadata.get('authors')) # ['John', 'Jane']
# Convert MetaMap to dict
settings = pf.meta2builtin(doc.metadata.get('settings')) # {'key': 'value'}
"""Access runtime Pandoc version and configuration information.
class PandocVersion:
"""
Get runtime Pandoc version and configuration.
Use PandocVersion().version for comparing versions.
Lazily calls pandoc --version only once.
Methods:
- __str__(): return version string (e.g., "2.19.2")
- __repr__(): return full pandoc --version output
Properties:
- version: tuple of version numbers for comparison
- data_dir: list of Pandoc data directories (with /filters appended)
Example:
import panflute as pf
pv = pf.PandocVersion()
print(str(pv)) # "2.19.2"
print(pv.version) # (2, 19, 2)
print(pv.data_dir) # ['/home/user/.local/share/pandoc/filters', ...]
# Version comparison
if pv.version >= (2, 17):
# Use newer Pandoc features
pass
"""
def __init__(self): ...
def __str__(self) -> str: ...
def __repr__(self) -> str: ...
@property
def version(self) -> tuple: ...
@property
def data_dir(self) -> list: ...
# Global instance for convenient access
pandoc_version: PandocVersionPrint debug messages to stderr without interfering with Pandoc processing.
def debug(*args, **kwargs):
"""
Same as print, but prints to stderr (which is not intercepted by Pandoc).
Parameters:
- *args: arguments to print (same as print())
- **kwargs: keyword arguments (same as print())
Example:
import panflute as pf
def my_filter(elem, doc):
if isinstance(elem, pf.Header):
pf.debug(f"Processing header: {pf.stringify(elem)}")
pf.debug("Header level:", elem.level)
return elem
if __name__ == '__main__':
pf.run_filter(my_filter)
"""Replace specific text strings with element structures throughout documents.
# Method added to Element class
def replace_keyword(self, keyword: str, replacement, count=0):
"""
Replace keyword strings with replacement elements.
Parameters:
- keyword: exact text string to find and replace
- replacement: Element to substitute (Inline or Block)
- count: maximum replacements (0 = unlimited)
Returns:
Element: modified element tree
Example:
import panflute as pf
# Replace text with styled elements
doc = pf.load()
doc.replace_keyword('TODO', pf.Strong(pf.Str('⚠️ TODO')))
# Replace with block elements (replaces parent if needed)
doc.replace_keyword('PAGEBREAK', pf.RawBlock('\\newpage', 'latex'))
# Limited replacements
doc.replace_keyword('DRAFT', pf.Emph(pf.Str('DRAFT')), count=3)
pf.dump(doc)
"""import panflute as pf
import re
def process_special_syntax(elem, doc):
"""Process custom syntax in text elements."""
if isinstance(elem, pf.Str):
text = elem.text
# Convert @mentions to links
text = re.sub(
r'@(\w+)',
lambda m: f'[@{m.group(1)}](https://github.com/{m.group(1)})',
text
)
# Convert [[wikilinks]] to proper links
text = re.sub(
r'\[\[([^\]]+)\]\]',
lambda m: f'[{m.group(1)}](wiki/{m.group(1).replace(" ", "_")})',
text
)
if text != elem.text:
# Convert back to elements if changed
elements = pf.convert_text(text, input_format='markdown')
return elements if len(elements) > 1 else elements[0].content
def generate_bibliography(elem, doc):
"""Generate bibliography from citations."""
if isinstance(elem, pf.Cite):
if not hasattr(doc, 'citations'):
doc.citations = set()
for citation in elem.citations:
doc.citations.add(citation.id)
def finalize_document(doc):
"""Add bibliography section to document."""
if hasattr(doc, 'citations') and doc.citations:
bib_header = pf.Header(pf.Str('References'), level=2)
bib_list = pf.BulletList()
for citation_id in sorted(doc.citations):
# Load citation details (would normally come from database/file)
bib_item = pf.ListItem(pf.Plain(pf.Str(f'Reference for {citation_id}')))
bib_list.content.append(bib_item)
doc.content.extend([bib_header, bib_list])
if __name__ == '__main__':
pf.run_filters(
[process_special_syntax, generate_bibliography],
finalize=finalize_document
)import panflute as pf
import json
from datetime import datetime
def process_data_blocks(options, data, element, doc):
"""Generate charts and tables from YAML + data."""
chart_type = options.get('type', 'table')
title = options.get('title', 'Data')
# Parse data section
lines = [line.strip() for line in data.split('\n') if line.strip()]
data_dict = {}
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
try:
data_dict[key.strip()] = float(value.strip())
except ValueError:
data_dict[key.strip()] = value.strip()
if chart_type == 'table':
# Generate table
header_row = pf.TableRow(
pf.TableCell(pf.Plain(pf.Str('Item'))),
pf.TableCell(pf.Plain(pf.Str('Value')))
)
data_rows = []
for key, value in data_dict.items():
row = pf.TableRow(
pf.TableCell(pf.Plain(pf.Str(key))),
pf.TableCell(pf.Plain(pf.Str(str(value))))
)
data_rows.append(row)
table = pf.Table(
pf.TableHead(header_row),
pf.TableBody(*data_rows),
caption=pf.Caption(pf.Plain(pf.Str(title)))
)
return table
elif chart_type == 'summary':
# Generate summary paragraph
total = sum(v for v in data_dict.values() if isinstance(v, (int, float)))
count = len(data_dict)
avg = total / count if count > 0 else 0
summary = pf.Para(
pf.Strong(pf.Str(f'{title}: ')),
pf.Str(f'{count} items, total: {total:.2f}, average: {avg:.2f}')
)
return summary
return element # Fallback
def data_filter(elem, doc):
"""Apply YAML filter to data blocks."""
return pf.yaml_filter(elem, doc, tag='data', function=process_data_blocks)
if __name__ == '__main__':
pf.run_filter(data_filter)Install with Tessl CLI
npx tessl i tessl/pypi-panflute