CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-panflute

Pythonic Pandoc filters library for programmatic document manipulation and transformation

Pending
Overview
Eval results
Files

text-processing.mddocs/

Text Processing Tools

Utility functions for text extraction, document conversion, YAML processing, and external tool integration. These tools provide powerful capabilities for document transformation, content analysis, and integration with external systems like Pandoc and shell commands.

Capabilities

Text Extraction

Extract plain text content from document elements while preserving formatting context.

def stringify(element, newlines=True) -> str:
    """
    Return the raw text version of an element and its children.

    Parameters:
    - element: Element to extract text from
    - newlines: add newlines after paragraphs (default: True)

    Returns:
    str: Plain text representation of the element

    Example:
    import panflute as pf
    
    # Extract text from complex elements
    elem = pf.Para(
        pf.Str('Hello '),
        pf.Emph(pf.Str('beautiful')),
        pf.Str(' world!')
    )
    text = pf.stringify(elem)  # "Hello beautiful world!\n\n"
    
    # Extract without paragraph newlines
    text = pf.stringify(elem, newlines=False)  # "Hello beautiful world!"
    
    # Extract text from entire document
    doc = pf.load()
    full_text = pf.stringify(doc)
    """

Document Format Conversion

Convert text between different formats using Pandoc's conversion capabilities.

def convert_text(text,
                 input_format='markdown',
                 output_format='panflute',
                 standalone=False,
                 extra_args=None,
                 pandoc_path=None):
    """
    Convert formatted text using Pandoc internally.

    Parameters:
    - text: text to convert (str, Element, or list of Elements)
    - input_format: source format ('markdown', 'latex', 'html', 'panflute', etc.)
    - output_format: target format ('panflute', 'html', 'latex', 'markdown', etc.)
    - standalone: create standalone document (default: False)
    - extra_args: additional Pandoc arguments (list)
    - pandoc_path: path to pandoc executable (optional)

    Returns:
    list|Doc|str: Converted content (type depends on output_format)

    Example:
    import panflute as pf
    
    # Convert Markdown to panflute elements
    md_text = "This is *emphasized* text with **bold** formatting."
    elements = pf.convert_text(md_text, input_format='markdown')
    # Returns: [Para(Str(This) Space Str(is) Space Emph(Str(emphasized)) ...)]
    
    # Convert panflute elements to HTML
    para = pf.Para(pf.Str('Hello '), pf.Strong(pf.Str('world')))
    html = pf.convert_text(para, input_format='panflute', output_format='html')
    # Returns: "<p>Hello <strong>world</strong></p>"
    
    # Create standalone document
    doc = pf.convert_text(md_text, standalone=True)
    # Returns: Doc object with metadata and proper structure
    
    # Use custom Pandoc arguments
    latex = pf.convert_text(
        md_text,
        output_format='latex',
        extra_args=['--template=custom.tex', '--variable=fontsize:12pt']
    )
    """

YAML Code Block Processing

Parse and process code blocks with YAML frontmatter for dynamic content generation.

def yaml_filter(element, doc, tag=None, function=None, tags=None, strict_yaml=False):
    """
    Convenience function for parsing code blocks with YAML options.

    Parameters:
    - element: current element being processed
    - doc: document being filtered
    - tag: class name to match (str)
    - function: function to call for matching blocks
    - tags: dict mapping class names to functions
    - strict_yaml: require explicit YAML delimiters (default: False)

    The function parameter receives (options, data, element, doc):
    - options: parsed YAML dict
    - data: remaining code content after YAML
    - element: original CodeBlock element
    - doc: document being processed

    Example:
    import panflute as pf
    
    def process_chart(options, data, element, doc):
        chart_type = options.get('type', 'bar')
        title = options.get('title', 'Chart')
        
        # Generate chart based on options and data
        return pf.Para(pf.Str(f"Generated {chart_type} chart: {title}"))
    
    def filter_func(elem, doc):
        return pf.yaml_filter(elem, doc, tag='chart', function=process_chart)
    
    if __name__ == '__main__':
        pf.run_filter(filter_func)
    
    # Processes code blocks like:
    # ```chart
    # type: line
    # title: Sales Data
    # ---
    # January: 100
    # February: 150
    # March: 120
    # ```
    """

External Command Execution

Execute external commands and shell scripts from within filters.

def shell(args, wait=True, msg=None):
    """
    Execute external command and get its output.

    Parameters:
    - args: command and arguments (str or list)
    - wait: wait for command completion (default: True)
    - msg: input message to send to command (bytes, optional)

    Returns:
    bytes: command output (if wait=True)

    Raises:
    IOError: if command fails (non-zero exit code)

    Example:
    import panflute as pf
    
    # Run a simple command
    output = pf.shell(['ls', '-la'])
    
    # Run with input
    result = pf.shell('grep -i python', msg=b'This is Python code\\nThis is Java code\\n')
    
    # Run command with string (automatically parsed)
    output = pf.shell('pandoc --version')
    
    # Run without waiting (fire and forget)
    pf.shell(['notify-send', 'Filter completed'], wait=False)
    """

def run_pandoc(text='', args=None, pandoc_path=None) -> str:
    """
    Low-level function to call Pandoc with input text and arguments.

    Parameters:
    - text: input text to process (str)
    - args: Pandoc command-line arguments (list)
    - pandoc_path: path to pandoc executable (optional)

    Returns:
    str: Pandoc output

    Example:
    import panflute as pf
    
    # Get Pandoc version
    version = pf.run_pandoc(args=['--version'])
    
    # Convert markdown to HTML
    html = pf.run_pandoc(
        '# Hello\\n\\nThis is **markdown**.',
        args=['--from=markdown', '--to=html']
    )
    
    # Use specific Pandoc installation
    output = pf.run_pandoc(
        'Some text',
        args=['--to=latex'],
        pandoc_path='/usr/local/bin/pandoc'
    )
    """

Metadata and Options Handling

Retrieve configuration options from multiple sources with fallback logic.

def get_option(options=None, local_tag=None, doc=None, doc_tag=None, default=None, error_on_none=True):
    """
    Fetch option from element attributes, document metadata, or default value.

    Parameters:
    - options: element attributes dict (local level)
    - local_tag: attribute key to look for (str)
    - doc: document object (for metadata access)
    - doc_tag: metadata key to look for (str, supports dot notation)
    - default: fallback value if not found
    - error_on_none: raise ValueError if no value found (default: True)

    Returns:
    any: Retrieved option value

    The search order is: local > document > default

    Example:
    import panflute as pf
    
    def process_div(elem, doc):
        if isinstance(elem, pf.Div):
            # Get style from div attributes, fallback to document metadata
            style = pf.get_option(
                elem.attributes, 'style',
                doc, 'default-div-style',
                default='bordered'
            )
            
            # Get nested metadata with dot notation
            font_size = pf.get_option(
                None, None,
                doc, 'formatting.font.size',
                default='12pt'
            )
            
            elem.attributes['data-style'] = style
            elem.attributes['data-font-size'] = font_size
    
    if __name__ == '__main__':
        pf.run_filter(process_div)
    """

def meta2builtin(meta):
    """
    Convert MetaValue elements to Python builtin types.

    Parameters:
    - meta: MetaValue element to convert

    Returns:
    any: Python builtin equivalent (str, bool, list, dict, etc.)

    Conversion rules:
    - MetaBool -> bool
    - MetaString -> str
    - MetaList -> list (recursively converted)
    - MetaMap -> dict (recursively converted)
    - MetaInlines/MetaBlocks -> str (via stringify)

    Example:
    import panflute as pf
    
    # Convert metadata to Python types
    doc = pf.load()
    
    # Convert MetaBool to bool
    show_toc = pf.meta2builtin(doc.metadata.get('show-toc'))  # True/False
    
    # Convert MetaList to list
    authors = pf.meta2builtin(doc.metadata.get('authors'))  # ['John', 'Jane']
    
    # Convert MetaMap to dict
    settings = pf.meta2builtin(doc.metadata.get('settings'))  # {'key': 'value'}
    """

Pandoc Version Information

Access runtime Pandoc version and configuration information.

class PandocVersion:
    """
    Get runtime Pandoc version and configuration.
    
    Use PandocVersion().version for comparing versions.
    Lazily calls pandoc --version only once.

    Methods:
    - __str__(): return version string (e.g., "2.19.2")
    - __repr__(): return full pandoc --version output

    Properties:
    - version: tuple of version numbers for comparison
    - data_dir: list of Pandoc data directories (with /filters appended)

    Example:
    import panflute as pf
    
    pv = pf.PandocVersion()
    print(str(pv))  # "2.19.2"
    print(pv.version)  # (2, 19, 2)
    print(pv.data_dir)  # ['/home/user/.local/share/pandoc/filters', ...]
    
    # Version comparison
    if pv.version >= (2, 17):
        # Use newer Pandoc features
        pass
    """
    
    def __init__(self): ...
    def __str__(self) -> str: ...
    def __repr__(self) -> str: ...
    
    @property
    def version(self) -> tuple: ...
    
    @property
    def data_dir(self) -> list: ...

# Global instance for convenient access
pandoc_version: PandocVersion

Debug Output

Print debug messages to stderr without interfering with Pandoc processing.

def debug(*args, **kwargs):
    """
    Same as print, but prints to stderr (which is not intercepted by Pandoc).

    Parameters:
    - *args: arguments to print (same as print())
    - **kwargs: keyword arguments (same as print())

    Example:
    import panflute as pf
    
    def my_filter(elem, doc):
        if isinstance(elem, pf.Header):
            pf.debug(f"Processing header: {pf.stringify(elem)}")
            pf.debug("Header level:", elem.level)
        return elem
    
    if __name__ == '__main__':
        pf.run_filter(my_filter)
    """

Element Keyword Replacement

Replace specific text strings with element structures throughout documents.

# Method added to Element class
def replace_keyword(self, keyword: str, replacement, count=0):
    """
    Replace keyword strings with replacement elements.

    Parameters:
    - keyword: exact text string to find and replace
    - replacement: Element to substitute (Inline or Block)
    - count: maximum replacements (0 = unlimited)

    Returns:
    Element: modified element tree

    Example:
    import panflute as pf
    
    # Replace text with styled elements
    doc = pf.load()
    doc.replace_keyword('TODO', pf.Strong(pf.Str('⚠️ TODO')))
    
    # Replace with block elements (replaces parent if needed)
    doc.replace_keyword('PAGEBREAK', pf.RawBlock('\\newpage', 'latex'))
    
    # Limited replacements
    doc.replace_keyword('DRAFT', pf.Emph(pf.Str('DRAFT')), count=3)
    
    pf.dump(doc)
    """

Usage Examples

Advanced Text Processing Pipeline

import panflute as pf
import re

def process_special_syntax(elem, doc):
    """Process custom syntax in text elements."""
    if isinstance(elem, pf.Str):
        text = elem.text
        
        # Convert @mentions to links
        text = re.sub(
            r'@(\w+)',
            lambda m: f'[@{m.group(1)}](https://github.com/{m.group(1)})',
            text
        )
        
        # Convert [[wikilinks]] to proper links
        text = re.sub(
            r'\[\[([^\]]+)\]\]',
            lambda m: f'[{m.group(1)}](wiki/{m.group(1).replace(" ", "_")})',
            text
        )
        
        if text != elem.text:
            # Convert back to elements if changed
            elements = pf.convert_text(text, input_format='markdown')
            return elements if len(elements) > 1 else elements[0].content

def generate_bibliography(elem, doc):
    """Generate bibliography from citations."""
    if isinstance(elem, pf.Cite):
        if not hasattr(doc, 'citations'):
            doc.citations = set()
        
        for citation in elem.citations:
            doc.citations.add(citation.id)

def finalize_document(doc):
    """Add bibliography section to document."""
    if hasattr(doc, 'citations') and doc.citations:
        bib_header = pf.Header(pf.Str('References'), level=2)
        bib_list = pf.BulletList()
        
        for citation_id in sorted(doc.citations):
            # Load citation details (would normally come from database/file)
            bib_item = pf.ListItem(pf.Plain(pf.Str(f'Reference for {citation_id}')))
            bib_list.content.append(bib_item)
        
        doc.content.extend([bib_header, bib_list])

if __name__ == '__main__':
    pf.run_filters(
        [process_special_syntax, generate_bibliography],
        finalize=finalize_document
    )

Dynamic Content Generation

import panflute as pf
import json
from datetime import datetime

def process_data_blocks(options, data, element, doc):
    """Generate charts and tables from YAML + data."""
    chart_type = options.get('type', 'table')
    title = options.get('title', 'Data')
    
    # Parse data section
    lines = [line.strip() for line in data.split('\n') if line.strip()]
    data_dict = {}
    
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            try:
                data_dict[key.strip()] = float(value.strip())
            except ValueError:
                data_dict[key.strip()] = value.strip()
    
    if chart_type == 'table':
        # Generate table
        header_row = pf.TableRow(
            pf.TableCell(pf.Plain(pf.Str('Item'))),
            pf.TableCell(pf.Plain(pf.Str('Value')))
        )
        
        data_rows = []
        for key, value in data_dict.items():
            row = pf.TableRow(
                pf.TableCell(pf.Plain(pf.Str(key))),
                pf.TableCell(pf.Plain(pf.Str(str(value))))
            )
            data_rows.append(row)
        
        table = pf.Table(
            pf.TableHead(header_row),
            pf.TableBody(*data_rows),
            caption=pf.Caption(pf.Plain(pf.Str(title)))
        )
        return table
    
    elif chart_type == 'summary':
        # Generate summary paragraph
        total = sum(v for v in data_dict.values() if isinstance(v, (int, float)))
        count = len(data_dict)
        avg = total / count if count > 0 else 0
        
        summary = pf.Para(
            pf.Strong(pf.Str(f'{title}: ')),
            pf.Str(f'{count} items, total: {total:.2f}, average: {avg:.2f}')
        )
        return summary
    
    return element  # Fallback

def data_filter(elem, doc):
    """Apply YAML filter to data blocks."""
    return pf.yaml_filter(elem, doc, tag='data', function=process_data_blocks)

if __name__ == '__main__':
    pf.run_filter(data_filter)

Install with Tessl CLI

npx tessl i tessl/pypi-panflute

docs

cli.md

document-elements.md

document-io.md

index.md

text-processing.md

tile.json