Tessl Tile for pypi/panflute@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli.md document-elements.md document-io.md index.md text-processing.md

text-processing.mddocs/

0
# Text Processing Tools
1

2
Utility functions for text extraction, document conversion, YAML processing, and external tool integration. These tools provide powerful capabilities for document transformation, content analysis, and integration with external systems like Pandoc and shell commands.
3

4
## Capabilities
5

6
### Text Extraction
7

8
Extract plain text content from document elements while preserving formatting context.
9

10
```python { .api }
11
def stringify(element, newlines=True) -> str:
12
    """
13
    Return the raw text version of an element and its children.
14

15
    Parameters:
16
    - element: Element to extract text from
17
    - newlines: add newlines after paragraphs (default: True)
18

19
    Returns:
20
    str: Plain text representation of the element
21

22
    Example:
23
    import panflute as pf
24
    
25
    # Extract text from complex elements
26
    elem = pf.Para(
27
        pf.Str('Hello '),
28
        pf.Emph(pf.Str('beautiful')),
29
        pf.Str(' world!')
30
    )
31
    text = pf.stringify(elem)  # "Hello beautiful world!\n\n"
32
    
33
    # Extract without paragraph newlines
34
    text = pf.stringify(elem, newlines=False)  # "Hello beautiful world!"
35
    
36
    # Extract text from entire document
37
    doc = pf.load()
38
    full_text = pf.stringify(doc)
39
    """
40
```
41

42
### Document Format Conversion
43

44
Convert text between different formats using Pandoc's conversion capabilities.
45

46
```python { .api }
47
def convert_text(text,
48
                 input_format='markdown',
49
                 output_format='panflute',
50
                 standalone=False,
51
                 extra_args=None,
52
                 pandoc_path=None):
53
    """
54
    Convert formatted text using Pandoc internally.
55

56
    Parameters:
57
    - text: text to convert (str, Element, or list of Elements)
58
    - input_format: source format ('markdown', 'latex', 'html', 'panflute', etc.)
59
    - output_format: target format ('panflute', 'html', 'latex', 'markdown', etc.)
60
    - standalone: create standalone document (default: False)
61
    - extra_args: additional Pandoc arguments (list)
62
    - pandoc_path: path to pandoc executable (optional)
63

64
    Returns:
65
    list|Doc|str: Converted content (type depends on output_format)
66

67
    Example:
68
    import panflute as pf
69
    
70
    # Convert Markdown to panflute elements
71
    md_text = "This is *emphasized* text with **bold** formatting."
72
    elements = pf.convert_text(md_text, input_format='markdown')
73
    # Returns: [Para(Str(This) Space Str(is) Space Emph(Str(emphasized)) ...)]
74
    
75
    # Convert panflute elements to HTML
76
    para = pf.Para(pf.Str('Hello '), pf.Strong(pf.Str('world')))
77
    html = pf.convert_text(para, input_format='panflute', output_format='html')
78
    # Returns: "<p>Hello <strong>world</strong></p>"
79
    
80
    # Create standalone document
81
    doc = pf.convert_text(md_text, standalone=True)
82
    # Returns: Doc object with metadata and proper structure
83
    
84
    # Use custom Pandoc arguments
85
    latex = pf.convert_text(
86
        md_text,
87
        output_format='latex',
88
        extra_args=['--template=custom.tex', '--variable=fontsize:12pt']
89
    )
90
    """
91
```
92

93
### YAML Code Block Processing
94

95
Parse and process code blocks with YAML frontmatter for dynamic content generation.
96

97
```python { .api }
98
def yaml_filter(element, doc, tag=None, function=None, tags=None, strict_yaml=False):
99
    """
100
    Convenience function for parsing code blocks with YAML options.
101

102
    Parameters:
103
    - element: current element being processed
104
    - doc: document being filtered
105
    - tag: class name to match (str)
106
    - function: function to call for matching blocks
107
    - tags: dict mapping class names to functions
108
    - strict_yaml: require explicit YAML delimiters (default: False)
109

110
    The function parameter receives (options, data, element, doc):
111
    - options: parsed YAML dict
112
    - data: remaining code content after YAML
113
    - element: original CodeBlock element
114
    - doc: document being processed
115

116
    Example:
117
    import panflute as pf
118
    
119
    def process_chart(options, data, element, doc):
120
        chart_type = options.get('type', 'bar')
121
        title = options.get('title', 'Chart')
122
        
123
        # Generate chart based on options and data
124
        return pf.Para(pf.Str(f"Generated {chart_type} chart: {title}"))
125
    
126
    def filter_func(elem, doc):
127
        return pf.yaml_filter(elem, doc, tag='chart', function=process_chart)
128
    
129
    if __name__ == '__main__':
130
        pf.run_filter(filter_func)
131
    
132
    # Processes code blocks like:
133
    # ```chart
134
    # type: line
135
    # title: Sales Data
136
    # ---
137
    # January: 100
138
    # February: 150
139
    # March: 120
140
    # ```
141
    """
142
```
143

144
### External Command Execution
145

146
Execute external commands and shell scripts from within filters.
147

148
```python { .api }
149
def shell(args, wait=True, msg=None):
150
    """
151
    Execute external command and get its output.
152

153
    Parameters:
154
    - args: command and arguments (str or list)
155
    - wait: wait for command completion (default: True)
156
    - msg: input message to send to command (bytes, optional)
157

158
    Returns:
159
    bytes: command output (if wait=True)
160

161
    Raises:
162
    IOError: if command fails (non-zero exit code)
163

164
    Example:
165
    import panflute as pf
166
    
167
    # Run a simple command
168
    output = pf.shell(['ls', '-la'])
169
    
170
    # Run with input
171
    result = pf.shell('grep -i python', msg=b'This is Python code\\nThis is Java code\\n')
172
    
173
    # Run command with string (automatically parsed)
174
    output = pf.shell('pandoc --version')
175
    
176
    # Run without waiting (fire and forget)
177
    pf.shell(['notify-send', 'Filter completed'], wait=False)
178
    """
179

180
def run_pandoc(text='', args=None, pandoc_path=None) -> str:
181
    """
182
    Low-level function to call Pandoc with input text and arguments.
183

184
    Parameters:
185
    - text: input text to process (str)
186
    - args: Pandoc command-line arguments (list)
187
    - pandoc_path: path to pandoc executable (optional)
188

189
    Returns:
190
    str: Pandoc output
191

192
    Example:
193
    import panflute as pf
194
    
195
    # Get Pandoc version
196
    version = pf.run_pandoc(args=['--version'])
197
    
198
    # Convert markdown to HTML
199
    html = pf.run_pandoc(
200
        '# Hello\\n\\nThis is **markdown**.',
201
        args=['--from=markdown', '--to=html']
202
    )
203
    
204
    # Use specific Pandoc installation
205
    output = pf.run_pandoc(
206
        'Some text',
207
        args=['--to=latex'],
208
        pandoc_path='/usr/local/bin/pandoc'
209
    )
210
    """
211
```
212

213
### Metadata and Options Handling
214

215
Retrieve configuration options from multiple sources with fallback logic.
216

217
```python { .api }
218
def get_option(options=None, local_tag=None, doc=None, doc_tag=None, default=None, error_on_none=True):
219
    """
220
    Fetch option from element attributes, document metadata, or default value.
221

222
    Parameters:
223
    - options: element attributes dict (local level)
224
    - local_tag: attribute key to look for (str)
225
    - doc: document object (for metadata access)
226
    - doc_tag: metadata key to look for (str, supports dot notation)
227
    - default: fallback value if not found
228
    - error_on_none: raise ValueError if no value found (default: True)
229

230
    Returns:
231
    any: Retrieved option value
232

233
    The search order is: local > document > default
234

235
    Example:
236
    import panflute as pf
237
    
238
    def process_div(elem, doc):
239
        if isinstance(elem, pf.Div):
240
            # Get style from div attributes, fallback to document metadata
241
            style = pf.get_option(
242
                elem.attributes, 'style',
243
                doc, 'default-div-style',
244
                default='bordered'
245
            )
246
            
247
            # Get nested metadata with dot notation
248
            font_size = pf.get_option(
249
                None, None,
250
                doc, 'formatting.font.size',
251
                default='12pt'
252
            )
253
            
254
            elem.attributes['data-style'] = style
255
            elem.attributes['data-font-size'] = font_size
256
    
257
    if __name__ == '__main__':
258
        pf.run_filter(process_div)
259
    """
260

261
def meta2builtin(meta):
262
    """
263
    Convert MetaValue elements to Python builtin types.
264

265
    Parameters:
266
    - meta: MetaValue element to convert
267

268
    Returns:
269
    any: Python builtin equivalent (str, bool, list, dict, etc.)
270

271
    Conversion rules:
272
    - MetaBool -> bool
273
    - MetaString -> str
274
    - MetaList -> list (recursively converted)
275
    - MetaMap -> dict (recursively converted)
276
    - MetaInlines/MetaBlocks -> str (via stringify)
277

278
    Example:
279
    import panflute as pf
280
    
281
    # Convert metadata to Python types
282
    doc = pf.load()
283
    
284
    # Convert MetaBool to bool
285
    show_toc = pf.meta2builtin(doc.metadata.get('show-toc'))  # True/False
286
    
287
    # Convert MetaList to list
288
    authors = pf.meta2builtin(doc.metadata.get('authors'))  # ['John', 'Jane']
289
    
290
    # Convert MetaMap to dict
291
    settings = pf.meta2builtin(doc.metadata.get('settings'))  # {'key': 'value'}
292
    """
293
```
294

295
### Pandoc Version Information
296

297
Access runtime Pandoc version and configuration information.
298

299
```python { .api }
300
class PandocVersion:
301
    """
302
    Get runtime Pandoc version and configuration.
303
    
304
    Use PandocVersion().version for comparing versions.
305
    Lazily calls pandoc --version only once.
306

307
    Methods:
308
    - __str__(): return version string (e.g., "2.19.2")
309
    - __repr__(): return full pandoc --version output
310

311
    Properties:
312
    - version: tuple of version numbers for comparison
313
    - data_dir: list of Pandoc data directories (with /filters appended)
314

315
    Example:
316
    import panflute as pf
317
    
318
    pv = pf.PandocVersion()
319
    print(str(pv))  # "2.19.2"
320
    print(pv.version)  # (2, 19, 2)
321
    print(pv.data_dir)  # ['/home/user/.local/share/pandoc/filters', ...]
322
    
323
    # Version comparison
324
    if pv.version >= (2, 17):
325
        # Use newer Pandoc features
326
        pass
327
    """
328
    
329
    def __init__(self): ...
330
    def __str__(self) -> str: ...
331
    def __repr__(self) -> str: ...
332
    
333
    @property
334
    def version(self) -> tuple: ...
335
    
336
    @property
337
    def data_dir(self) -> list: ...
338

339
# Global instance for convenient access
340
pandoc_version: PandocVersion
341
```
342

343
### Debug Output
344

345
Print debug messages to stderr without interfering with Pandoc processing.
346

347
```python { .api }
348
def debug(*args, **kwargs):
349
    """
350
    Same as print, but prints to stderr (which is not intercepted by Pandoc).
351

352
    Parameters:
353
    - *args: arguments to print (same as print())
354
    - **kwargs: keyword arguments (same as print())
355

356
    Example:
357
    import panflute as pf
358
    
359
    def my_filter(elem, doc):
360
        if isinstance(elem, pf.Header):
361
            pf.debug(f"Processing header: {pf.stringify(elem)}")
362
            pf.debug("Header level:", elem.level)
363
        return elem
364
    
365
    if __name__ == '__main__':
366
        pf.run_filter(my_filter)
367
    """
368
```
369

370
### Element Keyword Replacement
371

372
Replace specific text strings with element structures throughout documents.
373

374
```python { .api }
375
# Method added to Element class
376
def replace_keyword(self, keyword: str, replacement, count=0):
377
    """
378
    Replace keyword strings with replacement elements.
379

380
    Parameters:
381
    - keyword: exact text string to find and replace
382
    - replacement: Element to substitute (Inline or Block)
383
    - count: maximum replacements (0 = unlimited)
384

385
    Returns:
386
    Element: modified element tree
387

388
    Example:
389
    import panflute as pf
390
    
391
    # Replace text with styled elements
392
    doc = pf.load()
393
    doc.replace_keyword('TODO', pf.Strong(pf.Str('⚠️ TODO')))
394
    
395
    # Replace with block elements (replaces parent if needed)
396
    doc.replace_keyword('PAGEBREAK', pf.RawBlock('\\newpage', 'latex'))
397
    
398
    # Limited replacements
399
    doc.replace_keyword('DRAFT', pf.Emph(pf.Str('DRAFT')), count=3)
400
    
401
    pf.dump(doc)
402
    """
403
```
404

405
## Usage Examples
406

407
### Advanced Text Processing Pipeline
408

409
```python
410
import panflute as pf
411
import re
412

413
def process_special_syntax(elem, doc):
414
    """Process custom syntax in text elements."""
415
    if isinstance(elem, pf.Str):
416
        text = elem.text
417
        
418
        # Convert @mentions to links
419
        text = re.sub(
420
            r'@(\w+)',
421
            lambda m: f'[@{m.group(1)}](https://github.com/{m.group(1)})',
422
            text
423
        )
424
        
425
        # Convert [[wikilinks]] to proper links
426
        text = re.sub(
427
            r'\[\[([^\]]+)\]\]',
428
            lambda m: f'[{m.group(1)}](wiki/{m.group(1).replace(" ", "_")})',
429
            text
430
        )
431
        
432
        if text != elem.text:
433
            # Convert back to elements if changed
434
            elements = pf.convert_text(text, input_format='markdown')
435
            return elements if len(elements) > 1 else elements[0].content
436

437
def generate_bibliography(elem, doc):
438
    """Generate bibliography from citations."""
439
    if isinstance(elem, pf.Cite):
440
        if not hasattr(doc, 'citations'):
441
            doc.citations = set()
442
        
443
        for citation in elem.citations:
444
            doc.citations.add(citation.id)
445

446
def finalize_document(doc):
447
    """Add bibliography section to document."""
448
    if hasattr(doc, 'citations') and doc.citations:
449
        bib_header = pf.Header(pf.Str('References'), level=2)
450
        bib_list = pf.BulletList()
451
        
452
        for citation_id in sorted(doc.citations):
453
            # Load citation details (would normally come from database/file)
454
            bib_item = pf.ListItem(pf.Plain(pf.Str(f'Reference for {citation_id}')))
455
            bib_list.content.append(bib_item)
456
        
457
        doc.content.extend([bib_header, bib_list])
458

459
if __name__ == '__main__':
460
    pf.run_filters(
461
        [process_special_syntax, generate_bibliography],
462
        finalize=finalize_document
463
    )
464
```
465

466
### Dynamic Content Generation
467

468
```python
469
import panflute as pf
470
import json
471
from datetime import datetime
472

473
def process_data_blocks(options, data, element, doc):
474
    """Generate charts and tables from YAML + data."""
475
    chart_type = options.get('type', 'table')
476
    title = options.get('title', 'Data')
477
    
478
    # Parse data section
479
    lines = [line.strip() for line in data.split('\n') if line.strip()]
480
    data_dict = {}
481
    
482
    for line in lines:
483
        if ':' in line:
484
            key, value = line.split(':', 1)
485
            try:
486
                data_dict[key.strip()] = float(value.strip())
487
            except ValueError:
488
                data_dict[key.strip()] = value.strip()
489
    
490
    if chart_type == 'table':
491
        # Generate table
492
        header_row = pf.TableRow(
493
            pf.TableCell(pf.Plain(pf.Str('Item'))),
494
            pf.TableCell(pf.Plain(pf.Str('Value')))
495
        )
496
        
497
        data_rows = []
498
        for key, value in data_dict.items():
499
            row = pf.TableRow(
500
                pf.TableCell(pf.Plain(pf.Str(key))),
501
                pf.TableCell(pf.Plain(pf.Str(str(value))))
502
            )
503
            data_rows.append(row)
504
        
505
        table = pf.Table(
506
            pf.TableHead(header_row),
507
            pf.TableBody(*data_rows),
508
            caption=pf.Caption(pf.Plain(pf.Str(title)))
509
        )
510
        return table
511
    
512
    elif chart_type == 'summary':
513
        # Generate summary paragraph
514
        total = sum(v for v in data_dict.values() if isinstance(v, (int, float)))
515
        count = len(data_dict)
516
        avg = total / count if count > 0 else 0
517
        
518
        summary = pf.Para(
519
            pf.Strong(pf.Str(f'{title}: ')),
520
            pf.Str(f'{count} items, total: {total:.2f}, average: {avg:.2f}')
521
        )
522
        return summary
523
    
524
    return element  # Fallback
525

526
def data_filter(elem, doc):
527
    """Apply YAML filter to data blocks."""
528
    return pf.yaml_filter(elem, doc, tag='data', function=process_data_blocks)
529

530
if __name__ == '__main__':
531
    pf.run_filter(data_filter)
532
```

Version

Tile

Files

text-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-processing.mddocs/