or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mddocument-elements.mddocument-io.mdindex.mdtext-processing.md

text-processing.mddocs/

0

# Text Processing Tools

1

2

Utility functions for text extraction, document conversion, YAML processing, and external tool integration. These tools provide powerful capabilities for document transformation, content analysis, and integration with external systems like Pandoc and shell commands.

3

4

## Capabilities

5

6

### Text Extraction

7

8

Extract plain text content from document elements while preserving formatting context.

9

10

```python { .api }

11

def stringify(element, newlines=True) -> str:

12

"""

13

Return the raw text version of an element and its children.

14

15

Parameters:

16

- element: Element to extract text from

17

- newlines: add newlines after paragraphs (default: True)

18

19

Returns:

20

str: Plain text representation of the element

21

22

Example:

23

import panflute as pf

24

25

# Extract text from complex elements

26

elem = pf.Para(

27

pf.Str('Hello '),

28

pf.Emph(pf.Str('beautiful')),

29

pf.Str(' world!')

30

)

31

text = pf.stringify(elem) # "Hello beautiful world!\n\n"

32

33

# Extract without paragraph newlines

34

text = pf.stringify(elem, newlines=False) # "Hello beautiful world!"

35

36

# Extract text from entire document

37

doc = pf.load()

38

full_text = pf.stringify(doc)

39

"""

40

```

41

42

### Document Format Conversion

43

44

Convert text between different formats using Pandoc's conversion capabilities.

45

46

```python { .api }

47

def convert_text(text,

48

input_format='markdown',

49

output_format='panflute',

50

standalone=False,

51

extra_args=None,

52

pandoc_path=None):

53

"""

54

Convert formatted text using Pandoc internally.

55

56

Parameters:

57

- text: text to convert (str, Element, or list of Elements)

58

- input_format: source format ('markdown', 'latex', 'html', 'panflute', etc.)

59

- output_format: target format ('panflute', 'html', 'latex', 'markdown', etc.)

60

- standalone: create standalone document (default: False)

61

- extra_args: additional Pandoc arguments (list)

62

- pandoc_path: path to pandoc executable (optional)

63

64

Returns:

65

list|Doc|str: Converted content (type depends on output_format)

66

67

Example:

68

import panflute as pf

69

70

# Convert Markdown to panflute elements

71

md_text = "This is *emphasized* text with **bold** formatting."

72

elements = pf.convert_text(md_text, input_format='markdown')

73

# Returns: [Para(Str(This) Space Str(is) Space Emph(Str(emphasized)) ...)]

74

75

# Convert panflute elements to HTML

76

para = pf.Para(pf.Str('Hello '), pf.Strong(pf.Str('world')))

77

html = pf.convert_text(para, input_format='panflute', output_format='html')

78

# Returns: "<p>Hello <strong>world</strong></p>"

79

80

# Create standalone document

81

doc = pf.convert_text(md_text, standalone=True)

82

# Returns: Doc object with metadata and proper structure

83

84

# Use custom Pandoc arguments

85

latex = pf.convert_text(

86

md_text,

87

output_format='latex',

88

extra_args=['--template=custom.tex', '--variable=fontsize:12pt']

89

)

90

"""

91

```

92

93

### YAML Code Block Processing

94

95

Parse and process code blocks with YAML frontmatter for dynamic content generation.

96

97

```python { .api }

98

def yaml_filter(element, doc, tag=None, function=None, tags=None, strict_yaml=False):

99

"""

100

Convenience function for parsing code blocks with YAML options.

101

102

Parameters:

103

- element: current element being processed

104

- doc: document being filtered

105

- tag: class name to match (str)

106

- function: function to call for matching blocks

107

- tags: dict mapping class names to functions

108

- strict_yaml: require explicit YAML delimiters (default: False)

109

110

The function parameter receives (options, data, element, doc):

111

- options: parsed YAML dict

112

- data: remaining code content after YAML

113

- element: original CodeBlock element

114

- doc: document being processed

115

116

Example:

117

import panflute as pf

118

119

def process_chart(options, data, element, doc):

120

chart_type = options.get('type', 'bar')

121

title = options.get('title', 'Chart')

122

123

# Generate chart based on options and data

124

return pf.Para(pf.Str(f"Generated {chart_type} chart: {title}"))

125

126

def filter_func(elem, doc):

127

return pf.yaml_filter(elem, doc, tag='chart', function=process_chart)

128

129

if __name__ == '__main__':

130

pf.run_filter(filter_func)

131

132

# Processes code blocks like:

133

# ```chart

134

# type: line

135

# title: Sales Data

136

# ---

137

# January: 100

138

# February: 150

139

# March: 120

140

# ```

141

"""

142

```

143

144

### External Command Execution

145

146

Execute external commands and shell scripts from within filters.

147

148

```python { .api }

149

def shell(args, wait=True, msg=None):

150

"""

151

Execute external command and get its output.

152

153

Parameters:

154

- args: command and arguments (str or list)

155

- wait: wait for command completion (default: True)

156

- msg: input message to send to command (bytes, optional)

157

158

Returns:

159

bytes: command output (if wait=True)

160

161

Raises:

162

IOError: if command fails (non-zero exit code)

163

164

Example:

165

import panflute as pf

166

167

# Run a simple command

168

output = pf.shell(['ls', '-la'])

169

170

# Run with input

171

result = pf.shell('grep -i python', msg=b'This is Python code\\nThis is Java code\\n')

172

173

# Run command with string (automatically parsed)

174

output = pf.shell('pandoc --version')

175

176

# Run without waiting (fire and forget)

177

pf.shell(['notify-send', 'Filter completed'], wait=False)

178

"""

179

180

def run_pandoc(text='', args=None, pandoc_path=None) -> str:

181

"""

182

Low-level function to call Pandoc with input text and arguments.

183

184

Parameters:

185

- text: input text to process (str)

186

- args: Pandoc command-line arguments (list)

187

- pandoc_path: path to pandoc executable (optional)

188

189

Returns:

190

str: Pandoc output

191

192

Example:

193

import panflute as pf

194

195

# Get Pandoc version

196

version = pf.run_pandoc(args=['--version'])

197

198

# Convert markdown to HTML

199

html = pf.run_pandoc(

200

'# Hello\\n\\nThis is **markdown**.',

201

args=['--from=markdown', '--to=html']

202

)

203

204

# Use specific Pandoc installation

205

output = pf.run_pandoc(

206

'Some text',

207

args=['--to=latex'],

208

pandoc_path='/usr/local/bin/pandoc'

209

)

210

"""

211

```

212

213

### Metadata and Options Handling

214

215

Retrieve configuration options from multiple sources with fallback logic.

216

217

```python { .api }

218

def get_option(options=None, local_tag=None, doc=None, doc_tag=None, default=None, error_on_none=True):

219

"""

220

Fetch option from element attributes, document metadata, or default value.

221

222

Parameters:

223

- options: element attributes dict (local level)

224

- local_tag: attribute key to look for (str)

225

- doc: document object (for metadata access)

226

- doc_tag: metadata key to look for (str, supports dot notation)

227

- default: fallback value if not found

228

- error_on_none: raise ValueError if no value found (default: True)

229

230

Returns:

231

any: Retrieved option value

232

233

The search order is: local > document > default

234

235

Example:

236

import panflute as pf

237

238

def process_div(elem, doc):

239

if isinstance(elem, pf.Div):

240

# Get style from div attributes, fallback to document metadata

241

style = pf.get_option(

242

elem.attributes, 'style',

243

doc, 'default-div-style',

244

default='bordered'

245

)

246

247

# Get nested metadata with dot notation

248

font_size = pf.get_option(

249

None, None,

250

doc, 'formatting.font.size',

251

default='12pt'

252

)

253

254

elem.attributes['data-style'] = style

255

elem.attributes['data-font-size'] = font_size

256

257

if __name__ == '__main__':

258

pf.run_filter(process_div)

259

"""

260

261

def meta2builtin(meta):

262

"""

263

Convert MetaValue elements to Python builtin types.

264

265

Parameters:

266

- meta: MetaValue element to convert

267

268

Returns:

269

any: Python builtin equivalent (str, bool, list, dict, etc.)

270

271

Conversion rules:

272

- MetaBool -> bool

273

- MetaString -> str

274

- MetaList -> list (recursively converted)

275

- MetaMap -> dict (recursively converted)

276

- MetaInlines/MetaBlocks -> str (via stringify)

277

278

Example:

279

import panflute as pf

280

281

# Convert metadata to Python types

282

doc = pf.load()

283

284

# Convert MetaBool to bool

285

show_toc = pf.meta2builtin(doc.metadata.get('show-toc')) # True/False

286

287

# Convert MetaList to list

288

authors = pf.meta2builtin(doc.metadata.get('authors')) # ['John', 'Jane']

289

290

# Convert MetaMap to dict

291

settings = pf.meta2builtin(doc.metadata.get('settings')) # {'key': 'value'}

292

"""

293

```

294

295

### Pandoc Version Information

296

297

Access runtime Pandoc version and configuration information.

298

299

```python { .api }

300

class PandocVersion:

301

"""

302

Get runtime Pandoc version and configuration.

303

304

Use PandocVersion().version for comparing versions.

305

Lazily calls pandoc --version only once.

306

307

Methods:

308

- __str__(): return version string (e.g., "2.19.2")

309

- __repr__(): return full pandoc --version output

310

311

Properties:

312

- version: tuple of version numbers for comparison

313

- data_dir: list of Pandoc data directories (with /filters appended)

314

315

Example:

316

import panflute as pf

317

318

pv = pf.PandocVersion()

319

print(str(pv)) # "2.19.2"

320

print(pv.version) # (2, 19, 2)

321

print(pv.data_dir) # ['/home/user/.local/share/pandoc/filters', ...]

322

323

# Version comparison

324

if pv.version >= (2, 17):

325

# Use newer Pandoc features

326

pass

327

"""

328

329

def __init__(self): ...

330

def __str__(self) -> str: ...

331

def __repr__(self) -> str: ...

332

333

@property

334

def version(self) -> tuple: ...

335

336

@property

337

def data_dir(self) -> list: ...

338

339

# Global instance for convenient access

340

pandoc_version: PandocVersion

341

```

342

343

### Debug Output

344

345

Print debug messages to stderr without interfering with Pandoc processing.

346

347

```python { .api }

348

def debug(*args, **kwargs):

349

"""

350

Same as print, but prints to stderr (which is not intercepted by Pandoc).

351

352

Parameters:

353

- *args: arguments to print (same as print())

354

- **kwargs: keyword arguments (same as print())

355

356

Example:

357

import panflute as pf

358

359

def my_filter(elem, doc):

360

if isinstance(elem, pf.Header):

361

pf.debug(f"Processing header: {pf.stringify(elem)}")

362

pf.debug("Header level:", elem.level)

363

return elem

364

365

if __name__ == '__main__':

366

pf.run_filter(my_filter)

367

"""

368

```

369

370

### Element Keyword Replacement

371

372

Replace specific text strings with element structures throughout documents.

373

374

```python { .api }

375

# Method added to Element class

376

def replace_keyword(self, keyword: str, replacement, count=0):

377

"""

378

Replace keyword strings with replacement elements.

379

380

Parameters:

381

- keyword: exact text string to find and replace

382

- replacement: Element to substitute (Inline or Block)

383

- count: maximum replacements (0 = unlimited)

384

385

Returns:

386

Element: modified element tree

387

388

Example:

389

import panflute as pf

390

391

# Replace text with styled elements

392

doc = pf.load()

393

doc.replace_keyword('TODO', pf.Strong(pf.Str('⚠️ TODO')))

394

395

# Replace with block elements (replaces parent if needed)

396

doc.replace_keyword('PAGEBREAK', pf.RawBlock('\\newpage', 'latex'))

397

398

# Limited replacements

399

doc.replace_keyword('DRAFT', pf.Emph(pf.Str('DRAFT')), count=3)

400

401

pf.dump(doc)

402

"""

403

```

404

405

## Usage Examples

406

407

### Advanced Text Processing Pipeline

408

409

```python

410

import panflute as pf

411

import re

412

413

def process_special_syntax(elem, doc):

414

"""Process custom syntax in text elements."""

415

if isinstance(elem, pf.Str):

416

text = elem.text

417

418

# Convert @mentions to links

419

text = re.sub(

420

r'@(\w+)',

421

lambda m: f'[@{m.group(1)}](https://github.com/{m.group(1)})',

422

text

423

)

424

425

# Convert [[wikilinks]] to proper links

426

text = re.sub(

427

r'\[\[([^\]]+)\]\]',

428

lambda m: f'[{m.group(1)}](wiki/{m.group(1).replace(" ", "_")})',

429

text

430

)

431

432

if text != elem.text:

433

# Convert back to elements if changed

434

elements = pf.convert_text(text, input_format='markdown')

435

return elements if len(elements) > 1 else elements[0].content

436

437

def generate_bibliography(elem, doc):

438

"""Generate bibliography from citations."""

439

if isinstance(elem, pf.Cite):

440

if not hasattr(doc, 'citations'):

441

doc.citations = set()

442

443

for citation in elem.citations:

444

doc.citations.add(citation.id)

445

446

def finalize_document(doc):

447

"""Add bibliography section to document."""

448

if hasattr(doc, 'citations') and doc.citations:

449

bib_header = pf.Header(pf.Str('References'), level=2)

450

bib_list = pf.BulletList()

451

452

for citation_id in sorted(doc.citations):

453

# Load citation details (would normally come from database/file)

454

bib_item = pf.ListItem(pf.Plain(pf.Str(f'Reference for {citation_id}')))

455

bib_list.content.append(bib_item)

456

457

doc.content.extend([bib_header, bib_list])

458

459

if __name__ == '__main__':

460

pf.run_filters(

461

[process_special_syntax, generate_bibliography],

462

finalize=finalize_document

463

)

464

```

465

466

### Dynamic Content Generation

467

468

```python

469

import panflute as pf

470

import json

471

from datetime import datetime

472

473

def process_data_blocks(options, data, element, doc):

474

"""Generate charts and tables from YAML + data."""

475

chart_type = options.get('type', 'table')

476

title = options.get('title', 'Data')

477

478

# Parse data section

479

lines = [line.strip() for line in data.split('\n') if line.strip()]

480

data_dict = {}

481

482

for line in lines:

483

if ':' in line:

484

key, value = line.split(':', 1)

485

try:

486

data_dict[key.strip()] = float(value.strip())

487

except ValueError:

488

data_dict[key.strip()] = value.strip()

489

490

if chart_type == 'table':

491

# Generate table

492

header_row = pf.TableRow(

493

pf.TableCell(pf.Plain(pf.Str('Item'))),

494

pf.TableCell(pf.Plain(pf.Str('Value')))

495

)

496

497

data_rows = []

498

for key, value in data_dict.items():

499

row = pf.TableRow(

500

pf.TableCell(pf.Plain(pf.Str(key))),

501

pf.TableCell(pf.Plain(pf.Str(str(value))))

502

)

503

data_rows.append(row)

504

505

table = pf.Table(

506

pf.TableHead(header_row),

507

pf.TableBody(*data_rows),

508

caption=pf.Caption(pf.Plain(pf.Str(title)))

509

)

510

return table

511

512

elif chart_type == 'summary':

513

# Generate summary paragraph

514

total = sum(v for v in data_dict.values() if isinstance(v, (int, float)))

515

count = len(data_dict)

516

avg = total / count if count > 0 else 0

517

518

summary = pf.Para(

519

pf.Strong(pf.Str(f'{title}: ')),

520

pf.Str(f'{count} items, total: {total:.2f}, average: {avg:.2f}')

521

)

522

return summary

523

524

return element # Fallback

525

526

def data_filter(elem, doc):

527

"""Apply YAML filter to data blocks."""

528

return pf.yaml_filter(elem, doc, tag='data', function=process_data_blocks)

529

530

if __name__ == '__main__':

531

pf.run_filter(data_filter)

532

```