0
# Syntax Tree Processing
1
2
Tree representation utilities for converting linear token streams into hierarchical structures for advanced document analysis and manipulation. This module is unique to the Python implementation and not part of the original JavaScript markdown-it.
3
4
## Capabilities
5
6
### SyntaxTreeNode Class
7
8
Hierarchical representation of markdown document structure.
9
10
```python { .api }
11
class SyntaxTreeNode:
12
"""
13
A Markdown syntax tree node representing either:
14
- Root of the document
15
- Single unnested token
16
- Token pair (open/close) with nested content
17
"""
18
19
def __init__(self, tokens: list[Token] = (), *, create_root: bool = True):
20
"""
21
Initialize syntax tree from token stream.
22
23
Parameters:
24
- tokens: token stream to convert to tree
25
- create_root: whether to create a root node for the document
26
"""
27
28
# Properties
29
token: Token | None # Associated token (for leaf nodes)
30
nester_tokens: tuple[Token, Token] | None # Opening/closing token pair (for containers)
31
parent: SyntaxTreeNode | None # Parent node
32
children: list[SyntaxTreeNode] # Child nodes
33
```
34
35
### Tree Construction
36
37
Build tree structures from token streams:
38
39
```python { .api }
40
# Class methods for tree creation
41
@classmethod
42
def from_tokens(cls, tokens: list[Token]) -> SyntaxTreeNode:
43
"""
44
Create syntax tree from token list.
45
46
Parameters:
47
- tokens: list of tokens to convert
48
49
Returns:
50
- SyntaxTreeNode: root node of constructed tree
51
"""
52
```
53
54
**Usage Example:**
55
56
```python
57
from markdown_it import MarkdownIt
58
from markdown_it.tree import SyntaxTreeNode
59
60
md = MarkdownIt()
61
tokens = md.parse("""
62
# Heading
63
64
Paragraph with **bold** text.
65
66
- Item 1
67
- Item 2
68
""")
69
70
# Create syntax tree
71
tree = SyntaxTreeNode(tokens)
72
73
# Access tree structure
74
print(f"Root has {len(tree.children)} children")
75
for child in tree.children:
76
print(f"Child type: {child.token.type if child.token else 'container'}")
77
```
78
79
### Tree Traversal
80
81
Navigate and inspect tree structure:
82
83
```python { .api }
84
def walk(self, filter: callable = None) -> Generator[SyntaxTreeNode, None, None]:
85
"""
86
Walk the tree depth-first, yielding nodes.
87
88
Parameters:
89
- filter: optional function to filter nodes
90
91
Yields:
92
- SyntaxTreeNode: tree nodes in depth-first order
93
"""
94
95
@property
96
def is_root(self) -> bool:
97
"""True if this is the root node."""
98
99
@property
100
def is_leaf(self) -> bool:
101
"""True if this node has no children."""
102
103
@property
104
def is_container(self) -> bool:
105
"""True if this node represents a token pair container."""
106
```
107
108
**Usage Example:**
109
110
```python
111
from markdown_it.tree import SyntaxTreeNode
112
113
# Tree traversal
114
for node in tree.walk():
115
if node.token and node.token.type == "heading_open":
116
level = int(node.token.tag[1]) # h1->1, h2->2, etc.
117
print(f"Found heading level {level}")
118
119
# Filter specific node types
120
def is_paragraph(node):
121
return node.token and node.token.type == "paragraph_open"
122
123
for para_node in tree.walk(filter=is_paragraph):
124
print("Found paragraph")
125
126
# Check node types
127
for node in tree.children:
128
if node.is_container:
129
print(f"Container with {len(node.children)} children")
130
elif node.is_leaf:
131
print(f"Leaf node: {node.token.type}")
132
```
133
134
### Tree Manipulation
135
136
Modify tree structure and content:
137
138
```python { .api }
139
def remove_child(self, child: SyntaxTreeNode) -> None:
140
"""
141
Remove child node from this node.
142
143
Parameters:
144
- child: child node to remove
145
"""
146
147
def add_child(self, child: SyntaxTreeNode) -> None:
148
"""
149
Add child node to this node.
150
151
Parameters:
152
- child: child node to add
153
"""
154
155
def replace_child(self, old_child: SyntaxTreeNode, new_child: SyntaxTreeNode) -> None:
156
"""
157
Replace existing child with new child.
158
159
Parameters:
160
- old_child: child to replace
161
- new_child: replacement child
162
"""
163
```
164
165
**Usage Example:**
166
167
```python
168
from markdown_it.tree import SyntaxTreeNode
169
from markdown_it.token import Token
170
171
# Create new nodes
172
new_token = Token("div_open", "div", 1)
173
new_node = SyntaxTreeNode()
174
new_node.token = new_token
175
176
# Add to tree
177
tree.add_child(new_node)
178
179
# Remove nodes
180
for node in list(tree.children): # Copy list since we're modifying
181
if node.token and node.token.type == "hr":
182
tree.remove_child(node)
183
```
184
185
### Tree Conversion
186
187
Convert between tree and token representations:
188
189
```python { .api }
190
def to_tokens(self) -> list[Token]:
191
"""
192
Convert tree back to linear token stream.
193
194
Returns:
195
- list[Token]: linearized token representation
196
"""
197
198
def to_pretty(self, *, indent: int = 2, show_text: bool = False) -> str:
199
"""
200
Generate pretty-printed tree representation.
201
202
Parameters:
203
- indent: indentation spaces per level
204
- show_text: whether to show text content
205
206
Returns:
207
- str: formatted tree structure
208
"""
209
```
210
211
**Usage Example:**
212
213
```python
214
from markdown_it import MarkdownIt
215
from markdown_it.tree import SyntaxTreeNode
216
217
md = MarkdownIt()
218
tokens = md.parse("# Title\n\nParagraph text.")
219
220
# Token stream -> Tree -> Token stream
221
tree = SyntaxTreeNode(tokens)
222
reconstructed_tokens = tree.to_tokens()
223
224
# Verify round-trip consistency
225
original_html = md.renderer.render(tokens, md.options, {})
226
reconstructed_html = md.renderer.render(reconstructed_tokens, md.options, {})
227
assert original_html == reconstructed_html
228
229
# Pretty print tree structure
230
print(tree.to_pretty(show_text=True))
231
```
232
233
## Advanced Tree Operations
234
235
### Content Extraction
236
237
Extract specific content from tree structure:
238
239
```python
240
def extract_headings(tree):
241
"""Extract all headings with their levels and text."""
242
headings = []
243
244
for node in tree.walk():
245
if (node.is_container and
246
node.nester_tokens and
247
node.nester_tokens[0].type == "heading_open"):
248
249
level = int(node.nester_tokens[0].tag[1])
250
251
# Find text content in children
252
text = ""
253
for child in node.children:
254
if child.token and child.token.type == "inline":
255
text = child.token.content
256
break
257
258
headings.append({
259
'level': level,
260
'text': text,
261
'node': node
262
})
263
264
return headings
265
266
def extract_links(tree):
267
"""Extract all links with URLs and text."""
268
links = []
269
270
for node in tree.walk():
271
if (node.is_container and
272
node.nester_tokens and
273
node.nester_tokens[0].type == "link_open"):
274
275
href = node.nester_tokens[0].attrGet("href")
276
277
# Extract link text
278
text = ""
279
for child in node.children:
280
if child.token and child.token.type == "text":
281
text = child.token.content
282
break
283
284
links.append({
285
'url': href,
286
'text': text,
287
'node': node
288
})
289
290
return links
291
```
292
293
### Tree Transformation
294
295
Transform tree structure for custom processing:
296
297
```python
298
def wrap_paragraphs_in_divs(tree):
299
"""Wrap all paragraphs in div containers."""
300
from markdown_it.token import Token
301
302
for node in list(tree.children): # Copy since we're modifying
303
if (node.is_container and
304
node.nester_tokens and
305
node.nester_tokens[0].type == "paragraph_open"):
306
307
# Create wrapper div
308
div_open = Token("div_open", "div", 1)
309
div_open.attrSet("class", "paragraph-wrapper")
310
div_close = Token("div_close", "div", -1)
311
312
# Create new container node
313
wrapper_node = SyntaxTreeNode()
314
wrapper_node.parent = tree
315
wrapper_node.nester_tokens = (div_open, div_close)
316
wrapper_node.children = [node]
317
318
# Update parent relationships
319
node.parent = wrapper_node
320
321
# Replace in tree
322
tree.replace_child(node, wrapper_node)
323
324
def add_table_of_contents(tree):
325
"""Add table of contents based on headings."""
326
headings = extract_headings(tree)
327
328
if not headings:
329
return
330
331
# Create TOC tokens
332
toc_tokens = [
333
Token("div_open", "div", 1, attrs={"class": "table-of-contents"}),
334
Token("heading_open", "h2", 1),
335
Token("inline", "", 0, content="Table of Contents"),
336
Token("heading_close", "h2", -1),
337
Token("bullet_list_open", "ul", 1)
338
]
339
340
for heading in headings:
341
toc_tokens.extend([
342
Token("list_item_open", "li", 1),
343
Token("paragraph_open", "p", 1),
344
Token("link_open", "a", 1, attrs={"href": f"#{heading['text'].lower().replace(' ', '-')}"}),
345
Token("inline", "", 0, content=heading['text']),
346
Token("link_close", "a", -1),
347
Token("paragraph_close", "p", -1),
348
Token("list_item_close", "li", -1)
349
])
350
351
toc_tokens.extend([
352
Token("bullet_list_close", "ul", -1),
353
Token("div_close", "div", -1)
354
])
355
356
# Create TOC tree node
357
toc_tree = SyntaxTreeNode(toc_tokens, create_root=False)
358
359
# Insert at beginning
360
tree.children.insert(0, toc_tree)
361
toc_tree.parent = tree
362
```
363
364
### Tree Analysis
365
366
Analyze document structure using tree representation:
367
368
```python
369
def analyze_document_structure(tree):
370
"""Analyze document structure and return statistics."""
371
stats = {
372
'total_nodes': 0,
373
'headings': [],
374
'paragraphs': 0,
375
'lists': 0,
376
'code_blocks': 0,
377
'links': 0,
378
'images': 0,
379
'max_nesting_level': 0
380
}
381
382
def analyze_node(node, level=0):
383
stats['total_nodes'] += 1
384
stats['max_nesting_level'] = max(stats['max_nesting_level'], level)
385
386
if node.token:
387
token_type = node.token.type
388
if token_type == "heading_open":
389
stats['headings'].append(int(node.token.tag[1]))
390
elif token_type == "paragraph_open":
391
stats['paragraphs'] += 1
392
elif token_type in ["bullet_list_open", "ordered_list_open"]:
393
stats['lists'] += 1
394
elif token_type in ["code_block", "fence"]:
395
stats['code_blocks'] += 1
396
elif token_type == "link_open":
397
stats['links'] += 1
398
elif token_type == "image":
399
stats['images'] += 1
400
401
for child in node.children:
402
analyze_node(child, level + 1)
403
404
for child in tree.children:
405
analyze_node(child)
406
407
return stats
408
409
# Usage
410
stats = analyze_document_structure(tree)
411
print(f"Document has {stats['paragraphs']} paragraphs")
412
print(f"Heading levels: {set(stats['headings'])}")
413
print(f"Maximum nesting: {stats['max_nesting_level']}")
414
```