Tessl Tile for pypi/markdown-it-py@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli.md configuration.md core-parsing.md index.md link-processing.md rendering.md syntax-tree.md token-system.md

link-processing.mddocs/

0
# Link Processing and Security
1

2
URL validation, normalization, and link processing utilities with built-in security features to prevent XSS attacks and ensure safe link handling in markdown documents.
3

4
## Capabilities
5

6
### URL Validation
7

8
Security-focused URL validation to prevent malicious links.
9

10
```python { .api }
11
def validateLink(self, url: str) -> bool:
12
    """
13
    Validate if URL link is allowed in output.
14
    
15
    This validator can prohibit more than really needed to prevent XSS.
16
    It's a tradeoff to keep code simple and to be secure by default.
17
    
18
    Parameters:
19
    - url: URL to validate (should be normalized and entities decoded)
20
    
21
    Returns:
22
    - bool: True if URL is considered safe
23
    """
24
```
25

26
**Usage Example:**
27

28
```python
29
from markdown_it import MarkdownIt
30

31
md = MarkdownIt()
32

33
# Test URL validation
34
safe_urls = [
35
    "https://example.com",
36
    "http://example.com/path",
37
    "mailto:user@example.com",
38
    "/relative/path",
39
    "#anchor"
40
]
41

42
unsafe_urls = [
43
    "javascript:alert('xss')",
44
    "data:text/html,<script>alert('xss')</script>",
45
    "vbscript:msgbox('xss')"
46
]
47

48
for url in safe_urls:
49
    print(f"{url}: {md.validateLink(url)}")  # Should be True
50

51
for url in unsafe_urls:
52
    print(f"{url}: {md.validateLink(url)}")  # Should be False
53
```
54

55
### URL Normalization
56

57
Normalize URLs for consistency and security.
58

59
```python { .api }
60
def normalizeLink(self, url: str) -> str:
61
    """
62
    Normalize destination URLs in links.
63
    
64
    Used for link destinations like:
65
    [label]: destination   'title'
66
             ^^^^^^^^^^^
67
    
68
    Parameters:
69
    - url: raw URL to normalize
70
    
71
    Returns:
72
    - str: normalized URL
73
    """
74

75
def normalizeLinkText(self, link: str) -> str:
76
    """
77
    Normalize autolink content.
78
    
79
    Used for autolink content like:
80
    <destination>
81
     ~~~~~~~~~~~
82
    
83
    Parameters:
84
    - link: raw link text to normalize
85
    
86
    Returns:
87
    - str: normalized link text
88
    """
89
```
90

91
**Usage Example:**
92

93
```python
94
from markdown_it import MarkdownIt
95

96
md = MarkdownIt()
97

98
# URL normalization
99
raw_url = "HTTP://EXAMPLE.COM/Path With Spaces"
100
normalized = md.normalizeLink(raw_url)
101
print(normalized)  # "http://example.com/Path%20With%20Spaces"
102

103
# Link text normalization
104
raw_link = "www.example.com/path"
105
normalized_text = md.normalizeLinkText(raw_link)
106
print(normalized_text)  # Normalized for display
107
```
108

109
### Link Helper Functions
110

111
Low-level utilities for parsing link components.
112

113
```python { .api }
114
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
115

116
def parseLinkDestination(str: str, pos: int, max: int) -> dict:
117
    """
118
    Parse link destination from input string.
119
    
120
    Parameters:
121
    - str: input string
122
    - pos: starting position
123
    - max: maximum position
124
    
125
    Returns:
126
    - dict: {ok: bool, pos: int, str: str} - parse result
127
    """
128

129
def parseLinkLabel(str: str, pos: int, max: int) -> dict:
130
    """
131
    Parse link label from input string.
132
    
133
    Parameters:
134
    - str: input string  
135
    - pos: starting position
136
    - max: maximum position
137
    
138
    Returns:
139
    - dict: {ok: bool, pos: int, str: str} - parse result
140
    """
141

142
def parseLinkTitle(str: str, pos: int, max: int) -> dict:
143
    """
144
    Parse link title from input string.
145
    
146
    Parameters:
147
    - str: input string
148
    - pos: starting position  
149
    - max: maximum position
150
    
151
    Returns:
152
    - dict: {ok: bool, pos: int, str: str, marker: str} - parse result
153
    """
154
```
155

156
**Usage Example:**
157

158
```python
159
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
160

161
# Parse link destination
162
text = '<https://example.com> "Title"'
163
result = parseLinkDestination(text, 1, len(text) - 1)
164
print(result)  # {ok: True, pos: 19, str: 'https://example.com'}
165

166
# Parse link label
167
text = '[Link Text]'
168
result = parseLinkLabel(text, 0, len(text))
169
print(result)  # {ok: True, pos: 11, str: 'Link Text'}
170

171
# Parse link title
172
text = '"Title Here"'
173
result = parseLinkTitle(text, 0, len(text))
174
print(result)  # {ok: True, pos: 12, str: 'Title Here', marker: '"'}
175
```
176

177
## Security Features
178

179
### XSS Prevention
180

181
Built-in protection against cross-site scripting attacks:
182

183
```python
184
def custom_link_validator(url):
185
    """Custom link validation with additional security checks."""
186
    from markdown_it.common.normalize_url import validateLink
187
    
188
    # Use built-in validation first
189
    if not validateLink(url):
190
        return False
191
    
192
    # Additional custom checks
193
    lower_url = url.lower()
194
    
195
    # Block additional dangerous protocols
196
    dangerous_protocols = ['file:', 'ftp:', 'news:', 'gopher:']
197
    if any(lower_url.startswith(proto) for proto in dangerous_protocols):
198
        return False
199
    
200
    # Block URLs with suspicious patterns
201
    suspicious_patterns = ['<script', 'javascript:', 'vbscript:', 'data:']
202
    if any(pattern in lower_url for pattern in suspicious_patterns):
203
        return False
204
    
205
    return True
206

207
# Override validation in renderer
208
def secure_link_open(tokens, idx, options, env):
209
    """Secure link rendering with validation."""
210
    token = tokens[idx]
211
    href = token.attrGet("href")
212
    
213
    if href and not custom_link_validator(href):
214
        # Replace with safe placeholder
215
        token.attrSet("href", "#invalid-link")
216
        token.attrSet("class", "invalid-link")
217
        token.attrSet("title", "Invalid or potentially unsafe link")
218
    
219
    return default_link_open(tokens, idx, options, env)
220
```
221

222
### Content Security
223

224
Sanitize and validate link content:
225

226
```python
227
def sanitize_link_content(tokens):
228
    """Sanitize link tokens for security."""
229
    for token in tokens:
230
        if token.type == "link_open":
231
            href = token.attrGet("href")
232
            if href:
233
                # Normalize URL
234
                from markdown_it.common.normalize_url import normalizeLink
235
                normalized_href = normalizeLink(href)
236
                
237
                # Validate normalized URL
238
                from markdown_it.common.normalize_url import validateLink
239
                if validateLink(normalized_href):
240
                    token.attrSet("href", normalized_href)
241
                    # Add security attributes
242
                    if normalized_href.startswith(('http://', 'https://')):
243
                        token.attrSet("rel", "noopener noreferrer")
244
                        token.attrSet("target", "_blank")
245
                else:
246
                    # Remove unsafe link
247
                    token.type = "text"
248
                    token.tag = ""
249
                    token.content = href
250
        
251
        elif token.type == "image":
252
            src = token.attrGet("src")
253
            if src:
254
                # Validate image URLs
255
                from markdown_it.common.normalize_url import normalizeLink, validateLink
256
                normalized_src = normalizeLink(src)
257
                if validateLink(normalized_src):
258
                    token.attrSet("src", normalized_src)
259
                else:
260
                    # Remove unsafe image
261
                    token.attrSet("src", "")
262
                    token.attrSet("alt", f"[Invalid image: {src}]")
263
    
264
    return tokens
265
```
266

267
## Link Processing Utilities
268

269
### Reference Link Handling
270

271
Process reference-style links and their definitions:
272

273
```python
274
def extract_reference_links(env):
275
    """Extract reference link definitions from environment."""
276
    references = env.get('references', {})
277
    
278
    links = []
279
    for label, ref_data in references.items():
280
        links.append({
281
            'label': label,
282
            'href': ref_data.get('href', ''),
283
            'title': ref_data.get('title', '')
284
        })
285
    
286
    return links
287

288
def add_reference_link(env, label, href, title=""):
289
    """Add reference link definition to environment."""
290
    if 'references' not in env:
291
        env['references'] = {}
292
    
293
    env['references'][label.lower()] = {
294
        'href': href,
295
        'title': title
296
    }
297

298
# Usage
299
md = MarkdownIt()
300
env = {}
301

302
# Parse markdown with reference links
303
text = """
304
[Link 1][ref1]
305
[Link 2][ref2]
306

307
[ref1]: https://example.com "Example"
308
[ref2]: https://another.com
309
"""
310

311
tokens = md.parse(text, env)
312
references = extract_reference_links(env)
313

314
for ref in references:
315
    print(f"Reference '{ref['label']}': {ref['href']}")
316
```
317

318
### Autolink Processing
319

320
Handle automatic link detection and processing:
321

322
```python
323
def extract_autolinks(tokens):
324
    """Extract automatically detected links from tokens."""
325
    autolinks = []
326
    
327
    for token in tokens:
328
        if token.type == "link_open" and token.info == "auto":
329
            # This is an autolink
330
            href = token.attrGet("href")
331
            autolinks.append(href)
332
        elif token.children:
333
            # Recursively check children
334
            autolinks.extend(extract_autolinks(token.children))
335
    
336
    return autolinks
337

338
def disable_autolinks_for_domains(md, blocked_domains):
339
    """Disable autolink processing for specific domains."""
340
    original_linkify = md.core.ruler.getRules("")[3]  # linkify rule
341
    
342
    def filtered_linkify(state):
343
        # Run original linkify
344
        original_linkify(state)
345
        
346
        # Filter out blocked domains
347
        for token in state.tokens:
348
            if (token.type == "inline" and token.children):
349
                for child in token.children:
350
                    if (child.type == "link_open" and 
351
                        child.info == "auto"):
352
                        href = child.attrGet("href")
353
                        if any(domain in href for domain in blocked_domains):
354
                            # Convert back to text
355
                            child.type = "text"
356
                            child.content = href
357
    
358
    # Replace linkify rule
359
    md.core.ruler.at("linkify", filtered_linkify)
360
```
361

362
### Link Analysis
363

364
Analyze and report on links in documents:
365

366
```python
367
def analyze_links(tokens):
368
    """Analyze all links in token stream."""
369
    analysis = {
370
        'total_links': 0,
371
        'external_links': 0,
372
        'internal_links': 0,
373
        'reference_links': 0,
374
        'autolinks': 0,
375
        'images': 0,
376
        'broken_links': [],
377
        'domains': set()
378
    }
379
    
380
    def analyze_token_links(token_list):
381
        for token in token_list:
382
            if token.type == "link_open":
383
                analysis['total_links'] += 1
384
                href = token.attrGet("href")
385
                
386
                if token.info == "auto":
387
                    analysis['autolinks'] += 1
388
                
389
                if href:
390
                    if href.startswith(('http://', 'https://')):
391
                        analysis['external_links'] += 1
392
                        # Extract domain
393
                        from urllib.parse import urlparse
394
                        domain = urlparse(href).netloc
395
                        analysis['domains'].add(domain)
396
                    elif href.startswith('#'):
397
                        analysis['internal_links'] += 1
398
                    elif not href:
399
                        analysis['broken_links'].append(token)
400
                        
401
            elif token.type == "image":
402
                analysis['images'] += 1
403
                src = token.attrGet("src")
404
                if src and src.startswith(('http://', 'https://')):
405
                    from urllib.parse import urlparse
406
                    domain = urlparse(src).netloc
407
                    analysis['domains'].add(domain)
408
            
409
            elif token.children:
410
                analyze_token_links(token.children)
411
    
412
    analyze_token_links(tokens)
413
    analysis['domains'] = list(analysis['domains'])
414
    
415
    return analysis
416

417
# Usage
418
md = MarkdownIt('gfm-like')
419
tokens = md.parse("""
420
# Document
421

422
[External link](https://example.com)
423
[Internal link](#section)
424
https://auto.link.com
425
![Image](https://images.example.com/pic.jpg)
426
""")
427

428
link_analysis = analyze_links(tokens)
429
print(f"Found {link_analysis['total_links']} links")
430
print(f"External domains: {link_analysis['domains']}")
431
```

Version

Tile

Files

link-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

link-processing.mddocs/