0
# Link Processing and Security
1
2
URL validation, normalization, and link processing utilities with built-in security features to prevent XSS attacks and ensure safe link handling in markdown documents.
3
4
## Capabilities
5
6
### URL Validation
7
8
Security-focused URL validation to prevent malicious links.
9
10
```python { .api }
11
def validateLink(self, url: str) -> bool:
12
"""
13
Validate if URL link is allowed in output.
14
15
This validator can prohibit more than really needed to prevent XSS.
16
It's a tradeoff to keep code simple and to be secure by default.
17
18
Parameters:
19
- url: URL to validate (should be normalized and entities decoded)
20
21
Returns:
22
- bool: True if URL is considered safe
23
"""
24
```
25
26
**Usage Example:**
27
28
```python
29
from markdown_it import MarkdownIt
30
31
md = MarkdownIt()
32
33
# Test URL validation
34
safe_urls = [
35
"https://example.com",
36
"http://example.com/path",
37
"mailto:user@example.com",
38
"/relative/path",
39
"#anchor"
40
]
41
42
unsafe_urls = [
43
"javascript:alert('xss')",
44
"data:text/html,<script>alert('xss')</script>",
45
"vbscript:msgbox('xss')"
46
]
47
48
for url in safe_urls:
49
print(f"{url}: {md.validateLink(url)}") # Should be True
50
51
for url in unsafe_urls:
52
print(f"{url}: {md.validateLink(url)}") # Should be False
53
```
54
55
### URL Normalization
56
57
Normalize URLs for consistency and security.
58
59
```python { .api }
60
def normalizeLink(self, url: str) -> str:
61
"""
62
Normalize destination URLs in links.
63
64
Used for link destinations like:
65
[label]: destination 'title'
66
^^^^^^^^^^^
67
68
Parameters:
69
- url: raw URL to normalize
70
71
Returns:
72
- str: normalized URL
73
"""
74
75
def normalizeLinkText(self, link: str) -> str:
76
"""
77
Normalize autolink content.
78
79
Used for autolink content like:
80
<destination>
81
~~~~~~~~~~~
82
83
Parameters:
84
- link: raw link text to normalize
85
86
Returns:
87
- str: normalized link text
88
"""
89
```
90
91
**Usage Example:**
92
93
```python
94
from markdown_it import MarkdownIt
95
96
md = MarkdownIt()
97
98
# URL normalization
99
raw_url = "HTTP://EXAMPLE.COM/Path With Spaces"
100
normalized = md.normalizeLink(raw_url)
101
print(normalized) # "http://example.com/Path%20With%20Spaces"
102
103
# Link text normalization
104
raw_link = "www.example.com/path"
105
normalized_text = md.normalizeLinkText(raw_link)
106
print(normalized_text) # Normalized for display
107
```
108
109
### Link Helper Functions
110
111
Low-level utilities for parsing link components.
112
113
```python { .api }
114
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
115
116
def parseLinkDestination(str: str, pos: int, max: int) -> dict:
117
"""
118
Parse link destination from input string.
119
120
Parameters:
121
- str: input string
122
- pos: starting position
123
- max: maximum position
124
125
Returns:
126
- dict: {ok: bool, pos: int, str: str} - parse result
127
"""
128
129
def parseLinkLabel(str: str, pos: int, max: int) -> dict:
130
"""
131
Parse link label from input string.
132
133
Parameters:
134
- str: input string
135
- pos: starting position
136
- max: maximum position
137
138
Returns:
139
- dict: {ok: bool, pos: int, str: str} - parse result
140
"""
141
142
def parseLinkTitle(str: str, pos: int, max: int) -> dict:
143
"""
144
Parse link title from input string.
145
146
Parameters:
147
- str: input string
148
- pos: starting position
149
- max: maximum position
150
151
Returns:
152
- dict: {ok: bool, pos: int, str: str, marker: str} - parse result
153
"""
154
```
155
156
**Usage Example:**
157
158
```python
159
from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle
160
161
# Parse link destination
162
text = '<https://example.com> "Title"'
163
result = parseLinkDestination(text, 1, len(text) - 1)
164
print(result) # {ok: True, pos: 19, str: 'https://example.com'}
165
166
# Parse link label
167
text = '[Link Text]'
168
result = parseLinkLabel(text, 0, len(text))
169
print(result) # {ok: True, pos: 11, str: 'Link Text'}
170
171
# Parse link title
172
text = '"Title Here"'
173
result = parseLinkTitle(text, 0, len(text))
174
print(result) # {ok: True, pos: 12, str: 'Title Here', marker: '"'}
175
```
176
177
## Security Features
178
179
### XSS Prevention
180
181
Built-in protection against cross-site scripting attacks:
182
183
```python
184
def custom_link_validator(url):
185
"""Custom link validation with additional security checks."""
186
from markdown_it.common.normalize_url import validateLink
187
188
# Use built-in validation first
189
if not validateLink(url):
190
return False
191
192
# Additional custom checks
193
lower_url = url.lower()
194
195
# Block additional dangerous protocols
196
dangerous_protocols = ['file:', 'ftp:', 'news:', 'gopher:']
197
if any(lower_url.startswith(proto) for proto in dangerous_protocols):
198
return False
199
200
# Block URLs with suspicious patterns
201
suspicious_patterns = ['<script', 'javascript:', 'vbscript:', 'data:']
202
if any(pattern in lower_url for pattern in suspicious_patterns):
203
return False
204
205
return True
206
207
# Override validation in renderer
208
def secure_link_open(tokens, idx, options, env):
209
"""Secure link rendering with validation."""
210
token = tokens[idx]
211
href = token.attrGet("href")
212
213
if href and not custom_link_validator(href):
214
# Replace with safe placeholder
215
token.attrSet("href", "#invalid-link")
216
token.attrSet("class", "invalid-link")
217
token.attrSet("title", "Invalid or potentially unsafe link")
218
219
return default_link_open(tokens, idx, options, env)
220
```
221
222
### Content Security
223
224
Sanitize and validate link content:
225
226
```python
227
def sanitize_link_content(tokens):
228
"""Sanitize link tokens for security."""
229
for token in tokens:
230
if token.type == "link_open":
231
href = token.attrGet("href")
232
if href:
233
# Normalize URL
234
from markdown_it.common.normalize_url import normalizeLink
235
normalized_href = normalizeLink(href)
236
237
# Validate normalized URL
238
from markdown_it.common.normalize_url import validateLink
239
if validateLink(normalized_href):
240
token.attrSet("href", normalized_href)
241
# Add security attributes
242
if normalized_href.startswith(('http://', 'https://')):
243
token.attrSet("rel", "noopener noreferrer")
244
token.attrSet("target", "_blank")
245
else:
246
# Remove unsafe link
247
token.type = "text"
248
token.tag = ""
249
token.content = href
250
251
elif token.type == "image":
252
src = token.attrGet("src")
253
if src:
254
# Validate image URLs
255
from markdown_it.common.normalize_url import normalizeLink, validateLink
256
normalized_src = normalizeLink(src)
257
if validateLink(normalized_src):
258
token.attrSet("src", normalized_src)
259
else:
260
# Remove unsafe image
261
token.attrSet("src", "")
262
token.attrSet("alt", f"[Invalid image: {src}]")
263
264
return tokens
265
```
266
267
## Link Processing Utilities
268
269
### Reference Link Handling
270
271
Process reference-style links and their definitions:
272
273
```python
274
def extract_reference_links(env):
275
"""Extract reference link definitions from environment."""
276
references = env.get('references', {})
277
278
links = []
279
for label, ref_data in references.items():
280
links.append({
281
'label': label,
282
'href': ref_data.get('href', ''),
283
'title': ref_data.get('title', '')
284
})
285
286
return links
287
288
def add_reference_link(env, label, href, title=""):
289
"""Add reference link definition to environment."""
290
if 'references' not in env:
291
env['references'] = {}
292
293
env['references'][label.lower()] = {
294
'href': href,
295
'title': title
296
}
297
298
# Usage
299
md = MarkdownIt()
300
env = {}
301
302
# Parse markdown with reference links
303
text = """
304
[Link 1][ref1]
305
[Link 2][ref2]
306
307
[ref1]: https://example.com "Example"
308
[ref2]: https://another.com
309
"""
310
311
tokens = md.parse(text, env)
312
references = extract_reference_links(env)
313
314
for ref in references:
315
print(f"Reference '{ref['label']}': {ref['href']}")
316
```
317
318
### Autolink Processing
319
320
Handle automatic link detection and processing:
321
322
```python
323
def extract_autolinks(tokens):
324
"""Extract automatically detected links from tokens."""
325
autolinks = []
326
327
for token in tokens:
328
if token.type == "link_open" and token.info == "auto":
329
# This is an autolink
330
href = token.attrGet("href")
331
autolinks.append(href)
332
elif token.children:
333
# Recursively check children
334
autolinks.extend(extract_autolinks(token.children))
335
336
return autolinks
337
338
def disable_autolinks_for_domains(md, blocked_domains):
339
"""Disable autolink processing for specific domains."""
340
original_linkify = md.core.ruler.getRules("")[3] # linkify rule
341
342
def filtered_linkify(state):
343
# Run original linkify
344
original_linkify(state)
345
346
# Filter out blocked domains
347
for token in state.tokens:
348
if (token.type == "inline" and token.children):
349
for child in token.children:
350
if (child.type == "link_open" and
351
child.info == "auto"):
352
href = child.attrGet("href")
353
if any(domain in href for domain in blocked_domains):
354
# Convert back to text
355
child.type = "text"
356
child.content = href
357
358
# Replace linkify rule
359
md.core.ruler.at("linkify", filtered_linkify)
360
```
361
362
### Link Analysis
363
364
Analyze and report on links in documents:
365
366
```python
367
def analyze_links(tokens):
368
"""Analyze all links in token stream."""
369
analysis = {
370
'total_links': 0,
371
'external_links': 0,
372
'internal_links': 0,
373
'reference_links': 0,
374
'autolinks': 0,
375
'images': 0,
376
'broken_links': [],
377
'domains': set()
378
}
379
380
def analyze_token_links(token_list):
381
for token in token_list:
382
if token.type == "link_open":
383
analysis['total_links'] += 1
384
href = token.attrGet("href")
385
386
if token.info == "auto":
387
analysis['autolinks'] += 1
388
389
if href:
390
if href.startswith(('http://', 'https://')):
391
analysis['external_links'] += 1
392
# Extract domain
393
from urllib.parse import urlparse
394
domain = urlparse(href).netloc
395
analysis['domains'].add(domain)
396
elif href.startswith('#'):
397
analysis['internal_links'] += 1
398
elif not href:
399
analysis['broken_links'].append(token)
400
401
elif token.type == "image":
402
analysis['images'] += 1
403
src = token.attrGet("src")
404
if src and src.startswith(('http://', 'https://')):
405
from urllib.parse import urlparse
406
domain = urlparse(src).netloc
407
analysis['domains'].add(domain)
408
409
elif token.children:
410
analyze_token_links(token.children)
411
412
analyze_token_links(tokens)
413
analysis['domains'] = list(analysis['domains'])
414
415
return analysis
416
417
# Usage
418
md = MarkdownIt('gfm-like')
419
tokens = md.parse("""
420
# Document
421
422
[External link](https://example.com)
423
[Internal link](#section)
424
https://auto.link.com
425

426
""")
427
428
link_analysis = analyze_links(tokens)
429
print(f"Found {link_analysis['total_links']} links")
430
print(f"External domains: {link_analysis['domains']}")
431
```