or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdconfiguration.mdcore-parsing.mdindex.mdlink-processing.mdrendering.mdsyntax-tree.mdtoken-system.md

link-processing.mddocs/

0

# Link Processing and Security

1

2

URL validation, normalization, and link processing utilities with built-in security features to prevent XSS attacks and ensure safe link handling in markdown documents.

3

4

## Capabilities

5

6

### URL Validation

7

8

Security-focused URL validation to prevent malicious links.

9

10

```python { .api }

11

def validateLink(self, url: str) -> bool:

12

"""

13

Validate if URL link is allowed in output.

14

15

This validator can prohibit more than really needed to prevent XSS.

16

It's a tradeoff to keep code simple and to be secure by default.

17

18

Parameters:

19

- url: URL to validate (should be normalized and entities decoded)

20

21

Returns:

22

- bool: True if URL is considered safe

23

"""

24

```

25

26

**Usage Example:**

27

28

```python

29

from markdown_it import MarkdownIt

30

31

md = MarkdownIt()

32

33

# Test URL validation

34

safe_urls = [

35

"https://example.com",

36

"http://example.com/path",

37

"mailto:user@example.com",

38

"/relative/path",

39

"#anchor"

40

]

41

42

unsafe_urls = [

43

"javascript:alert('xss')",

44

"data:text/html,<script>alert('xss')</script>",

45

"vbscript:msgbox('xss')"

46

]

47

48

for url in safe_urls:

49

print(f"{url}: {md.validateLink(url)}") # Should be True

50

51

for url in unsafe_urls:

52

print(f"{url}: {md.validateLink(url)}") # Should be False

53

```

54

55

### URL Normalization

56

57

Normalize URLs for consistency and security.

58

59

```python { .api }

60

def normalizeLink(self, url: str) -> str:

61

"""

62

Normalize destination URLs in links.

63

64

Used for link destinations like:

65

[label]: destination 'title'

66

^^^^^^^^^^^

67

68

Parameters:

69

- url: raw URL to normalize

70

71

Returns:

72

- str: normalized URL

73

"""

74

75

def normalizeLinkText(self, link: str) -> str:

76

"""

77

Normalize autolink content.

78

79

Used for autolink content like:

80

<destination>

81

~~~~~~~~~~~

82

83

Parameters:

84

- link: raw link text to normalize

85

86

Returns:

87

- str: normalized link text

88

"""

89

```

90

91

**Usage Example:**

92

93

```python

94

from markdown_it import MarkdownIt

95

96

md = MarkdownIt()

97

98

# URL normalization

99

raw_url = "HTTP://EXAMPLE.COM/Path With Spaces"

100

normalized = md.normalizeLink(raw_url)

101

print(normalized) # "http://example.com/Path%20With%20Spaces"

102

103

# Link text normalization

104

raw_link = "www.example.com/path"

105

normalized_text = md.normalizeLinkText(raw_link)

106

print(normalized_text) # Normalized for display

107

```

108

109

### Link Helper Functions

110

111

Low-level utilities for parsing link components.

112

113

```python { .api }

114

from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle

115

116

def parseLinkDestination(str: str, pos: int, max: int) -> dict:

117

"""

118

Parse link destination from input string.

119

120

Parameters:

121

- str: input string

122

- pos: starting position

123

- max: maximum position

124

125

Returns:

126

- dict: {ok: bool, pos: int, str: str} - parse result

127

"""

128

129

def parseLinkLabel(str: str, pos: int, max: int) -> dict:

130

"""

131

Parse link label from input string.

132

133

Parameters:

134

- str: input string

135

- pos: starting position

136

- max: maximum position

137

138

Returns:

139

- dict: {ok: bool, pos: int, str: str} - parse result

140

"""

141

142

def parseLinkTitle(str: str, pos: int, max: int) -> dict:

143

"""

144

Parse link title from input string.

145

146

Parameters:

147

- str: input string

148

- pos: starting position

149

- max: maximum position

150

151

Returns:

152

- dict: {ok: bool, pos: int, str: str, marker: str} - parse result

153

"""

154

```

155

156

**Usage Example:**

157

158

```python

159

from markdown_it.helpers import parseLinkDestination, parseLinkLabel, parseLinkTitle

160

161

# Parse link destination

162

text = '<https://example.com> "Title"'

163

result = parseLinkDestination(text, 1, len(text) - 1)

164

print(result) # {ok: True, pos: 19, str: 'https://example.com'}

165

166

# Parse link label

167

text = '[Link Text]'

168

result = parseLinkLabel(text, 0, len(text))

169

print(result) # {ok: True, pos: 11, str: 'Link Text'}

170

171

# Parse link title

172

text = '"Title Here"'

173

result = parseLinkTitle(text, 0, len(text))

174

print(result) # {ok: True, pos: 12, str: 'Title Here', marker: '"'}

175

```

176

177

## Security Features

178

179

### XSS Prevention

180

181

Built-in protection against cross-site scripting attacks:

182

183

```python

184

def custom_link_validator(url):

185

"""Custom link validation with additional security checks."""

186

from markdown_it.common.normalize_url import validateLink

187

188

# Use built-in validation first

189

if not validateLink(url):

190

return False

191

192

# Additional custom checks

193

lower_url = url.lower()

194

195

# Block additional dangerous protocols

196

dangerous_protocols = ['file:', 'ftp:', 'news:', 'gopher:']

197

if any(lower_url.startswith(proto) for proto in dangerous_protocols):

198

return False

199

200

# Block URLs with suspicious patterns

201

suspicious_patterns = ['<script', 'javascript:', 'vbscript:', 'data:']

202

if any(pattern in lower_url for pattern in suspicious_patterns):

203

return False

204

205

return True

206

207

# Override validation in renderer

208

def secure_link_open(tokens, idx, options, env):

209

"""Secure link rendering with validation."""

210

token = tokens[idx]

211

href = token.attrGet("href")

212

213

if href and not custom_link_validator(href):

214

# Replace with safe placeholder

215

token.attrSet("href", "#invalid-link")

216

token.attrSet("class", "invalid-link")

217

token.attrSet("title", "Invalid or potentially unsafe link")

218

219

return default_link_open(tokens, idx, options, env)

220

```

221

222

### Content Security

223

224

Sanitize and validate link content:

225

226

```python

227

def sanitize_link_content(tokens):

228

"""Sanitize link tokens for security."""

229

for token in tokens:

230

if token.type == "link_open":

231

href = token.attrGet("href")

232

if href:

233

# Normalize URL

234

from markdown_it.common.normalize_url import normalizeLink

235

normalized_href = normalizeLink(href)

236

237

# Validate normalized URL

238

from markdown_it.common.normalize_url import validateLink

239

if validateLink(normalized_href):

240

token.attrSet("href", normalized_href)

241

# Add security attributes

242

if normalized_href.startswith(('http://', 'https://')):

243

token.attrSet("rel", "noopener noreferrer")

244

token.attrSet("target", "_blank")

245

else:

246

# Remove unsafe link

247

token.type = "text"

248

token.tag = ""

249

token.content = href

250

251

elif token.type == "image":

252

src = token.attrGet("src")

253

if src:

254

# Validate image URLs

255

from markdown_it.common.normalize_url import normalizeLink, validateLink

256

normalized_src = normalizeLink(src)

257

if validateLink(normalized_src):

258

token.attrSet("src", normalized_src)

259

else:

260

# Remove unsafe image

261

token.attrSet("src", "")

262

token.attrSet("alt", f"[Invalid image: {src}]")

263

264

return tokens

265

```

266

267

## Link Processing Utilities

268

269

### Reference Link Handling

270

271

Process reference-style links and their definitions:

272

273

```python

274

def extract_reference_links(env):

275

"""Extract reference link definitions from environment."""

276

references = env.get('references', {})

277

278

links = []

279

for label, ref_data in references.items():

280

links.append({

281

'label': label,

282

'href': ref_data.get('href', ''),

283

'title': ref_data.get('title', '')

284

})

285

286

return links

287

288

def add_reference_link(env, label, href, title=""):

289

"""Add reference link definition to environment."""

290

if 'references' not in env:

291

env['references'] = {}

292

293

env['references'][label.lower()] = {

294

'href': href,

295

'title': title

296

}

297

298

# Usage

299

md = MarkdownIt()

300

env = {}

301

302

# Parse markdown with reference links

303

text = """

304

[Link 1][ref1]

305

[Link 2][ref2]

306

307

[ref1]: https://example.com "Example"

308

[ref2]: https://another.com

309

"""

310

311

tokens = md.parse(text, env)

312

references = extract_reference_links(env)

313

314

for ref in references:

315

print(f"Reference '{ref['label']}': {ref['href']}")

316

```

317

318

### Autolink Processing

319

320

Handle automatic link detection and processing:

321

322

```python

323

def extract_autolinks(tokens):

324

"""Extract automatically detected links from tokens."""

325

autolinks = []

326

327

for token in tokens:

328

if token.type == "link_open" and token.info == "auto":

329

# This is an autolink

330

href = token.attrGet("href")

331

autolinks.append(href)

332

elif token.children:

333

# Recursively check children

334

autolinks.extend(extract_autolinks(token.children))

335

336

return autolinks

337

338

def disable_autolinks_for_domains(md, blocked_domains):

339

"""Disable autolink processing for specific domains."""

340

original_linkify = md.core.ruler.getRules("")[3] # linkify rule

341

342

def filtered_linkify(state):

343

# Run original linkify

344

original_linkify(state)

345

346

# Filter out blocked domains

347

for token in state.tokens:

348

if (token.type == "inline" and token.children):

349

for child in token.children:

350

if (child.type == "link_open" and

351

child.info == "auto"):

352

href = child.attrGet("href")

353

if any(domain in href for domain in blocked_domains):

354

# Convert back to text

355

child.type = "text"

356

child.content = href

357

358

# Replace linkify rule

359

md.core.ruler.at("linkify", filtered_linkify)

360

```

361

362

### Link Analysis

363

364

Analyze and report on links in documents:

365

366

```python

367

def analyze_links(tokens):

368

"""Analyze all links in token stream."""

369

analysis = {

370

'total_links': 0,

371

'external_links': 0,

372

'internal_links': 0,

373

'reference_links': 0,

374

'autolinks': 0,

375

'images': 0,

376

'broken_links': [],

377

'domains': set()

378

}

379

380

def analyze_token_links(token_list):

381

for token in token_list:

382

if token.type == "link_open":

383

analysis['total_links'] += 1

384

href = token.attrGet("href")

385

386

if token.info == "auto":

387

analysis['autolinks'] += 1

388

389

if href:

390

if href.startswith(('http://', 'https://')):

391

analysis['external_links'] += 1

392

# Extract domain

393

from urllib.parse import urlparse

394

domain = urlparse(href).netloc

395

analysis['domains'].add(domain)

396

elif href.startswith('#'):

397

analysis['internal_links'] += 1

398

elif not href:

399

analysis['broken_links'].append(token)

400

401

elif token.type == "image":

402

analysis['images'] += 1

403

src = token.attrGet("src")

404

if src and src.startswith(('http://', 'https://')):

405

from urllib.parse import urlparse

406

domain = urlparse(src).netloc

407

analysis['domains'].add(domain)

408

409

elif token.children:

410

analyze_token_links(token.children)

411

412

analyze_token_links(tokens)

413

analysis['domains'] = list(analysis['domains'])

414

415

return analysis

416

417

# Usage

418

md = MarkdownIt('gfm-like')

419

tokens = md.parse("""

420

# Document

421

422

[External link](https://example.com)

423

[Internal link](#section)

424

https://auto.link.com

425

![Image](https://images.example.com/pic.jpg)

426

""")

427

428

link_analysis = analyze_links(tokens)

429

print(f"Found {link_analysis['total_links']} links")

430

print(f"External domains: {link_analysis['domains']}")

431

```