or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

css-selection.mddom-manipulation.mdform-handling.mdhtml-sanitization.mdhttp-connection.mdindex.mdparsing.md

html-sanitization.mddocs/

0

# HTML Sanitization

1

2

Security-focused HTML cleaning using configurable allowlists to prevent XSS attacks while preserving safe content. jsoup's sanitization system provides comprehensive protection against malicious HTML injection.

3

4

## Capabilities

5

6

### HTML Cleaning

7

8

Clean untrusted HTML content using predefined or custom allowlists.

9

10

```java { .api }

11

/**

12

* Clean HTML content using a safelist of allowed elements and attributes.

13

* @param bodyHtml untrusted HTML content (body fragment)

14

* @param safelist allowlist of permitted HTML elements and attributes

15

* @return sanitized HTML content

16

*/

17

public static String clean(String bodyHtml, Safelist safelist);

18

19

/**

20

* Clean HTML content with base URI for relative URL resolution.

21

* @param bodyHtml untrusted HTML content

22

* @param baseUri base URI for resolving relative URLs

23

* @param safelist allowlist of permitted elements

24

* @return sanitized HTML content

25

*/

26

public static String clean(String bodyHtml, String baseUri, Safelist safelist);

27

28

/**

29

* Clean HTML with custom output settings.

30

* @param bodyHtml untrusted HTML content

31

* @param baseUri base URI for relative URLs

32

* @param safelist allowlist of permitted elements

33

* @param outputSettings document output configuration

34

* @return sanitized HTML content

35

*/

36

public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings);

37

```

38

39

**Usage Examples:**

40

41

```java

42

import org.jsoup.Jsoup;

43

import org.jsoup.safety.Safelist;

44

45

String userInput = "<p>Hello <script>alert('XSS')</script> <b>World</b>!</p>";

46

47

// Basic cleaning

48

String clean = Jsoup.clean(userInput, Safelist.basic());

49

// Result: "<p>Hello <b>World</b>!</p>"

50

51

// Clean with base URI

52

String htmlWithLinks = "<p><a href='/page'>Link</a></p>";

53

String cleanWithBase = Jsoup.clean(htmlWithLinks, "https://example.com", Safelist.basic());

54

55

// Clean with custom output settings

56

Document.OutputSettings settings = new Document.OutputSettings();

57

settings.prettyPrint(false);

58

String compactClean = Jsoup.clean(userInput, "", Safelist.basic(), settings);

59

```

60

61

### HTML Validation

62

63

Test if HTML content is valid according to a safelist without modifying it.

64

65

```java { .api }

66

/**

67

* Test if HTML content is valid according to the safelist.

68

* @param bodyHtml HTML content to validate

69

* @param safelist allowlist to test against

70

* @return true if HTML passes safelist validation

71

*/

72

public static boolean isValid(String bodyHtml, Safelist safelist);

73

```

74

75

**Usage Example:**

76

77

```java

78

String userContent = "<p>Safe content with <b>bold</b> text</p>";

79

String maliciousContent = "<p>Bad content <script>alert('XSS')</script></p>";

80

81

boolean isSafe = Jsoup.isValid(userContent, Safelist.basic()); // true

82

boolean isMalicious = Jsoup.isValid(maliciousContent, Safelist.basic()); // false

83

84

// Use for form validation

85

if (!Jsoup.isValid(userInput, Safelist.basic())) {

86

throw new ValidationException("HTML content contains unsafe elements");

87

}

88

89

// Always clean even if valid (for normalization)

90

String normalizedHtml = Jsoup.clean(userInput, Safelist.basic());

91

```

92

93

## Safelist Configuration

94

95

### Predefined Safelists

96

97

jsoup provides several predefined safelists for common use cases.

98

99

```java { .api }

100

/**

101

* Allow no HTML elements - text content only.

102

* @return Safelist that removes all HTML tags

103

*/

104

public static Safelist none();

105

106

/**

107

* Allow simple text formatting elements.

108

* Permits: b, em, i, strong, u

109

* @return Safelist for basic text formatting

110

*/

111

public static Safelist simpleText();

112

113

/**

114

* Allow basic HTML elements without links or images.

115

* Permits: a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul

116

* @return Safelist for basic HTML content

117

*/

118

public static Safelist basic();

119

120

/**

121

* Allow basic HTML elements plus images.

122

* Includes everything from basic() plus: img (with src, alt, title, width, height attributes)

123

* @return Safelist for basic HTML with images

124

*/

125

public static Safelist basicWithImages();

126

127

/**

128

* Allow a wide range of HTML elements for rich content.

129

* Includes structural elements, tables, formatting, and more.

130

* @return Safelist for comprehensive HTML content

131

*/

132

public static Safelist relaxed();

133

```

134

135

**Usage Examples:**

136

137

```java

138

String html = "<p>Text with <script>alert('xss')</script> and <b>formatting</b></p>";

139

140

// No HTML allowed

141

String textOnly = Jsoup.clean(html, Safelist.none());

142

// Result: "Text with and formatting"

143

144

// Simple formatting only

145

String simpleFormatted = Jsoup.clean(html, Safelist.simpleText());

146

// Result: "Text with and <b>formatting</b>"

147

148

// Basic HTML elements

149

String basicHtml = Jsoup.clean(html, Safelist.basic());

150

// Result: "<p>Text with and <b>formatting</b></p>"

151

152

// Compare safelists

153

Safelist basic = Safelist.basic();

154

Safelist withImages = Safelist.basicWithImages();

155

Safelist rich = Safelist.relaxed();

156

```

157

158

### Custom Safelist Configuration

159

160

Create and configure custom safelists for specific requirements.

161

162

```java { .api }

163

/**

164

* Create empty safelist.

165

*/

166

public Safelist();

167

168

/**

169

* Copy constructor for extending existing safelists.

170

* @param copy Safelist to copy

171

*/

172

public Safelist(Safelist copy);

173

174

/**

175

* Add allowed tag names.

176

* @param tags tag names to allow

177

* @return this Safelist for chaining

178

*/

179

public Safelist addTags(String... tags);

180

181

/**

182

* Remove allowed tag names.

183

* @param tags tag names to remove

184

* @return this Safelist for chaining

185

*/

186

public Safelist removeTags(String... tags);

187

188

/**

189

* Add allowed attributes for specific tags.

190

* @param tag tag name

191

* @param attributes attribute names to allow

192

* @return this Safelist for chaining

193

*/

194

public Safelist addAttributes(String tag, String... attributes);

195

196

/**

197

* Remove allowed attributes for specific tags.

198

* @param tag tag name

199

* @param attributes attribute names to remove

200

* @return this Safelist for chaining

201

*/

202

public Safelist removeAttributes(String tag, String... attributes);

203

```

204

205

**Usage Examples:**

206

207

```java

208

// Start with basic safelist and customize

209

Safelist customList = new Safelist(Safelist.basic())

210

.addTags("h1", "h2", "h3", "h4", "h5", "h6") // Add heading tags

211

.addAttributes("a", "target") // Allow target on links

212

.addAttributes("img", "class") // Allow class on images

213

.removeTags("cite", "q"); // Remove citation tags

214

215

// Build from scratch

216

Safelist minimal = new Safelist()

217

.addTags("p", "br", "strong", "em")

218

.addAttributes("p", "class")

219

.addAttributes("strong", "class");

220

221

String html = "<h1>Title</h1><p class='intro'>Text with <strong class='highlight'>emphasis</strong></p>";

222

String cleaned = Jsoup.clean(html, customList);

223

```

224

225

### Enforced Attributes

226

227

Ensure specific attributes are always present on certain elements.

228

229

```java { .api }

230

/**

231

* Add enforced attribute that will be set on matching elements.

232

* @param tag tag name

233

* @param attribute attribute name

234

* @param value attribute value to enforce

235

* @return this Safelist for chaining

236

*/

237

public Safelist addEnforcedAttribute(String tag, String attribute, String value);

238

239

/**

240

* Remove enforced attribute.

241

* @param tag tag name

242

* @param attribute attribute name

243

* @return this Safelist for chaining

244

*/

245

public Safelist removeEnforcedAttribute(String tag, String attribute);

246

247

/**

248

* Get enforced attributes for a tag.

249

* @param tagName tag name

250

* @return Map of enforced attributes

251

*/

252

public Map<String, String> getEnforcedAttributes(String tagName);

253

```

254

255

**Usage Examples:**

256

257

```java

258

Safelist safelist = Safelist.basic()

259

.addEnforcedAttribute("a", "rel", "nofollow") // All links get rel="nofollow"

260

.addEnforcedAttribute("a", "target", "_blank") // All links open in new window

261

.addEnforcedAttribute("img", "loading", "lazy"); // All images lazy load

262

263

String html = "<a href='https://example.com'>Link</a>";

264

String cleaned = Jsoup.clean(html, safelist);

265

// Result: "<a href='https://example.com' rel='nofollow' target='_blank'>Link</a>"

266

```

267

268

### Protocol Validation

269

270

Control which URL protocols are allowed in link and image attributes.

271

272

```java { .api }

273

/**

274

* Add allowed protocols for URL attributes.

275

* @param tag tag name

276

* @param attribute attribute name (href, src, etc.)

277

* @param protocols allowed URL protocols

278

* @return this Safelist for chaining

279

*/

280

public Safelist addProtocols(String tag, String attribute, String... protocols);

281

282

/**

283

* Remove allowed protocols for URL attributes.

284

* @param tag tag name

285

* @param attribute attribute name

286

* @param removeProtocols protocols to remove

287

* @return this Safelist for chaining

288

*/

289

public Safelist removeProtocols(String tag, String attribute, String... removeProtocols);

290

291

/**

292

* Control whether relative links are preserved.

293

* @param preserve true to preserve relative links

294

* @return this Safelist for chaining

295

*/

296

public Safelist preserveRelativeLinks(boolean preserve);

297

```

298

299

**Usage Examples:**

300

301

```java

302

Safelist safelist = Safelist.basic()

303

.addProtocols("a", "href", "http", "https", "mailto")

304

.addProtocols("img", "src", "http", "https", "data")

305

.preserveRelativeLinks(true);

306

307

// URLs with disallowed protocols are removed

308

String html = "<a href='javascript:alert(\"xss\")'>Bad Link</a>" +

309

"<a href='https://safe.com'>Good Link</a>";

310

String cleaned = Jsoup.clean(html, safelist);

311

// Result: "<a>Bad Link</a><a href='https://safe.com'>Good Link</a>"

312

```

313

314

## Cleaner Class

315

316

For more advanced cleaning scenarios, use the Cleaner class directly.

317

318

```java { .api }

319

/**

320

* Create a cleaner with the specified safelist.

321

* @param safelist allowlist for cleaning

322

*/

323

public Cleaner(Safelist safelist);

324

325

/**

326

* Clean a full Document (not just body fragment).

327

* @param dirtyDocument document to clean

328

* @return new cleaned Document

329

*/

330

public Document clean(Document dirtyDocument);

331

332

/**

333

* Test if a Document is valid according to the safelist.

334

* @param dirtyDocument document to validate

335

* @return true if document passes validation

336

*/

337

public boolean isValid(Document dirtyDocument);

338

339

/**

340

* Test if HTML body fragment is valid according to the safelist.

341

* @param bodyHtml HTML fragment to validate

342

* @return true if HTML is valid

343

*/

344

public boolean isValidBodyHtml(String bodyHtml);

345

```

346

347

**Usage Examples:**

348

349

```java

350

import org.jsoup.safety.Cleaner;

351

352

Cleaner cleaner = new Cleaner(Safelist.basic());

353

354

// Clean full documents

355

Document dirtyDoc = Jsoup.parse("<html><body><script>alert('xss')</script><p>Content</p></body></html>");

356

Document cleanDoc = cleaner.clean(dirtyDoc);

357

358

// Validate documents

359

boolean isDocumentSafe = cleaner.isValid(dirtyDoc);

360

361

// Validate HTML fragments

362

boolean isFragmentSafe = cleaner.isValidBodyHtml("<p>Safe content</p>");

363

```

364

365

## Security Best Practices

366

367

### XSS Prevention

368

369

```java

370

// Always clean user input before storing or displaying

371

public String sanitizeUserContent(String userHtml) {

372

return Jsoup.clean(userHtml, Safelist.basic());

373

}

374

375

// Use strict safelists for untrusted content

376

public String sanitizeComment(String comment) {

377

return Jsoup.clean(comment, Safelist.simpleText());

378

}

379

380

// Validate before cleaning for logging/monitoring

381

public String processUserSubmission(String html) {

382

if (!Jsoup.isValid(html, Safelist.basic())) {

383

logger.warn("Potentially malicious HTML submitted: " + html);

384

}

385

return Jsoup.clean(html, Safelist.basic());

386

}

387

```

388

389

### Content Security

390

391

```java

392

// Create restrictive safelist for user comments

393

Safelist commentSafelist = new Safelist()

394

.addTags("p", "br", "strong", "em", "code")

395

.addAttributes("code", "class"); // Allow syntax highlighting classes

396

397

// Create permissive safelist for trusted editors

398

Safelist editorSafelist = new Safelist(Safelist.relaxed())

399

.addEnforcedAttribute("a", "rel", "nofollow") // SEO protection

400

.addEnforcedAttribute("img", "loading", "lazy") // Performance

401

.addProtocols("img", "src", "http", "https"); // Block data URLs

402

403

// Different cleaning for different contexts

404

public String cleanForDisplay(String html, UserRole role) {

405

switch (role) {

406

case ADMIN:

407

return Jsoup.clean(html, Safelist.relaxed());

408

case EDITOR:

409

return Jsoup.clean(html, editorSafelist);

410

case USER:

411

return Jsoup.clean(html, commentSafelist);

412

default:

413

return Jsoup.clean(html, Safelist.none());

414

}

415

}

416

```

417

418

### Configuration Validation

419

420

```java

421

// Test safelist configuration

422

public void validateSafelistConfiguration() {

423

Safelist safelist = createCustomSafelist();

424

425

String[] testCases = {

426

"<script>alert('xss')</script>", // Should be removed

427

"<p onclick='alert()'>Text</p>", // onclick should be removed

428

"<a href='javascript:void(0)'>Link</a>", // javascript: should be removed

429

"<img src='data:image/svg+xml,...'>", // data: URLs if not allowed

430

};

431

432

for (String testCase : testCases) {

433

String cleaned = Jsoup.clean(testCase, safelist);

434

assertFalse("Unsafe content not removed: " + testCase,

435

cleaned.contains("script") ||

436

cleaned.contains("onclick") ||

437

cleaned.contains("javascript:"));

438

}

439

}

440

```

441

442

This comprehensive HTML sanitization system provides enterprise-grade security for processing untrusted HTML content while maintaining usability and performance.