0
# HTML Sanitization
1
2
Security-focused HTML cleaning using configurable allowlists to prevent XSS attacks while preserving safe content. jsoup's sanitization system provides comprehensive protection against malicious HTML injection.
3
4
## Capabilities
5
6
### HTML Cleaning
7
8
Clean untrusted HTML content using predefined or custom allowlists.
9
10
```java { .api }
11
/**
12
* Clean HTML content using a safelist of allowed elements and attributes.
13
* @param bodyHtml untrusted HTML content (body fragment)
14
* @param safelist allowlist of permitted HTML elements and attributes
15
* @return sanitized HTML content
16
*/
17
public static String clean(String bodyHtml, Safelist safelist);
18
19
/**
20
* Clean HTML content with base URI for relative URL resolution.
21
* @param bodyHtml untrusted HTML content
22
* @param baseUri base URI for resolving relative URLs
23
* @param safelist allowlist of permitted elements
24
* @return sanitized HTML content
25
*/
26
public static String clean(String bodyHtml, String baseUri, Safelist safelist);
27
28
/**
29
* Clean HTML with custom output settings.
30
* @param bodyHtml untrusted HTML content
31
* @param baseUri base URI for relative URLs
32
* @param safelist allowlist of permitted elements
33
* @param outputSettings document output configuration
34
* @return sanitized HTML content
35
*/
36
public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings);
37
```
38
39
**Usage Examples:**
40
41
```java
42
import org.jsoup.Jsoup;
43
import org.jsoup.safety.Safelist;
44
45
String userInput = "<p>Hello <script>alert('XSS')</script> <b>World</b>!</p>";
46
47
// Basic cleaning
48
String clean = Jsoup.clean(userInput, Safelist.basic());
49
// Result: "<p>Hello <b>World</b>!</p>"
50
51
// Clean with base URI
52
String htmlWithLinks = "<p><a href='/page'>Link</a></p>";
53
String cleanWithBase = Jsoup.clean(htmlWithLinks, "https://example.com", Safelist.basic());
54
55
// Clean with custom output settings
56
Document.OutputSettings settings = new Document.OutputSettings();
57
settings.prettyPrint(false);
58
String compactClean = Jsoup.clean(userInput, "", Safelist.basic(), settings);
59
```
60
61
### HTML Validation
62
63
Test if HTML content is valid according to a safelist without modifying it.
64
65
```java { .api }
66
/**
67
* Test if HTML content is valid according to the safelist.
68
* @param bodyHtml HTML content to validate
69
* @param safelist allowlist to test against
70
* @return true if HTML passes safelist validation
71
*/
72
public static boolean isValid(String bodyHtml, Safelist safelist);
73
```
74
75
**Usage Example:**
76
77
```java
78
String userContent = "<p>Safe content with <b>bold</b> text</p>";
79
String maliciousContent = "<p>Bad content <script>alert('XSS')</script></p>";
80
81
boolean isSafe = Jsoup.isValid(userContent, Safelist.basic()); // true
82
boolean isMalicious = Jsoup.isValid(maliciousContent, Safelist.basic()); // false
83
84
// Use for form validation
85
if (!Jsoup.isValid(userInput, Safelist.basic())) {
86
throw new ValidationException("HTML content contains unsafe elements");
87
}
88
89
// Always clean even if valid (for normalization)
90
String normalizedHtml = Jsoup.clean(userInput, Safelist.basic());
91
```
92
93
## Safelist Configuration
94
95
### Predefined Safelists
96
97
jsoup provides several predefined safelists for common use cases.
98
99
```java { .api }
100
/**
101
* Allow no HTML elements - text content only.
102
* @return Safelist that removes all HTML tags
103
*/
104
public static Safelist none();
105
106
/**
107
* Allow simple text formatting elements.
108
* Permits: b, em, i, strong, u
109
* @return Safelist for basic text formatting
110
*/
111
public static Safelist simpleText();
112
113
/**
114
* Allow basic HTML elements without links or images.
115
* Permits: a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul
116
* @return Safelist for basic HTML content
117
*/
118
public static Safelist basic();
119
120
/**
121
* Allow basic HTML elements plus images.
122
* Includes everything from basic() plus: img (with src, alt, title, width, height attributes)
123
* @return Safelist for basic HTML with images
124
*/
125
public static Safelist basicWithImages();
126
127
/**
128
* Allow a wide range of HTML elements for rich content.
129
* Includes structural elements, tables, formatting, and more.
130
* @return Safelist for comprehensive HTML content
131
*/
132
public static Safelist relaxed();
133
```
134
135
**Usage Examples:**
136
137
```java
138
String html = "<p>Text with <script>alert('xss')</script> and <b>formatting</b></p>";
139
140
// No HTML allowed
141
String textOnly = Jsoup.clean(html, Safelist.none());
142
// Result: "Text with and formatting"
143
144
// Simple formatting only
145
String simpleFormatted = Jsoup.clean(html, Safelist.simpleText());
146
// Result: "Text with and <b>formatting</b>"
147
148
// Basic HTML elements
149
String basicHtml = Jsoup.clean(html, Safelist.basic());
150
// Result: "<p>Text with and <b>formatting</b></p>"
151
152
// Compare safelists
153
Safelist basic = Safelist.basic();
154
Safelist withImages = Safelist.basicWithImages();
155
Safelist rich = Safelist.relaxed();
156
```
157
158
### Custom Safelist Configuration
159
160
Create and configure custom safelists for specific requirements.
161
162
```java { .api }
163
/**
164
* Create empty safelist.
165
*/
166
public Safelist();
167
168
/**
169
* Copy constructor for extending existing safelists.
170
* @param copy Safelist to copy
171
*/
172
public Safelist(Safelist copy);
173
174
/**
175
* Add allowed tag names.
176
* @param tags tag names to allow
177
* @return this Safelist for chaining
178
*/
179
public Safelist addTags(String... tags);
180
181
/**
182
* Remove allowed tag names.
183
* @param tags tag names to remove
184
* @return this Safelist for chaining
185
*/
186
public Safelist removeTags(String... tags);
187
188
/**
189
* Add allowed attributes for specific tags.
190
* @param tag tag name
191
* @param attributes attribute names to allow
192
* @return this Safelist for chaining
193
*/
194
public Safelist addAttributes(String tag, String... attributes);
195
196
/**
197
* Remove allowed attributes for specific tags.
198
* @param tag tag name
199
* @param attributes attribute names to remove
200
* @return this Safelist for chaining
201
*/
202
public Safelist removeAttributes(String tag, String... attributes);
203
```
204
205
**Usage Examples:**
206
207
```java
208
// Start with basic safelist and customize
209
Safelist customList = new Safelist(Safelist.basic())
210
.addTags("h1", "h2", "h3", "h4", "h5", "h6") // Add heading tags
211
.addAttributes("a", "target") // Allow target on links
212
.addAttributes("img", "class") // Allow class on images
213
.removeTags("cite", "q"); // Remove citation tags
214
215
// Build from scratch
216
Safelist minimal = new Safelist()
217
.addTags("p", "br", "strong", "em")
218
.addAttributes("p", "class")
219
.addAttributes("strong", "class");
220
221
String html = "<h1>Title</h1><p class='intro'>Text with <strong class='highlight'>emphasis</strong></p>";
222
String cleaned = Jsoup.clean(html, customList);
223
```
224
225
### Enforced Attributes
226
227
Ensure specific attributes are always present on certain elements.
228
229
```java { .api }
230
/**
231
* Add enforced attribute that will be set on matching elements.
232
* @param tag tag name
233
* @param attribute attribute name
234
* @param value attribute value to enforce
235
* @return this Safelist for chaining
236
*/
237
public Safelist addEnforcedAttribute(String tag, String attribute, String value);
238
239
/**
240
* Remove enforced attribute.
241
* @param tag tag name
242
* @param attribute attribute name
243
* @return this Safelist for chaining
244
*/
245
public Safelist removeEnforcedAttribute(String tag, String attribute);
246
247
/**
248
* Get enforced attributes for a tag.
249
* @param tagName tag name
250
* @return Map of enforced attributes
251
*/
252
public Map<String, String> getEnforcedAttributes(String tagName);
253
```
254
255
**Usage Examples:**
256
257
```java
258
Safelist safelist = Safelist.basic()
259
.addEnforcedAttribute("a", "rel", "nofollow") // All links get rel="nofollow"
260
.addEnforcedAttribute("a", "target", "_blank") // All links open in new window
261
.addEnforcedAttribute("img", "loading", "lazy"); // All images lazy load
262
263
String html = "<a href='https://example.com'>Link</a>";
264
String cleaned = Jsoup.clean(html, safelist);
265
// Result: "<a href='https://example.com' rel='nofollow' target='_blank'>Link</a>"
266
```
267
268
### Protocol Validation
269
270
Control which URL protocols are allowed in link and image attributes.
271
272
```java { .api }
273
/**
274
* Add allowed protocols for URL attributes.
275
* @param tag tag name
276
* @param attribute attribute name (href, src, etc.)
277
* @param protocols allowed URL protocols
278
* @return this Safelist for chaining
279
*/
280
public Safelist addProtocols(String tag, String attribute, String... protocols);
281
282
/**
283
* Remove allowed protocols for URL attributes.
284
* @param tag tag name
285
* @param attribute attribute name
286
* @param removeProtocols protocols to remove
287
* @return this Safelist for chaining
288
*/
289
public Safelist removeProtocols(String tag, String attribute, String... removeProtocols);
290
291
/**
292
* Control whether relative links are preserved.
293
* @param preserve true to preserve relative links
294
* @return this Safelist for chaining
295
*/
296
public Safelist preserveRelativeLinks(boolean preserve);
297
```
298
299
**Usage Examples:**
300
301
```java
302
Safelist safelist = Safelist.basic()
303
.addProtocols("a", "href", "http", "https", "mailto")
304
.addProtocols("img", "src", "http", "https", "data")
305
.preserveRelativeLinks(true);
306
307
// URLs with disallowed protocols are removed
308
String html = "<a href='javascript:alert(\"xss\")'>Bad Link</a>" +
309
"<a href='https://safe.com'>Good Link</a>";
310
String cleaned = Jsoup.clean(html, safelist);
311
// Result: "<a>Bad Link</a><a href='https://safe.com'>Good Link</a>"
312
```
313
314
## Cleaner Class
315
316
For more advanced cleaning scenarios, use the Cleaner class directly.
317
318
```java { .api }
319
/**
320
* Create a cleaner with the specified safelist.
321
* @param safelist allowlist for cleaning
322
*/
323
public Cleaner(Safelist safelist);
324
325
/**
326
* Clean a full Document (not just body fragment).
327
* @param dirtyDocument document to clean
328
* @return new cleaned Document
329
*/
330
public Document clean(Document dirtyDocument);
331
332
/**
333
* Test if a Document is valid according to the safelist.
334
* @param dirtyDocument document to validate
335
* @return true if document passes validation
336
*/
337
public boolean isValid(Document dirtyDocument);
338
339
/**
340
* Test if HTML body fragment is valid according to the safelist.
341
* @param bodyHtml HTML fragment to validate
342
* @return true if HTML is valid
343
*/
344
public boolean isValidBodyHtml(String bodyHtml);
345
```
346
347
**Usage Examples:**
348
349
```java
350
import org.jsoup.safety.Cleaner;
351
352
Cleaner cleaner = new Cleaner(Safelist.basic());
353
354
// Clean full documents
355
Document dirtyDoc = Jsoup.parse("<html><body><script>alert('xss')</script><p>Content</p></body></html>");
356
Document cleanDoc = cleaner.clean(dirtyDoc);
357
358
// Validate documents
359
boolean isDocumentSafe = cleaner.isValid(dirtyDoc);
360
361
// Validate HTML fragments
362
boolean isFragmentSafe = cleaner.isValidBodyHtml("<p>Safe content</p>");
363
```
364
365
## Security Best Practices
366
367
### XSS Prevention
368
369
```java
370
// Always clean user input before storing or displaying
371
public String sanitizeUserContent(String userHtml) {
372
return Jsoup.clean(userHtml, Safelist.basic());
373
}
374
375
// Use strict safelists for untrusted content
376
public String sanitizeComment(String comment) {
377
return Jsoup.clean(comment, Safelist.simpleText());
378
}
379
380
// Validate before cleaning for logging/monitoring
381
public String processUserSubmission(String html) {
382
if (!Jsoup.isValid(html, Safelist.basic())) {
383
logger.warn("Potentially malicious HTML submitted: " + html);
384
}
385
return Jsoup.clean(html, Safelist.basic());
386
}
387
```
388
389
### Content Security
390
391
```java
392
// Create restrictive safelist for user comments
393
Safelist commentSafelist = new Safelist()
394
.addTags("p", "br", "strong", "em", "code")
395
.addAttributes("code", "class"); // Allow syntax highlighting classes
396
397
// Create permissive safelist for trusted editors
398
Safelist editorSafelist = new Safelist(Safelist.relaxed())
399
.addEnforcedAttribute("a", "rel", "nofollow") // SEO protection
400
.addEnforcedAttribute("img", "loading", "lazy") // Performance
401
.addProtocols("img", "src", "http", "https"); // Block data URLs
402
403
// Different cleaning for different contexts
404
public String cleanForDisplay(String html, UserRole role) {
405
switch (role) {
406
case ADMIN:
407
return Jsoup.clean(html, Safelist.relaxed());
408
case EDITOR:
409
return Jsoup.clean(html, editorSafelist);
410
case USER:
411
return Jsoup.clean(html, commentSafelist);
412
default:
413
return Jsoup.clean(html, Safelist.none());
414
}
415
}
416
```
417
418
### Configuration Validation
419
420
```java
421
// Test safelist configuration
422
public void validateSafelistConfiguration() {
423
Safelist safelist = createCustomSafelist();
424
425
String[] testCases = {
426
"<script>alert('xss')</script>", // Should be removed
427
"<p onclick='alert()'>Text</p>", // onclick should be removed
428
"<a href='javascript:void(0)'>Link</a>", // javascript: should be removed
429
"<img src='data:image/svg+xml,...'>", // data: URLs if not allowed
430
};
431
432
for (String testCase : testCases) {
433
String cleaned = Jsoup.clean(testCase, safelist);
434
assertFalse("Unsafe content not removed: " + testCase,
435
cleaned.contains("script") ||
436
cleaned.contains("onclick") ||
437
cleaned.contains("javascript:"));
438
}
439
}
440
```
441
442
This comprehensive HTML sanitization system provides enterprise-grade security for processing untrusted HTML content while maintaining usability and performance.