Java HTML parser library implementing the WHATWG HTML5 specification for parsing, manipulating, and sanitizing HTML and XML documents.
—
Security-focused HTML cleaning using configurable allowlists to prevent XSS attacks while preserving safe content. jsoup's sanitization system provides comprehensive protection against malicious HTML injection.
Clean untrusted HTML content using predefined or custom allowlists.
/**
* Clean HTML content using a safelist of allowed elements and attributes.
* @param bodyHtml untrusted HTML content (body fragment)
* @param safelist allowlist of permitted HTML elements and attributes
* @return sanitized HTML content
*/
public static String clean(String bodyHtml, Safelist safelist);
/**
* Clean HTML content with base URI for relative URL resolution.
* @param bodyHtml untrusted HTML content
* @param baseUri base URI for resolving relative URLs
* @param safelist allowlist of permitted elements
* @return sanitized HTML content
*/
public static String clean(String bodyHtml, String baseUri, Safelist safelist);
/**
* Clean HTML with custom output settings.
* @param bodyHtml untrusted HTML content
* @param baseUri base URI for relative URLs
* @param safelist allowlist of permitted elements
* @param outputSettings document output configuration
* @return sanitized HTML content
*/
public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings);Usage Examples:
import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;
String userInput = "<p>Hello <script>alert('XSS')</script> <b>World</b>!</p>";
// Basic cleaning
String clean = Jsoup.clean(userInput, Safelist.basic());
// Result: "<p>Hello <b>World</b>!</p>"
// Clean with base URI
String htmlWithLinks = "<p><a href='/page'>Link</a></p>";
String cleanWithBase = Jsoup.clean(htmlWithLinks, "https://example.com", Safelist.basic());
// Clean with custom output settings
Document.OutputSettings settings = new Document.OutputSettings();
settings.prettyPrint(false);
String compactClean = Jsoup.clean(userInput, "", Safelist.basic(), settings);Test if HTML content is valid according to a safelist without modifying it.
/**
* Test if HTML content is valid according to the safelist.
* @param bodyHtml HTML content to validate
* @param safelist allowlist to test against
* @return true if HTML passes safelist validation
*/
public static boolean isValid(String bodyHtml, Safelist safelist);Usage Example:
String userContent = "<p>Safe content with <b>bold</b> text</p>";
String maliciousContent = "<p>Bad content <script>alert('XSS')</script></p>";
boolean isSafe = Jsoup.isValid(userContent, Safelist.basic()); // true
boolean isMalicious = Jsoup.isValid(maliciousContent, Safelist.basic()); // false
// Use for form validation
if (!Jsoup.isValid(userInput, Safelist.basic())) {
throw new ValidationException("HTML content contains unsafe elements");
}
// Always clean even if valid (for normalization)
String normalizedHtml = Jsoup.clean(userInput, Safelist.basic());jsoup provides several predefined safelists for common use cases.
/**
* Allow no HTML elements - text content only.
* @return Safelist that removes all HTML tags
*/
public static Safelist none();
/**
* Allow simple text formatting elements.
* Permits: b, em, i, strong, u
* @return Safelist for basic text formatting
*/
public static Safelist simpleText();
/**
* Allow basic HTML elements without links or images.
* Permits: a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul
* @return Safelist for basic HTML content
*/
public static Safelist basic();
/**
* Allow basic HTML elements plus images.
* Includes everything from basic() plus: img (with src, alt, title, width, height attributes)
* @return Safelist for basic HTML with images
*/
public static Safelist basicWithImages();
/**
* Allow a wide range of HTML elements for rich content.
* Includes structural elements, tables, formatting, and more.
* @return Safelist for comprehensive HTML content
*/
public static Safelist relaxed();Usage Examples:
String html = "<p>Text with <script>alert('xss')</script> and <b>formatting</b></p>";
// No HTML allowed
String textOnly = Jsoup.clean(html, Safelist.none());
// Result: "Text with and formatting"
// Simple formatting only
String simpleFormatted = Jsoup.clean(html, Safelist.simpleText());
// Result: "Text with and <b>formatting</b>"
// Basic HTML elements
String basicHtml = Jsoup.clean(html, Safelist.basic());
// Result: "<p>Text with and <b>formatting</b></p>"
// Compare safelists
Safelist basic = Safelist.basic();
Safelist withImages = Safelist.basicWithImages();
Safelist rich = Safelist.relaxed();Create and configure custom safelists for specific requirements.
/**
* Create empty safelist.
*/
public Safelist();
/**
* Copy constructor for extending existing safelists.
* @param copy Safelist to copy
*/
public Safelist(Safelist copy);
/**
* Add allowed tag names.
* @param tags tag names to allow
* @return this Safelist for chaining
*/
public Safelist addTags(String... tags);
/**
* Remove allowed tag names.
* @param tags tag names to remove
* @return this Safelist for chaining
*/
public Safelist removeTags(String... tags);
/**
* Add allowed attributes for specific tags.
* @param tag tag name
* @param attributes attribute names to allow
* @return this Safelist for chaining
*/
public Safelist addAttributes(String tag, String... attributes);
/**
* Remove allowed attributes for specific tags.
* @param tag tag name
* @param attributes attribute names to remove
* @return this Safelist for chaining
*/
public Safelist removeAttributes(String tag, String... attributes);Usage Examples:
// Start with basic safelist and customize
Safelist customList = new Safelist(Safelist.basic())
.addTags("h1", "h2", "h3", "h4", "h5", "h6") // Add heading tags
.addAttributes("a", "target") // Allow target on links
.addAttributes("img", "class") // Allow class on images
.removeTags("cite", "q"); // Remove citation tags
// Build from scratch
Safelist minimal = new Safelist()
.addTags("p", "br", "strong", "em")
.addAttributes("p", "class")
.addAttributes("strong", "class");
String html = "<h1>Title</h1><p class='intro'>Text with <strong class='highlight'>emphasis</strong></p>";
String cleaned = Jsoup.clean(html, customList);Ensure specific attributes are always present on certain elements.
/**
* Add enforced attribute that will be set on matching elements.
* @param tag tag name
* @param attribute attribute name
* @param value attribute value to enforce
* @return this Safelist for chaining
*/
public Safelist addEnforcedAttribute(String tag, String attribute, String value);
/**
* Remove enforced attribute.
* @param tag tag name
* @param attribute attribute name
* @return this Safelist for chaining
*/
public Safelist removeEnforcedAttribute(String tag, String attribute);
/**
* Get enforced attributes for a tag.
* @param tagName tag name
* @return Map of enforced attributes
*/
public Map<String, String> getEnforcedAttributes(String tagName);Usage Examples:
Safelist safelist = Safelist.basic()
.addEnforcedAttribute("a", "rel", "nofollow") // All links get rel="nofollow"
.addEnforcedAttribute("a", "target", "_blank") // All links open in new window
.addEnforcedAttribute("img", "loading", "lazy"); // All images lazy load
String html = "<a href='https://example.com'>Link</a>";
String cleaned = Jsoup.clean(html, safelist);
// Result: "<a href='https://example.com' rel='nofollow' target='_blank'>Link</a>"Control which URL protocols are allowed in link and image attributes.
/**
* Add allowed protocols for URL attributes.
* @param tag tag name
* @param attribute attribute name (href, src, etc.)
* @param protocols allowed URL protocols
* @return this Safelist for chaining
*/
public Safelist addProtocols(String tag, String attribute, String... protocols);
/**
* Remove allowed protocols for URL attributes.
* @param tag tag name
* @param attribute attribute name
* @param removeProtocols protocols to remove
* @return this Safelist for chaining
*/
public Safelist removeProtocols(String tag, String attribute, String... removeProtocols);
/**
* Control whether relative links are preserved.
* @param preserve true to preserve relative links
* @return this Safelist for chaining
*/
public Safelist preserveRelativeLinks(boolean preserve);Usage Examples:
Safelist safelist = Safelist.basic()
.addProtocols("a", "href", "http", "https", "mailto")
.addProtocols("img", "src", "http", "https", "data")
.preserveRelativeLinks(true);
// URLs with disallowed protocols are removed
String html = "<a href='javascript:alert(\"xss\")'>Bad Link</a>" +
"<a href='https://safe.com'>Good Link</a>";
String cleaned = Jsoup.clean(html, safelist);
// Result: "<a>Bad Link</a><a href='https://safe.com'>Good Link</a>"For more advanced cleaning scenarios, use the Cleaner class directly.
/**
* Create a cleaner with the specified safelist.
* @param safelist allowlist for cleaning
*/
public Cleaner(Safelist safelist);
/**
* Clean a full Document (not just body fragment).
* @param dirtyDocument document to clean
* @return new cleaned Document
*/
public Document clean(Document dirtyDocument);
/**
* Test if a Document is valid according to the safelist.
* @param dirtyDocument document to validate
* @return true if document passes validation
*/
public boolean isValid(Document dirtyDocument);
/**
* Test if HTML body fragment is valid according to the safelist.
* @param bodyHtml HTML fragment to validate
* @return true if HTML is valid
*/
public boolean isValidBodyHtml(String bodyHtml);Usage Examples:
import org.jsoup.safety.Cleaner;
Cleaner cleaner = new Cleaner(Safelist.basic());
// Clean full documents
Document dirtyDoc = Jsoup.parse("<html><body><script>alert('xss')</script><p>Content</p></body></html>");
Document cleanDoc = cleaner.clean(dirtyDoc);
// Validate documents
boolean isDocumentSafe = cleaner.isValid(dirtyDoc);
// Validate HTML fragments
boolean isFragmentSafe = cleaner.isValidBodyHtml("<p>Safe content</p>");// Always clean user input before storing or displaying
public String sanitizeUserContent(String userHtml) {
return Jsoup.clean(userHtml, Safelist.basic());
}
// Use strict safelists for untrusted content
public String sanitizeComment(String comment) {
return Jsoup.clean(comment, Safelist.simpleText());
}
// Validate before cleaning for logging/monitoring
public String processUserSubmission(String html) {
if (!Jsoup.isValid(html, Safelist.basic())) {
logger.warn("Potentially malicious HTML submitted: " + html);
}
return Jsoup.clean(html, Safelist.basic());
}// Create restrictive safelist for user comments
Safelist commentSafelist = new Safelist()
.addTags("p", "br", "strong", "em", "code")
.addAttributes("code", "class"); // Allow syntax highlighting classes
// Create permissive safelist for trusted editors
Safelist editorSafelist = new Safelist(Safelist.relaxed())
.addEnforcedAttribute("a", "rel", "nofollow") // SEO protection
.addEnforcedAttribute("img", "loading", "lazy") // Performance
.addProtocols("img", "src", "http", "https"); // Block data URLs
// Different cleaning for different contexts
public String cleanForDisplay(String html, UserRole role) {
switch (role) {
case ADMIN:
return Jsoup.clean(html, Safelist.relaxed());
case EDITOR:
return Jsoup.clean(html, editorSafelist);
case USER:
return Jsoup.clean(html, commentSafelist);
default:
return Jsoup.clean(html, Safelist.none());
}
}// Test safelist configuration
public void validateSafelistConfiguration() {
Safelist safelist = createCustomSafelist();
String[] testCases = {
"<script>alert('xss')</script>", // Should be removed
"<p onclick='alert()'>Text</p>", // onclick should be removed
"<a href='javascript:void(0)'>Link</a>", // javascript: should be removed
"<img src='data:image/svg+xml,...'>", // data: URLs if not allowed
};
for (String testCase : testCases) {
String cleaned = Jsoup.clean(testCase, safelist);
assertFalse("Unsafe content not removed: " + testCase,
cleaned.contains("script") ||
cleaned.contains("onclick") ||
cleaned.contains("javascript:"));
}
}This comprehensive HTML sanitization system provides enterprise-grade security for processing untrusted HTML content while maintaining usability and performance.
Install with Tessl CLI
npx tessl i tessl/maven-org-jsoup--jsoup