Java HTML parser library implementing the WHATWG HTML5 specification for parsing, manipulating, and sanitizing HTML and XML documents.
—
Document Object Model manipulation with Document, Element, and Node classes providing methods for traversing, modifying, and extracting content from parsed HTML. jsoup provides a jQuery-like API for intuitive DOM operations.
Document class extends Element and represents the root of the HTML document tree.
/**
* Get the document's head element.
* @return head Element, or null if not found
*/
public Element head();
/**
* Get the document's body element.
* @return body Element, or null if not found
*/
public Element body();
/**
* Get the document title text.
* @return title text from title element
*/
public String title();
/**
* Set the document title.
* @param title new title text
*/
public void title(String title);
/**
* Create a new Element with the given tag name.
* @param tagName element tag name
* @return new Element instance
*/
public Element createElement(String tagName);
/**
* Get the document's base URI.
* @return base URI string
*/
public String location();
/**
* Get the HTTP connection used to fetch this document.
* @return Connection object, or null if document was not fetched via HTTP
*/
public Connection connection();
/**
* Create empty document shell with basic HTML structure.
* @param baseUri base URI for the document
* @return new Document with html, head, and body elements
*/
public static Document createShell(String baseUri);Usage Examples:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
Document doc = Jsoup.parse("<html><head><title>Old Title</title></head><body></body></html>");
// Access document structure
Element head = doc.head();
Element body = doc.body();
// Modify document title
String currentTitle = doc.title(); // "Old Title"
doc.title("New Title");
// Create and add new elements
Element newParagraph = doc.createElement("p");
newParagraph.text("Hello World");
body.appendChild(newParagraph);
// Access connection used to fetch document (if fetched via HTTP)
Connection conn = doc.connection();
if (conn != null) {
System.out.println("Document was fetched from: " + conn.request().url());
}
// Create empty document shell
Document emptyDoc = Document.createShell("https://example.com");Manipulate element text content and HTML content.
/**
* Get the combined text content of this element and its descendants.
* @return text content with whitespace normalized
*/
public String text();
/**
* Set the text content of this element (removes all child elements).
* @param text new text content
* @return this Element for chaining
*/
public Element text(String text);
/**
* Test if this element has non-empty text content.
* @return true if element has text content
*/
public boolean hasText();
/**
* Get the inner HTML content of this element.
* @return HTML content inside this element
*/
public String html();
/**
* Set the inner HTML content of this element.
* @param html new HTML content
* @return this Element for chaining
*/
public Element html(String html);
/**
* Get the outer HTML of this element including the element itself.
* @return complete HTML representation
*/
public String outerHtml();
/**
* Get combined data content for script and style elements.
* @return data content
*/
public String data();Usage Examples:
Element paragraph = doc.selectFirst("p");
// Text operations
String text = paragraph.text();
paragraph.text("New text content");
boolean hasText = paragraph.hasText();
// HTML operations
String innerHtml = paragraph.html();
paragraph.html("<strong>Bold text</strong>");
String outerHtml = paragraph.outerHtml();
// Data content (for script/style elements)
Element script = doc.selectFirst("script");
String scriptContent = script.data();Manipulate element attributes and properties.
/**
* Get an attribute value by key.
* @param attributeKey attribute name
* @return attribute value, or empty string if not set
*/
public String attr(String attributeKey);
/**
* Set an attribute value.
* @param attributeKey attribute name
* @param attributeValue attribute value
* @return this Element for chaining
*/
public Element attr(String attributeKey, String attributeValue);
/**
* Test if this element has the specified attribute.
* @param attributeKey attribute name
* @return true if attribute exists
*/
public boolean hasAttr(String attributeKey);
/**
* Remove an attribute from this element.
* @param attributeKey attribute name to remove
* @return this Element for chaining
*/
public Element removeAttr(String attributeKey);
/**
* Get all attributes of this element.
* @return Attributes collection
*/
public Attributes attributes();
/**
* Get data-* attributes as a Map.
* @return Map of data attribute keys to values
*/
public Map<String, String> dataset();
/**
* Get absolute URL for an attribute (if it contains a relative URL).
* @param attributeKey attribute name
* @return absolute URL, or empty string if not found or not a URL
*/
public String absUrl(String attributeKey);Usage Examples:
Element link = doc.selectFirst("a");
// Attribute operations
String href = link.attr("href");
link.attr("href", "https://newlink.com");
link.attr("target", "_blank");
boolean hasClass = link.hasAttr("class");
link.removeAttr("target");
// Get all attributes
Attributes attrs = link.attributes();
for (Attribute attr : attrs) {
System.out.println(attr.getKey() + "=" + attr.getValue());
}
// Data attributes
Element div = doc.selectFirst("div[data-id]");
Map<String, String> data = div.dataset();
String dataId = data.get("id"); // Gets data-id value
// Absolute URLs
String absoluteUrl = link.absUrl("href");Navigate the DOM tree structure to find parent, child, and sibling elements.
/**
* Get the parent element of this element.
* @return parent Element, or null if this is root
*/
public Element parent();
/**
* Get all ancestor elements of this element.
* @return Elements collection of ancestors
*/
public Elements parents();
/**
* Get direct child elements of this element.
* @return Elements collection of child elements
*/
public Elements children();
/**
* Get the number of direct child elements.
* @return count of child elements
*/
public int childrenSize();
/**
* Get a child element by index.
* @param index zero-based index
* @return child Element at index
* @throws IndexOutOfBoundsException if index is invalid
*/
public Element child(int index);
/**
* Get the first child element.
* @return first child Element, or null if no children
*/
public Element firstElementChild();
/**
* Get the last child element.
* @return last child Element, or null if no children
*/
public Element lastElementChild();Usage Examples:
Element paragraph = doc.selectFirst("p");
// Parent navigation
Element parent = paragraph.parent();
Elements ancestors = paragraph.parents();
// Child navigation
Elements children = paragraph.children();
int childCount = paragraph.childrenSize();
if (childCount > 0) {
Element firstChild = paragraph.child(0);
Element lastChild = paragraph.lastElementChild();
}
// Find specific ancestor
Element bodyAncestor = paragraph.parents().select("body").first();Navigate between sibling elements at the same level.
/**
* Get the next sibling element.
* @return next sibling Element, or null if none
*/
public Element nextElementSibling();
/**
* Get the previous sibling element.
* @return previous sibling Element, or null if none
*/
public Element previousElementSibling();
/**
* Get all sibling elements (excluding this element).
* @return Elements collection of siblings
*/
public Elements siblingElements();
/**
* Get the index of this element among its siblings.
* @return zero-based index among element siblings
*/
public int elementSiblingIndex();Usage Examples:
Element listItem = doc.selectFirst("li");
// Sibling navigation
Element nextItem = listItem.nextElementSibling();
Element prevItem = listItem.previousElementSibling();
Elements allSiblings = listItem.siblingElements();
int position = listItem.elementSiblingIndex();
System.out.println("This is list item #" + (position + 1));Add, remove, and modify DOM structure.
/**
* Add a child node at the end of this element's children.
* @param child Node to add
* @return this Element for chaining
*/
public Element appendChild(Node child);
/**
* Add a child node at the beginning of this element's children.
* @param child Node to add
* @return this Element for chaining
*/
public Element prependChild(Node child);
/**
* Insert child nodes at the specified index.
* @param index insertion index
* @param children nodes to insert
* @return this Element for chaining
*/
public Element insertChildren(int index, Collection<? extends Node> children);
/**
* Create and append a new child element.
* @param tagName tag name for new element
* @return the new child Element
*/
public Element appendElement(String tagName);
/**
* Create and prepend a new child element.
* @param tagName tag name for new element
* @return the new child Element
*/
public Element prependElement(String tagName);
/**
* Add text content at the end of this element.
* @param text text to append
* @return this Element for chaining
*/
public Element appendText(String text);
/**
* Add text content at the beginning of this element.
* @param text text to prepend
* @return this Element for chaining
*/
public Element prependText(String text);Usage Examples:
Element container = doc.selectFirst("div");
// Add child elements
Element newParagraph = doc.createElement("p");
newParagraph.text("New paragraph");
container.appendChild(newParagraph);
// Create and add in one step
Element header = container.appendElement("h2");
header.text("Section Header");
// Add text content
container.appendText("Additional text");
container.prependText("Prefix text");
// Insert at specific position
Element span = doc.createElement("span");
span.text("Inserted span");
container.insertChildren(1, Arrays.asList(span));Insert HTML content relative to elements.
/**
* Parse and append HTML content to this element.
* @param html HTML to parse and append
* @return this Element for chaining
*/
public Element append(String html);
/**
* Parse and prepend HTML content to this element.
* @param html HTML to parse and prepend
* @return this Element for chaining
*/
public Element prepend(String html);
/**
* Parse and insert HTML before this element.
* @param html HTML to parse and insert
* @return this Element for chaining
*/
public Element before(String html);
/**
* Parse and insert HTML after this element.
* @param html HTML to parse and insert
* @return this Element for chaining
*/
public Element after(String html);
/**
* Wrap this element with the provided HTML.
* @param html HTML to wrap around this element
* @return this Element for chaining
*/
public Element wrap(String html);
/**
* Remove this element from the DOM but keep its children.
* @return first child that replaced this element, or null
*/
public Node unwrap();Usage Examples:
Element paragraph = doc.selectFirst("p");
// Insert HTML content
paragraph.append("<strong>Bold text</strong>");
paragraph.prepend("<em>Italic text</em>");
// Insert relative to element
paragraph.before("<hr>");
paragraph.after("<br><br>");
// Wrap element
paragraph.wrap("<div class='wrapper'></div>");
// Unwrap (remove wrapper but keep content)
Element wrapper = doc.selectFirst(".wrapper");
wrapper.unwrap();Remove elements and clear content.
/**
* Remove this element from the DOM.
* @return this Element
*/
public Element remove();
/**
* Remove all child nodes from this element.
* @return this Element for chaining
*/
public Element empty();Usage Examples:
// Remove specific elements
Elements ads = doc.select(".advertisement");
ads.remove();
// Clear element content
Element container = doc.selectFirst("#content");
container.empty(); // Removes all children but keeps the containerManipulate CSS classes on elements.
/**
* Get the CSS class attribute value.
* @return class attribute value
*/
public String className();
/**
* Set the CSS class attribute.
* @param className new class attribute value
* @return this Element for chaining
*/
public Element className(String className);
/**
* Get CSS class names as a Set.
* @return Set of class names
*/
public Set<String> classNames();
/**
* Add a CSS class name.
* @param className class name to add
* @return this Element for chaining
*/
public Element addClass(String className);
/**
* Remove a CSS class name.
* @param className class name to remove
* @return this Element for chaining
*/
public Element removeClass(String className);
/**
* Toggle a CSS class name.
* @param className class name to toggle
* @return this Element for chaining
*/
public Element toggleClass(String className);
/**
* Test if this element has the specified CSS class.
* @param className class name to test
* @return true if element has the class
*/
public boolean hasClass(String className);Usage Examples:
Element div = doc.selectFirst("div");
// Class operations
div.addClass("highlight");
div.addClass("active");
div.removeClass("hidden");
div.toggleClass("expanded");
boolean isActive = div.hasClass("active");
Set<String> classes = div.classNames();
// Set all classes at once
div.className("new-class another-class");Work with form input values.
/**
* Get the form element value (input, textarea, select).
* @return element value
*/
public String val();
/**
* Set the form element value.
* @param value new value
* @return this Element for chaining
*/
public Element val(String value);Usage Examples:
// Input elements
Element textInput = doc.selectFirst("input[type=text]");
String currentValue = textInput.val();
textInput.val("New value");
// Textarea
Element textarea = doc.selectFirst("textarea");
textarea.val("New textarea content");
// Select elements
Element select = doc.selectFirst("select");
select.val("option2"); // Select option with value="option2"Create copies of elements.
/**
* Create a deep copy of this element and its descendants.
* @return cloned Element
*/
public Element clone();
/**
* Create a shallow copy of this element (no children).
* @return shallow cloned Element
*/
public Element shallowClone();Usage Examples:
Element original = doc.selectFirst("div.template");
// Deep clone (includes all children)
Element fullCopy = original.clone();
fullCopy.attr("id", "copy1");
// Shallow clone (element only, no children)
Element shallowCopy = original.shallowClone();
shallowCopy.text("New content");
// Add clones to document
doc.body().appendChild(fullCopy);
doc.body().appendChild(shallowCopy);All DOM objects inherit from Node, providing basic tree navigation and manipulation.
/**
* Get the node name (tag name for elements, "#text" for text nodes, etc.).
* @return node name
*/
public String nodeName();
/**
* Get child nodes (including text nodes and elements).
* @return List of child nodes
*/
public List<Node> childNodes();
/**
* Get the number of child nodes.
* @return count of child nodes
*/
public int childNodeSize();
/**
* Get parent node.
* @return parent Node, or null if root
*/
public Node parentNode();
/**
* Get the document that contains this node.
* @return owner Document
*/
public Document ownerDocument();
/**
* Remove this node from the DOM.
*/
public void remove();
/**
* Replace this node with another node.
* @param in replacement node
*/
public void replace(Node in);This comprehensive DOM manipulation API provides all the tools needed for programmatic HTML document modification and content extraction.
Install with Tessl CLI
npx tessl i tessl/maven-org-jsoup--jsoup