CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-jsoup--jsoup

Java HTML parser library implementing the WHATWG HTML5 specification for parsing, manipulating, and sanitizing HTML and XML documents.

Pending
Overview
Eval results
Files

dom-manipulation.mddocs/

DOM Manipulation

Document Object Model manipulation with Document, Element, and Node classes providing methods for traversing, modifying, and extracting content from parsed HTML. jsoup provides a jQuery-like API for intuitive DOM operations.

Capabilities

Document Operations

Document class extends Element and represents the root of the HTML document tree.

/**
 * Get the document's head element.
 * @return head Element, or null if not found
 */
public Element head();

/**
 * Get the document's body element.
 * @return body Element, or null if not found
 */
public Element body();

/**
 * Get the document title text.
 * @return title text from title element
 */
public String title();

/**
 * Set the document title.
 * @param title new title text
 */
public void title(String title);

/**
 * Create a new Element with the given tag name.
 * @param tagName element tag name
 * @return new Element instance
 */
public Element createElement(String tagName);

/**
 * Get the document's base URI.
 * @return base URI string
 */
public String location();

/**
 * Get the HTTP connection used to fetch this document.
 * @return Connection object, or null if document was not fetched via HTTP
 */
public Connection connection();

/**
 * Create empty document shell with basic HTML structure.
 * @param baseUri base URI for the document
 * @return new Document with html, head, and body elements
 */
public static Document createShell(String baseUri);

Usage Examples:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

Document doc = Jsoup.parse("<html><head><title>Old Title</title></head><body></body></html>");

// Access document structure
Element head = doc.head();
Element body = doc.body();

// Modify document title
String currentTitle = doc.title();  // "Old Title"
doc.title("New Title");

// Create and add new elements
Element newParagraph = doc.createElement("p");
newParagraph.text("Hello World");
body.appendChild(newParagraph);

// Access connection used to fetch document (if fetched via HTTP)
Connection conn = doc.connection();
if (conn != null) {
    System.out.println("Document was fetched from: " + conn.request().url());
}

// Create empty document shell
Document emptyDoc = Document.createShell("https://example.com");

Element Content Operations

Manipulate element text content and HTML content.

/**
 * Get the combined text content of this element and its descendants.
 * @return text content with whitespace normalized
 */
public String text();

/**
 * Set the text content of this element (removes all child elements).
 * @param text new text content
 * @return this Element for chaining
 */
public Element text(String text);

/**
 * Test if this element has non-empty text content.
 * @return true if element has text content
 */
public boolean hasText();

/**
 * Get the inner HTML content of this element.
 * @return HTML content inside this element
 */
public String html();

/**
 * Set the inner HTML content of this element.
 * @param html new HTML content
 * @return this Element for chaining
 */
public Element html(String html);

/**
 * Get the outer HTML of this element including the element itself.
 * @return complete HTML representation
 */
public String outerHtml();

/**
 * Get combined data content for script and style elements.
 * @return data content
 */
public String data();

Usage Examples:

Element paragraph = doc.selectFirst("p");

// Text operations
String text = paragraph.text();
paragraph.text("New text content");
boolean hasText = paragraph.hasText();

// HTML operations
String innerHtml = paragraph.html();
paragraph.html("<strong>Bold text</strong>");
String outerHtml = paragraph.outerHtml();

// Data content (for script/style elements)
Element script = doc.selectFirst("script");
String scriptContent = script.data();

Attribute Operations

Manipulate element attributes and properties.

/**
 * Get an attribute value by key.
 * @param attributeKey attribute name
 * @return attribute value, or empty string if not set
 */
public String attr(String attributeKey);

/**
 * Set an attribute value.
 * @param attributeKey attribute name
 * @param attributeValue attribute value
 * @return this Element for chaining
 */
public Element attr(String attributeKey, String attributeValue);

/**
 * Test if this element has the specified attribute.
 * @param attributeKey attribute name
 * @return true if attribute exists
 */
public boolean hasAttr(String attributeKey);

/**
 * Remove an attribute from this element.
 * @param attributeKey attribute name to remove
 * @return this Element for chaining
 */
public Element removeAttr(String attributeKey);

/**
 * Get all attributes of this element.
 * @return Attributes collection
 */
public Attributes attributes();

/**
 * Get data-* attributes as a Map.
 * @return Map of data attribute keys to values
 */
public Map<String, String> dataset();

/**
 * Get absolute URL for an attribute (if it contains a relative URL).
 * @param attributeKey attribute name
 * @return absolute URL, or empty string if not found or not a URL
 */
public String absUrl(String attributeKey);

Usage Examples:

Element link = doc.selectFirst("a");

// Attribute operations
String href = link.attr("href");
link.attr("href", "https://newlink.com");
link.attr("target", "_blank");

boolean hasClass = link.hasAttr("class");
link.removeAttr("target");

// Get all attributes
Attributes attrs = link.attributes();
for (Attribute attr : attrs) {
    System.out.println(attr.getKey() + "=" + attr.getValue());
}

// Data attributes
Element div = doc.selectFirst("div[data-id]");
Map<String, String> data = div.dataset();
String dataId = data.get("id");  // Gets data-id value

// Absolute URLs
String absoluteUrl = link.absUrl("href");

Element Hierarchy Navigation

Navigate the DOM tree structure to find parent, child, and sibling elements.

/**
 * Get the parent element of this element.
 * @return parent Element, or null if this is root
 */
public Element parent();

/**
 * Get all ancestor elements of this element.
 * @return Elements collection of ancestors
 */
public Elements parents();

/**
 * Get direct child elements of this element.
 * @return Elements collection of child elements
 */
public Elements children();

/**
 * Get the number of direct child elements.
 * @return count of child elements
 */
public int childrenSize();

/**
 * Get a child element by index.
 * @param index zero-based index
 * @return child Element at index
 * @throws IndexOutOfBoundsException if index is invalid
 */
public Element child(int index);

/**
 * Get the first child element.
 * @return first child Element, or null if no children
 */
public Element firstElementChild();

/**
 * Get the last child element.
 * @return last child Element, or null if no children
 */
public Element lastElementChild();

Usage Examples:

Element paragraph = doc.selectFirst("p");

// Parent navigation
Element parent = paragraph.parent();
Elements ancestors = paragraph.parents();

// Child navigation
Elements children = paragraph.children();
int childCount = paragraph.childrenSize();

if (childCount > 0) {
    Element firstChild = paragraph.child(0);
    Element lastChild = paragraph.lastElementChild();
}

// Find specific ancestor
Element bodyAncestor = paragraph.parents().select("body").first();

Sibling Navigation

Navigate between sibling elements at the same level.

/**
 * Get the next sibling element.
 * @return next sibling Element, or null if none
 */
public Element nextElementSibling();

/**
 * Get the previous sibling element.
 * @return previous sibling Element, or null if none
 */
public Element previousElementSibling();

/**
 * Get all sibling elements (excluding this element).
 * @return Elements collection of siblings
 */
public Elements siblingElements();

/**
 * Get the index of this element among its siblings.
 * @return zero-based index among element siblings
 */
public int elementSiblingIndex();

Usage Examples:

Element listItem = doc.selectFirst("li");

// Sibling navigation
Element nextItem = listItem.nextElementSibling();
Element prevItem = listItem.previousElementSibling();
Elements allSiblings = listItem.siblingElements();

int position = listItem.elementSiblingIndex();
System.out.println("This is list item #" + (position + 1));

DOM Modification

Add, remove, and modify DOM structure.

/**
 * Add a child node at the end of this element's children.
 * @param child Node to add
 * @return this Element for chaining
 */
public Element appendChild(Node child);

/**
 * Add a child node at the beginning of this element's children.
 * @param child Node to add
 * @return this Element for chaining
 */
public Element prependChild(Node child);

/**
 * Insert child nodes at the specified index.
 * @param index insertion index
 * @param children nodes to insert
 * @return this Element for chaining
 */
public Element insertChildren(int index, Collection<? extends Node> children);

/**
 * Create and append a new child element.
 * @param tagName tag name for new element
 * @return the new child Element
 */
public Element appendElement(String tagName);

/**
 * Create and prepend a new child element.
 * @param tagName tag name for new element
 * @return the new child Element
 */
public Element prependElement(String tagName);

/**
 * Add text content at the end of this element.
 * @param text text to append
 * @return this Element for chaining
 */
public Element appendText(String text);

/**
 * Add text content at the beginning of this element.
 * @param text text to prepend
 * @return this Element for chaining
 */
public Element prependText(String text);

Usage Examples:

Element container = doc.selectFirst("div");

// Add child elements
Element newParagraph = doc.createElement("p");
newParagraph.text("New paragraph");
container.appendChild(newParagraph);

// Create and add in one step
Element header = container.appendElement("h2");
header.text("Section Header");

// Add text content
container.appendText("Additional text");
container.prependText("Prefix text");

// Insert at specific position
Element span = doc.createElement("span");
span.text("Inserted span");
container.insertChildren(1, Arrays.asList(span));

HTML Insertion

Insert HTML content relative to elements.

/**
 * Parse and append HTML content to this element.
 * @param html HTML to parse and append
 * @return this Element for chaining
 */
public Element append(String html);

/**
 * Parse and prepend HTML content to this element.
 * @param html HTML to parse and prepend
 * @return this Element for chaining
 */
public Element prepend(String html);

/**
 * Parse and insert HTML before this element.
 * @param html HTML to parse and insert
 * @return this Element for chaining
 */
public Element before(String html);

/**
 * Parse and insert HTML after this element.
 * @param html HTML to parse and insert
 * @return this Element for chaining
 */
public Element after(String html);

/**
 * Wrap this element with the provided HTML.
 * @param html HTML to wrap around this element
 * @return this Element for chaining
 */
public Element wrap(String html);

/**
 * Remove this element from the DOM but keep its children.
 * @return first child that replaced this element, or null
 */
public Node unwrap();

Usage Examples:

Element paragraph = doc.selectFirst("p");

// Insert HTML content
paragraph.append("<strong>Bold text</strong>");
paragraph.prepend("<em>Italic text</em>");

// Insert relative to element
paragraph.before("<hr>");
paragraph.after("<br><br>");

// Wrap element
paragraph.wrap("<div class='wrapper'></div>");

// Unwrap (remove wrapper but keep content)
Element wrapper = doc.selectFirst(".wrapper");
wrapper.unwrap();

Element Removal and Clearing

Remove elements and clear content.

/**
 * Remove this element from the DOM.
 * @return this Element
 */
public Element remove();

/**
 * Remove all child nodes from this element.
 * @return this Element for chaining
 */
public Element empty();

Usage Examples:

// Remove specific elements
Elements ads = doc.select(".advertisement");
ads.remove();

// Clear element content
Element container = doc.selectFirst("#content");
container.empty();  // Removes all children but keeps the container

CSS Class Operations

Manipulate CSS classes on elements.

/**
 * Get the CSS class attribute value.
 * @return class attribute value
 */
public String className();

/**
 * Set the CSS class attribute.
 * @param className new class attribute value
 * @return this Element for chaining
 */
public Element className(String className);

/**
 * Get CSS class names as a Set.
 * @return Set of class names
 */
public Set<String> classNames();

/**
 * Add a CSS class name.
 * @param className class name to add
 * @return this Element for chaining
 */
public Element addClass(String className);

/**
 * Remove a CSS class name.
 * @param className class name to remove
 * @return this Element for chaining
 */
public Element removeClass(String className);

/**
 * Toggle a CSS class name.
 * @param className class name to toggle
 * @return this Element for chaining
 */
public Element toggleClass(String className);

/**
 * Test if this element has the specified CSS class.
 * @param className class name to test
 * @return true if element has the class
 */
public boolean hasClass(String className);

Usage Examples:

Element div = doc.selectFirst("div");

// Class operations
div.addClass("highlight");
div.addClass("active");
div.removeClass("hidden");
div.toggleClass("expanded");

boolean isActive = div.hasClass("active");
Set<String> classes = div.classNames();

// Set all classes at once
div.className("new-class another-class");

Form Element Values

Work with form input values.

/**
 * Get the form element value (input, textarea, select).
 * @return element value
 */
public String val();

/**
 * Set the form element value.
 * @param value new value
 * @return this Element for chaining
 */
public Element val(String value);

Usage Examples:

// Input elements
Element textInput = doc.selectFirst("input[type=text]");
String currentValue = textInput.val();
textInput.val("New value");

// Textarea
Element textarea = doc.selectFirst("textarea");
textarea.val("New textarea content");

// Select elements
Element select = doc.selectFirst("select");
select.val("option2");  // Select option with value="option2"

Element Cloning

Create copies of elements.

/**
 * Create a deep copy of this element and its descendants.
 * @return cloned Element
 */
public Element clone();

/**
 * Create a shallow copy of this element (no children).
 * @return shallow cloned Element
 */
public Element shallowClone();

Usage Examples:

Element original = doc.selectFirst("div.template");

// Deep clone (includes all children)
Element fullCopy = original.clone();
fullCopy.attr("id", "copy1");

// Shallow clone (element only, no children)
Element shallowCopy = original.shallowClone();
shallowCopy.text("New content");

// Add clones to document
doc.body().appendChild(fullCopy);
doc.body().appendChild(shallowCopy);

Node Base Class

All DOM objects inherit from Node, providing basic tree navigation and manipulation.

/**
 * Get the node name (tag name for elements, "#text" for text nodes, etc.).
 * @return node name
 */
public String nodeName();

/**
 * Get child nodes (including text nodes and elements).
 * @return List of child nodes
 */
public List<Node> childNodes();

/**
 * Get the number of child nodes.
 * @return count of child nodes
 */
public int childNodeSize();

/**
 * Get parent node.
 * @return parent Node, or null if root
 */
public Node parentNode();

/**
 * Get the document that contains this node.
 * @return owner Document
 */
public Document ownerDocument();

/**
 * Remove this node from the DOM.
 */
public void remove();

/**
 * Replace this node with another node.
 * @param in replacement node
 */
public void replace(Node in);

This comprehensive DOM manipulation API provides all the tools needed for programmatic HTML document modification and content extraction.

Install with Tessl CLI

npx tessl i tessl/maven-org-jsoup--jsoup

docs

css-selection.md

dom-manipulation.md

form-handling.md

html-sanitization.md

http-connection.md

index.md

parsing.md

tile.json