Java HTML parser library implementing the WHATWG HTML5 specification for parsing, manipulating, and sanitizing HTML and XML documents.
—
HTTP client functionality for fetching web pages with full configuration control including headers, cookies, timeouts, and session management. jsoup provides a comprehensive HTTP client designed specifically for web scraping and HTML processing.
Create HTTP connections for fetching and parsing web content.
/**
* Create a new HTTP connection to the specified URL.
* @param url URL to connect to (must be http or https)
* @return Connection instance for configuration and execution
*/
public static Connection connect(String url);
/**
* Create a new HTTP session for maintaining connection settings across requests.
* @return Connection instance configured as a session
*/
public static Connection newSession();Usage Examples:
import org.jsoup.Jsoup;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
// Single request connection
Connection conn = Jsoup.connect("https://example.com");
// Session for multiple requests
Connection session = Jsoup.newSession()
.timeout(30000)
.userAgent("MyBot 1.0");Configure the target URL and related settings.
/**
* Set the request URL.
* @param url target URL as string
* @return this Connection for chaining
*/
public Connection url(String url);
/**
* Set the request URL.
* @param url target URL object
* @return this Connection for chaining
*/
public Connection url(URL url);
/**
* Set the referrer header.
* @param referrer referrer URL
* @return this Connection for chaining
*/
public Connection referrer(String referrer);
/**
* Set whether to follow HTTP redirects.
* @param followRedirects true to follow redirects (default: true)
* @return this Connection for chaining
*/
public Connection followRedirects(boolean followRedirects);Usage Examples:
Connection conn = Jsoup.connect("https://example.com")
.referrer("https://google.com")
.followRedirects(true);
// Change URL dynamically
conn.url("https://different-site.com/page");Configure HTTP method, headers, and connection behavior.
/**
* Set the HTTP request method.
* @param method HTTP method (GET, POST, PUT, PATCH, DELETE, HEAD, OPTIONS, TRACE)
* @return this Connection for chaining
*/
public Connection method(Connection.Method method);
/**
* Set the User-Agent header.
* @param userAgent user agent string
* @return this Connection for chaining
*/
public Connection userAgent(String userAgent);
/**
* Set connection and read timeout.
* @param millis timeout in milliseconds (0 = infinite)
* @return this Connection for chaining
*/
public Connection timeout(int millis);
/**
* Set maximum response body size.
* @param bytes maximum body size in bytes (0 = unlimited)
* @return this Connection for chaining
*/
public Connection maxBodySize(int bytes);
/**
* Set whether to ignore HTTP error status codes.
* @param ignoreHttpErrors true to ignore errors (default: false)
* @return this Connection for chaining
*/
public Connection ignoreHttpErrors(boolean ignoreHttpErrors);
/**
* Set whether to ignore unsupported content types.
* @param ignoreContentType true to ignore content type checks (default: false)
* @return this Connection for chaining
*/
public Connection ignoreContentType(boolean ignoreContentType);
/**
* Set whether to validate TLS certificates.
* @param value true to validate certificates (default: true)
* @return this Connection for chaining
*/
public Connection validateTLSCertificates(boolean value);Usage Examples:
Connection conn = Jsoup.connect("https://api.example.com")
.method(Connection.Method.POST)
.userAgent("Mozilla/5.0 (compatible; MyBot/1.0)")
.timeout(10000) // 10 second timeout
.maxBodySize(1024 * 1024) // 1MB max response
.ignoreHttpErrors(true)
.ignoreContentType(true);Configure HTTP headers and cookies for requests.
/**
* Set a request header.
* @param name header name
* @param value header value
* @return this Connection for chaining
*/
public Connection header(String name, String value);
/**
* Set multiple request headers.
* @param headers Map of header names to values
* @return this Connection for chaining
*/
public Connection headers(Map<String, String> headers);
/**
* Set a cookie.
* @param name cookie name
* @param value cookie value
* @return this Connection for chaining
*/
public Connection cookie(String name, String value);
/**
* Set multiple cookies.
* @param cookies Map of cookie names to values
* @return this Connection for chaining
*/
public Connection cookies(Map<String, String> cookies);Usage Examples:
Connection conn = Jsoup.connect("https://example.com")
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "en-US,en;q=0.5")
.cookie("session", "abc123")
.cookie("preferences", "theme=dark");
// Set multiple at once
Map<String, String> headers = new HashMap<>();
headers.put("Authorization", "Bearer token123");
headers.put("Content-Type", "application/json");
conn.headers(headers);Send form data and request payloads.
/**
* Add form data parameter.
* @param key parameter name
* @param value parameter value
* @return this Connection for chaining
*/
public Connection data(String key, String value);
/**
* Add file upload data.
* @param key parameter name
* @param filename filename for upload
* @param inputStream file content stream
* @return this Connection for chaining
*/
public Connection data(String key, String filename, InputStream inputStream);
/**
* Set form data from collection.
* @param data Collection of Connection.KeyVal pairs
* @return this Connection for chaining
*/
public Connection data(Collection<Connection.KeyVal> data);
/**
* Set form data from map.
* @param data Map of parameter names to values
* @return this Connection for chaining
*/
public Connection data(Map<String, String> data);
/**
* Set the request body directly.
* @param body request body content
* @return this Connection for chaining
*/
public Connection requestBody(String body);Usage Examples:
// Form data
Connection conn = Jsoup.connect("https://example.com/api")
.method(Connection.Method.POST)
.data("username", "john")
.data("password", "secret")
.data("remember", "true");
// File upload
FileInputStream fileStream = new FileInputStream("document.pdf");
conn.data("file", "document.pdf", fileStream);
// Raw request body
conn.requestBody("{\"name\":\"John\",\"age\":30}")
.header("Content-Type", "application/json");Execute HTTP requests and handle responses.
/**
* Execute a GET request and parse the response as HTML.
* @return Document containing parsed HTML
* @throws IOException if request fails or response cannot be parsed
*/
public Document get() throws IOException;
/**
* Execute a POST request and parse the response as HTML.
* @return Document containing parsed HTML
* @throws IOException if request fails or response cannot be parsed
*/
public Document post() throws IOException;
/**
* Execute the configured request and return the raw response.
* @return Connection.Response containing response data
* @throws IOException if request fails
*/
public Connection.Response execute() throws IOException;Usage Examples:
// GET request
Document doc = Jsoup.connect("https://example.com")
.userAgent("Mozilla/5.0")
.get();
// POST request
Document result = Jsoup.connect("https://example.com/search")
.data("q", "jsoup")
.post();
// Get raw response
Connection.Response response = Jsoup.connect("https://api.example.com")
.ignoreContentType(true)
.execute();
String responseBody = response.body();
int statusCode = response.statusCode();Use sessions to maintain cookies and settings across multiple requests.
/**
* Create a new request using this connection's session settings.
* @return new Connection.Request with session settings
*/
public Connection newRequest();
/**
* Get the current request configuration.
* @return Connection.Request object
*/
public Connection.Request request();
/**
* Set the request configuration.
* @param request Connection.Request object
* @return this Connection for chaining
*/
public Connection request(Connection.Request request);
/**
* Get the response from the last executed request.
* @return Connection.Response object, or null if no request executed
*/
public Connection.Response response();
/**
* Set the response object.
* @param response Connection.Response object
* @return this Connection for chaining
*/
public Connection response(Connection.Response response);Usage Examples:
// Create session with common settings
Connection session = Jsoup.newSession()
.timeout(30000)
.userAgent("MyBot/1.0")
.cookie("auth", "token123");
// Make multiple requests with shared session
Document page1 = session.newRequest()
.url("https://example.com/page1")
.get();
Document page2 = session.newRequest()
.url("https://example.com/page2")
.get();
// Session maintains cookies automatically
Connection.Response loginResponse = session.newRequest()
.url("https://example.com/login")
.data("username", "user")
.data("password", "pass")
.method(Connection.Method.POST)
.execute();
// Subsequent requests include login cookies
Document protectedPage = session.newRequest()
.url("https://example.com/protected")
.get();Work with HTTP response data and metadata.
// Response interface methods
public interface Response {
/** Get response status code */
int statusCode();
/** Get response status message */
String statusMessage();
/** Get response body as string */
String body();
/** Get response body as bytes */
byte[] bodyAsBytes();
/** Get response headers */
Map<String, String> headers();
/** Get specific response header */
String header(String name);
/** Get response cookies */
Map<String, String> cookies();
/** Get specific response cookie */
String cookie(String name);
/** Get response content type */
String contentType();
/** Get response charset */
String charset();
/** Parse response body as Document */
Document parse() throws IOException;
}Usage Examples:
Connection.Response response = Jsoup.connect("https://api.example.com")
.ignoreContentType(true)
.execute();
// Response metadata
int status = response.statusCode();
String contentType = response.contentType();
Map<String, String> headers = response.headers();
Map<String, String> cookies = response.cookies();
// Response content
String body = response.body();
byte[] rawBytes = response.bodyAsBytes();
// Parse as HTML if needed
if (contentType.contains("text/html")) {
Document doc = response.parse();
}Configure custom parsers for response processing.
/**
* Set custom parser for processing responses.
* @param parser Parser instance (HTML or XML)
* @return this Connection for chaining
*/
public Connection parser(Parser parser);Usage Example:
import org.jsoup.parser.Parser;
// Use XML parser for XML responses
Connection conn = Jsoup.connect("https://example.com/data.xml")
.parser(Parser.xmlParser());
Document xmlDoc = conn.get();Monitor download progress for large responses.
/**
* Set progress callback for response downloads.
* @param progress Progress callback interface
* @return this Connection for chaining
*/
public Connection onResponseProgress(Progress progress);
// Progress interface
public interface Progress {
/**
* Called during response download with progress information.
* @param bytesRead bytes downloaded so far
* @param totalBytes total response size (may be -1 if unknown)
* @param percent completion percentage (0.0 to 100.0)
*/
void onProgress(long bytesRead, long totalBytes, float percent);
}Usage Example:
Connection conn = Jsoup.connect("https://example.com/large-page")
.onResponseProgress(new Progress() {
@Override
public void onProgress(long bytesRead, long totalBytes, float percent) {
System.out.printf("Downloaded: %d/%d bytes (%.1f%%)\n",
bytesRead, totalBytes, percent);
}
});
Document doc = conn.get();Handle specific HTTP and connection errors.
// HTTP status exceptions
public class HttpStatusException extends IOException {
public int getStatusCode();
public String getUrl();
}
// Unsupported content type exceptions
public class UnsupportedMimeTypeException extends IOException {
public String getMimeType();
public String getUrl();
}Usage Example:
try {
Document doc = Jsoup.connect("https://example.com")
.timeout(5000)
.get();
} catch (HttpStatusException e) {
System.err.println("HTTP error: " + e.getStatusCode() + " for URL: " + e.getUrl());
} catch (UnsupportedMimeTypeException e) {
System.err.println("Unsupported content type: " + e.getMimeType());
} catch (SocketTimeoutException e) {
System.err.println("Request timed out");
} catch (IOException e) {
System.err.println("Connection error: " + e.getMessage());
}HTTP methods supported by jsoup Connection.
public enum Method {
GET, POST, PUT, PATCH, DELETE, HEAD, OPTIONS, TRACE
}Usage Example:
// Different HTTP methods
Connection conn = Jsoup.connect("https://api.example.com");
// GET (default)
Document getResponse = conn.method(Connection.Method.GET).get();
// POST
Document postResponse = conn.method(Connection.Method.POST).post();
// Other methods
Connection.Response response = conn.method(Connection.Method.PUT).execute();This comprehensive HTTP connection API provides enterprise-grade web scraping capabilities with full control over request configuration, session management, and response handling.
Install with Tessl CLI
npx tessl i tessl/maven-org-jsoup--jsoup