or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

css-selection.mddom-manipulation.mdform-handling.mdhtml-sanitization.mdhttp-connection.mdindex.mdparsing.md

parsing.mddocs/

0

# HTML/XML Parsing

1

2

Core parsing functionality for converting HTML and XML strings, files, and streams into navigable DOM structures. jsoup implements the WHATWG HTML5 specification and handles malformed HTML gracefully.

3

4

## Capabilities

5

6

### Parse from String

7

8

Parse HTML content from strings with optional base URI for resolving relative URLs.

9

10

```java { .api }

11

/**

12

* Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.

13

* @param html HTML to parse

14

* @return Document with parsed HTML

15

*/

16

public static Document parse(String html);

17

18

/**

19

* Parse HTML into a Document with base URI for resolving relative URLs.

20

* @param html HTML to parse

21

* @param baseUri The URL where the HTML was retrieved from

22

* @return Document with parsed HTML

23

*/

24

public static Document parse(String html, String baseUri);

25

26

/**

27

* Parse HTML with custom parser (e.g., XML parser).

28

* @param html HTML to parse

29

* @param baseUri Base URI for resolving relative URLs

30

* @param parser Parser to use (Parser.htmlParser() or Parser.xmlParser())

31

* @return Document with parsed content

32

*/

33

public static Document parse(String html, String baseUri, Parser parser);

34

```

35

36

**Usage Examples:**

37

38

```java

39

import org.jsoup.Jsoup;

40

import org.jsoup.nodes.Document;

41

import org.jsoup.parser.Parser;

42

43

// Basic HTML parsing

44

Document doc = Jsoup.parse("<html><body><h1>Title</h1></body></html>");

45

46

// Parse with base URI for relative URL resolution

47

Document doc = Jsoup.parse(

48

"<html><body><a href='/page'>Link</a></body></html>",

49

"https://example.com"

50

);

51

52

// Parse XML content

53

Document xmlDoc = Jsoup.parse(

54

"<root><item id='1'>Value</item></root>",

55

"",

56

Parser.xmlParser()

57

);

58

```

59

60

### Parse HTML Fragments

61

62

Parse partial HTML content intended as body fragments rather than complete documents.

63

64

```java { .api }

65

/**

66

* Parse a fragment of HTML, with the assumption that it forms the body of the HTML.

67

* @param bodyHtml body HTML fragment

68

* @return Document with HTML fragment wrapped in basic document structure

69

*/

70

public static Document parseBodyFragment(String bodyHtml);

71

72

/**

73

* Parse HTML fragment with base URI for relative URL resolution.

74

* @param bodyHtml body HTML fragment

75

* @param baseUri URL to resolve relative URLs against

76

* @return Document with HTML fragment

77

*/

78

public static Document parseBodyFragment(String bodyHtml, String baseUri);

79

```

80

81

**Usage Examples:**

82

83

```java

84

// Parse HTML fragment

85

Document doc = Jsoup.parseBodyFragment("<p>Hello <b>world</b>!</p>");

86

Element body = doc.body(); // Access the generated body element

87

88

// Parse fragment with base URI

89

Document doc = Jsoup.parseBodyFragment(

90

"<img src='/image.jpg' alt='Image'>",

91

"https://example.com"

92

);

93

```

94

95

### Parse from File

96

97

Parse HTML content from files with automatic or specified character encoding detection.

98

99

```java { .api }

100

/**

101

* Parse the contents of a file as HTML with auto-detected charset.

102

* @param file file to load HTML from (supports gzipped files)

103

* @return Document with parsed HTML

104

* @throws IOException if the file could not be found or read

105

*/

106

public static Document parse(File file) throws IOException;

107

108

/**

109

* Parse file with specified character encoding.

110

* @param file file to load HTML from

111

* @param charsetName character set of file contents (null for auto-detection)

112

* @return Document with parsed HTML

113

* @throws IOException if the file could not be found, read, or charset is invalid

114

*/

115

public static Document parse(File file, String charsetName) throws IOException;

116

117

/**

118

* Parse file with charset and base URI.

119

* @param file file to load HTML from

120

* @param charsetName character set (null for auto-detection)

121

* @param baseUri base URI for resolving relative URLs

122

* @return Document with parsed HTML

123

* @throws IOException if the file could not be found, read, or charset is invalid

124

*/

125

public static Document parse(File file, String charsetName, String baseUri) throws IOException;

126

127

/**

128

* Parse file with custom parser.

129

* @param file file to load HTML from

130

* @param charsetName character set (null for auto-detection)

131

* @param baseUri base URI for resolving relative URLs

132

* @param parser custom parser to use

133

* @return Document with parsed content

134

* @throws IOException if the file could not be found, read, or charset is invalid

135

*/

136

public static Document parse(File file, String charsetName, String baseUri, Parser parser) throws IOException;

137

```

138

139

**Usage Examples:**

140

141

```java

142

import java.io.File;

143

144

// Parse with auto-detected encoding

145

Document doc = Jsoup.parse(new File("index.html"));

146

147

// Parse with specific encoding

148

Document doc = Jsoup.parse(new File("page.html"), "UTF-8");

149

150

// Parse with encoding and base URI

151

Document doc = Jsoup.parse(

152

new File("content.html"),

153

"UTF-8",

154

"https://example.com"

155

);

156

```

157

158

### Parse from Path

159

160

Parse HTML content from Java NIO Path objects (Java 8+ feature).

161

162

```java { .api }

163

/**

164

* Parse the contents of a file path as HTML with auto-detected charset.

165

* @param path file path to load HTML from (supports gzipped files)

166

* @return Document with parsed HTML

167

* @throws IOException if the file could not be found or read

168

*/

169

public static Document parse(Path path) throws IOException;

170

171

/**

172

* Parse path with specified character encoding.

173

* @param path file path to load HTML from

174

* @param charsetName character set of file contents (null for auto-detection)

175

* @return Document with parsed HTML

176

* @throws IOException if the path could not be found, read, or charset is invalid

177

*/

178

public static Document parse(Path path, String charsetName) throws IOException;

179

180

/**

181

* Parse path with charset and base URI.

182

* @param path file path to load HTML from

183

* @param charsetName character set (null for auto-detection)

184

* @param baseUri base URI for resolving relative URLs

185

* @return Document with parsed HTML

186

* @throws IOException if the path could not be found, read, or charset is invalid

187

*/

188

public static Document parse(Path path, String charsetName, String baseUri) throws IOException;

189

190

/**

191

* Parse path with custom parser.

192

* @param path file path to load HTML from

193

* @param charsetName character set (null for auto-detection)

194

* @param baseUri base URI for resolving relative URLs

195

* @param parser custom parser to use

196

* @return Document with parsed content

197

* @throws IOException if the path could not be found, read, or charset is invalid

198

*/

199

public static Document parse(Path path, String charsetName, String baseUri, Parser parser) throws IOException;

200

```

201

202

### Parse from InputStream

203

204

Parse HTML content from input streams with specified character encoding.

205

206

```java { .api }

207

/**

208

* Read an input stream, and parse it to a Document.

209

* @param in input stream to read (will be closed after reading)

210

* @param charsetName character set of stream contents (null for auto-detection)

211

* @param baseUri base URI for resolving relative URLs

212

* @return Document with parsed HTML

213

* @throws IOException if the stream could not be read or charset is invalid

214

*/

215

public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException;

216

217

/**

218

* Parse InputStream with custom parser.

219

* @param in input stream to read

220

* @param charsetName character set (null for auto-detection)

221

* @param baseUri base URI for resolving relative URLs

222

* @param parser custom parser to use

223

* @return Document with parsed content

224

* @throws IOException if the stream could not be read or charset is invalid

225

*/

226

public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException;

227

```

228

229

### Parse from URL

230

231

Fetch and parse HTML content directly from URLs with timeout control.

232

233

```java { .api }

234

/**

235

* Fetch a URL, and parse it as HTML.

236

* @param url URL to fetch (must be http or https)

237

* @param timeoutMillis Connection and read timeout in milliseconds

238

* @return Document with parsed HTML

239

* @throws IOException if connection fails, times out, or returns error status

240

* @throws HttpStatusException if HTTP response is not OK

241

* @throws UnsupportedMimeTypeException if response MIME type is not supported

242

*/

243

public static Document parse(URL url, int timeoutMillis) throws IOException;

244

```

245

246

**Usage Example:**

247

248

```java

249

import java.net.URL;

250

251

// Fetch and parse URL with 5-second timeout

252

Document doc = Jsoup.parse(new URL("https://example.com"), 5000);

253

```

254

255

## Parser Configuration

256

257

### Parser Class

258

259

Create and configure custom parsers for specific parsing requirements.

260

261

```java { .api }

262

/**

263

* HTML parser factory method.

264

* @return Parser configured for HTML parsing

265

*/

266

public static Parser htmlParser();

267

268

/**

269

* XML parser factory method.

270

* @return Parser configured for XML parsing

271

*/

272

public static Parser xmlParser();

273

274

/**

275

* Parse HTML input with this parser.

276

* @param html HTML content to parse

277

* @param baseUri base URI for relative URL resolution

278

* @return Document with parsed content

279

*/

280

public Document parseInput(String html, String baseUri);

281

282

/**

283

* Parse HTML fragment with context element.

284

* @param fragment HTML fragment to parse

285

* @param context Element providing parsing context

286

* @param baseUri base URI for relative URLs

287

* @return List of parsed nodes

288

*/

289

public List<Node> parseFragmentInput(String fragment, Element context, String baseUri);

290

```

291

292

### Parse Settings

293

294

Control case sensitivity and normalization behavior during parsing.

295

296

```java { .api }

297

public class ParseSettings {

298

/** Default HTML settings (case-insensitive tags and attributes) */

299

public static final ParseSettings htmlDefault;

300

301

/** Preserve case settings (case-sensitive tags and attributes) */

302

public static final ParseSettings preserveCase;

303

304

/**

305

* Create custom parse settings.

306

* @param preserveTagCase whether to preserve tag name case

307

* @param preserveAttributeCase whether to preserve attribute name case

308

*/

309

public ParseSettings(boolean preserveTagCase, boolean preserveAttributeCase);

310

}

311

```

312

313

**Usage Examples:**

314

315

```java

316

import org.jsoup.parser.Parser;

317

import org.jsoup.parser.ParseSettings;

318

319

// Create HTML parser with case-sensitive settings

320

Parser parser = Parser.htmlParser();

321

parser.settings(ParseSettings.preserveCase);

322

323

// Parse with custom parser

324

Document doc = Jsoup.parse(html, baseUri, parser);

325

326

// XML parsing (automatically case-sensitive)

327

Parser xmlParser = Parser.xmlParser();

328

Document xmlDoc = Jsoup.parse(xmlContent, "", xmlParser);

329

```

330

331

## Error Handling and Position Tracking

332

333

Enable error tracking and position information during parsing for debugging and validation.

334

335

```java { .api }

336

/**

337

* Enable parse error tracking.

338

* @param maxErrors maximum number of errors to track (0 = unlimited)

339

* @return this parser for chaining

340

*/

341

public Parser setTrackErrors(int maxErrors);

342

343

/**

344

* Get parse errors if error tracking is enabled.

345

* @return List of ParseError objects

346

*/

347

public List<ParseError> getErrors();

348

349

/**

350

* Enable position tracking for parsed nodes.

351

* @param trackPosition whether to track source positions

352

* @return this parser for chaining

353

*/

354

public Parser setTrackPosition(boolean trackPosition);

355

```

356

357

**Usage Example:**

358

359

```java

360

// Create parser with error tracking

361

Parser parser = Parser.htmlParser();

362

parser.setTrackErrors(50); // Track up to 50 errors

363

parser.setTrackPosition(true); // Track source positions

364

365

Document doc = parser.parseInput(html, baseUri);

366

367

// Check for parse errors

368

List<ParseError> errors = parser.getErrors();

369

if (!errors.isEmpty()) {

370

System.out.println("Parse errors found: " + errors.size());

371

for (ParseError error : errors) {

372

System.out.println("Error: " + error.getErrorMessage());

373

}

374

}

375

```

376

377

## Character Encoding

378

379

jsoup automatically detects character encoding from:

380

381

1. Byte-order mark (BOM) in the input

382

2. `<meta charset>` declaration in HTML

383

3. `http-equiv` meta tag with charset information

384

4. Specified encoding parameter

385

5. UTF-8 fallback (if no encoding detected)

386

387

**Encoding Priority:**

388

1. Explicitly specified encoding parameter

389

2. BOM detection

390

3. HTML meta declarations

391

4. UTF-8 default

392

393

This ensures reliable parsing of HTML content regardless of encoding inconsistencies.