0
# HTML DOM Manipulation
1
2
Comprehensive HTML DOM access and manipulation capabilities including element lookup, CSS selectors, XPath queries, and DOM tree navigation. Essential for web scraping and automated testing.
3
4
## Capabilities
5
6
### HtmlPage Class
7
8
Primary interface for HTML page manipulation and content extraction.
9
10
```java { .api }
11
/**
12
* HTML page with full DOM manipulation support
13
*/
14
public class HtmlPage extends SgmlPage {
15
/** Get element by ID attribute */
16
public HtmlElement getElementById(String id);
17
18
/** Get elements by tag name */
19
public List<HtmlElement> getElementsByTagName(String tagName);
20
21
/** Get elements by name attribute */
22
public List<HtmlElement> getElementsByName(String name);
23
24
/** Get elements by CSS class name */
25
public List<HtmlElement> getElementsByClassName(String className);
26
27
/** Get first element matching CSS selector */
28
public HtmlElement querySelector(String selectors);
29
30
/** Get all elements matching CSS selector */
31
public List<HtmlElement> querySelectorAll(String selectors);
32
33
/** Get first node matching XPath expression */
34
public DomNode getFirstByXPath(String xpathExpr);
35
36
/** Get all nodes matching XPath expression */
37
public List<?> getByXPath(String xpathExpr);
38
39
/** Get element by name (first match) */
40
public HtmlElement getElementByName(String name);
41
42
/** Get typed element by ID */
43
public <T extends HtmlElement> T getHtmlElementById(String id);
44
45
/** Get all forms on the page */
46
public List<HtmlForm> getForms();
47
48
/** Get form by name attribute */
49
public HtmlForm getFormByName(String name);
50
51
/** Get all anchor elements */
52
public List<HtmlAnchor> getAnchors();
53
54
/** Get anchor by name attribute */
55
public HtmlAnchor getAnchorByName(String name);
56
57
/** Get anchor by href attribute */
58
public HtmlAnchor getAnchorByHref(String href);
59
60
/** Get page title text */
61
public String getTitleText();
62
63
/** Get page content as plain text */
64
public String asText();
65
66
/** Get page content as XML string */
67
public String asXml();
68
69
/** Get head element */
70
public HtmlHead getHead();
71
72
/** Get body element */
73
public HtmlBody getBody();
74
75
/** Get document element (html) */
76
public HtmlHtml getDocumentElement();
77
78
/** Execute JavaScript code in page context */
79
public ScriptResult executeJavaScript(String sourceCode);
80
81
/** Execute JavaScript with source location info */
82
public ScriptResult executeJavaScript(String sourceCode, String sourceName, int startLine);
83
84
/** Click anywhere on the page */
85
public <P extends Page> P click() throws IOException;
86
87
/** Get character set encoding */
88
public Charset getCharset();
89
90
/** Refresh the page */
91
public void refresh() throws IOException;
92
93
/** Check if page has been modified */
94
public boolean hasFeature(PageFeature feature);
95
}
96
```
97
98
**Usage Examples:**
99
100
```java
101
import com.gargoylesoftware.htmlunit.WebClient;
102
import com.gargoylesoftware.htmlunit.html.HtmlPage;
103
import com.gargoylesoftware.htmlunit.html.HtmlElement;
104
105
try (WebClient webClient = new WebClient()) {
106
HtmlPage page = webClient.getPage("https://example.com");
107
108
// Element lookup by ID
109
HtmlElement loginButton = page.getElementById("login-btn");
110
if (loginButton != null) {
111
loginButton.click();
112
}
113
114
// CSS selector queries
115
HtmlElement navbar = page.querySelector(".navbar");
116
List<HtmlElement> menuItems = page.querySelectorAll(".menu-item");
117
118
// XPath queries
119
List<?> allLinks = page.getByXPath("//a[@href]");
120
HtmlElement firstParagraph = (HtmlElement) page.getFirstByXPath("//p[1]");
121
122
// Content extraction
123
String pageTitle = page.getTitleText();
124
String pageText = page.asText();
125
String pageHtml = page.asXml();
126
127
System.out.println("Title: " + pageTitle);
128
System.out.println("Found " + menuItems.size() + " menu items");
129
}
130
```
131
132
### DomNode Class
133
134
Base class for all DOM nodes providing tree navigation and manipulation.
135
136
```java { .api }
137
/**
138
* Base class for all DOM nodes
139
*/
140
public abstract class DomNode implements Cloneable {
141
/** Get parent node */
142
public DomNode getParentNode();
143
144
/** Get all child nodes */
145
public DomNodeList<DomNode> getChildNodes();
146
147
/** Get first child node */
148
public DomNode getFirstChild();
149
150
/** Get last child node */
151
public DomNode getLastChild();
152
153
/** Get next sibling node */
154
public DomNode getNextSibling();
155
156
/** Get previous sibling node */
157
public DomNode getPreviousSibling();
158
159
/** Get node name */
160
public String getNodeName();
161
162
/** Get node value */
163
public String getNodeValue();
164
165
/** Set node value */
166
public void setNodeValue(String nodeValue);
167
168
/** Get DOM node type constant */
169
public short getNodeType();
170
171
/** Append child node */
172
public DomNode appendChild(DomNode node);
173
174
/** Insert child before reference node */
175
public DomNode insertBefore(DomNode newChild, DomNode refChild);
176
177
/** Replace child node */
178
public DomNode replaceChild(DomNode newChild, DomNode oldChild);
179
180
/** Remove child node */
181
public DomNode removeChild(DomNode child);
182
183
/** Remove this node from parent */
184
public void remove();
185
186
/** Clone node (deep or shallow) */
187
public DomNode cloneNode(boolean deep);
188
189
/** Check if node has child nodes */
190
public boolean hasChildNodes();
191
192
/** Get text content of node and descendants */
193
public String getTextContent();
194
195
/** Set text content */
196
public void setTextContent(String textContent);
197
198
/** Get node as text representation */
199
public String asText();
200
201
/** Get node as XML string */
202
public String asXml();
203
204
/** Get containing page */
205
public SgmlPage getPage();
206
207
/** Get containing document */
208
public DomDocumentFragment getOwnerDocument();
209
210
/** Get nodes matching XPath expression */
211
public List<?> getByXPath(String xpathExpr);
212
213
/** Get canonical XPath to this node */
214
public String getCanonicalXPath();
215
216
/** Check if node is ancestor of another node */
217
public boolean isAncestorOf(DomNode node);
218
219
/** Check if node is descendant of another node */
220
public boolean isDescendantOf(DomNode node);
221
}
222
```
223
224
### DomElement Class
225
226
Base class for HTML/XML elements with attribute management.
227
228
```java { .api }
229
/**
230
* Base class for HTML and XML elements
231
*/
232
public class DomElement extends DomNode {
233
/** Get element tag name */
234
public String getTagName();
235
236
/** Get attribute value */
237
public String getAttribute(String attributeName);
238
239
/** Get attribute value with namespace */
240
public String getAttributeNS(String namespaceURI, String localName);
241
242
/** Set attribute value */
243
public void setAttribute(String attributeName, String attributeValue);
244
245
/** Set attribute with namespace */
246
public void setAttributeNS(String namespaceURI, String qualifiedName, String attributeValue);
247
248
/** Remove attribute */
249
public void removeAttribute(String attributeName);
250
251
/** Remove attribute with namespace */
252
public void removeAttributeNS(String namespaceURI, String localName);
253
254
/** Check if attribute exists */
255
public boolean hasAttribute(String attributeName);
256
257
/** Check if namespaced attribute exists */
258
public boolean hasAttributeNS(String namespaceURI, String localName);
259
260
/** Get all attribute names */
261
public Iterable<String> getAttributeNames();
262
263
/** Get element ID */
264
public String getId();
265
266
/** Set element ID */
267
public void setId(String id);
268
269
/** Get CSS class names */
270
public String getClassName();
271
272
/** Set CSS class names */
273
public void setClassName(String className);
274
275
/** Add CSS class */
276
public void addClass(String className);
277
278
/** Remove CSS class */
279
public void removeClass(String className);
280
281
/** Check if has CSS class */
282
public boolean hasClass(String className);
283
284
/** Get elements by tag name (descendants) */
285
public DomNodeList<DomElement> getElementsByTagName(String name);
286
287
/** Get elements by tag name with namespace */
288
public DomNodeList<DomElement> getElementsByTagNameNS(String namespaceURI, String localName);
289
290
/** Click the element */
291
public <P extends Page> P click() throws IOException;
292
293
/** Double-click the element */
294
public <P extends Page> P dblClick() throws IOException;
295
296
/** Right-click the element */
297
public <P extends Page> P rightClick() throws IOException;
298
299
/** Focus the element */
300
public void focus();
301
302
/** Remove focus from element */
303
public void blur();
304
305
/** Check if element is displayed */
306
public boolean isDisplayed();
307
308
/** Get element's computed style */
309
public ComputedCSSStyleDeclaration getComputedStyle();
310
311
/** Get element's inline style */
312
public ElementCssStyleDeclaration getStyle();
313
314
/** Check if element matches CSS selector */
315
public boolean matches(String selector);
316
317
/** Get closest ancestor matching selector */
318
public DomElement closest(String selector);
319
}
320
```
321
322
**Usage Examples:**
323
324
```java
325
import com.gargoylesoftware.htmlunit.html.DomElement;
326
import com.gargoylesoftware.htmlunit.html.HtmlElement;
327
328
// Element attribute manipulation
329
HtmlElement element = page.getElementById("myElement");
330
element.setAttribute("data-value", "123");
331
String value = element.getAttribute("data-value");
332
boolean hasClass = element.hasAttribute("class");
333
334
// CSS class manipulation
335
element.addClass("active");
336
element.removeClass("disabled");
337
boolean isActive = element.hasClass("active");
338
339
// DOM tree navigation
340
DomNode parent = element.getParentNode();
341
DomNodeList<DomNode> children = element.getChildNodes();
342
DomNode nextSibling = element.getNextSibling();
343
344
// Element interaction
345
if (element.isDisplayed()) {
346
HtmlPage newPage = element.click();
347
}
348
```
349
350
### HtmlElement Class
351
352
Specialized HTML element with additional HTML-specific functionality.
353
354
```java { .api }
355
/**
356
* Base class for HTML elements with interaction capabilities
357
*/
358
public abstract class HtmlElement extends DomElement {
359
/** Get element's lang attribute */
360
public String getLangAttribute();
361
362
/** Get element's dir attribute */
363
public String getDirAttribute();
364
365
/** Get element's title attribute */
366
public String getTitleAttribute();
367
368
/** Get element's tabindex */
369
public String getTabIndexAttribute();
370
371
/** Get element's onclick attribute */
372
public String getOnClickAttribute();
373
374
/** Get element's ondblclick attribute */
375
public String getOnDblClickAttribute();
376
377
/** Get element's onmousedown attribute */
378
public String getOnMouseDownAttribute();
379
380
/** Get element's onmouseup attribute */
381
public String getOnMouseUpAttribute();
382
383
/** Get element's onmouseover attribute */
384
public String getOnMouseOverAttribute();
385
386
/** Get element's onmousemove attribute */
387
public String getOnMouseMoveAttribute();
388
389
/** Get element's onmouseout attribute */
390
public String getOnMouseOutAttribute();
391
392
/** Get element's onkeypress attribute */
393
public String getOnKeyPressAttribute();
394
395
/** Get element's onkeydown attribute */
396
public String getOnKeyDownAttribute();
397
398
/** Get element's onkeyup attribute */
399
public String getOnKeyUpAttribute();
400
401
/** Type text into element (fires keyboard events) */
402
public void type(String text) throws IOException;
403
404
/** Type single character */
405
public void type(char c) throws IOException;
406
407
/** Simulate key press */
408
public void keyDown(int keyCode);
409
410
/** Simulate key release */
411
public void keyUp(int keyCode);
412
413
/** Get element's bounding rectangle */
414
public Rectangle getBoundingClientRect();
415
416
/** Scroll element into view */
417
public void scrollIntoView();
418
419
/** Check if element is enabled */
420
public boolean isEnabled();
421
422
/** Get element's offset parent */
423
public HtmlElement getOffsetParent();
424
425
/** Get offset dimensions */
426
public int getOffsetHeight();
427
public int getOffsetWidth();
428
public int getOffsetLeft();
429
public int getOffsetTop();
430
431
/** Get client dimensions */
432
public int getClientHeight();
433
public int getClientWidth();
434
public int getClientLeft();
435
public int getClientTop();
436
437
/** Get scroll dimensions */
438
public int getScrollHeight();
439
public int getScrollWidth();
440
public int getScrollLeft();
441
public int getScrollTop();
442
443
/** Set scroll position */
444
public void setScrollLeft(int scrollLeft);
445
public void setScrollTop(int scrollTop);
446
}
447
```
448
449
### DOM Node Types
450
451
```java { .api }
452
/**
453
* DOM node type constants (from DOM specification)
454
*/
455
public interface Node {
456
public static final short ELEMENT_NODE = 1;
457
public static final short ATTRIBUTE_NODE = 2;
458
public static final short TEXT_NODE = 3;
459
public static final short CDATA_SECTION_NODE = 4;
460
public static final short ENTITY_REFERENCE_NODE = 5;
461
public static final short ENTITY_NODE = 6;
462
public static final short PROCESSING_INSTRUCTION_NODE = 7;
463
public static final short COMMENT_NODE = 8;
464
public static final short DOCUMENT_NODE = 9;
465
public static final short DOCUMENT_TYPE_NODE = 10;
466
public static final short DOCUMENT_FRAGMENT_NODE = 11;
467
public static final short NOTATION_NODE = 12;
468
}
469
```
470
471
### Common HTML Elements
472
473
```java { .api }
474
/**
475
* HTML document structure elements
476
*/
477
public class HtmlHtml extends HtmlElement {
478
// Root HTML element
479
}
480
481
public class HtmlHead extends HtmlElement {
482
// Document head element
483
}
484
485
public class HtmlBody extends HtmlElement {
486
// Document body element
487
}
488
489
public class HtmlTitle extends HtmlElement {
490
// Document title element
491
}
492
493
/**
494
* Common content elements
495
*/
496
public class HtmlDivision extends HtmlElement {
497
// Division element (<div>)
498
}
499
500
public class HtmlSpan extends HtmlElement {
501
// Span element (<span>)
502
}
503
504
public class HtmlParagraph extends HtmlElement {
505
// Paragraph element (<p>)
506
}
507
508
public class HtmlHeading1 extends HtmlElement {
509
// H1 heading element
510
}
511
512
public class HtmlHeading2 extends HtmlElement {
513
// H2 heading element
514
}
515
516
public class HtmlHeading3 extends HtmlElement {
517
// H3 heading element
518
}
519
520
public class HtmlHeading4 extends HtmlElement {
521
// H4 heading element
522
}
523
524
public class HtmlHeading5 extends HtmlElement {
525
// H5 heading element
526
}
527
528
public class HtmlHeading6 extends HtmlElement {
529
// H6 heading element
530
}
531
532
/**
533
* List elements
534
*/
535
public class HtmlUnorderedList extends HtmlElement {
536
// <ul> element
537
}
538
539
public class HtmlOrderedList extends HtmlElement {
540
// <ol> element
541
}
542
543
public class HtmlListItem extends HtmlElement {
544
// <li> element
545
}
546
547
/**
548
* Text formatting elements
549
*/
550
public class HtmlEmphasis extends HtmlElement {
551
// <em> element
552
}
553
554
public class HtmlStrong extends HtmlElement {
555
// <strong> element
556
}
557
558
public class HtmlBold extends HtmlElement {
559
// <b> element
560
}
561
562
public class HtmlItalic extends HtmlElement {
563
// <i> element
564
}
565
```
566
567
### CSS Selector Support
568
569
HtmlUnit supports comprehensive CSS selector syntax:
570
571
**Basic Selectors:**
572
- Element: `div`, `p`, `span`
573
- Class: `.className`, `.class1.class2`
574
- ID: `#elementId`
575
- Attribute: `[attribute]`, `[attribute="value"]`, `[attribute^="prefix"]`
576
577
**Combinators:**
578
- Descendant: `div p` (p inside div)
579
- Child: `div > p` (p direct child of div)
580
- Adjacent sibling: `h1 + p` (p immediately after h1)
581
- General sibling: `h1 ~ p` (p sibling after h1)
582
583
**Pseudo-classes:**
584
- `:first-child`, `:last-child`, `:nth-child(n)`
585
- `:first-of-type`, `:last-of-type`, `:nth-of-type(n)`
586
- `:not(selector)`
587
- `:checked`, `:disabled`, `:enabled`
588
589
**Usage Example:**
590
591
```java
592
// Complex CSS selectors
593
List<HtmlElement> activeMenuItems = page.querySelectorAll(".menu .item.active");
594
HtmlElement firstTableCell = page.querySelector("table tr:first-child td:first-child");
595
List<HtmlElement> checkedInputs = page.querySelectorAll("input[type='checkbox']:checked");
596
HtmlElement submitButton = page.querySelector("form input[type='submit'], form button[type='submit']");
597
```