0
# XML Parsing
1
2
Groovy XML provides two main approaches to parsing XML documents: XmlParser for Node-based manipulation and XmlSlurper for XPath-like navigation using GPathResult.
3
4
## XmlParser
5
6
Parses XML into a tree of Node objects that can be directly manipulated, modified, and traversed.
7
8
```java { .api }
9
public class XmlParser implements ContentHandler {
10
// Constructors
11
public XmlParser();
12
public XmlParser(boolean validating, boolean namespaceAware);
13
public XmlParser(boolean validating, boolean namespaceAware, boolean allowDocTypeDeclaration);
14
public XmlParser(XMLReader reader);
15
public XmlParser(SAXParser parser);
16
17
// Parsing methods
18
public Node parse(File file) throws IOException, SAXException;
19
public Node parse(InputSource input) throws IOException, SAXException;
20
public Node parse(InputStream input) throws IOException, SAXException;
21
public Node parse(Reader in) throws IOException, SAXException;
22
public Node parse(String uri) throws IOException, SAXException;
23
public Node parseText(String text) throws SAXException;
24
25
// Configuration methods
26
public boolean isTrimWhitespace();
27
public void setTrimWhitespace(boolean trimWhitespace);
28
public boolean isKeepIgnorableWhitespace();
29
public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace);
30
public boolean isNamespaceAware();
31
public void setNamespaceAware(boolean namespaceAware);
32
33
// SAX ContentHandler configuration
34
public void setContentHandler(ContentHandler contentHandler);
35
public void setErrorHandler(ErrorHandler errorHandler);
36
public void setEntityResolver(EntityResolver entityResolver);
37
public void setDTDHandler(DTDHandler dtdHandler);
38
}
39
```
40
41
### XmlParser Usage
42
43
```groovy
44
// Basic parsing
45
def parser = new XmlParser()
46
def root = parser.parseText('''
47
<catalog>
48
<book id="1" category="fiction">
49
<title>The Great Gatsby</title>
50
<author>F. Scott Fitzgerald</author>
51
<price currency="USD">12.99</price>
52
</book>
53
<book id="2" category="science">
54
<title>A Brief History of Time</title>
55
<author>Stephen Hawking</author>
56
<price currency="USD">15.99</price>
57
</book>
58
</catalog>
59
''')
60
61
// Access elements and attributes
62
println root.name() // "catalog"
63
println root.book.size() // 2
64
println root.book[0].title.text() // "The Great Gatsby"
65
println root.book[0].'@id' // "1"
66
println root.book[0].'@category' // "fiction"
67
68
// Modify the structure
69
root.book[0].title[0].value = 'New Title'
70
root.book[0].author[0].value = 'New Author'
71
72
// Add new elements
73
root.appendNode('publisher', 'Penguin Books')
74
root.book[0].appendNode('isbn', '978-0-7432-7356-5')
75
76
// Parse from file
77
def fileNode = parser.parse(new File('catalog.xml'))
78
79
// Parse with validation and namespace awareness
80
def validatingParser = new XmlParser(true, true)
81
validatingParser.setTrimWhitespace(true)
82
validatingParser.setKeepIgnorableWhitespace(false)
83
def validatedRoot = validatingParser.parseText(xmlString)
84
```
85
86
## XmlSlurper
87
88
Parses XML into GPathResult objects providing XPath-like navigation and lazy evaluation.
89
90
```java { .api }
91
public class XmlSlurper extends DefaultHandler {
92
// Constructors
93
public XmlSlurper();
94
public XmlSlurper(boolean validating, boolean namespaceAware);
95
public XmlSlurper(boolean validating, boolean namespaceAware, boolean allowDocTypeDeclaration);
96
public XmlSlurper(XMLReader reader);
97
public XmlSlurper(SAXParser parser);
98
99
// Parsing methods
100
public GPathResult parse(InputSource input) throws IOException, SAXException;
101
public GPathResult parse(File file) throws IOException, SAXException;
102
public GPathResult parse(InputStream input) throws IOException, SAXException;
103
public GPathResult parse(Reader in) throws IOException, SAXException;
104
public GPathResult parse(String uri) throws IOException, SAXException;
105
public GPathResult parseText(String text) throws SAXException;
106
107
// Configuration methods
108
public GPathResult getDocument();
109
public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace);
110
public boolean isKeepIgnorableWhitespace();
111
public void setEntityBaseUrl(URL base);
112
113
// SAX Handler configuration
114
public void setContentHandler(ContentHandler contentHandler);
115
public void setErrorHandler(ErrorHandler errorHandler);
116
public void setEntityResolver(EntityResolver entityResolver);
117
public void setDTDHandler(DTDHandler dtdHandler);
118
}
119
```
120
121
### XmlSlurper Usage
122
123
```groovy
124
// Basic slurping
125
def slurper = new XmlSlurper()
126
def catalog = slurper.parseText('''
127
<catalog>
128
<book id="1" category="fiction">
129
<title>The Great Gatsby</title>
130
<author>F. Scott Fitzgerald</author>
131
<price currency="USD">12.99</price>
132
</book>
133
<book id="2" category="science">
134
<title>A Brief History of Time</title>
135
<author>Stephen Hawking</author>
136
<price currency="USD">15.99</price>
137
</book>
138
</catalog>
139
''')
140
141
// XPath-like navigation
142
println catalog.book.title.text() // All titles as text
143
println catalog.book[0].title // "The Great Gatsby"
144
println catalog.book.'@category' // All category attributes
145
println catalog.book.find { it.'@id' == '1' }.title // Find by attribute
146
147
// Advanced navigation
148
println catalog.'**'.findAll { it.name() == 'price' }.text() // All prices
149
println catalog.book.findAll { it.price.toDouble() > 13.0 } // Books over $13
150
151
// Attribute access
152
catalog.book.each { book ->
153
println "Book ${book.'@id'}: ${book.title} by ${book.author}"
154
println "Price: ${book.price.'@currency'} ${book.price.text()}"
155
}
156
157
// Parse from file with configuration
158
def namespaceSlurper = new XmlSlurper(false, true) // not validating, namespace aware
159
namespaceSlurper.setKeepIgnorableWhitespace(false)
160
def result = namespaceSlurper.parse(new File('document.xml'))
161
```
162
163
## Node Type (XmlParser Result)
164
165
The Node class represents parsed XML elements from XmlParser.
166
167
```java { .api }
168
public class Node implements Serializable {
169
// Basic properties
170
public String name();
171
public String text();
172
public List<Node> children();
173
public Map<String, String> attributes();
174
175
// Content access
176
public Object get(String key);
177
public Object getAt(String key);
178
public void putAt(String key, Object value);
179
180
// Modification methods
181
public void setValue(String value);
182
public Node appendNode(String name);
183
public Node appendNode(String name, String value);
184
public Node appendNode(String name, Map<String, Object> attributes);
185
public Node appendNode(String name, Map<String, Object> attributes, String value);
186
public boolean remove(Node child);
187
188
// Navigation
189
public Node parent();
190
public List<Node> breadthFirst();
191
public List<Node> depthFirst();
192
193
// Utility methods
194
public Node plus(Node node);
195
public Iterator<Node> iterator();
196
}
197
```
198
199
### Node Usage Examples
200
201
```groovy
202
def parser = new XmlParser()
203
def root = parser.parseText('<root><item id="1">value</item></root>')
204
205
// Access node properties
206
println root.name() // "root"
207
println root.item[0].text() // "value"
208
println root.item[0].'@id' // "1"
209
210
// Traverse and modify
211
root.children().each { child ->
212
println "Child: ${child.name()} = ${child.text()}"
213
}
214
215
// Add new nodes
216
def newItem = root.appendNode('item', [id: '2'], 'new value')
217
root.appendNode('metadata') {
218
appendNode('created', new Date().toString())
219
appendNode('version', '1.0')
220
}
221
222
// Remove nodes
223
root.item.findAll { it.'@id' == '1' }.each { root.remove(it) }
224
```
225
226
## Parser Configuration
227
228
Both parsers support extensive configuration for different parsing scenarios:
229
230
```groovy
231
// Validation and namespace configuration
232
def validatingParser = new XmlParser(
233
true, // validating
234
true, // namespace aware
235
false // allow DOCTYPE declaration
236
)
237
238
// Whitespace handling
239
parser.setTrimWhitespace(true) // Trim whitespace around text
240
parser.setKeepIgnorableWhitespace(false) // Don't keep insignificant whitespace
241
242
// Custom SAX configuration
243
parser.setErrorHandler(new MyErrorHandler())
244
parser.setEntityResolver(new MyEntityResolver())
245
246
// For XmlSlurper
247
def slurper = new XmlSlurper(false, true) // not validating, namespace aware
248
slurper.setKeepIgnorableWhitespace(false)
249
slurper.setEntityBaseUrl(new URL('http://example.com/'))
250
```
251
252
## Error Handling
253
254
Both parsers can throw SAXException and IOException during parsing:
255
256
```groovy
257
try {
258
def parser = new XmlParser()
259
def result = parser.parseText(invalidXml)
260
} catch (SAXException e) {
261
println "XML parsing error: ${e.message}"
262
} catch (IOException e) {
263
println "IO error: ${e.message}"
264
}
265
266
// Custom error handling
267
parser.setErrorHandler(new ErrorHandler() {
268
void error(SAXParseException e) throws SAXException {
269
println "Parsing error at line ${e.lineNumber}: ${e.message}"
270
}
271
272
void fatalError(SAXParseException e) throws SAXException {
273
throw e // Re-throw fatal errors
274
}
275
276
void warning(SAXParseException e) throws SAXException {
277
println "Warning: ${e.message}"
278
}
279
})
280
```
281
282
## Comparison: XmlParser vs XmlSlurper
283
284
| Feature | XmlParser | XmlSlurper |
285
|---------|-----------|------------|
286
| Result Type | Node (mutable) | GPathResult (immutable) |
287
| Memory Usage | Higher (full DOM) | Lower (lazy evaluation) |
288
| Navigation | Object traversal | XPath-like |
289
| Modification | Yes | No (read-only) |
290
| Performance | Better for modification | Better for navigation |
291
| Suitable For | Manipulating XML | Querying XML |
292
293
Choose XmlParser when you need to modify XML structures, and XmlSlurper when you primarily need to read and navigate XML content.
294
295
## Parser Factory Classes
296
297
Factory classes provide convenient methods for creating parser instances with standard configurations.
298
299
### XmlParserFactory
300
301
```groovy { .api }
302
class XmlParserFactory {
303
static Object newParser(Object... args)
304
}
305
```
306
307
### XmlSlurperFactory
308
309
```groovy { .api }
310
class XmlSlurperFactory {
311
static Object newSlurper(Object... args)
312
}
313
```
314
315
### Factory Usage
316
317
```groovy
318
// Using parser factory with default settings
319
def parser = XmlParserFactory.newParser()
320
321
// Using parser factory with custom settings
322
def validatingParser = XmlParserFactory.newParser(
323
true, // validating
324
true // namespace aware
325
)
326
327
// Using slurper factory
328
def slurper = XmlSlurperFactory.newSlurper(false, true) // not validating, namespace aware
329
330
// Factories handle ParserConfigurationException and SAXException internally
331
try {
332
def customParser = XmlParserFactory.newParser(true, true, false) // validating, namespace aware, no DOCTYPE
333
def result = customParser.parseText(xmlString)
334
} catch (Exception e) {
335
println "Parser creation or parsing failed: ${e.message}"
336
}
337
```
338
339
## Advanced Error Handling
340
341
### Comprehensive Exception Handling
342
343
```groovy
344
import javax.xml.parsers.ParserConfigurationException
345
import org.xml.sax.SAXException
346
import org.xml.sax.SAXParseException
347
348
def robustParse = { xmlString ->
349
try {
350
def parser = new XmlParser()
351
return parser.parseText(xmlString)
352
353
} catch (ParserConfigurationException e) {
354
println "Parser configuration error: ${e.message}"
355
println "Check your XML parser installation and configuration"
356
return null
357
358
} catch (SAXParseException e) {
359
println "XML structure error at line ${e.lineNumber}, column ${e.columnNumber}:"
360
println " ${e.message}"
361
println " System ID: ${e.systemId}"
362
println " Public ID: ${e.publicId}"
363
return null
364
365
} catch (SAXException e) {
366
println "XML parsing error: ${e.message}"
367
if (e.exception) {
368
println "Root cause: ${e.exception.message}"
369
}
370
return null
371
372
} catch (IOException e) {
373
println "IO error while parsing: ${e.message}"
374
return null
375
376
} catch (Exception e) {
377
println "Unexpected error during parsing: ${e.message}"
378
e.printStackTrace()
379
return null
380
}
381
}
382
383
// Usage
384
def xml = '<root><item>valid</item></root>'
385
def result = robustParse(xml)
386
387
def invalidXml = '<root><item>unclosed'
388
def failedResult = robustParse(invalidXml) // Will handle the error gracefully
389
```
390
391
### Custom Error Handlers
392
393
```groovy
394
import org.xml.sax.ErrorHandler
395
import org.xml.sax.SAXParseException
396
397
class DetailedErrorHandler implements ErrorHandler {
398
List<String> warnings = []
399
List<String> errors = []
400
List<String> fatalErrors = []
401
402
@Override
403
void warning(SAXParseException e) throws SAXException {
404
def msg = "Warning at line ${e.lineNumber}: ${e.message}"
405
warnings << msg
406
println msg
407
}
408
409
@Override
410
void error(SAXParseException e) throws SAXException {
411
def msg = "Error at line ${e.lineNumber}: ${e.message}"
412
errors << msg
413
println msg
414
// Don't throw - allow parsing to continue
415
}
416
417
@Override
418
void fatalError(SAXParseException e) throws SAXException {
419
def msg = "Fatal error at line ${e.lineNumber}: ${e.message}"
420
fatalErrors << msg
421
println msg
422
throw e // Must throw for fatal errors
423
}
424
425
boolean hasErrors() {
426
return !errors.isEmpty() || !fatalErrors.isEmpty()
427
}
428
429
void printSummary() {
430
println "Parsing summary:"
431
println " Warnings: ${warnings.size()}"
432
println " Errors: ${errors.size()}"
433
println " Fatal errors: ${fatalErrors.size()}"
434
}
435
}
436
437
// Usage with custom error handler
438
def parseWithDetailedErrors = { xmlString ->
439
def errorHandler = new DetailedErrorHandler()
440
441
try {
442
def parser = new XmlParser()
443
parser.setErrorHandler(errorHandler)
444
445
def result = parser.parseText(xmlString)
446
errorHandler.printSummary()
447
448
if (errorHandler.hasErrors()) {
449
println "Parsing completed with errors - results may be incomplete"
450
}
451
452
return result
453
454
} catch (Exception e) {
455
errorHandler.printSummary()
456
println "Parsing failed: ${e.message}"
457
return null
458
}
459
}
460
```
461
462
### Validation Error Handling
463
464
```groovy
465
import javax.xml.validation.SchemaFactory
466
import javax.xml.validation.Schema
467
import javax.xml.XMLConstants
468
import javax.xml.transform.stream.StreamSource
469
470
def parseWithSchemaValidation = { xmlString, xsdFile ->
471
try {
472
// Create schema from XSD file
473
def schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
474
def schema = schemaFactory.newSchema(xsdFile)
475
476
// Create validating parser
477
def parser = XmlUtil.newSAXParser(XMLConstants.W3C_XML_SCHEMA_NS_URI, schema)
478
def xmlParser = new XmlParser(parser)
479
480
// Custom error handler for validation errors
481
def validationErrors = []
482
xmlParser.setErrorHandler(new ErrorHandler() {
483
void warning(SAXParseException e) {
484
validationErrors << "Validation warning: ${e.message}"
485
}
486
487
void error(SAXParseException e) {
488
validationErrors << "Validation error: ${e.message}"
489
}
490
491
void fatalError(SAXParseException e) throws SAXException {
492
validationErrors << "Fatal validation error: ${e.message}"
493
throw e
494
}
495
})
496
497
def result = xmlParser.parseText(xmlString)
498
499
if (validationErrors) {
500
println "Validation issues found:"
501
validationErrors.each { println " ${it}" }
502
} else {
503
println "Document is valid according to schema"
504
}
505
506
return result
507
508
} catch (Exception e) {
509
println "Schema validation failed: ${e.message}"
510
return null
511
}
512
}
513
514
// Usage
515
def xsdFile = new File('catalog.xsd')
516
def validXml = '''
517
<catalog xmlns="http://example.com/catalog">
518
<book id="1">
519
<title>Valid Book</title>
520
<author>Valid Author</author>
521
</book>
522
</catalog>
523
'''
524
525
def result = parseWithSchemaValidation(validXml, xsdFile)
526
```
527
528
### Recovery Strategies
529
530
```groovy
531
class XmlParsingRecovery {
532
533
static Node parseWithFallback(String xmlString) {
534
// Try strict parsing first
535
try {
536
def parser = new XmlParser(true, true) // validating, namespace aware
537
return parser.parseText(xmlString)
538
} catch (Exception e) {
539
println "Strict parsing failed: ${e.message}"
540
}
541
542
// Try lenient parsing
543
try {
544
def parser = new XmlParser(false, false) // non-validating, not namespace aware
545
parser.setTrimWhitespace(true)
546
return parser.parseText(xmlString.trim())
547
} catch (Exception e) {
548
println "Lenient parsing failed: ${e.message}"
549
}
550
551
// Try to fix common issues and parse again
552
try {
553
def fixedXml = fixCommonXmlIssues(xmlString)
554
def parser = new XmlParser(false, false)
555
return parser.parseText(fixedXml)
556
} catch (Exception e) {
557
println "Recovery parsing failed: ${e.message}"
558
}
559
560
return null
561
}
562
563
static String fixCommonXmlIssues(String xml) {
564
return xml
565
.replaceAll(/&(?![a-zA-Z0-9#]+;)/, '&') // Fix unescaped ampersands
566
.replaceAll(/<(?!\/?[a-zA-Z])/,'<') // Fix unescaped less-than
567
.replaceAll(/(?<![a-zA-Z0-9])>/, '>') // Fix unescaped greater-than
568
.replaceAll(/\r\n|\r/, '\n') // Normalize line endings
569
.trim()
570
}
571
572
static GPathResult slurpWithFallback(String xmlString) {
573
// Similar fallback strategy for XmlSlurper
574
try {
575
def slurper = new XmlSlurper(true, true)
576
return slurper.parseText(xmlString)
577
} catch (Exception e) {
578
println "Strict slurping failed: ${e.message}"
579
}
580
581
try {
582
def slurper = new XmlSlurper(false, false)
583
slurper.setKeepIgnorableWhitespace(false)
584
return slurper.parseText(xmlString.trim())
585
} catch (Exception e) {
586
println "Lenient slurping failed: ${e.message}"
587
}
588
589
try {
590
def fixedXml = fixCommonXmlIssues(xmlString)
591
def slurper = new XmlSlurper(false, false)
592
return slurper.parseText(fixedXml)
593
} catch (Exception e) {
594
println "Recovery slurping failed: ${e.message}"
595
}
596
597
return null
598
}
599
}
600
601
// Usage
602
def problematicXml = '<root><item>Text with & unescaped chars < ></item></root>'
603
def recovered = XmlParsingRecovery.parseWithFallback(problematicXml)
604
605
if (recovered) {
606
println "Successfully recovered and parsed XML"
607
println recovered.item.text()
608
} else {
609
println "Could not recover the XML"
610
}
611
```
612
613
## Performance Considerations
614
615
### Memory Management for Large Documents
616
617
```groovy
618
// For large XML files, prefer XmlSlurper over XmlParser
619
def processLargeXml = { file ->
620
if (file.size() > 10 * 1024 * 1024) { // > 10MB
621
println "Large file detected, using XmlSlurper for better memory efficiency"
622
def slurper = new XmlSlurper()
623
slurper.setKeepIgnorableWhitespace(false)
624
return slurper.parse(file)
625
} else {
626
println "Small file, using XmlParser for full DOM access"
627
def parser = new XmlParser()
628
return parser.parse(file)
629
}
630
}
631
632
// Process in chunks for very large files
633
def processXmlInChunks = { file, chunkProcessor ->
634
def slurper = new XmlSlurper()
635
def doc = slurper.parse(file)
636
637
// Process top-level elements one at a time
638
doc.children().each { element ->
639
chunkProcessor(element)
640
// Allow garbage collection of processed elements
641
System.gc()
642
}
643
}
644
```