or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

builders.mdentities.mdindex.mdjaxb.mdnamespaces.mdnavigation.mdparsing.mdstreaming.mdutilities.md

parsing.mddocs/

0

# XML Parsing

1

2

Groovy XML provides two main approaches to parsing XML documents: XmlParser for Node-based manipulation and XmlSlurper for XPath-like navigation using GPathResult.

3

4

## XmlParser

5

6

Parses XML into a tree of Node objects that can be directly manipulated, modified, and traversed.

7

8

```java { .api }

9

public class XmlParser implements ContentHandler {

10

// Constructors

11

public XmlParser();

12

public XmlParser(boolean validating, boolean namespaceAware);

13

public XmlParser(boolean validating, boolean namespaceAware, boolean allowDocTypeDeclaration);

14

public XmlParser(XMLReader reader);

15

public XmlParser(SAXParser parser);

16

17

// Parsing methods

18

public Node parse(File file) throws IOException, SAXException;

19

public Node parse(InputSource input) throws IOException, SAXException;

20

public Node parse(InputStream input) throws IOException, SAXException;

21

public Node parse(Reader in) throws IOException, SAXException;

22

public Node parse(String uri) throws IOException, SAXException;

23

public Node parseText(String text) throws SAXException;

24

25

// Configuration methods

26

public boolean isTrimWhitespace();

27

public void setTrimWhitespace(boolean trimWhitespace);

28

public boolean isKeepIgnorableWhitespace();

29

public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace);

30

public boolean isNamespaceAware();

31

public void setNamespaceAware(boolean namespaceAware);

32

33

// SAX ContentHandler configuration

34

public void setContentHandler(ContentHandler contentHandler);

35

public void setErrorHandler(ErrorHandler errorHandler);

36

public void setEntityResolver(EntityResolver entityResolver);

37

public void setDTDHandler(DTDHandler dtdHandler);

38

}

39

```

40

41

### XmlParser Usage

42

43

```groovy

44

// Basic parsing

45

def parser = new XmlParser()

46

def root = parser.parseText('''

47

<catalog>

48

<book id="1" category="fiction">

49

<title>The Great Gatsby</title>

50

<author>F. Scott Fitzgerald</author>

51

<price currency="USD">12.99</price>

52

</book>

53

<book id="2" category="science">

54

<title>A Brief History of Time</title>

55

<author>Stephen Hawking</author>

56

<price currency="USD">15.99</price>

57

</book>

58

</catalog>

59

''')

60

61

// Access elements and attributes

62

println root.name() // "catalog"

63

println root.book.size() // 2

64

println root.book[0].title.text() // "The Great Gatsby"

65

println root.book[0].'@id' // "1"

66

println root.book[0].'@category' // "fiction"

67

68

// Modify the structure

69

root.book[0].title[0].value = 'New Title'

70

root.book[0].author[0].value = 'New Author'

71

72

// Add new elements

73

root.appendNode('publisher', 'Penguin Books')

74

root.book[0].appendNode('isbn', '978-0-7432-7356-5')

75

76

// Parse from file

77

def fileNode = parser.parse(new File('catalog.xml'))

78

79

// Parse with validation and namespace awareness

80

def validatingParser = new XmlParser(true, true)

81

validatingParser.setTrimWhitespace(true)

82

validatingParser.setKeepIgnorableWhitespace(false)

83

def validatedRoot = validatingParser.parseText(xmlString)

84

```

85

86

## XmlSlurper

87

88

Parses XML into GPathResult objects providing XPath-like navigation and lazy evaluation.

89

90

```java { .api }

91

public class XmlSlurper extends DefaultHandler {

92

// Constructors

93

public XmlSlurper();

94

public XmlSlurper(boolean validating, boolean namespaceAware);

95

public XmlSlurper(boolean validating, boolean namespaceAware, boolean allowDocTypeDeclaration);

96

public XmlSlurper(XMLReader reader);

97

public XmlSlurper(SAXParser parser);

98

99

// Parsing methods

100

public GPathResult parse(InputSource input) throws IOException, SAXException;

101

public GPathResult parse(File file) throws IOException, SAXException;

102

public GPathResult parse(InputStream input) throws IOException, SAXException;

103

public GPathResult parse(Reader in) throws IOException, SAXException;

104

public GPathResult parse(String uri) throws IOException, SAXException;

105

public GPathResult parseText(String text) throws SAXException;

106

107

// Configuration methods

108

public GPathResult getDocument();

109

public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace);

110

public boolean isKeepIgnorableWhitespace();

111

public void setEntityBaseUrl(URL base);

112

113

// SAX Handler configuration

114

public void setContentHandler(ContentHandler contentHandler);

115

public void setErrorHandler(ErrorHandler errorHandler);

116

public void setEntityResolver(EntityResolver entityResolver);

117

public void setDTDHandler(DTDHandler dtdHandler);

118

}

119

```

120

121

### XmlSlurper Usage

122

123

```groovy

124

// Basic slurping

125

def slurper = new XmlSlurper()

126

def catalog = slurper.parseText('''

127

<catalog>

128

<book id="1" category="fiction">

129

<title>The Great Gatsby</title>

130

<author>F. Scott Fitzgerald</author>

131

<price currency="USD">12.99</price>

132

</book>

133

<book id="2" category="science">

134

<title>A Brief History of Time</title>

135

<author>Stephen Hawking</author>

136

<price currency="USD">15.99</price>

137

</book>

138

</catalog>

139

''')

140

141

// XPath-like navigation

142

println catalog.book.title.text() // All titles as text

143

println catalog.book[0].title // "The Great Gatsby"

144

println catalog.book.'@category' // All category attributes

145

println catalog.book.find { it.'@id' == '1' }.title // Find by attribute

146

147

// Advanced navigation

148

println catalog.'**'.findAll { it.name() == 'price' }.text() // All prices

149

println catalog.book.findAll { it.price.toDouble() > 13.0 } // Books over $13

150

151

// Attribute access

152

catalog.book.each { book ->

153

println "Book ${book.'@id'}: ${book.title} by ${book.author}"

154

println "Price: ${book.price.'@currency'} ${book.price.text()}"

155

}

156

157

// Parse from file with configuration

158

def namespaceSlurper = new XmlSlurper(false, true) // not validating, namespace aware

159

namespaceSlurper.setKeepIgnorableWhitespace(false)

160

def result = namespaceSlurper.parse(new File('document.xml'))

161

```

162

163

## Node Type (XmlParser Result)

164

165

The Node class represents parsed XML elements from XmlParser.

166

167

```java { .api }

168

public class Node implements Serializable {

169

// Basic properties

170

public String name();

171

public String text();

172

public List<Node> children();

173

public Map<String, String> attributes();

174

175

// Content access

176

public Object get(String key);

177

public Object getAt(String key);

178

public void putAt(String key, Object value);

179

180

// Modification methods

181

public void setValue(String value);

182

public Node appendNode(String name);

183

public Node appendNode(String name, String value);

184

public Node appendNode(String name, Map<String, Object> attributes);

185

public Node appendNode(String name, Map<String, Object> attributes, String value);

186

public boolean remove(Node child);

187

188

// Navigation

189

public Node parent();

190

public List<Node> breadthFirst();

191

public List<Node> depthFirst();

192

193

// Utility methods

194

public Node plus(Node node);

195

public Iterator<Node> iterator();

196

}

197

```

198

199

### Node Usage Examples

200

201

```groovy

202

def parser = new XmlParser()

203

def root = parser.parseText('<root><item id="1">value</item></root>')

204

205

// Access node properties

206

println root.name() // "root"

207

println root.item[0].text() // "value"

208

println root.item[0].'@id' // "1"

209

210

// Traverse and modify

211

root.children().each { child ->

212

println "Child: ${child.name()} = ${child.text()}"

213

}

214

215

// Add new nodes

216

def newItem = root.appendNode('item', [id: '2'], 'new value')

217

root.appendNode('metadata') {

218

appendNode('created', new Date().toString())

219

appendNode('version', '1.0')

220

}

221

222

// Remove nodes

223

root.item.findAll { it.'@id' == '1' }.each { root.remove(it) }

224

```

225

226

## Parser Configuration

227

228

Both parsers support extensive configuration for different parsing scenarios:

229

230

```groovy

231

// Validation and namespace configuration

232

def validatingParser = new XmlParser(

233

true, // validating

234

true, // namespace aware

235

false // allow DOCTYPE declaration

236

)

237

238

// Whitespace handling

239

parser.setTrimWhitespace(true) // Trim whitespace around text

240

parser.setKeepIgnorableWhitespace(false) // Don't keep insignificant whitespace

241

242

// Custom SAX configuration

243

parser.setErrorHandler(new MyErrorHandler())

244

parser.setEntityResolver(new MyEntityResolver())

245

246

// For XmlSlurper

247

def slurper = new XmlSlurper(false, true) // not validating, namespace aware

248

slurper.setKeepIgnorableWhitespace(false)

249

slurper.setEntityBaseUrl(new URL('http://example.com/'))

250

```

251

252

## Error Handling

253

254

Both parsers can throw SAXException and IOException during parsing:

255

256

```groovy

257

try {

258

def parser = new XmlParser()

259

def result = parser.parseText(invalidXml)

260

} catch (SAXException e) {

261

println "XML parsing error: ${e.message}"

262

} catch (IOException e) {

263

println "IO error: ${e.message}"

264

}

265

266

// Custom error handling

267

parser.setErrorHandler(new ErrorHandler() {

268

void error(SAXParseException e) throws SAXException {

269

println "Parsing error at line ${e.lineNumber}: ${e.message}"

270

}

271

272

void fatalError(SAXParseException e) throws SAXException {

273

throw e // Re-throw fatal errors

274

}

275

276

void warning(SAXParseException e) throws SAXException {

277

println "Warning: ${e.message}"

278

}

279

})

280

```

281

282

## Comparison: XmlParser vs XmlSlurper

283

284

| Feature | XmlParser | XmlSlurper |

285

|---------|-----------|------------|

286

| Result Type | Node (mutable) | GPathResult (immutable) |

287

| Memory Usage | Higher (full DOM) | Lower (lazy evaluation) |

288

| Navigation | Object traversal | XPath-like |

289

| Modification | Yes | No (read-only) |

290

| Performance | Better for modification | Better for navigation |

291

| Suitable For | Manipulating XML | Querying XML |

292

293

Choose XmlParser when you need to modify XML structures, and XmlSlurper when you primarily need to read and navigate XML content.

294

295

## Parser Factory Classes

296

297

Factory classes provide convenient methods for creating parser instances with standard configurations.

298

299

### XmlParserFactory

300

301

```groovy { .api }

302

class XmlParserFactory {

303

static Object newParser(Object... args)

304

}

305

```

306

307

### XmlSlurperFactory

308

309

```groovy { .api }

310

class XmlSlurperFactory {

311

static Object newSlurper(Object... args)

312

}

313

```

314

315

### Factory Usage

316

317

```groovy

318

// Using parser factory with default settings

319

def parser = XmlParserFactory.newParser()

320

321

// Using parser factory with custom settings

322

def validatingParser = XmlParserFactory.newParser(

323

true, // validating

324

true // namespace aware

325

)

326

327

// Using slurper factory

328

def slurper = XmlSlurperFactory.newSlurper(false, true) // not validating, namespace aware

329

330

// Factories handle ParserConfigurationException and SAXException internally

331

try {

332

def customParser = XmlParserFactory.newParser(true, true, false) // validating, namespace aware, no DOCTYPE

333

def result = customParser.parseText(xmlString)

334

} catch (Exception e) {

335

println "Parser creation or parsing failed: ${e.message}"

336

}

337

```

338

339

## Advanced Error Handling

340

341

### Comprehensive Exception Handling

342

343

```groovy

344

import javax.xml.parsers.ParserConfigurationException

345

import org.xml.sax.SAXException

346

import org.xml.sax.SAXParseException

347

348

def robustParse = { xmlString ->

349

try {

350

def parser = new XmlParser()

351

return parser.parseText(xmlString)

352

353

} catch (ParserConfigurationException e) {

354

println "Parser configuration error: ${e.message}"

355

println "Check your XML parser installation and configuration"

356

return null

357

358

} catch (SAXParseException e) {

359

println "XML structure error at line ${e.lineNumber}, column ${e.columnNumber}:"

360

println " ${e.message}"

361

println " System ID: ${e.systemId}"

362

println " Public ID: ${e.publicId}"

363

return null

364

365

} catch (SAXException e) {

366

println "XML parsing error: ${e.message}"

367

if (e.exception) {

368

println "Root cause: ${e.exception.message}"

369

}

370

return null

371

372

} catch (IOException e) {

373

println "IO error while parsing: ${e.message}"

374

return null

375

376

} catch (Exception e) {

377

println "Unexpected error during parsing: ${e.message}"

378

e.printStackTrace()

379

return null

380

}

381

}

382

383

// Usage

384

def xml = '<root><item>valid</item></root>'

385

def result = robustParse(xml)

386

387

def invalidXml = '<root><item>unclosed'

388

def failedResult = robustParse(invalidXml) // Will handle the error gracefully

389

```

390

391

### Custom Error Handlers

392

393

```groovy

394

import org.xml.sax.ErrorHandler

395

import org.xml.sax.SAXParseException

396

397

class DetailedErrorHandler implements ErrorHandler {

398

List<String> warnings = []

399

List<String> errors = []

400

List<String> fatalErrors = []

401

402

@Override

403

void warning(SAXParseException e) throws SAXException {

404

def msg = "Warning at line ${e.lineNumber}: ${e.message}"

405

warnings << msg

406

println msg

407

}

408

409

@Override

410

void error(SAXParseException e) throws SAXException {

411

def msg = "Error at line ${e.lineNumber}: ${e.message}"

412

errors << msg

413

println msg

414

// Don't throw - allow parsing to continue

415

}

416

417

@Override

418

void fatalError(SAXParseException e) throws SAXException {

419

def msg = "Fatal error at line ${e.lineNumber}: ${e.message}"

420

fatalErrors << msg

421

println msg

422

throw e // Must throw for fatal errors

423

}

424

425

boolean hasErrors() {

426

return !errors.isEmpty() || !fatalErrors.isEmpty()

427

}

428

429

void printSummary() {

430

println "Parsing summary:"

431

println " Warnings: ${warnings.size()}"

432

println " Errors: ${errors.size()}"

433

println " Fatal errors: ${fatalErrors.size()}"

434

}

435

}

436

437

// Usage with custom error handler

438

def parseWithDetailedErrors = { xmlString ->

439

def errorHandler = new DetailedErrorHandler()

440

441

try {

442

def parser = new XmlParser()

443

parser.setErrorHandler(errorHandler)

444

445

def result = parser.parseText(xmlString)

446

errorHandler.printSummary()

447

448

if (errorHandler.hasErrors()) {

449

println "Parsing completed with errors - results may be incomplete"

450

}

451

452

return result

453

454

} catch (Exception e) {

455

errorHandler.printSummary()

456

println "Parsing failed: ${e.message}"

457

return null

458

}

459

}

460

```

461

462

### Validation Error Handling

463

464

```groovy

465

import javax.xml.validation.SchemaFactory

466

import javax.xml.validation.Schema

467

import javax.xml.XMLConstants

468

import javax.xml.transform.stream.StreamSource

469

470

def parseWithSchemaValidation = { xmlString, xsdFile ->

471

try {

472

// Create schema from XSD file

473

def schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)

474

def schema = schemaFactory.newSchema(xsdFile)

475

476

// Create validating parser

477

def parser = XmlUtil.newSAXParser(XMLConstants.W3C_XML_SCHEMA_NS_URI, schema)

478

def xmlParser = new XmlParser(parser)

479

480

// Custom error handler for validation errors

481

def validationErrors = []

482

xmlParser.setErrorHandler(new ErrorHandler() {

483

void warning(SAXParseException e) {

484

validationErrors << "Validation warning: ${e.message}"

485

}

486

487

void error(SAXParseException e) {

488

validationErrors << "Validation error: ${e.message}"

489

}

490

491

void fatalError(SAXParseException e) throws SAXException {

492

validationErrors << "Fatal validation error: ${e.message}"

493

throw e

494

}

495

})

496

497

def result = xmlParser.parseText(xmlString)

498

499

if (validationErrors) {

500

println "Validation issues found:"

501

validationErrors.each { println " ${it}" }

502

} else {

503

println "Document is valid according to schema"

504

}

505

506

return result

507

508

} catch (Exception e) {

509

println "Schema validation failed: ${e.message}"

510

return null

511

}

512

}

513

514

// Usage

515

def xsdFile = new File('catalog.xsd')

516

def validXml = '''

517

<catalog xmlns="http://example.com/catalog">

518

<book id="1">

519

<title>Valid Book</title>

520

<author>Valid Author</author>

521

</book>

522

</catalog>

523

'''

524

525

def result = parseWithSchemaValidation(validXml, xsdFile)

526

```

527

528

### Recovery Strategies

529

530

```groovy

531

class XmlParsingRecovery {

532

533

static Node parseWithFallback(String xmlString) {

534

// Try strict parsing first

535

try {

536

def parser = new XmlParser(true, true) // validating, namespace aware

537

return parser.parseText(xmlString)

538

} catch (Exception e) {

539

println "Strict parsing failed: ${e.message}"

540

}

541

542

// Try lenient parsing

543

try {

544

def parser = new XmlParser(false, false) // non-validating, not namespace aware

545

parser.setTrimWhitespace(true)

546

return parser.parseText(xmlString.trim())

547

} catch (Exception e) {

548

println "Lenient parsing failed: ${e.message}"

549

}

550

551

// Try to fix common issues and parse again

552

try {

553

def fixedXml = fixCommonXmlIssues(xmlString)

554

def parser = new XmlParser(false, false)

555

return parser.parseText(fixedXml)

556

} catch (Exception e) {

557

println "Recovery parsing failed: ${e.message}"

558

}

559

560

return null

561

}

562

563

static String fixCommonXmlIssues(String xml) {

564

return xml

565

.replaceAll(/&(?![a-zA-Z0-9#]+;)/, '&amp;') // Fix unescaped ampersands

566

.replaceAll(/<(?!\/?[a-zA-Z])/,'&lt;') // Fix unescaped less-than

567

.replaceAll(/(?<![a-zA-Z0-9])>/, '&gt;') // Fix unescaped greater-than

568

.replaceAll(/\r\n|\r/, '\n') // Normalize line endings

569

.trim()

570

}

571

572

static GPathResult slurpWithFallback(String xmlString) {

573

// Similar fallback strategy for XmlSlurper

574

try {

575

def slurper = new XmlSlurper(true, true)

576

return slurper.parseText(xmlString)

577

} catch (Exception e) {

578

println "Strict slurping failed: ${e.message}"

579

}

580

581

try {

582

def slurper = new XmlSlurper(false, false)

583

slurper.setKeepIgnorableWhitespace(false)

584

return slurper.parseText(xmlString.trim())

585

} catch (Exception e) {

586

println "Lenient slurping failed: ${e.message}"

587

}

588

589

try {

590

def fixedXml = fixCommonXmlIssues(xmlString)

591

def slurper = new XmlSlurper(false, false)

592

return slurper.parseText(fixedXml)

593

} catch (Exception e) {

594

println "Recovery slurping failed: ${e.message}"

595

}

596

597

return null

598

}

599

}

600

601

// Usage

602

def problematicXml = '<root><item>Text with & unescaped chars < ></item></root>'

603

def recovered = XmlParsingRecovery.parseWithFallback(problematicXml)

604

605

if (recovered) {

606

println "Successfully recovered and parsed XML"

607

println recovered.item.text()

608

} else {

609

println "Could not recover the XML"

610

}

611

```

612

613

## Performance Considerations

614

615

### Memory Management for Large Documents

616

617

```groovy

618

// For large XML files, prefer XmlSlurper over XmlParser

619

def processLargeXml = { file ->

620

if (file.size() > 10 * 1024 * 1024) { // > 10MB

621

println "Large file detected, using XmlSlurper for better memory efficiency"

622

def slurper = new XmlSlurper()

623

slurper.setKeepIgnorableWhitespace(false)

624

return slurper.parse(file)

625

} else {

626

println "Small file, using XmlParser for full DOM access"

627

def parser = new XmlParser()

628

return parser.parse(file)

629

}

630

}

631

632

// Process in chunks for very large files

633

def processXmlInChunks = { file, chunkProcessor ->

634

def slurper = new XmlSlurper()

635

def doc = slurper.parse(file)

636

637

// Process top-level elements one at a time

638

doc.children().each { element ->

639

chunkProcessor(element)

640

// Allow garbage collection of processed elements

641

System.gc()

642

}

643

}

644

```