0
# Content Processing
1
2
SAX-based content handler system for processing and transforming document content during parsing, including specialized handlers for text extraction, HTML/XML conversion, link extraction, and XPath-based content matching.
3
4
## Capabilities
5
6
### Content Handler Base Classes
7
8
#### BodyContentHandler
9
10
Primary content handler for extracting textual content from documents with configurable output limits and encoding support.
11
12
```java { .api }
13
/**
14
* Content handler that extracts textual content from documents
15
*/
16
public class BodyContentHandler extends DefaultHandler implements WriteOutContentHandler {
17
/**
18
* Creates a BodyContentHandler with default StringWriter
19
*/
20
public BodyContentHandler();
21
22
/**
23
* Creates a BodyContentHandler with custom Writer
24
* @param writer Writer to receive extracted content
25
*/
26
public BodyContentHandler(Writer writer);
27
28
/**
29
* Creates a BodyContentHandler with write limit
30
* @param writeLimit Maximum characters to write (-1 for no limit)
31
*/
32
public BodyContentHandler(int writeLimit);
33
34
/**
35
* Gets the extracted content as string
36
* @return String containing extracted textual content
37
*/
38
@Override
39
public String toString();
40
41
/**
42
* Checks if write limit has been reached
43
* @return true if write limit exceeded
44
*/
45
public boolean isWriteLimitReached();
46
}
47
```
48
49
#### WriteOutContentHandler
50
51
Interface for content handlers that support write limits and output control.
52
53
```java { .api }
54
/**
55
* Interface for content handlers with write limit support
56
*/
57
public interface WriteOutContentHandler {
58
/**
59
* Gets the extracted content as string
60
* @return String representation of extracted content
61
*/
62
String toString();
63
64
/**
65
* Checks if configured write limit has been reached
66
* @return true if write limit exceeded, false otherwise
67
*/
68
boolean isWriteLimitReached();
69
}
70
```
71
72
#### ContentHandlerDecorator
73
74
Base decorator class for wrapping and extending content handler functionality.
75
76
```java { .api }
77
/**
78
* Abstract base class for decorating ContentHandler instances
79
*/
80
public abstract class ContentHandlerDecorator implements ContentHandler {
81
/**
82
* Creates decorator around existing ContentHandler
83
* @param handler ContentHandler to decorate
84
*/
85
protected ContentHandlerDecorator(ContentHandler handler);
86
87
/**
88
* Gets the wrapped ContentHandler
89
* @return Underlying ContentHandler instance
90
*/
91
protected ContentHandler getContentHandler();
92
}
93
```
94
95
### Format Conversion Handlers
96
97
#### ToXMLContentHandler
98
99
Converts document content to well-formed XML output with proper encoding and namespace handling.
100
101
```java { .api }
102
/**
103
* Content handler that converts document content to XML format
104
*/
105
public class ToXMLContentHandler extends ContentHandlerDecorator {
106
/**
107
* Creates ToXMLContentHandler with default XML output
108
*/
109
public ToXMLContentHandler();
110
111
/**
112
* Creates ToXMLContentHandler with custom Result target
113
* @param result Result object for XML output
114
*/
115
public ToXMLContentHandler(Result result);
116
117
/**
118
* Creates ToXMLContentHandler with encoding specification
119
* @param encoding Character encoding for XML output
120
*/
121
public ToXMLContentHandler(String encoding);
122
123
/**
124
* Gets the XML content as string
125
* @return String containing XML representation
126
*/
127
@Override
128
public String toString();
129
}
130
```
131
132
#### ToHTMLContentHandler
133
134
Converts document content to HTML format with proper tag structure and encoding.
135
136
```java { .api }
137
/**
138
* Content handler that converts document content to HTML format
139
*/
140
public class ToHTMLContentHandler extends ToXMLContentHandler {
141
/**
142
* Creates ToHTMLContentHandler with default HTML output
143
*/
144
public ToHTMLContentHandler();
145
146
/**
147
* Creates ToHTMLContentHandler with custom Writer
148
* @param writer Writer for HTML output
149
*/
150
public ToHTMLContentHandler(Writer writer);
151
152
/**
153
* Creates ToHTMLContentHandler with encoding specification
154
* @param encoding Character encoding for HTML output
155
*/
156
public ToHTMLContentHandler(String encoding);
157
}
158
```
159
160
#### ToTextContentHandler
161
162
Extracts plain text content without formatting or markup elements.
163
164
```java { .api }
165
/**
166
* Content handler that extracts plain text content
167
*/
168
public class ToTextContentHandler extends ContentHandlerDecorator {
169
/**
170
* Creates ToTextContentHandler with default text extraction
171
*/
172
public ToTextContentHandler();
173
174
/**
175
* Creates ToTextContentHandler with custom Writer
176
* @param writer Writer for plain text output
177
*/
178
public ToTextContentHandler(Writer writer);
179
180
/**
181
* Gets the extracted plain text
182
* @return String containing plain text content
183
*/
184
@Override
185
public String toString();
186
}
187
```
188
189
### Specialized Content Handlers
190
191
#### LinkContentHandler
192
193
Extracts and collects hyperlinks and references from document content.
194
195
```java { .api }
196
/**
197
* Content handler that extracts links from document content
198
*/
199
public class LinkContentHandler extends ContentHandlerDecorator {
200
/**
201
* Creates LinkContentHandler for link extraction
202
*/
203
public LinkContentHandler();
204
205
/**
206
* Creates LinkContentHandler with base URI for resolving relative links
207
* @param base Base URI for link resolution
208
*/
209
public LinkContentHandler(String base);
210
211
/**
212
* Gets all extracted links
213
* @return List of Link objects representing extracted hyperlinks
214
*/
215
public List<Link> getLinks();
216
217
/**
218
* Inner class representing an extracted link
219
*/
220
public static class Link {
221
/**
222
* Gets the link type (e.g., "a", "img", "link")
223
* @return String representing link element type
224
*/
225
public String getType();
226
227
/**
228
* Gets the link URI
229
* @return String containing link URI
230
*/
231
public String getUri();
232
233
/**
234
* Gets the link title or alt text
235
* @return String containing link title
236
*/
237
public String getTitle();
238
239
/**
240
* Gets the anchor text content
241
* @return String containing link text content
242
*/
243
public String getText();
244
245
/**
246
* Gets the relationship attribute
247
* @return String containing rel attribute value
248
*/
249
public String getRel();
250
}
251
}
252
```
253
254
#### TeeContentHandler
255
256
Broadcasts SAX events to multiple content handlers simultaneously for parallel processing.
257
258
```java { .api }
259
/**
260
* Content handler that delegates events to multiple handlers
261
*/
262
public class TeeContentHandler extends DefaultHandler {
263
/**
264
* Creates TeeContentHandler with array of handlers
265
* @param handlers Array of ContentHandler instances to receive events
266
*/
267
public TeeContentHandler(ContentHandler... handlers);
268
269
/**
270
* Creates TeeContentHandler with list of handlers
271
* @param handlers List of ContentHandler instances
272
*/
273
public TeeContentHandler(List<ContentHandler> handlers);
274
275
/**
276
* Gets all registered content handlers
277
* @return List of ContentHandler instances receiving events
278
*/
279
public List<ContentHandler> getHandlers();
280
}
281
```
282
283
#### SafeContentHandler
284
285
Wraps content handlers with error handling and recovery mechanisms.
286
287
```java { .api }
288
/**
289
* Content handler wrapper that provides error handling and recovery
290
*/
291
public class SafeContentHandler extends ContentHandlerDecorator {
292
/**
293
* Creates SafeContentHandler wrapping another handler
294
* @param handler ContentHandler to wrap with error handling
295
*/
296
public SafeContentHandler(ContentHandler handler);
297
298
/**
299
* Gets any exception that occurred during processing
300
* @return Exception that occurred, or null if none
301
*/
302
public Exception getException();
303
304
/**
305
* Checks if processing completed without errors
306
* @return true if no exceptions occurred
307
*/
308
public boolean hasCompleted();
309
}
310
```
311
312
### Advanced Content Handlers
313
314
#### ExpandedTitleContentHandler
315
316
Extracts and expands document titles using various heuristics and content analysis.
317
318
```java { .api }
319
/**
320
* Content handler that extracts and expands document titles
321
*/
322
public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
323
/**
324
* Creates ExpandedTitleContentHandler with default title extraction
325
* @param handler Underlying ContentHandler
326
*/
327
public ExpandedTitleContentHandler(ContentHandler handler);
328
329
/**
330
* Gets the extracted and expanded title
331
* @return String containing document title
332
*/
333
public String getTitle();
334
}
335
```
336
337
#### PhoneExtractingContentHandler
338
339
Specialized handler for extracting phone numbers from document content using pattern recognition.
340
341
```java { .api }
342
/**
343
* Content handler that extracts phone numbers from content
344
*/
345
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
346
/**
347
* Creates PhoneExtractingContentHandler
348
* @param handler Underlying ContentHandler
349
* @param metadata Metadata for context
350
*/
351
public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata);
352
353
/**
354
* Gets all extracted phone numbers
355
* @return Set of phone number strings found in content
356
*/
357
public Set<String> getPhoneNumbers();
358
}
359
```
360
361
#### TaggedContentHandler
362
363
Tags content elements with identifiers for tracking and reference purposes.
364
365
```java { .api }
366
/**
367
* Content handler that adds tracking tags to content elements
368
*/
369
public class TaggedContentHandler extends ContentHandlerDecorator {
370
/**
371
* Creates TaggedContentHandler with element tagging
372
* @param handler Underlying ContentHandler
373
*/
374
public TaggedContentHandler(ContentHandler handler);
375
376
/**
377
* Gets mapping of tags to content elements
378
* @return Map of tag identifiers to content strings
379
*/
380
public Map<String, String> getTaggedContent();
381
}
382
```
383
384
### XHTML Processing
385
386
#### XHTMLContentHandler
387
388
Specialized handler for processing XHTML content with namespace awareness and structure preservation.
389
390
```java { .api }
391
/**
392
* Content handler specialized for XHTML document processing
393
*/
394
public class XHTMLContentHandler extends DefaultHandler {
395
/**
396
* Creates XHTMLContentHandler with default XHTML processing
397
*/
398
public XHTMLContentHandler();
399
400
/**
401
* Creates XHTMLContentHandler with custom ContentHandler
402
* @param handler ContentHandler for XHTML events
403
*/
404
public XHTMLContentHandler(ContentHandler handler);
405
406
/**
407
* Starts an XHTML element with namespace support
408
* @param name Element name
409
* @param attributes Element attributes
410
*/
411
public void startElement(String name, AttributesImpl attributes);
412
413
/**
414
* Ends an XHTML element
415
* @param name Element name
416
*/
417
public void endElement(String name);
418
419
/**
420
* Adds character content
421
* @param ch Character array
422
* @param start Start offset
423
* @param length Length of content
424
*/
425
public void characters(char[] ch, int start, int length);
426
}
427
```
428
429
### Embedded Document Handling
430
431
#### EmbeddedContentHandler
432
433
Handles extraction and processing of embedded documents within parent documents.
434
435
```java { .api }
436
/**
437
* Content handler for processing embedded documents
438
*/
439
public class EmbeddedContentHandler extends ContentHandlerDecorator {
440
/**
441
* Creates EmbeddedContentHandler for embedded document processing
442
* @param handler ContentHandler for embedded content
443
*/
444
public EmbeddedContentHandler(ContentHandler handler);
445
446
/**
447
* Sets the embedded document extractor
448
* @param extractor EmbeddedDocumentExtractor for processing embedded docs
449
*/
450
public void setEmbeddedDocumentExtractor(EmbeddedDocumentExtractor extractor);
451
452
/**
453
* Gets the embedded document extractor
454
* @return EmbeddedDocumentExtractor currently in use
455
*/
456
public EmbeddedDocumentExtractor getEmbeddedDocumentExtractor();
457
}
458
```
459
460
## XPath Content Matching
461
462
### XPath Parser and Matching
463
464
#### XPathParser
465
466
Parser for XPath expressions used in content matching and selection operations.
467
468
```java { .api }
469
/**
470
* Parser for XPath expressions used in content matching
471
*/
472
public class XPathParser {
473
/**
474
* Parses XPath expression into Matcher
475
* @param xpath XPath expression string
476
* @return Matcher for the XPath expression
477
* @throws ParseException if XPath syntax is invalid
478
*/
479
public static Matcher parse(String xpath) throws ParseException;
480
481
/**
482
* Creates composite matcher from multiple XPath expressions
483
* @param xpaths Array of XPath expression strings
484
* @return CompositeMatcher combining all expressions
485
*/
486
public static Matcher parseMultiple(String... xpaths);
487
}
488
```
489
490
#### Matcher Interface
491
492
Interface for matching content elements based on XPath-like expressions.
493
494
```java { .api }
495
/**
496
* Interface for matching content elements using path-based expressions
497
*/
498
public interface Matcher {
499
/**
500
* Checks if current parse state matches this matcher
501
* @param namespaceURI Namespace URI of current element
502
* @param localName Local name of current element
503
* @param qName Qualified name of current element
504
* @return true if current state matches
505
*/
506
boolean matches(String namespaceURI, String localName, String qName);
507
508
/**
509
* Updates matcher state for element start
510
* @param namespaceURI Namespace URI
511
* @param localName Local name
512
* @param qName Qualified name
513
* @return Updated matcher for child elements
514
*/
515
Matcher descend(String namespaceURI, String localName, String qName);
516
}
517
```
518
519
#### MatchingContentHandler
520
521
Content handler that applies XPath matching to selectively process document elements.
522
523
```java { .api }
524
/**
525
* Content handler that uses XPath matching for selective processing
526
*/
527
public class MatchingContentHandler extends ContentHandlerDecorator {
528
/**
529
* Creates MatchingContentHandler with XPath matcher
530
* @param handler ContentHandler to receive matched content
531
* @param matcher Matcher defining selection criteria
532
*/
533
public MatchingContentHandler(ContentHandler handler, Matcher matcher);
534
535
/**
536
* Creates MatchingContentHandler with XPath expression
537
* @param handler ContentHandler to receive matched content
538
* @param xpath XPath expression for matching
539
*/
540
public MatchingContentHandler(ContentHandler handler, String xpath);
541
542
/**
543
* Gets the current matcher
544
* @return Matcher being used for content selection
545
*/
546
public Matcher getMatcher();
547
548
/**
549
* Checks if currently inside a matching element
550
* @return true if processing matched content
551
*/
552
public boolean isMatching();
553
}
554
```
555
556
## Usage Examples
557
558
### Basic Text Extraction
559
560
```java { .api }
561
// Extract plain text with size limit
562
BodyContentHandler textHandler = new BodyContentHandler(1000000);
563
AutoDetectParser parser = new AutoDetectParser();
564
Metadata metadata = new Metadata();
565
566
try (InputStream stream = new FileInputStream("document.pdf")) {
567
parser.parse(stream, textHandler, metadata, new ParseContext());
568
String extractedText = textHandler.toString();
569
570
if (textHandler.isWriteLimitReached()) {
571
System.out.println("Content truncated due to size limit");
572
}
573
}
574
```
575
576
### Multiple Format Output
577
578
```java { .api }
579
// Generate both HTML and plain text simultaneously
580
BodyContentHandler textHandler = new BodyContentHandler();
581
ToHTMLContentHandler htmlHandler = new ToHTMLContentHandler();
582
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, htmlHandler);
583
584
parser.parse(stream, teeHandler, metadata, new ParseContext());
585
586
String plainText = textHandler.toString();
587
String htmlContent = htmlHandler.toString();
588
```
589
590
### Link Extraction
591
592
```java { .api }
593
// Extract all links from document
594
LinkContentHandler linkHandler = new LinkContentHandler();
595
parser.parse(stream, linkHandler, metadata, new ParseContext());
596
597
List<LinkContentHandler.Link> links = linkHandler.getLinks();
598
for (LinkContentHandler.Link link : links) {
599
System.out.println("Type: " + link.getType());
600
System.out.println("URI: " + link.getUri());
601
System.out.println("Title: " + link.getTitle());
602
System.out.println("Text: " + link.getText());
603
}
604
```
605
606
### XPath-based Content Selection
607
608
```java { .api }
609
// Extract only table content using XPath
610
String xpath = "//table";
611
BodyContentHandler tableHandler = new BodyContentHandler();
612
MatchingContentHandler matcher = new MatchingContentHandler(tableHandler, xpath);
613
614
parser.parse(stream, matcher, metadata, new ParseContext());
615
String tableContent = tableHandler.toString();
616
```
617
618
### Error-Safe Processing
619
620
```java { .api }
621
// Process with error handling
622
BodyContentHandler textHandler = new BodyContentHandler();
623
SafeContentHandler safeHandler = new SafeContentHandler(textHandler);
624
625
parser.parse(stream, safeHandler, metadata, new ParseContext());
626
627
if (safeHandler.hasCompleted()) {
628
String content = textHandler.toString();
629
} else {
630
Exception error = safeHandler.getException();
631
System.err.println("Processing failed: " + error.getMessage());
632
}
633
```