Tessl Tile for maven/org.apache.tika/tika-core@3.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md content-processing.md detection.md embedded-extraction.md embedding.md exceptions.md index.md io-utilities.md language.md metadata.md mime-types.md parsing.md pipes.md process-forking.md rendering.md

content-processing.mddocs/

0
# Content Processing
1

2
SAX-based content handler system for processing and transforming document content during parsing, including specialized handlers for text extraction, HTML/XML conversion, link extraction, and XPath-based content matching.
3

4
## Capabilities
5

6
### Content Handler Base Classes
7

8
#### BodyContentHandler
9

10
Primary content handler for extracting textual content from documents with configurable output limits and encoding support.
11

12
```java { .api }
13
/**
14
 * Content handler that extracts textual content from documents
15
 */
16
public class BodyContentHandler extends DefaultHandler implements WriteOutContentHandler {
17
    /**
18
     * Creates a BodyContentHandler with default StringWriter
19
     */
20
    public BodyContentHandler();
21
    
22
    /**
23
     * Creates a BodyContentHandler with custom Writer
24
     * @param writer Writer to receive extracted content
25
     */
26
    public BodyContentHandler(Writer writer);
27
    
28
    /**
29
     * Creates a BodyContentHandler with write limit
30
     * @param writeLimit Maximum characters to write (-1 for no limit)
31
     */
32
    public BodyContentHandler(int writeLimit);
33
    
34
    /**
35
     * Gets the extracted content as string
36
     * @return String containing extracted textual content
37
     */
38
    @Override
39
    public String toString();
40
    
41
    /**
42
     * Checks if write limit has been reached
43
     * @return true if write limit exceeded
44
     */
45
    public boolean isWriteLimitReached();
46
}
47
```
48

49
#### WriteOutContentHandler
50

51
Interface for content handlers that support write limits and output control.
52

53
```java { .api }
54
/**
55
 * Interface for content handlers with write limit support
56
 */
57
public interface WriteOutContentHandler {
58
    /**
59
     * Gets the extracted content as string
60
     * @return String representation of extracted content
61
     */
62
    String toString();
63
    
64
    /**
65
     * Checks if configured write limit has been reached
66
     * @return true if write limit exceeded, false otherwise
67
     */
68
    boolean isWriteLimitReached();
69
}
70
```
71

72
#### ContentHandlerDecorator
73

74
Base decorator class for wrapping and extending content handler functionality.
75

76
```java { .api }
77
/**
78
 * Abstract base class for decorating ContentHandler instances
79
 */
80
public abstract class ContentHandlerDecorator implements ContentHandler {
81
    /**
82
     * Creates decorator around existing ContentHandler
83
     * @param handler ContentHandler to decorate
84
     */
85
    protected ContentHandlerDecorator(ContentHandler handler);
86
    
87
    /**
88
     * Gets the wrapped ContentHandler
89
     * @return Underlying ContentHandler instance
90
     */
91
    protected ContentHandler getContentHandler();
92
}
93
```
94

95
### Format Conversion Handlers
96

97
#### ToXMLContentHandler
98

99
Converts document content to well-formed XML output with proper encoding and namespace handling.
100

101
```java { .api }
102
/**
103
 * Content handler that converts document content to XML format
104
 */
105
public class ToXMLContentHandler extends ContentHandlerDecorator {
106
    /**
107
     * Creates ToXMLContentHandler with default XML output
108
     */
109
    public ToXMLContentHandler();
110
    
111
    /**
112
     * Creates ToXMLContentHandler with custom Result target
113
     * @param result Result object for XML output
114
     */
115
    public ToXMLContentHandler(Result result);
116
    
117
    /**
118
     * Creates ToXMLContentHandler with encoding specification
119
     * @param encoding Character encoding for XML output
120
     */
121
    public ToXMLContentHandler(String encoding);
122
    
123
    /**
124
     * Gets the XML content as string
125
     * @return String containing XML representation
126
     */
127
    @Override
128
    public String toString();
129
}
130
```
131

132
#### ToHTMLContentHandler
133

134
Converts document content to HTML format with proper tag structure and encoding.
135

136
```java { .api }
137
/**
138
 * Content handler that converts document content to HTML format
139
 */
140
public class ToHTMLContentHandler extends ToXMLContentHandler {
141
    /**
142
     * Creates ToHTMLContentHandler with default HTML output
143
     */
144
    public ToHTMLContentHandler();
145
    
146
    /**
147
     * Creates ToHTMLContentHandler with custom Writer
148
     * @param writer Writer for HTML output
149
     */
150
    public ToHTMLContentHandler(Writer writer);
151
    
152
    /**
153
     * Creates ToHTMLContentHandler with encoding specification
154
     * @param encoding Character encoding for HTML output
155
     */
156
    public ToHTMLContentHandler(String encoding);
157
}
158
```
159

160
#### ToTextContentHandler
161

162
Extracts plain text content without formatting or markup elements.
163

164
```java { .api }
165
/**
166
 * Content handler that extracts plain text content
167
 */
168
public class ToTextContentHandler extends ContentHandlerDecorator {
169
    /**
170
     * Creates ToTextContentHandler with default text extraction
171
     */
172
    public ToTextContentHandler();
173
    
174
    /**
175
     * Creates ToTextContentHandler with custom Writer
176
     * @param writer Writer for plain text output
177
     */
178
    public ToTextContentHandler(Writer writer);
179
    
180
    /**
181
     * Gets the extracted plain text
182
     * @return String containing plain text content
183
     */
184
    @Override
185
    public String toString();
186
}
187
```
188

189
### Specialized Content Handlers
190

191
#### LinkContentHandler
192

193
Extracts and collects hyperlinks and references from document content.
194

195
```java { .api }
196
/**
197
 * Content handler that extracts links from document content
198
 */
199
public class LinkContentHandler extends ContentHandlerDecorator {
200
    /**
201
     * Creates LinkContentHandler for link extraction
202
     */
203
    public LinkContentHandler();
204
    
205
    /**
206
     * Creates LinkContentHandler with base URI for resolving relative links
207
     * @param base Base URI for link resolution
208
     */
209
    public LinkContentHandler(String base);
210
    
211
    /**
212
     * Gets all extracted links
213
     * @return List of Link objects representing extracted hyperlinks
214
     */
215
    public List<Link> getLinks();
216
    
217
    /**
218
     * Inner class representing an extracted link
219
     */
220
    public static class Link {
221
        /**
222
         * Gets the link type (e.g., "a", "img", "link")
223
         * @return String representing link element type
224
         */
225
        public String getType();
226
        
227
        /**
228
         * Gets the link URI
229
         * @return String containing link URI
230
         */
231
        public String getUri();
232
        
233
        /**
234
         * Gets the link title or alt text
235
         * @return String containing link title
236
         */
237
        public String getTitle();
238
        
239
        /**
240
         * Gets the anchor text content
241
         * @return String containing link text content
242
         */
243
        public String getText();
244
        
245
        /**
246
         * Gets the relationship attribute
247
         * @return String containing rel attribute value
248
         */
249
        public String getRel();
250
    }
251
}
252
```
253

254
#### TeeContentHandler
255

256
Broadcasts SAX events to multiple content handlers simultaneously for parallel processing.
257

258
```java { .api }
259
/**
260
 * Content handler that delegates events to multiple handlers
261
 */
262
public class TeeContentHandler extends DefaultHandler {
263
    /**
264
     * Creates TeeContentHandler with array of handlers
265
     * @param handlers Array of ContentHandler instances to receive events
266
     */
267
    public TeeContentHandler(ContentHandler... handlers);
268
    
269
    /**
270
     * Creates TeeContentHandler with list of handlers
271
     * @param handlers List of ContentHandler instances
272
     */
273
    public TeeContentHandler(List<ContentHandler> handlers);
274
    
275
    /**
276
     * Gets all registered content handlers
277
     * @return List of ContentHandler instances receiving events
278
     */
279
    public List<ContentHandler> getHandlers();
280
}
281
```
282

283
#### SafeContentHandler
284

285
Wraps content handlers with error handling and recovery mechanisms.
286

287
```java { .api }
288
/**
289
 * Content handler wrapper that provides error handling and recovery
290
 */
291
public class SafeContentHandler extends ContentHandlerDecorator {
292
    /**
293
     * Creates SafeContentHandler wrapping another handler
294
     * @param handler ContentHandler to wrap with error handling
295
     */
296
    public SafeContentHandler(ContentHandler handler);
297
    
298
    /**
299
     * Gets any exception that occurred during processing
300
     * @return Exception that occurred, or null if none
301
     */
302
    public Exception getException();
303
    
304
    /**
305
     * Checks if processing completed without errors
306
     * @return true if no exceptions occurred
307
     */
308
    public boolean hasCompleted();
309
}
310
```
311

312
### Advanced Content Handlers
313

314
#### ExpandedTitleContentHandler
315

316
Extracts and expands document titles using various heuristics and content analysis.
317

318
```java { .api }
319
/**
320
 * Content handler that extracts and expands document titles
321
 */
322
public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
323
    /**
324
     * Creates ExpandedTitleContentHandler with default title extraction
325
     * @param handler Underlying ContentHandler
326
     */
327
    public ExpandedTitleContentHandler(ContentHandler handler);
328
    
329
    /**
330
     * Gets the extracted and expanded title
331
     * @return String containing document title
332
     */
333
    public String getTitle();
334
}
335
```
336

337
#### PhoneExtractingContentHandler
338

339
Specialized handler for extracting phone numbers from document content using pattern recognition.
340

341
```java { .api }
342
/**
343
 * Content handler that extracts phone numbers from content
344
 */
345
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
346
    /**
347
     * Creates PhoneExtractingContentHandler
348
     * @param handler Underlying ContentHandler
349
     * @param metadata Metadata for context
350
     */
351
    public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata);
352
    
353
    /**
354
     * Gets all extracted phone numbers
355
     * @return Set of phone number strings found in content
356
     */
357
    public Set<String> getPhoneNumbers();
358
}
359
```
360

361
#### TaggedContentHandler
362

363
Tags content elements with identifiers for tracking and reference purposes.
364

365
```java { .api }
366
/**
367
 * Content handler that adds tracking tags to content elements
368
 */
369
public class TaggedContentHandler extends ContentHandlerDecorator {
370
    /**
371
     * Creates TaggedContentHandler with element tagging
372
     * @param handler Underlying ContentHandler
373
     */
374
    public TaggedContentHandler(ContentHandler handler);
375
    
376
    /**
377
     * Gets mapping of tags to content elements
378
     * @return Map of tag identifiers to content strings
379
     */
380
    public Map<String, String> getTaggedContent();
381
}
382
```
383

384
### XHTML Processing
385

386
#### XHTMLContentHandler
387

388
Specialized handler for processing XHTML content with namespace awareness and structure preservation.
389

390
```java { .api }
391
/**
392
 * Content handler specialized for XHTML document processing
393
 */
394
public class XHTMLContentHandler extends DefaultHandler {
395
    /**
396
     * Creates XHTMLContentHandler with default XHTML processing
397
     */
398
    public XHTMLContentHandler();
399
    
400
    /**
401
     * Creates XHTMLContentHandler with custom ContentHandler
402
     * @param handler ContentHandler for XHTML events
403
     */
404
    public XHTMLContentHandler(ContentHandler handler);
405
    
406
    /**
407
     * Starts an XHTML element with namespace support
408
     * @param name Element name
409
     * @param attributes Element attributes
410
     */
411
    public void startElement(String name, AttributesImpl attributes);
412
    
413
    /**
414
     * Ends an XHTML element
415
     * @param name Element name
416
     */
417
    public void endElement(String name);
418
    
419
    /**
420
     * Adds character content
421
     * @param ch Character array
422
     * @param start Start offset
423
     * @param length Length of content
424
     */
425
    public void characters(char[] ch, int start, int length);
426
}
427
```
428

429
### Embedded Document Handling
430

431
#### EmbeddedContentHandler
432

433
Handles extraction and processing of embedded documents within parent documents.
434

435
```java { .api }
436
/**
437
 * Content handler for processing embedded documents
438
 */
439
public class EmbeddedContentHandler extends ContentHandlerDecorator {
440
    /**
441
     * Creates EmbeddedContentHandler for embedded document processing
442
     * @param handler ContentHandler for embedded content
443
     */
444
    public EmbeddedContentHandler(ContentHandler handler);
445
    
446
    /**
447
     * Sets the embedded document extractor
448
     * @param extractor EmbeddedDocumentExtractor for processing embedded docs
449
     */
450
    public void setEmbeddedDocumentExtractor(EmbeddedDocumentExtractor extractor);
451
    
452
    /**
453
     * Gets the embedded document extractor
454
     * @return EmbeddedDocumentExtractor currently in use
455
     */
456
    public EmbeddedDocumentExtractor getEmbeddedDocumentExtractor();
457
}
458
```
459

460
## XPath Content Matching
461

462
### XPath Parser and Matching
463

464
#### XPathParser
465

466
Parser for XPath expressions used in content matching and selection operations.
467

468
```java { .api }
469
/**
470
 * Parser for XPath expressions used in content matching
471
 */
472
public class XPathParser {
473
    /**
474
     * Parses XPath expression into Matcher
475
     * @param xpath XPath expression string
476
     * @return Matcher for the XPath expression
477
     * @throws ParseException if XPath syntax is invalid
478
     */
479
    public static Matcher parse(String xpath) throws ParseException;
480
    
481
    /**
482
     * Creates composite matcher from multiple XPath expressions
483
     * @param xpaths Array of XPath expression strings
484
     * @return CompositeMatcher combining all expressions
485
     */
486
    public static Matcher parseMultiple(String... xpaths);
487
}
488
```
489

490
#### Matcher Interface
491

492
Interface for matching content elements based on XPath-like expressions.
493

494
```java { .api }
495
/**
496
 * Interface for matching content elements using path-based expressions
497
 */
498
public interface Matcher {
499
    /**
500
     * Checks if current parse state matches this matcher
501
     * @param namespaceURI Namespace URI of current element
502
     * @param localName Local name of current element
503
     * @param qName Qualified name of current element
504
     * @return true if current state matches
505
     */
506
    boolean matches(String namespaceURI, String localName, String qName);
507
    
508
    /**
509
     * Updates matcher state for element start
510
     * @param namespaceURI Namespace URI
511
     * @param localName Local name
512
     * @param qName Qualified name
513
     * @return Updated matcher for child elements
514
     */
515
    Matcher descend(String namespaceURI, String localName, String qName);
516
}
517
```
518

519
#### MatchingContentHandler
520

521
Content handler that applies XPath matching to selectively process document elements.
522

523
```java { .api }
524
/**
525
 * Content handler that uses XPath matching for selective processing
526
 */
527
public class MatchingContentHandler extends ContentHandlerDecorator {
528
    /**
529
     * Creates MatchingContentHandler with XPath matcher
530
     * @param handler ContentHandler to receive matched content
531
     * @param matcher Matcher defining selection criteria
532
     */
533
    public MatchingContentHandler(ContentHandler handler, Matcher matcher);
534
    
535
    /**
536
     * Creates MatchingContentHandler with XPath expression
537
     * @param handler ContentHandler to receive matched content  
538
     * @param xpath XPath expression for matching
539
     */
540
    public MatchingContentHandler(ContentHandler handler, String xpath);
541
    
542
    /**
543
     * Gets the current matcher
544
     * @return Matcher being used for content selection
545
     */
546
    public Matcher getMatcher();
547
    
548
    /**
549
     * Checks if currently inside a matching element
550
     * @return true if processing matched content
551
     */
552
    public boolean isMatching();
553
}
554
```
555

556
## Usage Examples
557

558
### Basic Text Extraction
559

560
```java { .api }
561
// Extract plain text with size limit
562
BodyContentHandler textHandler = new BodyContentHandler(1000000);
563
AutoDetectParser parser = new AutoDetectParser();
564
Metadata metadata = new Metadata();
565

566
try (InputStream stream = new FileInputStream("document.pdf")) {
567
    parser.parse(stream, textHandler, metadata, new ParseContext());
568
    String extractedText = textHandler.toString();
569
    
570
    if (textHandler.isWriteLimitReached()) {
571
        System.out.println("Content truncated due to size limit");
572
    }
573
}
574
```
575

576
### Multiple Format Output
577

578
```java { .api }
579
// Generate both HTML and plain text simultaneously
580
BodyContentHandler textHandler = new BodyContentHandler();
581
ToHTMLContentHandler htmlHandler = new ToHTMLContentHandler();
582
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, htmlHandler);
583

584
parser.parse(stream, teeHandler, metadata, new ParseContext());
585

586
String plainText = textHandler.toString();
587
String htmlContent = htmlHandler.toString();
588
```
589

590
### Link Extraction
591

592
```java { .api }
593
// Extract all links from document
594
LinkContentHandler linkHandler = new LinkContentHandler();
595
parser.parse(stream, linkHandler, metadata, new ParseContext());
596

597
List<LinkContentHandler.Link> links = linkHandler.getLinks();
598
for (LinkContentHandler.Link link : links) {
599
    System.out.println("Type: " + link.getType());
600
    System.out.println("URI: " + link.getUri()); 
601
    System.out.println("Title: " + link.getTitle());
602
    System.out.println("Text: " + link.getText());
603
}
604
```
605

606
### XPath-based Content Selection
607

608
```java { .api }
609
// Extract only table content using XPath
610
String xpath = "//table";
611
BodyContentHandler tableHandler = new BodyContentHandler();
612
MatchingContentHandler matcher = new MatchingContentHandler(tableHandler, xpath);
613

614
parser.parse(stream, matcher, metadata, new ParseContext());
615
String tableContent = tableHandler.toString();
616
```
617

618
### Error-Safe Processing
619

620
```java { .api }
621
// Process with error handling
622
BodyContentHandler textHandler = new BodyContentHandler();
623
SafeContentHandler safeHandler = new SafeContentHandler(textHandler);
624

625
parser.parse(stream, safeHandler, metadata, new ParseContext());
626

627
if (safeHandler.hasCompleted()) {
628
    String content = textHandler.toString();
629
} else {
630
    Exception error = safeHandler.getException();
631
    System.err.println("Processing failed: " + error.getMessage());
632
}
633
```

Version

Tile

Files

content-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

content-processing.mddocs/