or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

content-processing.mddocs/

0

# Content Processing

1

2

SAX-based content handler system for processing and transforming document content during parsing, including specialized handlers for text extraction, HTML/XML conversion, link extraction, and XPath-based content matching.

3

4

## Capabilities

5

6

### Content Handler Base Classes

7

8

#### BodyContentHandler

9

10

Primary content handler for extracting textual content from documents with configurable output limits and encoding support.

11

12

```java { .api }

13

/**

14

* Content handler that extracts textual content from documents

15

*/

16

public class BodyContentHandler extends DefaultHandler implements WriteOutContentHandler {

17

/**

18

* Creates a BodyContentHandler with default StringWriter

19

*/

20

public BodyContentHandler();

21

22

/**

23

* Creates a BodyContentHandler with custom Writer

24

* @param writer Writer to receive extracted content

25

*/

26

public BodyContentHandler(Writer writer);

27

28

/**

29

* Creates a BodyContentHandler with write limit

30

* @param writeLimit Maximum characters to write (-1 for no limit)

31

*/

32

public BodyContentHandler(int writeLimit);

33

34

/**

35

* Gets the extracted content as string

36

* @return String containing extracted textual content

37

*/

38

@Override

39

public String toString();

40

41

/**

42

* Checks if write limit has been reached

43

* @return true if write limit exceeded

44

*/

45

public boolean isWriteLimitReached();

46

}

47

```

48

49

#### WriteOutContentHandler

50

51

Interface for content handlers that support write limits and output control.

52

53

```java { .api }

54

/**

55

* Interface for content handlers with write limit support

56

*/

57

public interface WriteOutContentHandler {

58

/**

59

* Gets the extracted content as string

60

* @return String representation of extracted content

61

*/

62

String toString();

63

64

/**

65

* Checks if configured write limit has been reached

66

* @return true if write limit exceeded, false otherwise

67

*/

68

boolean isWriteLimitReached();

69

}

70

```

71

72

#### ContentHandlerDecorator

73

74

Base decorator class for wrapping and extending content handler functionality.

75

76

```java { .api }

77

/**

78

* Abstract base class for decorating ContentHandler instances

79

*/

80

public abstract class ContentHandlerDecorator implements ContentHandler {

81

/**

82

* Creates decorator around existing ContentHandler

83

* @param handler ContentHandler to decorate

84

*/

85

protected ContentHandlerDecorator(ContentHandler handler);

86

87

/**

88

* Gets the wrapped ContentHandler

89

* @return Underlying ContentHandler instance

90

*/

91

protected ContentHandler getContentHandler();

92

}

93

```

94

95

### Format Conversion Handlers

96

97

#### ToXMLContentHandler

98

99

Converts document content to well-formed XML output with proper encoding and namespace handling.

100

101

```java { .api }

102

/**

103

* Content handler that converts document content to XML format

104

*/

105

public class ToXMLContentHandler extends ContentHandlerDecorator {

106

/**

107

* Creates ToXMLContentHandler with default XML output

108

*/

109

public ToXMLContentHandler();

110

111

/**

112

* Creates ToXMLContentHandler with custom Result target

113

* @param result Result object for XML output

114

*/

115

public ToXMLContentHandler(Result result);

116

117

/**

118

* Creates ToXMLContentHandler with encoding specification

119

* @param encoding Character encoding for XML output

120

*/

121

public ToXMLContentHandler(String encoding);

122

123

/**

124

* Gets the XML content as string

125

* @return String containing XML representation

126

*/

127

@Override

128

public String toString();

129

}

130

```

131

132

#### ToHTMLContentHandler

133

134

Converts document content to HTML format with proper tag structure and encoding.

135

136

```java { .api }

137

/**

138

* Content handler that converts document content to HTML format

139

*/

140

public class ToHTMLContentHandler extends ToXMLContentHandler {

141

/**

142

* Creates ToHTMLContentHandler with default HTML output

143

*/

144

public ToHTMLContentHandler();

145

146

/**

147

* Creates ToHTMLContentHandler with custom Writer

148

* @param writer Writer for HTML output

149

*/

150

public ToHTMLContentHandler(Writer writer);

151

152

/**

153

* Creates ToHTMLContentHandler with encoding specification

154

* @param encoding Character encoding for HTML output

155

*/

156

public ToHTMLContentHandler(String encoding);

157

}

158

```

159

160

#### ToTextContentHandler

161

162

Extracts plain text content without formatting or markup elements.

163

164

```java { .api }

165

/**

166

* Content handler that extracts plain text content

167

*/

168

public class ToTextContentHandler extends ContentHandlerDecorator {

169

/**

170

* Creates ToTextContentHandler with default text extraction

171

*/

172

public ToTextContentHandler();

173

174

/**

175

* Creates ToTextContentHandler with custom Writer

176

* @param writer Writer for plain text output

177

*/

178

public ToTextContentHandler(Writer writer);

179

180

/**

181

* Gets the extracted plain text

182

* @return String containing plain text content

183

*/

184

@Override

185

public String toString();

186

}

187

```

188

189

### Specialized Content Handlers

190

191

#### LinkContentHandler

192

193

Extracts and collects hyperlinks and references from document content.

194

195

```java { .api }

196

/**

197

* Content handler that extracts links from document content

198

*/

199

public class LinkContentHandler extends ContentHandlerDecorator {

200

/**

201

* Creates LinkContentHandler for link extraction

202

*/

203

public LinkContentHandler();

204

205

/**

206

* Creates LinkContentHandler with base URI for resolving relative links

207

* @param base Base URI for link resolution

208

*/

209

public LinkContentHandler(String base);

210

211

/**

212

* Gets all extracted links

213

* @return List of Link objects representing extracted hyperlinks

214

*/

215

public List<Link> getLinks();

216

217

/**

218

* Inner class representing an extracted link

219

*/

220

public static class Link {

221

/**

222

* Gets the link type (e.g., "a", "img", "link")

223

* @return String representing link element type

224

*/

225

public String getType();

226

227

/**

228

* Gets the link URI

229

* @return String containing link URI

230

*/

231

public String getUri();

232

233

/**

234

* Gets the link title or alt text

235

* @return String containing link title

236

*/

237

public String getTitle();

238

239

/**

240

* Gets the anchor text content

241

* @return String containing link text content

242

*/

243

public String getText();

244

245

/**

246

* Gets the relationship attribute

247

* @return String containing rel attribute value

248

*/

249

public String getRel();

250

}

251

}

252

```

253

254

#### TeeContentHandler

255

256

Broadcasts SAX events to multiple content handlers simultaneously for parallel processing.

257

258

```java { .api }

259

/**

260

* Content handler that delegates events to multiple handlers

261

*/

262

public class TeeContentHandler extends DefaultHandler {

263

/**

264

* Creates TeeContentHandler with array of handlers

265

* @param handlers Array of ContentHandler instances to receive events

266

*/

267

public TeeContentHandler(ContentHandler... handlers);

268

269

/**

270

* Creates TeeContentHandler with list of handlers

271

* @param handlers List of ContentHandler instances

272

*/

273

public TeeContentHandler(List<ContentHandler> handlers);

274

275

/**

276

* Gets all registered content handlers

277

* @return List of ContentHandler instances receiving events

278

*/

279

public List<ContentHandler> getHandlers();

280

}

281

```

282

283

#### SafeContentHandler

284

285

Wraps content handlers with error handling and recovery mechanisms.

286

287

```java { .api }

288

/**

289

* Content handler wrapper that provides error handling and recovery

290

*/

291

public class SafeContentHandler extends ContentHandlerDecorator {

292

/**

293

* Creates SafeContentHandler wrapping another handler

294

* @param handler ContentHandler to wrap with error handling

295

*/

296

public SafeContentHandler(ContentHandler handler);

297

298

/**

299

* Gets any exception that occurred during processing

300

* @return Exception that occurred, or null if none

301

*/

302

public Exception getException();

303

304

/**

305

* Checks if processing completed without errors

306

* @return true if no exceptions occurred

307

*/

308

public boolean hasCompleted();

309

}

310

```

311

312

### Advanced Content Handlers

313

314

#### ExpandedTitleContentHandler

315

316

Extracts and expands document titles using various heuristics and content analysis.

317

318

```java { .api }

319

/**

320

* Content handler that extracts and expands document titles

321

*/

322

public class ExpandedTitleContentHandler extends ContentHandlerDecorator {

323

/**

324

* Creates ExpandedTitleContentHandler with default title extraction

325

* @param handler Underlying ContentHandler

326

*/

327

public ExpandedTitleContentHandler(ContentHandler handler);

328

329

/**

330

* Gets the extracted and expanded title

331

* @return String containing document title

332

*/

333

public String getTitle();

334

}

335

```

336

337

#### PhoneExtractingContentHandler

338

339

Specialized handler for extracting phone numbers from document content using pattern recognition.

340

341

```java { .api }

342

/**

343

* Content handler that extracts phone numbers from content

344

*/

345

public class PhoneExtractingContentHandler extends ContentHandlerDecorator {

346

/**

347

* Creates PhoneExtractingContentHandler

348

* @param handler Underlying ContentHandler

349

* @param metadata Metadata for context

350

*/

351

public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata);

352

353

/**

354

* Gets all extracted phone numbers

355

* @return Set of phone number strings found in content

356

*/

357

public Set<String> getPhoneNumbers();

358

}

359

```

360

361

#### TaggedContentHandler

362

363

Tags content elements with identifiers for tracking and reference purposes.

364

365

```java { .api }

366

/**

367

* Content handler that adds tracking tags to content elements

368

*/

369

public class TaggedContentHandler extends ContentHandlerDecorator {

370

/**

371

* Creates TaggedContentHandler with element tagging

372

* @param handler Underlying ContentHandler

373

*/

374

public TaggedContentHandler(ContentHandler handler);

375

376

/**

377

* Gets mapping of tags to content elements

378

* @return Map of tag identifiers to content strings

379

*/

380

public Map<String, String> getTaggedContent();

381

}

382

```

383

384

### XHTML Processing

385

386

#### XHTMLContentHandler

387

388

Specialized handler for processing XHTML content with namespace awareness and structure preservation.

389

390

```java { .api }

391

/**

392

* Content handler specialized for XHTML document processing

393

*/

394

public class XHTMLContentHandler extends DefaultHandler {

395

/**

396

* Creates XHTMLContentHandler with default XHTML processing

397

*/

398

public XHTMLContentHandler();

399

400

/**

401

* Creates XHTMLContentHandler with custom ContentHandler

402

* @param handler ContentHandler for XHTML events

403

*/

404

public XHTMLContentHandler(ContentHandler handler);

405

406

/**

407

* Starts an XHTML element with namespace support

408

* @param name Element name

409

* @param attributes Element attributes

410

*/

411

public void startElement(String name, AttributesImpl attributes);

412

413

/**

414

* Ends an XHTML element

415

* @param name Element name

416

*/

417

public void endElement(String name);

418

419

/**

420

* Adds character content

421

* @param ch Character array

422

* @param start Start offset

423

* @param length Length of content

424

*/

425

public void characters(char[] ch, int start, int length);

426

}

427

```

428

429

### Embedded Document Handling

430

431

#### EmbeddedContentHandler

432

433

Handles extraction and processing of embedded documents within parent documents.

434

435

```java { .api }

436

/**

437

* Content handler for processing embedded documents

438

*/

439

public class EmbeddedContentHandler extends ContentHandlerDecorator {

440

/**

441

* Creates EmbeddedContentHandler for embedded document processing

442

* @param handler ContentHandler for embedded content

443

*/

444

public EmbeddedContentHandler(ContentHandler handler);

445

446

/**

447

* Sets the embedded document extractor

448

* @param extractor EmbeddedDocumentExtractor for processing embedded docs

449

*/

450

public void setEmbeddedDocumentExtractor(EmbeddedDocumentExtractor extractor);

451

452

/**

453

* Gets the embedded document extractor

454

* @return EmbeddedDocumentExtractor currently in use

455

*/

456

public EmbeddedDocumentExtractor getEmbeddedDocumentExtractor();

457

}

458

```

459

460

## XPath Content Matching

461

462

### XPath Parser and Matching

463

464

#### XPathParser

465

466

Parser for XPath expressions used in content matching and selection operations.

467

468

```java { .api }

469

/**

470

* Parser for XPath expressions used in content matching

471

*/

472

public class XPathParser {

473

/**

474

* Parses XPath expression into Matcher

475

* @param xpath XPath expression string

476

* @return Matcher for the XPath expression

477

* @throws ParseException if XPath syntax is invalid

478

*/

479

public static Matcher parse(String xpath) throws ParseException;

480

481

/**

482

* Creates composite matcher from multiple XPath expressions

483

* @param xpaths Array of XPath expression strings

484

* @return CompositeMatcher combining all expressions

485

*/

486

public static Matcher parseMultiple(String... xpaths);

487

}

488

```

489

490

#### Matcher Interface

491

492

Interface for matching content elements based on XPath-like expressions.

493

494

```java { .api }

495

/**

496

* Interface for matching content elements using path-based expressions

497

*/

498

public interface Matcher {

499

/**

500

* Checks if current parse state matches this matcher

501

* @param namespaceURI Namespace URI of current element

502

* @param localName Local name of current element

503

* @param qName Qualified name of current element

504

* @return true if current state matches

505

*/

506

boolean matches(String namespaceURI, String localName, String qName);

507

508

/**

509

* Updates matcher state for element start

510

* @param namespaceURI Namespace URI

511

* @param localName Local name

512

* @param qName Qualified name

513

* @return Updated matcher for child elements

514

*/

515

Matcher descend(String namespaceURI, String localName, String qName);

516

}

517

```

518

519

#### MatchingContentHandler

520

521

Content handler that applies XPath matching to selectively process document elements.

522

523

```java { .api }

524

/**

525

* Content handler that uses XPath matching for selective processing

526

*/

527

public class MatchingContentHandler extends ContentHandlerDecorator {

528

/**

529

* Creates MatchingContentHandler with XPath matcher

530

* @param handler ContentHandler to receive matched content

531

* @param matcher Matcher defining selection criteria

532

*/

533

public MatchingContentHandler(ContentHandler handler, Matcher matcher);

534

535

/**

536

* Creates MatchingContentHandler with XPath expression

537

* @param handler ContentHandler to receive matched content

538

* @param xpath XPath expression for matching

539

*/

540

public MatchingContentHandler(ContentHandler handler, String xpath);

541

542

/**

543

* Gets the current matcher

544

* @return Matcher being used for content selection

545

*/

546

public Matcher getMatcher();

547

548

/**

549

* Checks if currently inside a matching element

550

* @return true if processing matched content

551

*/

552

public boolean isMatching();

553

}

554

```

555

556

## Usage Examples

557

558

### Basic Text Extraction

559

560

```java { .api }

561

// Extract plain text with size limit

562

BodyContentHandler textHandler = new BodyContentHandler(1000000);

563

AutoDetectParser parser = new AutoDetectParser();

564

Metadata metadata = new Metadata();

565

566

try (InputStream stream = new FileInputStream("document.pdf")) {

567

parser.parse(stream, textHandler, metadata, new ParseContext());

568

String extractedText = textHandler.toString();

569

570

if (textHandler.isWriteLimitReached()) {

571

System.out.println("Content truncated due to size limit");

572

}

573

}

574

```

575

576

### Multiple Format Output

577

578

```java { .api }

579

// Generate both HTML and plain text simultaneously

580

BodyContentHandler textHandler = new BodyContentHandler();

581

ToHTMLContentHandler htmlHandler = new ToHTMLContentHandler();

582

TeeContentHandler teeHandler = new TeeContentHandler(textHandler, htmlHandler);

583

584

parser.parse(stream, teeHandler, metadata, new ParseContext());

585

586

String plainText = textHandler.toString();

587

String htmlContent = htmlHandler.toString();

588

```

589

590

### Link Extraction

591

592

```java { .api }

593

// Extract all links from document

594

LinkContentHandler linkHandler = new LinkContentHandler();

595

parser.parse(stream, linkHandler, metadata, new ParseContext());

596

597

List<LinkContentHandler.Link> links = linkHandler.getLinks();

598

for (LinkContentHandler.Link link : links) {

599

System.out.println("Type: " + link.getType());

600

System.out.println("URI: " + link.getUri());

601

System.out.println("Title: " + link.getTitle());

602

System.out.println("Text: " + link.getText());

603

}

604

```

605

606

### XPath-based Content Selection

607

608

```java { .api }

609

// Extract only table content using XPath

610

String xpath = "//table";

611

BodyContentHandler tableHandler = new BodyContentHandler();

612

MatchingContentHandler matcher = new MatchingContentHandler(tableHandler, xpath);

613

614

parser.parse(stream, matcher, metadata, new ParseContext());

615

String tableContent = tableHandler.toString();

616

```

617

618

### Error-Safe Processing

619

620

```java { .api }

621

// Process with error handling

622

BodyContentHandler textHandler = new BodyContentHandler();

623

SafeContentHandler safeHandler = new SafeContentHandler(textHandler);

624

625

parser.parse(stream, safeHandler, metadata, new ParseContext());

626

627

if (safeHandler.hasCompleted()) {

628

String content = textHandler.toString();

629

} else {

630

Exception error = safeHandler.getException();

631

System.err.println("Processing failed: " + error.getMessage());

632

}

633

```