tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

—

Pending

Overview

Eval results

Files

Content Processing

Name: tessl/maven-org-apache-tika--tika-core
Author: tessl

SAX-based content handler system for processing and transforming document content during parsing, including specialized handlers for text extraction, HTML/XML conversion, link extraction, and XPath-based content matching.

Capabilities

Content Handler Base Classes

BodyContentHandler

Primary content handler for extracting textual content from documents with configurable output limits and encoding support.

/**
 * Content handler that extracts textual content from documents
 */
public class BodyContentHandler extends DefaultHandler implements WriteOutContentHandler {
    /**
     * Creates a BodyContentHandler with default StringWriter
     */
    public BodyContentHandler();
    
    /**
     * Creates a BodyContentHandler with custom Writer
     * @param writer Writer to receive extracted content
     */
    public BodyContentHandler(Writer writer);
    
    /**
     * Creates a BodyContentHandler with write limit
     * @param writeLimit Maximum characters to write (-1 for no limit)
     */
    public BodyContentHandler(int writeLimit);
    
    /**
     * Gets the extracted content as string
     * @return String containing extracted textual content
     */
    @Override
    public String toString();
    
    /**
     * Checks if write limit has been reached
     * @return true if write limit exceeded
     */
    public boolean isWriteLimitReached();
}

WriteOutContentHandler

Interface for content handlers that support write limits and output control.

/**
 * Interface for content handlers with write limit support
 */
public interface WriteOutContentHandler {
    /**
     * Gets the extracted content as string
     * @return String representation of extracted content
     */
    String toString();
    
    /**
     * Checks if configured write limit has been reached
     * @return true if write limit exceeded, false otherwise
     */
    boolean isWriteLimitReached();
}

ContentHandlerDecorator

Base decorator class for wrapping and extending content handler functionality.

/**
 * Abstract base class for decorating ContentHandler instances
 */
public abstract class ContentHandlerDecorator implements ContentHandler {
    /**
     * Creates decorator around existing ContentHandler
     * @param handler ContentHandler to decorate
     */
    protected ContentHandlerDecorator(ContentHandler handler);
    
    /**
     * Gets the wrapped ContentHandler
     * @return Underlying ContentHandler instance
     */
    protected ContentHandler getContentHandler();
}

Format Conversion Handlers

ToXMLContentHandler

Converts document content to well-formed XML output with proper encoding and namespace handling.

/**
 * Content handler that converts document content to XML format
 */
public class ToXMLContentHandler extends ContentHandlerDecorator {
    /**
     * Creates ToXMLContentHandler with default XML output
     */
    public ToXMLContentHandler();
    
    /**
     * Creates ToXMLContentHandler with custom Result target
     * @param result Result object for XML output
     */
    public ToXMLContentHandler(Result result);
    
    /**
     * Creates ToXMLContentHandler with encoding specification
     * @param encoding Character encoding for XML output
     */
    public ToXMLContentHandler(String encoding);
    
    /**
     * Gets the XML content as string
     * @return String containing XML representation
     */
    @Override
    public String toString();
}

ToHTMLContentHandler

Converts document content to HTML format with proper tag structure and encoding.

/**
 * Content handler that converts document content to HTML format
 */
public class ToHTMLContentHandler extends ToXMLContentHandler {
    /**
     * Creates ToHTMLContentHandler with default HTML output
     */
    public ToHTMLContentHandler();
    
    /**
     * Creates ToHTMLContentHandler with custom Writer
     * @param writer Writer for HTML output
     */
    public ToHTMLContentHandler(Writer writer);
    
    /**
     * Creates ToHTMLContentHandler with encoding specification
     * @param encoding Character encoding for HTML output
     */
    public ToHTMLContentHandler(String encoding);
}

ToTextContentHandler

Extracts plain text content without formatting or markup elements.

/**
 * Content handler that extracts plain text content
 */
public class ToTextContentHandler extends ContentHandlerDecorator {
    /**
     * Creates ToTextContentHandler with default text extraction
     */
    public ToTextContentHandler();
    
    /**
     * Creates ToTextContentHandler with custom Writer
     * @param writer Writer for plain text output
     */
    public ToTextContentHandler(Writer writer);
    
    /**
     * Gets the extracted plain text
     * @return String containing plain text content
     */
    @Override
    public String toString();
}

Specialized Content Handlers

LinkContentHandler

Extracts and collects hyperlinks and references from document content.

/**
 * Content handler that extracts links from document content
 */
public class LinkContentHandler extends ContentHandlerDecorator {
    /**
     * Creates LinkContentHandler for link extraction
     */
    public LinkContentHandler();
    
    /**
     * Creates LinkContentHandler with base URI for resolving relative links
     * @param base Base URI for link resolution
     */
    public LinkContentHandler(String base);
    
    /**
     * Gets all extracted links
     * @return List of Link objects representing extracted hyperlinks
     */
    public List<Link> getLinks();
    
    /**
     * Inner class representing an extracted link
     */
    public static class Link {
        /**
         * Gets the link type (e.g., "a", "img", "link")
         * @return String representing link element type
         */
        public String getType();
        
        /**
         * Gets the link URI
         * @return String containing link URI
         */
        public String getUri();
        
        /**
         * Gets the link title or alt text
         * @return String containing link title
         */
        public String getTitle();
        
        /**
         * Gets the anchor text content
         * @return String containing link text content
         */
        public String getText();
        
        /**
         * Gets the relationship attribute
         * @return String containing rel attribute value
         */
        public String getRel();
    }
}

TeeContentHandler

Broadcasts SAX events to multiple content handlers simultaneously for parallel processing.

/**
 * Content handler that delegates events to multiple handlers
 */
public class TeeContentHandler extends DefaultHandler {
    /**
     * Creates TeeContentHandler with array of handlers
     * @param handlers Array of ContentHandler instances to receive events
     */
    public TeeContentHandler(ContentHandler... handlers);
    
    /**
     * Creates TeeContentHandler with list of handlers
     * @param handlers List of ContentHandler instances
     */
    public TeeContentHandler(List<ContentHandler> handlers);
    
    /**
     * Gets all registered content handlers
     * @return List of ContentHandler instances receiving events
     */
    public List<ContentHandler> getHandlers();
}

SafeContentHandler

Wraps content handlers with error handling and recovery mechanisms.

/**
 * Content handler wrapper that provides error handling and recovery
 */
public class SafeContentHandler extends ContentHandlerDecorator {
    /**
     * Creates SafeContentHandler wrapping another handler
     * @param handler ContentHandler to wrap with error handling
     */
    public SafeContentHandler(ContentHandler handler);
    
    /**
     * Gets any exception that occurred during processing
     * @return Exception that occurred, or null if none
     */
    public Exception getException();
    
    /**
     * Checks if processing completed without errors
     * @return true if no exceptions occurred
     */
    public boolean hasCompleted();
}

Advanced Content Handlers

ExpandedTitleContentHandler

Extracts and expands document titles using various heuristics and content analysis.

/**
 * Content handler that extracts and expands document titles
 */
public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
    /**
     * Creates ExpandedTitleContentHandler with default title extraction
     * @param handler Underlying ContentHandler
     */
    public ExpandedTitleContentHandler(ContentHandler handler);
    
    /**
     * Gets the extracted and expanded title
     * @return String containing document title
     */
    public String getTitle();
}

PhoneExtractingContentHandler

Specialized handler for extracting phone numbers from document content using pattern recognition.

/**
 * Content handler that extracts phone numbers from content
 */
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
    /**
     * Creates PhoneExtractingContentHandler
     * @param handler Underlying ContentHandler
     * @param metadata Metadata for context
     */
    public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata);
    
    /**
     * Gets all extracted phone numbers
     * @return Set of phone number strings found in content
     */
    public Set<String> getPhoneNumbers();
}

TaggedContentHandler

Tags content elements with identifiers for tracking and reference purposes.

/**
 * Content handler that adds tracking tags to content elements
 */
public class TaggedContentHandler extends ContentHandlerDecorator {
    /**
     * Creates TaggedContentHandler with element tagging
     * @param handler Underlying ContentHandler
     */
    public TaggedContentHandler(ContentHandler handler);
    
    /**
     * Gets mapping of tags to content elements
     * @return Map of tag identifiers to content strings
     */
    public Map<String, String> getTaggedContent();
}

XHTML Processing

XHTMLContentHandler

Specialized handler for processing XHTML content with namespace awareness and structure preservation.

/**
 * Content handler specialized for XHTML document processing
 */
public class XHTMLContentHandler extends DefaultHandler {
    /**
     * Creates XHTMLContentHandler with default XHTML processing
     */
    public XHTMLContentHandler();
    
    /**
     * Creates XHTMLContentHandler with custom ContentHandler
     * @param handler ContentHandler for XHTML events
     */
    public XHTMLContentHandler(ContentHandler handler);
    
    /**
     * Starts an XHTML element with namespace support
     * @param name Element name
     * @param attributes Element attributes
     */
    public void startElement(String name, AttributesImpl attributes);
    
    /**
     * Ends an XHTML element
     * @param name Element name
     */
    public void endElement(String name);
    
    /**
     * Adds character content
     * @param ch Character array
     * @param start Start offset
     * @param length Length of content
     */
    public void characters(char[] ch, int start, int length);
}

Embedded Document Handling

EmbeddedContentHandler

Handles extraction and processing of embedded documents within parent documents.

/**
 * Content handler for processing embedded documents
 */
public class EmbeddedContentHandler extends ContentHandlerDecorator {
    /**
     * Creates EmbeddedContentHandler for embedded document processing
     * @param handler ContentHandler for embedded content
     */
    public EmbeddedContentHandler(ContentHandler handler);
    
    /**
     * Sets the embedded document extractor
     * @param extractor EmbeddedDocumentExtractor for processing embedded docs
     */
    public void setEmbeddedDocumentExtractor(EmbeddedDocumentExtractor extractor);
    
    /**
     * Gets the embedded document extractor
     * @return EmbeddedDocumentExtractor currently in use
     */
    public EmbeddedDocumentExtractor getEmbeddedDocumentExtractor();
}

XPath Content Matching

XPath Parser and Matching

XPathParser

Parser for XPath expressions used in content matching and selection operations.

/**
 * Parser for XPath expressions used in content matching
 */
public class XPathParser {
    /**
     * Parses XPath expression into Matcher
     * @param xpath XPath expression string
     * @return Matcher for the XPath expression
     * @throws ParseException if XPath syntax is invalid
     */
    public static Matcher parse(String xpath) throws ParseException;
    
    /**
     * Creates composite matcher from multiple XPath expressions
     * @param xpaths Array of XPath expression strings
     * @return CompositeMatcher combining all expressions
     */
    public static Matcher parseMultiple(String... xpaths);
}

Matcher Interface

Interface for matching content elements based on XPath-like expressions.

/**
 * Interface for matching content elements using path-based expressions
 */
public interface Matcher {
    /**
     * Checks if current parse state matches this matcher
     * @param namespaceURI Namespace URI of current element
     * @param localName Local name of current element
     * @param qName Qualified name of current element
     * @return true if current state matches
     */
    boolean matches(String namespaceURI, String localName, String qName);
    
    /**
     * Updates matcher state for element start
     * @param namespaceURI Namespace URI
     * @param localName Local name
     * @param qName Qualified name
     * @return Updated matcher for child elements
     */
    Matcher descend(String namespaceURI, String localName, String qName);
}

MatchingContentHandler

Content handler that applies XPath matching to selectively process document elements.

/**
 * Content handler that uses XPath matching for selective processing
 */
public class MatchingContentHandler extends ContentHandlerDecorator {
    /**
     * Creates MatchingContentHandler with XPath matcher
     * @param handler ContentHandler to receive matched content
     * @param matcher Matcher defining selection criteria
     */
    public MatchingContentHandler(ContentHandler handler, Matcher matcher);
    
    /**
     * Creates MatchingContentHandler with XPath expression
     * @param handler ContentHandler to receive matched content  
     * @param xpath XPath expression for matching
     */
    public MatchingContentHandler(ContentHandler handler, String xpath);
    
    /**
     * Gets the current matcher
     * @return Matcher being used for content selection
     */
    public Matcher getMatcher();
    
    /**
     * Checks if currently inside a matching element
     * @return true if processing matched content
     */
    public boolean isMatching();
}

Usage Examples

Basic Text Extraction

// Extract plain text with size limit
BodyContentHandler textHandler = new BodyContentHandler(1000000);
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();

try (InputStream stream = new FileInputStream("document.pdf")) {
    parser.parse(stream, textHandler, metadata, new ParseContext());
    String extractedText = textHandler.toString();
    
    if (textHandler.isWriteLimitReached()) {
        System.out.println("Content truncated due to size limit");
    }
}

Multiple Format Output

// Generate both HTML and plain text simultaneously
BodyContentHandler textHandler = new BodyContentHandler();
ToHTMLContentHandler htmlHandler = new ToHTMLContentHandler();
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, htmlHandler);

parser.parse(stream, teeHandler, metadata, new ParseContext());

String plainText = textHandler.toString();
String htmlContent = htmlHandler.toString();

Link Extraction

// Extract all links from document
LinkContentHandler linkHandler = new LinkContentHandler();
parser.parse(stream, linkHandler, metadata, new ParseContext());

List<LinkContentHandler.Link> links = linkHandler.getLinks();
for (LinkContentHandler.Link link : links) {
    System.out.println("Type: " + link.getType());
    System.out.println("URI: " + link.getUri()); 
    System.out.println("Title: " + link.getTitle());
    System.out.println("Text: " + link.getText());
}

XPath-based Content Selection

// Extract only table content using XPath
String xpath = "//table";
BodyContentHandler tableHandler = new BodyContentHandler();
MatchingContentHandler matcher = new MatchingContentHandler(tableHandler, xpath);

parser.parse(stream, matcher, metadata, new ParseContext());
String tableContent = tableHandler.toString();

Error-Safe Processing

// Process with error handling
BodyContentHandler textHandler = new BodyContentHandler();
SafeContentHandler safeHandler = new SafeContentHandler(textHandler);

parser.parse(stream, safeHandler, metadata, new ParseContext());

if (safeHandler.hasCompleted()) {
    String content = textHandler.toString();
} else {
    Exception error = safeHandler.getException();
    System.err.println("Processing failed: " + error.getMessage());
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

tessl/maven-org-apache-tika--tika-core