Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
SAX-based content handler system for processing and transforming document content during parsing, including specialized handlers for text extraction, HTML/XML conversion, link extraction, and XPath-based content matching.
Primary content handler for extracting textual content from documents with configurable output limits and encoding support.
/**
* Content handler that extracts textual content from documents
*/
public class BodyContentHandler extends DefaultHandler implements WriteOutContentHandler {
/**
* Creates a BodyContentHandler with default StringWriter
*/
public BodyContentHandler();
/**
* Creates a BodyContentHandler with custom Writer
* @param writer Writer to receive extracted content
*/
public BodyContentHandler(Writer writer);
/**
* Creates a BodyContentHandler with write limit
* @param writeLimit Maximum characters to write (-1 for no limit)
*/
public BodyContentHandler(int writeLimit);
/**
* Gets the extracted content as string
* @return String containing extracted textual content
*/
@Override
public String toString();
/**
* Checks if write limit has been reached
* @return true if write limit exceeded
*/
public boolean isWriteLimitReached();
}Interface for content handlers that support write limits and output control.
/**
* Interface for content handlers with write limit support
*/
public interface WriteOutContentHandler {
/**
* Gets the extracted content as string
* @return String representation of extracted content
*/
String toString();
/**
* Checks if configured write limit has been reached
* @return true if write limit exceeded, false otherwise
*/
boolean isWriteLimitReached();
}Base decorator class for wrapping and extending content handler functionality.
/**
* Abstract base class for decorating ContentHandler instances
*/
public abstract class ContentHandlerDecorator implements ContentHandler {
/**
* Creates decorator around existing ContentHandler
* @param handler ContentHandler to decorate
*/
protected ContentHandlerDecorator(ContentHandler handler);
/**
* Gets the wrapped ContentHandler
* @return Underlying ContentHandler instance
*/
protected ContentHandler getContentHandler();
}Converts document content to well-formed XML output with proper encoding and namespace handling.
/**
* Content handler that converts document content to XML format
*/
public class ToXMLContentHandler extends ContentHandlerDecorator {
/**
* Creates ToXMLContentHandler with default XML output
*/
public ToXMLContentHandler();
/**
* Creates ToXMLContentHandler with custom Result target
* @param result Result object for XML output
*/
public ToXMLContentHandler(Result result);
/**
* Creates ToXMLContentHandler with encoding specification
* @param encoding Character encoding for XML output
*/
public ToXMLContentHandler(String encoding);
/**
* Gets the XML content as string
* @return String containing XML representation
*/
@Override
public String toString();
}Converts document content to HTML format with proper tag structure and encoding.
/**
* Content handler that converts document content to HTML format
*/
public class ToHTMLContentHandler extends ToXMLContentHandler {
/**
* Creates ToHTMLContentHandler with default HTML output
*/
public ToHTMLContentHandler();
/**
* Creates ToHTMLContentHandler with custom Writer
* @param writer Writer for HTML output
*/
public ToHTMLContentHandler(Writer writer);
/**
* Creates ToHTMLContentHandler with encoding specification
* @param encoding Character encoding for HTML output
*/
public ToHTMLContentHandler(String encoding);
}Extracts plain text content without formatting or markup elements.
/**
* Content handler that extracts plain text content
*/
public class ToTextContentHandler extends ContentHandlerDecorator {
/**
* Creates ToTextContentHandler with default text extraction
*/
public ToTextContentHandler();
/**
* Creates ToTextContentHandler with custom Writer
* @param writer Writer for plain text output
*/
public ToTextContentHandler(Writer writer);
/**
* Gets the extracted plain text
* @return String containing plain text content
*/
@Override
public String toString();
}Extracts and collects hyperlinks and references from document content.
/**
* Content handler that extracts links from document content
*/
public class LinkContentHandler extends ContentHandlerDecorator {
/**
* Creates LinkContentHandler for link extraction
*/
public LinkContentHandler();
/**
* Creates LinkContentHandler with base URI for resolving relative links
* @param base Base URI for link resolution
*/
public LinkContentHandler(String base);
/**
* Gets all extracted links
* @return List of Link objects representing extracted hyperlinks
*/
public List<Link> getLinks();
/**
* Inner class representing an extracted link
*/
public static class Link {
/**
* Gets the link type (e.g., "a", "img", "link")
* @return String representing link element type
*/
public String getType();
/**
* Gets the link URI
* @return String containing link URI
*/
public String getUri();
/**
* Gets the link title or alt text
* @return String containing link title
*/
public String getTitle();
/**
* Gets the anchor text content
* @return String containing link text content
*/
public String getText();
/**
* Gets the relationship attribute
* @return String containing rel attribute value
*/
public String getRel();
}
}Broadcasts SAX events to multiple content handlers simultaneously for parallel processing.
/**
* Content handler that delegates events to multiple handlers
*/
public class TeeContentHandler extends DefaultHandler {
/**
* Creates TeeContentHandler with array of handlers
* @param handlers Array of ContentHandler instances to receive events
*/
public TeeContentHandler(ContentHandler... handlers);
/**
* Creates TeeContentHandler with list of handlers
* @param handlers List of ContentHandler instances
*/
public TeeContentHandler(List<ContentHandler> handlers);
/**
* Gets all registered content handlers
* @return List of ContentHandler instances receiving events
*/
public List<ContentHandler> getHandlers();
}Wraps content handlers with error handling and recovery mechanisms.
/**
* Content handler wrapper that provides error handling and recovery
*/
public class SafeContentHandler extends ContentHandlerDecorator {
/**
* Creates SafeContentHandler wrapping another handler
* @param handler ContentHandler to wrap with error handling
*/
public SafeContentHandler(ContentHandler handler);
/**
* Gets any exception that occurred during processing
* @return Exception that occurred, or null if none
*/
public Exception getException();
/**
* Checks if processing completed without errors
* @return true if no exceptions occurred
*/
public boolean hasCompleted();
}Extracts and expands document titles using various heuristics and content analysis.
/**
* Content handler that extracts and expands document titles
*/
public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
/**
* Creates ExpandedTitleContentHandler with default title extraction
* @param handler Underlying ContentHandler
*/
public ExpandedTitleContentHandler(ContentHandler handler);
/**
* Gets the extracted and expanded title
* @return String containing document title
*/
public String getTitle();
}Specialized handler for extracting phone numbers from document content using pattern recognition.
/**
* Content handler that extracts phone numbers from content
*/
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
/**
* Creates PhoneExtractingContentHandler
* @param handler Underlying ContentHandler
* @param metadata Metadata for context
*/
public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata);
/**
* Gets all extracted phone numbers
* @return Set of phone number strings found in content
*/
public Set<String> getPhoneNumbers();
}Tags content elements with identifiers for tracking and reference purposes.
/**
* Content handler that adds tracking tags to content elements
*/
public class TaggedContentHandler extends ContentHandlerDecorator {
/**
* Creates TaggedContentHandler with element tagging
* @param handler Underlying ContentHandler
*/
public TaggedContentHandler(ContentHandler handler);
/**
* Gets mapping of tags to content elements
* @return Map of tag identifiers to content strings
*/
public Map<String, String> getTaggedContent();
}Specialized handler for processing XHTML content with namespace awareness and structure preservation.
/**
* Content handler specialized for XHTML document processing
*/
public class XHTMLContentHandler extends DefaultHandler {
/**
* Creates XHTMLContentHandler with default XHTML processing
*/
public XHTMLContentHandler();
/**
* Creates XHTMLContentHandler with custom ContentHandler
* @param handler ContentHandler for XHTML events
*/
public XHTMLContentHandler(ContentHandler handler);
/**
* Starts an XHTML element with namespace support
* @param name Element name
* @param attributes Element attributes
*/
public void startElement(String name, AttributesImpl attributes);
/**
* Ends an XHTML element
* @param name Element name
*/
public void endElement(String name);
/**
* Adds character content
* @param ch Character array
* @param start Start offset
* @param length Length of content
*/
public void characters(char[] ch, int start, int length);
}Handles extraction and processing of embedded documents within parent documents.
/**
* Content handler for processing embedded documents
*/
public class EmbeddedContentHandler extends ContentHandlerDecorator {
/**
* Creates EmbeddedContentHandler for embedded document processing
* @param handler ContentHandler for embedded content
*/
public EmbeddedContentHandler(ContentHandler handler);
/**
* Sets the embedded document extractor
* @param extractor EmbeddedDocumentExtractor for processing embedded docs
*/
public void setEmbeddedDocumentExtractor(EmbeddedDocumentExtractor extractor);
/**
* Gets the embedded document extractor
* @return EmbeddedDocumentExtractor currently in use
*/
public EmbeddedDocumentExtractor getEmbeddedDocumentExtractor();
}Parser for XPath expressions used in content matching and selection operations.
/**
* Parser for XPath expressions used in content matching
*/
public class XPathParser {
/**
* Parses XPath expression into Matcher
* @param xpath XPath expression string
* @return Matcher for the XPath expression
* @throws ParseException if XPath syntax is invalid
*/
public static Matcher parse(String xpath) throws ParseException;
/**
* Creates composite matcher from multiple XPath expressions
* @param xpaths Array of XPath expression strings
* @return CompositeMatcher combining all expressions
*/
public static Matcher parseMultiple(String... xpaths);
}Interface for matching content elements based on XPath-like expressions.
/**
* Interface for matching content elements using path-based expressions
*/
public interface Matcher {
/**
* Checks if current parse state matches this matcher
* @param namespaceURI Namespace URI of current element
* @param localName Local name of current element
* @param qName Qualified name of current element
* @return true if current state matches
*/
boolean matches(String namespaceURI, String localName, String qName);
/**
* Updates matcher state for element start
* @param namespaceURI Namespace URI
* @param localName Local name
* @param qName Qualified name
* @return Updated matcher for child elements
*/
Matcher descend(String namespaceURI, String localName, String qName);
}Content handler that applies XPath matching to selectively process document elements.
/**
* Content handler that uses XPath matching for selective processing
*/
public class MatchingContentHandler extends ContentHandlerDecorator {
/**
* Creates MatchingContentHandler with XPath matcher
* @param handler ContentHandler to receive matched content
* @param matcher Matcher defining selection criteria
*/
public MatchingContentHandler(ContentHandler handler, Matcher matcher);
/**
* Creates MatchingContentHandler with XPath expression
* @param handler ContentHandler to receive matched content
* @param xpath XPath expression for matching
*/
public MatchingContentHandler(ContentHandler handler, String xpath);
/**
* Gets the current matcher
* @return Matcher being used for content selection
*/
public Matcher getMatcher();
/**
* Checks if currently inside a matching element
* @return true if processing matched content
*/
public boolean isMatching();
}// Extract plain text with size limit
BodyContentHandler textHandler = new BodyContentHandler(1000000);
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = new FileInputStream("document.pdf")) {
parser.parse(stream, textHandler, metadata, new ParseContext());
String extractedText = textHandler.toString();
if (textHandler.isWriteLimitReached()) {
System.out.println("Content truncated due to size limit");
}
}// Generate both HTML and plain text simultaneously
BodyContentHandler textHandler = new BodyContentHandler();
ToHTMLContentHandler htmlHandler = new ToHTMLContentHandler();
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, htmlHandler);
parser.parse(stream, teeHandler, metadata, new ParseContext());
String plainText = textHandler.toString();
String htmlContent = htmlHandler.toString();// Extract all links from document
LinkContentHandler linkHandler = new LinkContentHandler();
parser.parse(stream, linkHandler, metadata, new ParseContext());
List<LinkContentHandler.Link> links = linkHandler.getLinks();
for (LinkContentHandler.Link link : links) {
System.out.println("Type: " + link.getType());
System.out.println("URI: " + link.getUri());
System.out.println("Title: " + link.getTitle());
System.out.println("Text: " + link.getText());
}// Extract only table content using XPath
String xpath = "//table";
BodyContentHandler tableHandler = new BodyContentHandler();
MatchingContentHandler matcher = new MatchingContentHandler(tableHandler, xpath);
parser.parse(stream, matcher, metadata, new ParseContext());
String tableContent = tableHandler.toString();// Process with error handling
BodyContentHandler textHandler = new BodyContentHandler();
SafeContentHandler safeHandler = new SafeContentHandler(textHandler);
parser.parse(stream, safeHandler, metadata, new ParseContext());
if (safeHandler.hasCompleted()) {
String content = textHandler.toString();
} else {
Exception error = safeHandler.getException();
System.err.println("Processing failed: " + error.getMessage());
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core