Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
Framework for extracting embedded documents and resources from container formats such as ZIP archives, Microsoft Office documents, and other compound document formats. Provides both high-level extraction APIs and low-level container processing capabilities with support for nested containers, selective extraction, and custom processing strategies.
Core interface for extracting and processing embedded documents within container formats.
/**
* Interface for extracting embedded documents from container formats
*/
public interface EmbeddedDocumentExtractor {
/**
* Determines whether an embedded document should be parsed
* @param metadata metadata of the embedded document
* @return true if the document should be processed
*/
boolean shouldParseEmbedded(Metadata metadata);
/**
* Processes embedded resource with appropriate parsing
* @param stream input stream containing embedded document
* @param handler SAX content handler for output
* @param metadata metadata for the embedded resource
* @param outputHtml whether to output HTML format
* @throws SAXException if SAX processing fails
* @throws IOException if I/O error occurs
*/
void parseEmbedded(InputStream stream, ContentHandler handler,
Metadata metadata, boolean outputHtml)
throws SAXException, IOException;
}
/**
* Factory interface for creating embedded document extractors
*/
public interface EmbeddedDocumentExtractorFactory extends Serializable {
/**
* Creates new embedded document extractor instance
* @param metadata parent document metadata
* @param parseContext parsing context
* @return configured extractor instance
*/
EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext);
}Default implementation that uses Tika parsers to process embedded documents.
/**
* Parser-based embedded document extractor for compound documents
*/
public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
/**
* Creates extractor with parsing context
* @param context parse context containing configuration
*/
public ParsingEmbeddedDocumentExtractor(ParseContext context);
/**
* Sets whether to write filename to content output
* @param writeFileNameToContent true to include filenames in output
*/
public void setWriteFileNameToContent(boolean writeFileNameToContent);
/**
* Gets filename writing configuration
* @return true if filenames are written to content
*/
public boolean getWriteFileNameToContent();
}
/**
* Factory for creating parsing embedded document extractors
*/
public class ParsingEmbeddedDocumentExtractorFactory
implements EmbeddedDocumentExtractorFactory {
/**
* Creates factory instance
*/
public ParsingEmbeddedDocumentExtractorFactory();
}Low-level interface for extracting resources from container formats.
/**
* Interface for extracting embedded resources from container formats
*/
public interface ContainerExtractor extends Serializable {
/**
* Checks if extractor supports the container format
* @param input Tika input stream to examine
* @return true if this extractor can process the container
* @throws IOException if stream cannot be read
*/
boolean isSupported(TikaInputStream input) throws IOException;
/**
* Extracts all embedded resources from container
* @param stream document stream to process
* @param recurseExtractor extractor for nested containers
* @param handler handler for processing extracted resources
* @throws IOException if stream cannot be read
* @throws TikaException if container cannot be parsed
*/
void extract(TikaInputStream stream, ContainerExtractor recurseExtractor,
EmbeddedResourceHandler handler) throws IOException, TikaException;
}
/**
* Parser-based container extractor implementation
*/
public class ParserContainerExtractor implements ContainerExtractor {
/**
* Creates parser-based container extractor
*/
public ParserContainerExtractor();
/**
* Creates parser-based extractor with custom parser
* @param parser parser to use for extraction
*/
public ParserContainerExtractor(Parser parser);
}Interfaces for processing extracted embedded resources.
/**
* Callback interface for handling extracted embedded resources
*/
public interface EmbeddedResourceHandler {
/**
* Processes an embedded resource
* @param filename filename of embedded resource (if known)
* @param mediaType media type of resource (if known)
* @param stream input stream containing resource content
*/
void handle(String filename, MediaType mediaType, InputStream stream);
}
/**
* Interface for handling embedded document bytes
*/
public interface EmbeddedDocumentBytesHandler {
/**
* Processes bytes from embedded document
* @param embeddedDocumentBytes bytes from embedded document
* @param metadata metadata for the embedded document
* @throws IOException if processing fails
* @throws TikaException if document processing fails
*/
void handleEmbeddedDocumentBytes(byte[] embeddedDocumentBytes, Metadata metadata)
throws IOException, TikaException;
}
/**
* Abstract base class for embedded document bytes handlers
*/
public abstract class AbstractEmbeddedDocumentBytesHandler
implements EmbeddedDocumentBytesHandler {
/**
* Creates handler with temporary resources
* @param temporaryResources temporary resource manager
*/
public AbstractEmbeddedDocumentBytesHandler(TemporaryResources temporaryResources);
}
/**
* Basic implementation of embedded document bytes handler
*/
public class BasicEmbeddedDocumentBytesHandler
extends AbstractEmbeddedDocumentBytesHandler {
/**
* Creates basic bytes handler
* @param temporaryResources temporary resource manager
*/
public BasicEmbeddedDocumentBytesHandler(TemporaryResources temporaryResources);
}Interfaces for controlling which embedded documents to process.
/**
* Interface for document selection strategies
*/
public interface DocumentSelector {
/**
* Determines if document should be selected for processing
* @param metadata document metadata to evaluate
* @return true if document matches selection criteria
*/
boolean select(Metadata metadata);
}
/**
* Interface for selecting embedded bytes to process
*/
public interface EmbeddedBytesSelector {
/**
* Determines if embedded bytes should be selected
* @param metadata metadata for embedded content
* @return true if bytes should be processed
*/
boolean select(Metadata metadata);
}
/**
* Basic implementation of embedded bytes selector
*/
public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {
/**
* Creates basic embedded bytes selector
*/
public BasicEmbeddedBytesSelector();
}Interfaces for translating embedded streams during extraction.
/**
* Interface for translating embedded streams
*/
public interface EmbeddedStreamTranslator {
/**
* Translates embedded stream content
* @param is input stream to translate
* @param embeddedMetadata metadata for embedded content
* @return translated input stream
* @throws IOException if translation fails
*/
InputStream translate(InputStream is, Metadata embeddedMetadata) throws IOException;
}
/**
* Default implementation of embedded stream translator
*/
public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator {
/**
* Creates default stream translator
*/
public DefaultEmbeddedStreamTranslator();
}Helper classes for embedded document processing.
/**
* Utility methods for embedded document processing
*/
public class EmbeddedDocumentUtil {
/**
* Gets file extension from metadata
* @param metadata document metadata
* @return file extension or null
*/
public static String getExtension(Metadata metadata);
/**
* Tries to determine file extension from content type
* @param metadata document metadata containing content type
* @return likely file extension or null
*/
public static String tryToGetExtensionFromContentType(Metadata metadata);
}Basic Embedded Document Extraction:
import org.apache.tika.extractor.*;
import org.apache.tika.parser.*;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import java.io.FileInputStream;
import java.io.InputStream;
// Setup parsing context with embedded extractor
ParseContext context = new ParseContext();
EmbeddedDocumentExtractor extractor = new ParsingEmbeddedDocumentExtractor(context);
context.set(EmbeddedDocumentExtractor.class, extractor);
// Parse document with embedded content
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = new FileInputStream("compound_document.docx")) {
parser.parse(stream, handler, metadata, context);
// Extracted content includes embedded documents
String content = handler.toString();
System.out.println("Content with embedded documents: " + content);
}Container Extraction with Custom Handler:
import org.apache.tika.extractor.*;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
// Custom resource handler to collect extracted files
class ResourceCollector implements EmbeddedResourceHandler {
private List<ExtractedResource> resources = new ArrayList<>();
@Override
public void handle(String filename, MediaType mediaType, InputStream stream) {
try {
byte[] content = stream.readAllBytes();
resources.add(new ExtractedResource(filename, mediaType, content));
System.out.println("Extracted: " + filename + " (" + mediaType + ")");
} catch (IOException e) {
System.err.println("Failed to read: " + filename);
}
}
public List<ExtractedResource> getResources() { return resources; }
}
// Extract from ZIP container
ContainerExtractor extractor = new ParserContainerExtractor();
ResourceCollector collector = new ResourceCollector();
try (TikaInputStream stream = TikaInputStream.get(new FileInputStream("archive.zip"))) {
if (extractor.isSupported(stream)) {
extractor.extract(stream, extractor, collector);
// Process extracted resources
for (ExtractedResource resource : collector.getResources()) {
System.out.println("Found: " + resource.getFilename() +
" (" + resource.getContent().length + " bytes)");
}
}
}Selective Embedded Document Processing:
// Custom document selector for specific file types
class PDFSelector implements DocumentSelector {
@Override
public boolean select(Metadata metadata) {
String contentType = metadata.get(TikaCoreProperties.TYPE);
return "application/pdf".equals(contentType);
}
}
// Configure selective extraction
ParseContext context = new ParseContext();
context.set(DocumentSelector.class, new PDFSelector());
EmbeddedDocumentExtractor extractor = new ParsingEmbeddedDocumentExtractor(context);
context.set(EmbeddedDocumentExtractor.class, extractor);
// Only PDF embedded documents will be processed
Parser parser = new AutoDetectParser();
// ... continue with parsingEmbedded Document Bytes Handling:
import org.apache.tika.extractor.*;
import org.apache.tika.io.TemporaryResources;
// Custom bytes handler for processing embedded document bytes
class CustomBytesHandler extends AbstractEmbeddedDocumentBytesHandler {
public CustomBytesHandler(TemporaryResources temp) {
super(temp);
}
@Override
public void handleEmbeddedDocumentBytes(byte[] bytes, Metadata metadata)
throws IOException, TikaException {
String filename = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
System.out.println("Processing embedded bytes for: " + filename +
" (" + bytes.length + " bytes)");
// Custom processing logic for embedded bytes
// e.g., save to file, analyze content, etc.
}
}
// Use custom bytes handler in parsing context
TemporaryResources temp = new TemporaryResources();
EmbeddedDocumentBytesHandler bytesHandler = new CustomBytesHandler(temp);
ParseContext context = new ParseContext();
context.set(EmbeddedDocumentBytesHandler.class, bytesHandler);The embedded extraction framework provides comprehensive support for handling compound documents, from high-level automatic extraction to low-level container processing with custom handlers and selective processing strategies.
Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core