Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
npx @tessl/cli install tessl/maven-org-apache-tika--tika-core@3.2.0Apache Tika Core is the foundational library of the Apache Tika toolkit, providing essential functionality for detecting and extracting metadata and structured text content from various document formats. As the base module from which all other Tika modules inherit functionality, it defines the core APIs, interfaces, and architectural components for document processing, content type identification, metadata handling, and content extraction.
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>3.2.2</version>
</dependency>implementation 'org.apache.tika:tika-core:3.2.2'import org.apache.tika.Tika;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.config.TikaConfig;import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
// Simple facade usage
Tika tika = new Tika();
// Detect content type
String mimeType = tika.detect(new File("document.pdf"));
System.out.println("MIME type: " + mimeType);
// Extract text content
String text = tika.parseToString(new File("document.pdf"));
System.out.println("Extracted text: " + text);
// Parse with metadata extraction
try (InputStream stream = new FileInputStream("document.pdf")) {
Metadata metadata = new Metadata();
String content = tika.parseToString(stream, metadata);
// Access metadata
String title = metadata.get("title");
String author = metadata.get("dc:creator");
}Apache Tika Core is built around several key architectural components:
org.apache.tika.Tika class provides simplified access to all Tika functionalityParser interface and implementations for document parsingDetector interface and implementations for content type detectionMetadata class and property interfaces for document metadataTikaConfig for advanced setup and service loadingCore document parsing functionality using the Parser interface and AutoDetectParser for automatic format detection. Supports parsing of documents into structured content with metadata extraction.
public interface Parser {
void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException;
Set<MediaType> getSupportedTypes(ParseContext context);
}
public class AutoDetectParser implements Parser {
public AutoDetectParser();
public AutoDetectParser(TikaConfig config);
public void setFallback(Parser fallback);
public Parser getFallback();
}Detection system for identifying document formats and MIME types using various detection strategies including magic numbers, file extensions, and neural network models.
public interface Detector {
MediaType detect(InputStream input, Metadata metadata) throws IOException;
}
public class DefaultDetector extends CompositeDetector {
public DefaultDetector();
public DefaultDetector(MimeTypes types);
}Comprehensive metadata system for extracting, storing, and manipulating document properties with support for standard metadata schemas and custom properties.
public class Metadata implements Serializable {
public String get(String name);
public String[] getValues(String name);
public void set(String name, String value);
public void add(String name, String value);
public void remove(String name);
public String[] names();
}SAX-based content handler system for extracting, transforming, and processing document content with support for various output formats and specialized processing needs.
public class BodyContentHandler extends WriteOutContentHandler {
public BodyContentHandler();
public BodyContentHandler(Writer writer);
public BodyContentHandler(int writeLimit);
}
public class ToXMLContentHandler extends ContentHandlerDecorator {
public ToXMLContentHandler();
public ToXMLContentHandler(ContentHandler handler, String encoding);
}Comprehensive MIME type registry and media type handling with support for type relationships, detection patterns, and custom type definitions.
public final class MediaType implements Serializable {
public static MediaType parse(String type);
public String getType();
public String getSubtype();
public String toString();
}
public class MimeTypes {
public static MimeTypes getDefaultMimeTypes();
public MediaType detect(InputStream input, String name) throws IOException;
public MimeType forName(String name) throws MimeTypeException;
}Configuration management system with support for custom parsers, detectors, and service loading with parameter configuration and initialization handling.
public class TikaConfig {
public static TikaConfig getDefaultConfig();
public Parser getParser();
public Detector getDetector();
public Translator getTranslator();
}Language detection and translation capabilities for identifying document languages and translating text content with pluggable translator implementations.
public class LanguageIdentifier {
public LanguageIdentifier(String text);
public String getLanguage();
public boolean isReasonablyCertain();
}
public interface Translator {
String translate(String text, String sourceLanguage, String targetLanguage)
throws TikaException, IOException;
boolean isAvailable();
}Enterprise-grade batch processing framework using the Fetcher/Emitter pattern for scalable document processing with support for async operations and error handling.
public interface Fetcher<T extends FetchKey> {
InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException;
String getName();
}
public interface Emitter {
void emit(String emitKey, List<Metadata> metadataList) throws IOException, TikaException;
String getName();
}Comprehensive exception hierarchy for handling various error conditions in document processing with specific exceptions for encryption, corruption, and format issues.
public class TikaException extends Exception {
public TikaException(String message);
public TikaException(String message, Throwable cause);
}
public class EncryptedDocumentException extends TikaException;
public class UnsupportedFormatException extends TikaException;
public class CorruptedFileException extends TikaException;Framework for rendering documents into visual representations such as images, with support for page-based rendering and custom render requests.
public interface Renderer extends Serializable {
Set<MediaType> getSupportedTypes(ParseContext context);
RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
RenderRequest... requests) throws IOException, TikaException;
}
public class CompositeRenderer implements Renderer, Initializable {
public CompositeRenderer(ServiceLoader serviceLoader);
public CompositeRenderer(List<Renderer> renderers);
}Framework for extracting embedded documents and resources from container formats with support for selective extraction and custom processing strategies.
public interface EmbeddedDocumentExtractor {
boolean shouldParseEmbedded(Metadata metadata);
void parseEmbedded(InputStream stream, ContentHandler handler,
Metadata metadata, boolean outputHtml) throws SAXException, IOException;
}
public interface ContainerExtractor extends Serializable {
boolean isSupported(TikaInputStream input) throws IOException;
void extract(TikaInputStream stream, ContainerExtractor recurseExtractor,
EmbeddedResourceHandler handler) throws IOException, TikaException;
}Framework for embedding metadata into documents, allowing modification and insertion of metadata properties into existing files.
public interface Embedder extends Serializable {
Set<MediaType> getSupportedEmbedTypes(ParseContext context);
void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream,
ParseContext context) throws IOException, TikaException;
}
public class ExternalEmbedder implements Embedder {
public void setCommand(String... command);
public void setMetadataCommandArguments(Map<Property, String[]> arguments);
}Advanced infrastructure for running document parsing operations in separate JVM processes to provide isolation, memory management, and fault tolerance.
public class ForkParser implements Parser, Closeable {
public ForkParser();
public ForkParser(ClassLoader loader, Parser parser);
public void setPoolSize(int poolSize);
public void setServerParseTimeoutMillis(long serverParseTimeoutMillis);
}
public interface ForkResource {
Throwable process(DataInputStream input, DataOutputStream output) throws IOException;
}Process Forking Infrastructure
Enhanced I/O streams and utility classes for efficient document processing with support for temporary resources, bounded streams, and system integration.
public class TikaInputStream extends TaggedInputStream {
public static TikaInputStream get(InputStream stream);
public static TikaInputStream get(File file);
public static TikaInputStream get(Path path);
public static TikaInputStream get(URL url);
public boolean hasFile();
public File getFile() throws IOException;
}