Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
Configuration system for managing Tika parsers, detectors, and service loading with XML-based configuration files, parameter management, and service discovery mechanisms.
Central configuration class that manages parser, detector, and translator configurations with support for custom configurations and service loading.
/**
* Main configuration class for Tika components and services
*/
public class TikaConfig {
/**
* Gets the default Tika configuration with standard parsers and detectors
* @return TikaConfig instance with default settings
*/
public static TikaConfig getDefaultConfig();
/**
* Creates TikaConfig from XML configuration file
* @param file XML configuration file
* @return TikaConfig instance based on file configuration
* @throws TikaException if configuration is invalid
* @throws IOException if file cannot be read
*/
public TikaConfig(File file) throws TikaException, IOException;
/**
* Creates TikaConfig from XML configuration stream
* @param stream InputStream containing XML configuration
* @return TikaConfig instance based on stream configuration
* @throws TikaException if configuration is invalid
* @throws IOException if stream cannot be read
*/
public TikaConfig(InputStream stream) throws TikaException, IOException;
/**
* Creates TikaConfig from XML configuration at URL
* @param url URL pointing to XML configuration
* @throws TikaException if configuration is invalid
* @throws IOException if URL cannot be accessed
*/
public TikaConfig(URL url) throws TikaException, IOException;
/**
* Creates TikaConfig from classpath resource
* @param resource Resource path in classpath
* @throws TikaException if configuration is invalid
*/
public TikaConfig(String resource) throws TikaException;
/**
* Creates TikaConfig with custom class loader
* @param loader ClassLoader for service discovery
*/
public TikaConfig(ClassLoader loader);
/**
* Gets the configured composite parser
* @return Parser instance configured with all registered parsers
*/
public Parser getParser();
/**
* Gets parser for specific media type
* @param mimeType MediaType to get parser for
* @return Parser that handles the specified media type
*/
public Parser getParser(MediaType mimeType);
/**
* Gets all configured parsers mapped by media type
* @return Map of MediaType to Parser instances
*/
public Map<MediaType, Parser> getParsers();
/**
* Gets the configured composite detector
* @return Detector instance configured with all registered detectors
*/
public Detector getDetector();
/**
* Gets the configured translator
* @return Translator instance for text translation
*/
public Translator getTranslator();
/**
* Gets the MIME types registry
* @return MimeTypes instance with registered type definitions
*/
public MimeTypes getMimeRepository();
/**
* Gets the media type registry for type relationships
* @return MediaTypeRegistry for managing type hierarchies
*/
public MediaTypeRegistry getMediaTypeRegistry();
/**
* Gets configuration for specific parser class
* @param parserClass Class of parser to get configuration for
* @return Map of configuration parameters for the parser
*/
public Map<String, Param> getParserConfig(Class<? extends Parser> parserClass);
/**
* Gets configuration for specific detector class
* @param detectorClass Class of detector to get configuration for
* @return Map of configuration parameters for the detector
*/
public Map<String, Param> getDetectorConfig(Class<? extends Detector> detectorClass);
/**
* Gets the service loader configuration
* @return ServiceLoader instance used for dynamic service discovery
*/
public ServiceLoader getServiceLoader();
}Service loading utility for dynamic discovery and instantiation of Tika components.
/**
* Service loader for dynamic discovery of Tika components
*/
public class ServiceLoader {
/**
* Creates ServiceLoader with default class loader
*/
public ServiceLoader();
/**
* Creates ServiceLoader with custom class loader
* @param loader ClassLoader to use for service discovery
*/
public ServiceLoader(ClassLoader loader);
/**
* Creates ServiceLoader with class loader and dynamic loading flag
* @param loader ClassLoader for service discovery
* @param dynamic Whether to enable dynamic loading
*/
public ServiceLoader(ClassLoader loader, boolean dynamic);
/**
* Loads all available services of specified type
* @param iface Interface or class type to load
* @return List of service instances implementing the interface
*/
public <T> List<T> loadServiceProviders(Class<T> iface);
/**
* Loads static services from META-INF/services files
* @param iface Interface or class type to load
* @return List of statically declared service instances
*/
public <T> List<T> loadStaticServiceProviders(Class<T> iface);
/**
* Loads dynamic services from configuration
* @param iface Interface or class type to load
* @return List of dynamically configured service instances
*/
public <T> List<T> loadDynamicServiceProviders(Class<T> iface);
/**
* Gets the class loader used by this service loader
* @return ClassLoader instance used for loading services
*/
public ClassLoader getLoader();
/**
* Checks if dynamic loading is enabled
* @return true if dynamic loading is enabled
*/
public boolean isDynamic();
}Represents a configuration parameter with name, value, and type information.
/**
* Configuration parameter with name, value, and type information
*/
public class Param<T> {
/**
* Creates Param with name and value
* @param name Parameter name
* @param value Parameter value
*/
public Param(String name, T value);
/**
* Creates Param with name, value, and type
* @param name Parameter name
* @param value Parameter value
* @param type Parameter type class
*/
public Param(String name, T value, Class<T> type);
/**
* Gets parameter name
* @return String containing parameter name
*/
public String getName();
/**
* Gets parameter value
* @return Parameter value of type T
*/
public T getValue();
/**
* Gets parameter type
* @return Class representing parameter type
*/
public Class<T> getType();
/**
* Sets parameter value
* @param value New parameter value
*/
public void setValue(T value);
/**
* Gets string representation of value
* @return String representation of parameter value
*/
@Override
public String toString();
}Descriptor for parameter fields with metadata about configuration parameters.
/**
* Field descriptor for configuration parameters with metadata
*/
public class ParamField {
/**
* Creates ParamField for specified field
* @param field Field to create descriptor for
*/
public ParamField(Field field);
/**
* Gets the field name
* @return String containing field name
*/
public String getName();
/**
* Gets the field type
* @return Class representing field type
*/
public Class<?> getType();
/**
* Checks if field is required
* @return true if field is required for configuration
*/
public boolean isRequired();
/**
* Gets default value for field
* @return Default value or null if no default
*/
public Object getDefaultValue();
/**
* Gets field description from annotations
* @return String describing field purpose
*/
public String getDescription();
/**
* Sets field value on target object
* @param target Object to set field value on
* @param value Value to set
* @throws IllegalAccessException if field is not accessible
*/
public void setValue(Object target, Object value) throws IllegalAccessException;
/**
* Gets field value from target object
* @param target Object to get field value from
* @return Field value
* @throws IllegalAccessException if field is not accessible
*/
public Object getValue(Object target) throws IllegalAccessException;
}Base class for configurable Tika components with parameter injection support.
/**
* Base class for configurable components with parameter injection
*/
public abstract class ConfigBase {
/**
* Initializes component with configuration parameters
* @param params Map of parameter names to Param objects
* @throws TikaConfigException if initialization fails
*/
public void initialize(Map<String, Param> params) throws TikaConfigException;
/**
* Checks current configuration state
* @param handler Problem handler for reporting issues
*/
public void checkInitialization(InitializableProblemHandler handler);
/**
* Gets all configurable fields for this component
* @return List of ParamField descriptors for configurable fields
*/
public List<ParamField> getConfigurableFields();
/**
* Gets configuration parameter by name
* @param name Parameter name
* @return Param object or null if not found
*/
protected Param getParam(String name);
/**
* Sets configuration parameter
* @param name Parameter name
* @param value Parameter value
*/
protected void setParam(String name, Object value);
/**
* Validates configuration parameters
* @throws TikaConfigException if validation fails
*/
protected void validateConfig() throws TikaConfigException;
}Interface for handling problems that occur during component initialization.
/**
* Handler for problems encountered during component initialization
*/
public interface InitializableProblemHandler {
/**
* Handles a problem encountered during initialization
* @param clazz Class where problem occurred
* @param problem Description of the problem
*/
void handleInitializableProblem(Class<?> clazz, String problem);
}Default implementation that collects initialization problems for later analysis.
/**
* Default problem handler that collects initialization issues
*/
public class ParsingProblemHandler implements InitializableProblemHandler {
/**
* Creates problem handler for collecting issues
*/
public ParsingProblemHandler();
/**
* Handles initialization problem by recording it
* @param clazz Class where problem occurred
* @param problem Description of the problem
*/
@Override
public void handleInitializableProblem(Class<?> clazz, String problem);
/**
* Gets all recorded problems
* @return List of problems encountered during initialization
*/
public List<String> getProblems();
/**
* Checks if any problems were recorded
* @return true if problems were encountered
*/
public boolean hasProblems();
/**
* Gets problems for specific class
* @param clazz Class to get problems for
* @return List of problems for the specified class
*/
public List<String> getProblems(Class<?> clazz);
}<?xml version="1.0" encoding="UTF-8"?>
<properties>
<!-- MIME Types Configuration -->
<mimeTypeRepository resource="custom-mimetypes.xml"/>
<!-- Detectors Configuration -->
<detectors>
<detector class="org.apache.tika.detect.DefaultDetector"/>
<detector class="org.example.CustomDetector">
<params>
<param name="threshold" type="int">90</param>
<param name="enabled" type="boolean">true</param>
</params>
</detector>
</detectors>
<!-- Parsers Configuration -->
<parsers>
<parser class="org.apache.tika.parser.AutoDetectParser"/>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<param name="extractInlineImages" type="boolean">false</param>
<param name="sortByPosition" type="boolean">true</param>
</params>
</parser>
</parsers>
<!-- Translator Configuration -->
<translator class="org.apache.tika.language.translate.DefaultTranslator">
<params>
<param name="maxStringLength" type="int">10000</param>
</params>
</translator>
<!-- Service Loader Configuration -->
<service-loader dynamic="true" loadErrorHandler="IGNORE"/>
</properties>// Use default configuration
TikaConfig config = TikaConfig.getDefaultConfig();
Parser parser = config.getParser();
Detector detector = config.getDetector();
// Parse with configured components
Metadata metadata = new Metadata();
try (InputStream input = new FileInputStream("document.pdf")) {
parser.parse(input, new BodyContentHandler(), metadata, new ParseContext());
}// Load configuration from file
try {
TikaConfig config = new TikaConfig("tika-config.xml");
// Get configured components
Parser parser = config.getParser();
Detector detector = config.getDetector();
Translator translator = config.getTranslator();
} catch (TikaException | IOException e) {
System.err.println("Configuration error: " + e.getMessage());
}
// Load from classpath resource
TikaConfig config = new TikaConfig("/org/example/custom-tika.xml");// Create service loader with custom class loader
ClassLoader customLoader = Thread.currentThread().getContextClassLoader();
ServiceLoader serviceLoader = new ServiceLoader(customLoader, true);
// Load parser services
List<Parser> parsers = serviceLoader.loadServiceProviders(Parser.class);
System.out.println("Found " + parsers.size() + " parser services");
// Load detector services
List<Detector> detectors = serviceLoader.loadServiceProviders(Detector.class);
for (Detector detector : detectors) {
System.out.println("Detector: " + detector.getClass().getName());
}// Get parser configuration
TikaConfig config = TikaConfig.getDefaultConfig();
Map<String, Param> pdfConfig = config.getParserConfig(PDFParser.class);
// Check specific parameter
Param extractImages = pdfConfig.get("extractInlineImages");
if (extractImages != null) {
System.out.println("Extract images: " + extractImages.getValue());
}
// Create custom parameters
Map<String, Param> customParams = new HashMap<>();
customParams.put("maxStringLength", new Param<>("maxStringLength", 100000, Integer.class));
customParams.put("enableOCR", new Param<>("enableOCR", true, Boolean.class));public class CustomParser extends ConfigBase implements Parser {
private int maxDocuments = 1000;
private boolean verbose = false;
private String outputFormat = "text";
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
super.initialize(params);
Param maxDocs = getParam("maxDocuments");
if (maxDocs != null) {
this.maxDocuments = (Integer) maxDocs.getValue();
}
Param verboseParam = getParam("verbose");
if (verboseParam != null) {
this.verbose = (Boolean) verboseParam.getValue();
}
validateConfig();
}
@Override
protected void validateConfig() throws TikaConfigException {
if (maxDocuments <= 0) {
throw new TikaConfigException("maxDocuments must be positive");
}
}
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Implementation using configured parameters
if (verbose) {
System.out.println("Parsing with maxDocuments=" + maxDocuments);
}
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.singleton(MediaType.TEXT_PLAIN);
}
}// Handle initialization problems
ParsingProblemHandler problemHandler = new ParsingProblemHandler();
try {
TikaConfig config = new TikaConfig("config-with-issues.xml");
// Check for initialization problems
config.getParser(); // This might trigger initialization
if (problemHandler.hasProblems()) {
for (String problem : problemHandler.getProblems()) {
System.err.println("Configuration issue: " + problem);
}
}
} catch (TikaException e) {
System.err.println("Fatal configuration error: " + e.getMessage());
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core