CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

Pending
Overview
Eval results
Files

configuration.mddocs/

Configuration

Configuration system for managing Tika parsers, detectors, and service loading with XML-based configuration files, parameter management, and service discovery mechanisms.

Capabilities

TikaConfig Class

Central configuration class that manages parser, detector, and translator configurations with support for custom configurations and service loading.

/**
 * Main configuration class for Tika components and services
 */
public class TikaConfig {
    /**
     * Gets the default Tika configuration with standard parsers and detectors
     * @return TikaConfig instance with default settings
     */
    public static TikaConfig getDefaultConfig();
    
    /**
     * Creates TikaConfig from XML configuration file
     * @param file XML configuration file
     * @return TikaConfig instance based on file configuration
     * @throws TikaException if configuration is invalid
     * @throws IOException if file cannot be read
     */
    public TikaConfig(File file) throws TikaException, IOException;
    
    /**
     * Creates TikaConfig from XML configuration stream
     * @param stream InputStream containing XML configuration
     * @return TikaConfig instance based on stream configuration
     * @throws TikaException if configuration is invalid
     * @throws IOException if stream cannot be read
     */
    public TikaConfig(InputStream stream) throws TikaException, IOException;
    
    /**
     * Creates TikaConfig from XML configuration at URL
     * @param url URL pointing to XML configuration
     * @throws TikaException if configuration is invalid
     * @throws IOException if URL cannot be accessed
     */
    public TikaConfig(URL url) throws TikaException, IOException;
    
    /**
     * Creates TikaConfig from classpath resource
     * @param resource Resource path in classpath
     * @throws TikaException if configuration is invalid
     */
    public TikaConfig(String resource) throws TikaException;
    
    /**
     * Creates TikaConfig with custom class loader
     * @param loader ClassLoader for service discovery
     */
    public TikaConfig(ClassLoader loader);
    
    /**
     * Gets the configured composite parser
     * @return Parser instance configured with all registered parsers
     */
    public Parser getParser();
    
    /**
     * Gets parser for specific media type
     * @param mimeType MediaType to get parser for
     * @return Parser that handles the specified media type
     */
    public Parser getParser(MediaType mimeType);
    
    /**
     * Gets all configured parsers mapped by media type
     * @return Map of MediaType to Parser instances
     */
    public Map<MediaType, Parser> getParsers();
    
    /**
     * Gets the configured composite detector
     * @return Detector instance configured with all registered detectors
     */
    public Detector getDetector();
    
    /**
     * Gets the configured translator
     * @return Translator instance for text translation
     */
    public Translator getTranslator();
    
    /**
     * Gets the MIME types registry
     * @return MimeTypes instance with registered type definitions
     */
    public MimeTypes getMimeRepository();
    
    /**
     * Gets the media type registry for type relationships
     * @return MediaTypeRegistry for managing type hierarchies
     */
    public MediaTypeRegistry getMediaTypeRegistry();
    
    /**
     * Gets configuration for specific parser class
     * @param parserClass Class of parser to get configuration for
     * @return Map of configuration parameters for the parser
     */
    public Map<String, Param> getParserConfig(Class<? extends Parser> parserClass);
    
    /**
     * Gets configuration for specific detector class
     * @param detectorClass Class of detector to get configuration for
     * @return Map of configuration parameters for the detector
     */
    public Map<String, Param> getDetectorConfig(Class<? extends Detector> detectorClass);
    
    /**
     * Gets the service loader configuration
     * @return ServiceLoader instance used for dynamic service discovery
     */
    public ServiceLoader getServiceLoader();
}

ServiceLoader Class

Service loading utility for dynamic discovery and instantiation of Tika components.

/**
 * Service loader for dynamic discovery of Tika components
 */
public class ServiceLoader {
    /**
     * Creates ServiceLoader with default class loader
     */
    public ServiceLoader();
    
    /**
     * Creates ServiceLoader with custom class loader
     * @param loader ClassLoader to use for service discovery
     */
    public ServiceLoader(ClassLoader loader);
    
    /**
     * Creates ServiceLoader with class loader and dynamic loading flag
     * @param loader ClassLoader for service discovery
     * @param dynamic Whether to enable dynamic loading
     */
    public ServiceLoader(ClassLoader loader, boolean dynamic);
    
    /**
     * Loads all available services of specified type
     * @param iface Interface or class type to load
     * @return List of service instances implementing the interface
     */
    public <T> List<T> loadServiceProviders(Class<T> iface);
    
    /**
     * Loads static services from META-INF/services files
     * @param iface Interface or class type to load
     * @return List of statically declared service instances
     */
    public <T> List<T> loadStaticServiceProviders(Class<T> iface);
    
    /**
     * Loads dynamic services from configuration
     * @param iface Interface or class type to load
     * @return List of dynamically configured service instances
     */
    public <T> List<T> loadDynamicServiceProviders(Class<T> iface);
    
    /**
     * Gets the class loader used by this service loader
     * @return ClassLoader instance used for loading services
     */
    public ClassLoader getLoader();
    
    /**
     * Checks if dynamic loading is enabled
     * @return true if dynamic loading is enabled
     */
    public boolean isDynamic();
}

Configuration Parameters

Param Class

Represents a configuration parameter with name, value, and type information.

/**
 * Configuration parameter with name, value, and type information
 */
public class Param<T> {
    /**
     * Creates Param with name and value
     * @param name Parameter name
     * @param value Parameter value
     */
    public Param(String name, T value);
    
    /**
     * Creates Param with name, value, and type
     * @param name Parameter name
     * @param value Parameter value
     * @param type Parameter type class
     */
    public Param(String name, T value, Class<T> type);
    
    /**
     * Gets parameter name
     * @return String containing parameter name
     */
    public String getName();
    
    /**
     * Gets parameter value
     * @return Parameter value of type T
     */
    public T getValue();
    
    /**
     * Gets parameter type
     * @return Class representing parameter type
     */
    public Class<T> getType();
    
    /**
     * Sets parameter value
     * @param value New parameter value
     */
    public void setValue(T value);
    
    /**
     * Gets string representation of value
     * @return String representation of parameter value
     */
    @Override
    public String toString();
}

ParamField Class

Descriptor for parameter fields with metadata about configuration parameters.

/**
 * Field descriptor for configuration parameters with metadata
 */
public class ParamField {
    /**
     * Creates ParamField for specified field
     * @param field Field to create descriptor for
     */
    public ParamField(Field field);
    
    /**
     * Gets the field name
     * @return String containing field name
     */
    public String getName();
    
    /**
     * Gets the field type
     * @return Class representing field type
     */
    public Class<?> getType();
    
    /**
     * Checks if field is required
     * @return true if field is required for configuration
     */
    public boolean isRequired();
    
    /**
     * Gets default value for field
     * @return Default value or null if no default
     */
    public Object getDefaultValue();
    
    /**
     * Gets field description from annotations
     * @return String describing field purpose
     */
    public String getDescription();
    
    /**
     * Sets field value on target object
     * @param target Object to set field value on
     * @param value Value to set
     * @throws IllegalAccessException if field is not accessible
     */
    public void setValue(Object target, Object value) throws IllegalAccessException;
    
    /**
     * Gets field value from target object
     * @param target Object to get field value from
     * @return Field value
     * @throws IllegalAccessException if field is not accessible
     */
    public Object getValue(Object target) throws IllegalAccessException;
}

Configuration Base Classes

ConfigBase Class

Base class for configurable Tika components with parameter injection support.

/**
 * Base class for configurable components with parameter injection
 */
public abstract class ConfigBase {
    /**
     * Initializes component with configuration parameters
     * @param params Map of parameter names to Param objects
     * @throws TikaConfigException if initialization fails
     */
    public void initialize(Map<String, Param> params) throws TikaConfigException;
    
    /**
     * Checks current configuration state
     * @param handler Problem handler for reporting issues
     */
    public void checkInitialization(InitializableProblemHandler handler);
    
    /**
     * Gets all configurable fields for this component
     * @return List of ParamField descriptors for configurable fields
     */
    public List<ParamField> getConfigurableFields();
    
    /**
     * Gets configuration parameter by name
     * @param name Parameter name
     * @return Param object or null if not found
     */
    protected Param getParam(String name);
    
    /**
     * Sets configuration parameter
     * @param name Parameter name
     * @param value Parameter value
     */
    protected void setParam(String name, Object value);
    
    /**
     * Validates configuration parameters
     * @throws TikaConfigException if validation fails
     */
    protected void validateConfig() throws TikaConfigException;
}

Problem Handling

InitializableProblemHandler Interface

Interface for handling problems that occur during component initialization.

/**
 * Handler for problems encountered during component initialization
 */
public interface InitializableProblemHandler {
    /**
     * Handles a problem encountered during initialization
     * @param clazz Class where problem occurred
     * @param problem Description of the problem
     */
    void handleInitializableProblem(Class<?> clazz, String problem);
}

ParsingProblemHandler Implementation

Default implementation that collects initialization problems for later analysis.

/**
 * Default problem handler that collects initialization issues
 */
public class ParsingProblemHandler implements InitializableProblemHandler {
    /**
     * Creates problem handler for collecting issues
     */
    public ParsingProblemHandler();
    
    /**
     * Handles initialization problem by recording it
     * @param clazz Class where problem occurred
     * @param problem Description of the problem
     */
    @Override
    public void handleInitializableProblem(Class<?> clazz, String problem);
    
    /**
     * Gets all recorded problems
     * @return List of problems encountered during initialization
     */
    public List<String> getProblems();
    
    /**
     * Checks if any problems were recorded
     * @return true if problems were encountered
     */
    public boolean hasProblems();
    
    /**
     * Gets problems for specific class
     * @param clazz Class to get problems for
     * @return List of problems for the specified class
     */
    public List<String> getProblems(Class<?> clazz);
}

Configuration File Format

XML Configuration Structure

<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <!-- MIME Types Configuration -->
  <mimeTypeRepository resource="custom-mimetypes.xml"/>
  
  <!-- Detectors Configuration -->
  <detectors>
    <detector class="org.apache.tika.detect.DefaultDetector"/>
    <detector class="org.example.CustomDetector">
      <params>
        <param name="threshold" type="int">90</param>
        <param name="enabled" type="boolean">true</param>
      </params>
    </detector>
  </detectors>
  
  <!-- Parsers Configuration -->
  <parsers>
    <parser class="org.apache.tika.parser.AutoDetectParser"/>
    <parser class="org.apache.tika.parser.pdf.PDFParser">
      <params>
        <param name="extractInlineImages" type="boolean">false</param>
        <param name="sortByPosition" type="boolean">true</param>
      </params>
    </parser>
  </parsers>
  
  <!-- Translator Configuration -->
  <translator class="org.apache.tika.language.translate.DefaultTranslator">
    <params>
      <param name="maxStringLength" type="int">10000</param>
    </params>
  </translator>
  
  <!-- Service Loader Configuration -->
  <service-loader dynamic="true" loadErrorHandler="IGNORE"/>
</properties>

Usage Examples

Basic Configuration Usage

// Use default configuration
TikaConfig config = TikaConfig.getDefaultConfig();
Parser parser = config.getParser();
Detector detector = config.getDetector();

// Parse with configured components
Metadata metadata = new Metadata();
try (InputStream input = new FileInputStream("document.pdf")) {
    parser.parse(input, new BodyContentHandler(), metadata, new ParseContext());
}

Custom Configuration Loading

// Load configuration from file
try {
    TikaConfig config = new TikaConfig("tika-config.xml");
    
    // Get configured components
    Parser parser = config.getParser();
    Detector detector = config.getDetector();
    Translator translator = config.getTranslator();
    
} catch (TikaException | IOException e) {
    System.err.println("Configuration error: " + e.getMessage());
}

// Load from classpath resource
TikaConfig config = new TikaConfig("/org/example/custom-tika.xml");

Working with Service Loader

// Create service loader with custom class loader
ClassLoader customLoader = Thread.currentThread().getContextClassLoader();
ServiceLoader serviceLoader = new ServiceLoader(customLoader, true);

// Load parser services
List<Parser> parsers = serviceLoader.loadServiceProviders(Parser.class);
System.out.println("Found " + parsers.size() + " parser services");

// Load detector services  
List<Detector> detectors = serviceLoader.loadServiceProviders(Detector.class);
for (Detector detector : detectors) {
    System.out.println("Detector: " + detector.getClass().getName());
}

Parameter Configuration

// Get parser configuration
TikaConfig config = TikaConfig.getDefaultConfig();
Map<String, Param> pdfConfig = config.getParserConfig(PDFParser.class);

// Check specific parameter
Param extractImages = pdfConfig.get("extractInlineImages");
if (extractImages != null) {
    System.out.println("Extract images: " + extractImages.getValue());
}

// Create custom parameters
Map<String, Param> customParams = new HashMap<>();
customParams.put("maxStringLength", new Param<>("maxStringLength", 100000, Integer.class));
customParams.put("enableOCR", new Param<>("enableOCR", true, Boolean.class));

Configurable Component Implementation

public class CustomParser extends ConfigBase implements Parser {
    private int maxDocuments = 1000;
    private boolean verbose = false;
    private String outputFormat = "text";
    
    @Override
    public void initialize(Map<String, Param> params) throws TikaConfigException {
        super.initialize(params);
        
        Param maxDocs = getParam("maxDocuments");
        if (maxDocs != null) {
            this.maxDocuments = (Integer) maxDocs.getValue();
        }
        
        Param verboseParam = getParam("verbose");
        if (verboseParam != null) {
            this.verbose = (Boolean) verboseParam.getValue();
        }
        
        validateConfig();
    }
    
    @Override
    protected void validateConfig() throws TikaConfigException {
        if (maxDocuments <= 0) {
            throw new TikaConfigException("maxDocuments must be positive");
        }
    }
    
    @Override
    public void parse(InputStream stream, ContentHandler handler, 
                     Metadata metadata, ParseContext context) 
            throws IOException, SAXException, TikaException {
        // Implementation using configured parameters
        if (verbose) {
            System.out.println("Parsing with maxDocuments=" + maxDocuments);
        }
    }
    
    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return Collections.singleton(MediaType.TEXT_PLAIN);
    }
}

Problem Handling

// Handle initialization problems
ParsingProblemHandler problemHandler = new ParsingProblemHandler();

try {
    TikaConfig config = new TikaConfig("config-with-issues.xml");
    
    // Check for initialization problems
    config.getParser(); // This might trigger initialization
    
    if (problemHandler.hasProblems()) {
        for (String problem : problemHandler.getProblems()) {
            System.err.println("Configuration issue: " + problem);
        }
    }
    
} catch (TikaException e) {
    System.err.println("Fatal configuration error: " + e.getMessage());
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

embedding.md

exceptions.md

index.md

io-utilities.md

language.md

metadata.md

mime-types.md

parsing.md

pipes.md

process-forking.md

rendering.md

tile.json