CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

Pending
Overview
Eval results
Files

metadata.mddocs/

Metadata Management

Comprehensive metadata system for extracting, storing, and manipulating document properties with support for standard metadata schemas, custom properties, and metadata filtering operations.

Capabilities

Metadata Container

The central container class for document metadata, providing a flexible key-value store with support for multiple values per key and standard property interfaces.

/**
 * Container for document metadata properties
 */
public class Metadata implements Serializable {
    /**
     * Creates an empty Metadata container
     */
    public Metadata();
    
    /**
     * Gets the first value associated with the given property name
     * @param name Property name to retrieve
     * @return First value for the property, or null if not set
     */
    public String get(String name);
    
    /**
     * Gets all values associated with the given property name
     * @param name Property name to retrieve
     * @return Array of all values for the property, never null but may be empty
     */
    public String[] getValues(String name);
    
    /**
     * Sets a single value for the given property, replacing any existing values
     * @param name Property name to set
     * @param value Value to set for the property
     */
    public void set(String name, String value);
    
    /**
     * Adds a value to the given property, preserving existing values
     * @param name Property name to add to
     * @param value Value to add for the property
     */
    public void add(String name, String value);
    
    /**
     * Removes all values for the given property
     * @param name Property name to remove
     */
    public void remove(String name);
    
    /**
     * Gets all property names that have been set
     * @return Array of property names with values
     */
    public String[] names();
    
    /**
     * Gets the number of properties with values
     * @return Number of properties that have been set
     */
    public int size();
    
    /**
     * Checks if any properties have been set
     * @return true if no properties have values
     */
    public boolean isEmpty();
}

Usage Examples:

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.DublinCore;

// Basic metadata operations
Metadata metadata = new Metadata();

// Set standard properties
metadata.set(TikaCoreProperties.TITLE, "Document Title");
metadata.set(DublinCore.CREATOR, "John Doe");
metadata.set(TikaCoreProperties.CREATED, "2023-01-15T10:30:00Z");

// Add multiple values for same property
metadata.add(DublinCore.SUBJECT, "Technology");
metadata.add(DublinCore.SUBJECT, "Programming");

// Retrieve values
String title = metadata.get(TikaCoreProperties.TITLE);
String[] subjects = metadata.getValues(DublinCore.SUBJECT);

// Iterate through all properties
for (String name : metadata.names()) {
    String[] values = metadata.getValues(name);
    System.out.println(name + ": " + Arrays.toString(values));
}

Property Interfaces

Standard property definitions organized by metadata schemas and document types.

/**
 * Interface defining property constants
 */
public interface Property {
    /**
     * Gets the property name
     * @return String name of the property
     */
    String getName();
    
    /**
     * Checks if this property allows multiple values
     * @return true if multiple values are allowed
     */
    boolean isMultiValuePermitted();
}

/**
 * Core Tika metadata properties
 */
public interface TikaCoreProperties {
    /** Document title */
    Property TITLE = Property.internalText("title");
    
    /** Document creator/author */
    Property CREATOR = Property.internalText("dc:creator");
    
    /** Document subject/description */
    Property SUBJECT = Property.internalText("subject");
    
    /** Document creation date */
    Property CREATED = Property.internalDate("dcterms:created");
    
    /** Document modification date */
    Property MODIFIED = Property.internalDate("dcterms:modified");
    
    /** Content type/MIME type */
    Property CONTENT_TYPE = Property.internalText("Content-Type");
    
    /** Character encoding */
    Property CONTENT_ENCODING = Property.internalText("Content-Encoding");
    
    /** Document language */
    Property LANGUAGE = Property.internalText("language");
    
    /** Resource name (filename) */
    Property RESOURCE_NAME_KEY = Property.internalText("resourceName");
    
    /** Number of pages */
    Property PAGE_COUNT = Property.internalInteger("xmpTPg:NPages");
    
    /** Number of words */
    Property WORD_COUNT = Property.internalInteger("meta:word-count");
    
    /** Number of characters */
    Property CHARACTER_COUNT = Property.internalInteger("meta:character-count");
}

Dublin Core Properties

Standard Dublin Core metadata elements for bibliographic information.

/**
 * Dublin Core metadata properties
 */
public interface DublinCore {
    /** Document contributor */
    Property CONTRIBUTOR = Property.internalTextBag("dc:contributor");
    
    /** Document coverage */
    Property COVERAGE = Property.internalText("dc:coverage");
    
    /** Document creator */
    Property CREATOR = Property.internalTextBag("dc:creator");
    
    /** Document date */
    Property DATE = Property.internalDate("dc:date");
    
    /** Document description */
    Property DESCRIPTION = Property.internalText("dc:description");
    
    /** Document format */
    Property FORMAT = Property.internalText("dc:format");
    
    /** Document identifier */
    Property IDENTIFIER = Property.internalText("dc:identifier");
    
    /** Document language */
    Property LANGUAGE = Property.internalText("dc:language");
    
    /** Document publisher */
    Property PUBLISHER = Property.internalText("dc:publisher");
    
    /** Document relation */
    Property RELATION = Property.internalText("dc:relation");
    
    /** Document rights */
    Property RIGHTS = Property.internalText("dc:rights");
    
    /** Document source */
    Property SOURCE = Property.internalText("dc:source");
    
    /** Document subject */
    Property SUBJECT = Property.internalTextBag("dc:subject");
    
    /** Document title */
    Property TITLE = Property.internalText("dc:title");
    
    /** Document type */
    Property TYPE = Property.internalText("dc:type");
}

Office Document Properties

Properties specific to office documents (Microsoft Office, LibreOffice, etc.).

/**
 * Generic office document properties
 */
public interface Office {
    /** Application name that created the document */
    Property APPLICATION = Property.internalText("Application-Name");
    
    /** Application version */
    Property APPLICATION_VERSION = Property.internalText("Application-Version");
    
    /** Document category */
    Property CATEGORY = Property.internalText("Category");
    
    /** Document company */
    Property COMPANY = Property.internalText("Company");
    
    /** Document keywords */
    Property KEYWORDS = Property.internalTextBag("Keywords");
    
    /** Document manager */
    Property MANAGER = Property.internalText("Manager");
    
    /** Document comments */
    Property COMMENTS = Property.internalText("Comments");
    
    /** Document template */
    Property TEMPLATE = Property.internalText("Template");
    
    /** Total editing time */
    Property TOTAL_TIME = Property.internalInteger("Total-Time");
    
    /** Document revision number */
    Property REVISION_NUMBER = Property.internalText("Revision-Number");
    
    /** Document security level */
    Property SECURITY = Property.internalInteger("Security");
    
    /** Number of slides (presentations) */
    Property SLIDE_COUNT = Property.internalInteger("Slide-Count");
    
    /** Number of paragraphs */
    Property PARAGRAPH_COUNT = Property.internalInteger("Paragraph-Count");
    
    /** Number of lines */
    Property LINE_COUNT = Property.internalInteger("Line-Count");
}

PDF-Specific Properties

Properties specific to PDF documents.

/**
 * PDF document properties
 */
public interface PDF {
    /** PDF version */
    Property PDF_VERSION = Property.internalText("pdf:PDFVersion");
    
    /** PDF producer */
    Property PRODUCER = Property.internalText("producer");
    
    /** PDF encryption status */
    Property ENCRYPTED = Property.internalBoolean("pdf:encrypted");
    
    /** PDF permissions */
    Property PERMISSIONS = Property.internalInteger("access_permission:extract_content");
    
    /** PDF optimization */
    Property OPTIMIZED = Property.internalBoolean("pdf:optimized");
    
    /** PDF tagged */
    Property TAGGED = Property.internalBoolean("pdf:tagged");
    
    /** Number of characters with spaces */
    Property CHARACTERS_WITH_SPACES = Property.internalInteger("pdf:charsWithSpaces");
    
    /** PDF/A conformance */
    Property PDFA_VERSION = Property.internalText("pdfa:version");
    
    /** PDF/UA compliance */
    Property PDFUA_VERSION = Property.internalText("pdfua:version");
    
    /** Document ID */
    Property DOC_INFO_ID_1 = Property.internalText("pdf:docinfo:id1");
    
    /** Modification date from PDF info */
    Property DOC_INFO_MODIFICATION_DATE = Property.internalDate("pdf:docinfo:modified");
    
    /** Creation date from PDF info */
    Property DOC_INFO_CREATION_DATE = Property.internalDate("pdf:docinfo:created");
}

Image Properties

Properties for image documents and embedded images.

/**
 * TIFF image properties
 */
public interface TIFF {
    /** Image width in pixels */
    Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth");
    
    /** Image height in pixels */
    Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength");
    
    /** Bits per sample */
    Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample");
    
    /** Compression type */
    Property COMPRESSION = Property.internalInteger("tiff:Compression");
    
    /** Color space */
    Property COLOR_SPACE = Property.internalText("ColorSpace");
    
    /** Resolution unit */
    Property RESOLUTION_UNIT = Property.internalInteger("tiff:ResolutionUnit");
    
    /** X resolution */
    Property X_RESOLUTION = Property.internalRational("tiff:XResolution");
    
    /** Y resolution */
    Property Y_RESOLUTION = Property.internalRational("tiff:YResolution");
    
    /** Orientation */
    Property ORIENTATION = Property.internalInteger("tiff:Orientation");
}

/**
 * JPEG image properties
 */
public interface JPEG {
    /** JPEG compression quality */
    Property COMPRESSION_QUALITY = Property.internalReal("JPEG Compression Quality");
    
    /** Color components */
    Property COLOR_COMPONENTS = Property.internalInteger("Number of Components");
    
    /** Image width */
    Property IMAGE_WIDTH = Property.internalInteger("Image Width");
    
    /** Image height */
    Property IMAGE_HEIGHT = Property.internalInteger("Image Height");
}

Metadata Filtering

System for filtering and transforming metadata during extraction and processing.

/**
 * Interface for filtering metadata
 */
public interface MetadataFilter {
    /**
     * Filters the given metadata
     * @param metadata Metadata to filter
     * @param context Parse context for configuration
     */
    void filter(Metadata metadata, ParseContext context) throws TikaException;
}

/**
 * Composite metadata filter combining multiple filters
 */
public class CompositeMetadataFilter implements MetadataFilter {
    /**
     * Creates a CompositeMetadataFilter with the specified filters
     * @param filters Array of MetadataFilter instances to combine
     */
    public CompositeMetadataFilter(MetadataFilter... filters);
    
    /**
     * Gets the list of filters
     * @return List of MetadataFilter instances
     */
    public List<MetadataFilter> getFilters();
}

/**
 * Filter that normalizes date formats
 */
public class DateNormalizingMetadataFilter implements MetadataFilter {
    /**
     * Creates a DateNormalizingMetadataFilter with default configuration
     */
    public DateNormalizingMetadataFilter();
    
    /**
     * Filters metadata by normalizing date formats
     * @param metadata Metadata to process
     * @param context Parse context (unused)
     */
    public void filter(Metadata metadata, ParseContext context) throws TikaException;
}

/**
 * Filter that clears metadata based on MIME type
 */
public class ClearByMimeMetadataFilter implements MetadataFilter {
    /**
     * Creates a filter that clears metadata for specified MIME types
     * @param mimeTypes Set of MediaType objects to clear metadata for
     */
    public ClearByMimeMetadataFilter(Set<MediaType> mimeTypes);
    
    /**
     * Filters metadata by clearing it for matching MIME types
     * @param metadata Metadata to process
     * @param context Parse context containing MIME type information
     */
    public void filter(Metadata metadata, ParseContext context) throws TikaException;
}

Write Filtering

System for filtering metadata during write operations to prevent sensitive information leakage.

/**
 * Interface for filtering metadata during write operations
 */
public interface MetadataWriteFilter {
    /**
     * Filters metadata before writing
     * @param metadata Metadata to filter
     * @param context Write context
     * @return Filtered metadata safe for writing
     */
    Metadata filterMetadata(Metadata metadata, WriteContext context);
}

/**
 * Standard write filter with common filtering rules
 */
public class StandardWriteFilter implements MetadataWriteFilter {
    /**
     * Creates a StandardWriteFilter with default rules
     */
    public StandardWriteFilter();
    
    /**
     * Filters sensitive metadata before writing
     * @param metadata Original metadata
     * @param context Write context
     * @return Filtered metadata
     */
    public Metadata filterMetadata(Metadata metadata, WriteContext context);
    
    /**
     * Adds a property to the exclusion list
     * @param property Property to exclude from output
     */
    public void excludeProperty(Property property);
    
    /**
     * Adds a property pattern to the exclusion list
     * @param pattern Regular expression pattern for property names to exclude
     */
    public void excludePattern(String pattern);
}

List Filtering

Specialized filtering for metadata containing list values.

/**
 * Interface for filtering metadata lists
 */
public interface MetadataListFilter {
    /**
     * Filters a list of metadata objects
     * @param metadataList List of Metadata objects to filter
     * @param context Processing context
     * @return Filtered list of Metadata objects
     */
    List<Metadata> filter(List<Metadata> metadataList, ParseContext context) throws TikaException;
}

Metadata Schemas and Standards

Standard Property Mappings

Common metadata property mappings across different standards:

// Document title mappings
TikaCoreProperties.TITLE        // Generic title
DublinCore.TITLE               // Dublin Core title
Office.TITLE                   // Office document title
PDF.TITLE                      // PDF document title

// Author/Creator mappings
TikaCoreProperties.CREATOR     // Generic creator
DublinCore.CREATOR            // Dublin Core creator
Office.AUTHOR                 // Office document author
PDF.AUTHOR                    // PDF document author

// Date mappings
TikaCoreProperties.CREATED    // Generic creation date
TikaCoreProperties.MODIFIED   // Generic modification date
DublinCore.DATE              // Dublin Core date
Office.CREATION_DATE         // Office creation date
PDF.DOC_INFO_CREATION_DATE   // PDF creation date

Custom Properties

// Working with custom properties
Metadata metadata = new Metadata();

// Set custom properties
metadata.set("custom:department", "Engineering");
metadata.set("custom:project", "Atlas");
metadata.add("custom:tags", "important");
metadata.add("custom:tags", "review-needed");

// Define custom property interfaces
public interface CustomProperties {
    Property DEPARTMENT = Property.internalText("custom:department");
    Property PROJECT = Property.internalText("custom:project");
    Property TAGS = Property.internalTextBag("custom:tags");
}

Advanced Metadata Operations

Metadata Merging

// Merge metadata from multiple sources
Metadata combined = new Metadata();

// Copy all properties from source metadata
for (String name : sourceMetadata.names()) {
    String[] values = sourceMetadata.getValues(name);
    for (String value : values) {
        combined.add(name, value);
    }
}

Type-Safe Property Access

// Type-safe property operations using Property interfaces
Metadata metadata = new Metadata();

// Set using Property constants
metadata.set(TikaCoreProperties.TITLE, "Document Title");
metadata.set(TikaCoreProperties.PAGE_COUNT, "150");

// Get with type conversion
String title = metadata.get(TikaCoreProperties.TITLE);
Integer pageCount = metadata.getInt(TikaCoreProperties.PAGE_COUNT);
Date created = metadata.getDate(TikaCoreProperties.CREATED);

Performance and Memory Considerations

  • Property Interning: Property names are interned to reduce memory usage
  • Value Storage: Multiple values per property are stored efficiently
  • Filtering Performance: Metadata filters should be lightweight operations
  • Memory Footprint: Large metadata sets may require streaming processing

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

embedding.md

exceptions.md

index.md

io-utilities.md

language.md

metadata.md

mime-types.md

parsing.md

pipes.md

process-forking.md

rendering.md

tile.json