Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
Comprehensive metadata system for extracting, storing, and manipulating document properties with support for standard metadata schemas, custom properties, and metadata filtering operations.
The central container class for document metadata, providing a flexible key-value store with support for multiple values per key and standard property interfaces.
/**
* Container for document metadata properties
*/
public class Metadata implements Serializable {
/**
* Creates an empty Metadata container
*/
public Metadata();
/**
* Gets the first value associated with the given property name
* @param name Property name to retrieve
* @return First value for the property, or null if not set
*/
public String get(String name);
/**
* Gets all values associated with the given property name
* @param name Property name to retrieve
* @return Array of all values for the property, never null but may be empty
*/
public String[] getValues(String name);
/**
* Sets a single value for the given property, replacing any existing values
* @param name Property name to set
* @param value Value to set for the property
*/
public void set(String name, String value);
/**
* Adds a value to the given property, preserving existing values
* @param name Property name to add to
* @param value Value to add for the property
*/
public void add(String name, String value);
/**
* Removes all values for the given property
* @param name Property name to remove
*/
public void remove(String name);
/**
* Gets all property names that have been set
* @return Array of property names with values
*/
public String[] names();
/**
* Gets the number of properties with values
* @return Number of properties that have been set
*/
public int size();
/**
* Checks if any properties have been set
* @return true if no properties have values
*/
public boolean isEmpty();
}Usage Examples:
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.DublinCore;
// Basic metadata operations
Metadata metadata = new Metadata();
// Set standard properties
metadata.set(TikaCoreProperties.TITLE, "Document Title");
metadata.set(DublinCore.CREATOR, "John Doe");
metadata.set(TikaCoreProperties.CREATED, "2023-01-15T10:30:00Z");
// Add multiple values for same property
metadata.add(DublinCore.SUBJECT, "Technology");
metadata.add(DublinCore.SUBJECT, "Programming");
// Retrieve values
String title = metadata.get(TikaCoreProperties.TITLE);
String[] subjects = metadata.getValues(DublinCore.SUBJECT);
// Iterate through all properties
for (String name : metadata.names()) {
String[] values = metadata.getValues(name);
System.out.println(name + ": " + Arrays.toString(values));
}Standard property definitions organized by metadata schemas and document types.
/**
* Interface defining property constants
*/
public interface Property {
/**
* Gets the property name
* @return String name of the property
*/
String getName();
/**
* Checks if this property allows multiple values
* @return true if multiple values are allowed
*/
boolean isMultiValuePermitted();
}
/**
* Core Tika metadata properties
*/
public interface TikaCoreProperties {
/** Document title */
Property TITLE = Property.internalText("title");
/** Document creator/author */
Property CREATOR = Property.internalText("dc:creator");
/** Document subject/description */
Property SUBJECT = Property.internalText("subject");
/** Document creation date */
Property CREATED = Property.internalDate("dcterms:created");
/** Document modification date */
Property MODIFIED = Property.internalDate("dcterms:modified");
/** Content type/MIME type */
Property CONTENT_TYPE = Property.internalText("Content-Type");
/** Character encoding */
Property CONTENT_ENCODING = Property.internalText("Content-Encoding");
/** Document language */
Property LANGUAGE = Property.internalText("language");
/** Resource name (filename) */
Property RESOURCE_NAME_KEY = Property.internalText("resourceName");
/** Number of pages */
Property PAGE_COUNT = Property.internalInteger("xmpTPg:NPages");
/** Number of words */
Property WORD_COUNT = Property.internalInteger("meta:word-count");
/** Number of characters */
Property CHARACTER_COUNT = Property.internalInteger("meta:character-count");
}Standard Dublin Core metadata elements for bibliographic information.
/**
* Dublin Core metadata properties
*/
public interface DublinCore {
/** Document contributor */
Property CONTRIBUTOR = Property.internalTextBag("dc:contributor");
/** Document coverage */
Property COVERAGE = Property.internalText("dc:coverage");
/** Document creator */
Property CREATOR = Property.internalTextBag("dc:creator");
/** Document date */
Property DATE = Property.internalDate("dc:date");
/** Document description */
Property DESCRIPTION = Property.internalText("dc:description");
/** Document format */
Property FORMAT = Property.internalText("dc:format");
/** Document identifier */
Property IDENTIFIER = Property.internalText("dc:identifier");
/** Document language */
Property LANGUAGE = Property.internalText("dc:language");
/** Document publisher */
Property PUBLISHER = Property.internalText("dc:publisher");
/** Document relation */
Property RELATION = Property.internalText("dc:relation");
/** Document rights */
Property RIGHTS = Property.internalText("dc:rights");
/** Document source */
Property SOURCE = Property.internalText("dc:source");
/** Document subject */
Property SUBJECT = Property.internalTextBag("dc:subject");
/** Document title */
Property TITLE = Property.internalText("dc:title");
/** Document type */
Property TYPE = Property.internalText("dc:type");
}Properties specific to office documents (Microsoft Office, LibreOffice, etc.).
/**
* Generic office document properties
*/
public interface Office {
/** Application name that created the document */
Property APPLICATION = Property.internalText("Application-Name");
/** Application version */
Property APPLICATION_VERSION = Property.internalText("Application-Version");
/** Document category */
Property CATEGORY = Property.internalText("Category");
/** Document company */
Property COMPANY = Property.internalText("Company");
/** Document keywords */
Property KEYWORDS = Property.internalTextBag("Keywords");
/** Document manager */
Property MANAGER = Property.internalText("Manager");
/** Document comments */
Property COMMENTS = Property.internalText("Comments");
/** Document template */
Property TEMPLATE = Property.internalText("Template");
/** Total editing time */
Property TOTAL_TIME = Property.internalInteger("Total-Time");
/** Document revision number */
Property REVISION_NUMBER = Property.internalText("Revision-Number");
/** Document security level */
Property SECURITY = Property.internalInteger("Security");
/** Number of slides (presentations) */
Property SLIDE_COUNT = Property.internalInteger("Slide-Count");
/** Number of paragraphs */
Property PARAGRAPH_COUNT = Property.internalInteger("Paragraph-Count");
/** Number of lines */
Property LINE_COUNT = Property.internalInteger("Line-Count");
}Properties specific to PDF documents.
/**
* PDF document properties
*/
public interface PDF {
/** PDF version */
Property PDF_VERSION = Property.internalText("pdf:PDFVersion");
/** PDF producer */
Property PRODUCER = Property.internalText("producer");
/** PDF encryption status */
Property ENCRYPTED = Property.internalBoolean("pdf:encrypted");
/** PDF permissions */
Property PERMISSIONS = Property.internalInteger("access_permission:extract_content");
/** PDF optimization */
Property OPTIMIZED = Property.internalBoolean("pdf:optimized");
/** PDF tagged */
Property TAGGED = Property.internalBoolean("pdf:tagged");
/** Number of characters with spaces */
Property CHARACTERS_WITH_SPACES = Property.internalInteger("pdf:charsWithSpaces");
/** PDF/A conformance */
Property PDFA_VERSION = Property.internalText("pdfa:version");
/** PDF/UA compliance */
Property PDFUA_VERSION = Property.internalText("pdfua:version");
/** Document ID */
Property DOC_INFO_ID_1 = Property.internalText("pdf:docinfo:id1");
/** Modification date from PDF info */
Property DOC_INFO_MODIFICATION_DATE = Property.internalDate("pdf:docinfo:modified");
/** Creation date from PDF info */
Property DOC_INFO_CREATION_DATE = Property.internalDate("pdf:docinfo:created");
}Properties for image documents and embedded images.
/**
* TIFF image properties
*/
public interface TIFF {
/** Image width in pixels */
Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth");
/** Image height in pixels */
Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength");
/** Bits per sample */
Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample");
/** Compression type */
Property COMPRESSION = Property.internalInteger("tiff:Compression");
/** Color space */
Property COLOR_SPACE = Property.internalText("ColorSpace");
/** Resolution unit */
Property RESOLUTION_UNIT = Property.internalInteger("tiff:ResolutionUnit");
/** X resolution */
Property X_RESOLUTION = Property.internalRational("tiff:XResolution");
/** Y resolution */
Property Y_RESOLUTION = Property.internalRational("tiff:YResolution");
/** Orientation */
Property ORIENTATION = Property.internalInteger("tiff:Orientation");
}
/**
* JPEG image properties
*/
public interface JPEG {
/** JPEG compression quality */
Property COMPRESSION_QUALITY = Property.internalReal("JPEG Compression Quality");
/** Color components */
Property COLOR_COMPONENTS = Property.internalInteger("Number of Components");
/** Image width */
Property IMAGE_WIDTH = Property.internalInteger("Image Width");
/** Image height */
Property IMAGE_HEIGHT = Property.internalInteger("Image Height");
}System for filtering and transforming metadata during extraction and processing.
/**
* Interface for filtering metadata
*/
public interface MetadataFilter {
/**
* Filters the given metadata
* @param metadata Metadata to filter
* @param context Parse context for configuration
*/
void filter(Metadata metadata, ParseContext context) throws TikaException;
}
/**
* Composite metadata filter combining multiple filters
*/
public class CompositeMetadataFilter implements MetadataFilter {
/**
* Creates a CompositeMetadataFilter with the specified filters
* @param filters Array of MetadataFilter instances to combine
*/
public CompositeMetadataFilter(MetadataFilter... filters);
/**
* Gets the list of filters
* @return List of MetadataFilter instances
*/
public List<MetadataFilter> getFilters();
}
/**
* Filter that normalizes date formats
*/
public class DateNormalizingMetadataFilter implements MetadataFilter {
/**
* Creates a DateNormalizingMetadataFilter with default configuration
*/
public DateNormalizingMetadataFilter();
/**
* Filters metadata by normalizing date formats
* @param metadata Metadata to process
* @param context Parse context (unused)
*/
public void filter(Metadata metadata, ParseContext context) throws TikaException;
}
/**
* Filter that clears metadata based on MIME type
*/
public class ClearByMimeMetadataFilter implements MetadataFilter {
/**
* Creates a filter that clears metadata for specified MIME types
* @param mimeTypes Set of MediaType objects to clear metadata for
*/
public ClearByMimeMetadataFilter(Set<MediaType> mimeTypes);
/**
* Filters metadata by clearing it for matching MIME types
* @param metadata Metadata to process
* @param context Parse context containing MIME type information
*/
public void filter(Metadata metadata, ParseContext context) throws TikaException;
}System for filtering metadata during write operations to prevent sensitive information leakage.
/**
* Interface for filtering metadata during write operations
*/
public interface MetadataWriteFilter {
/**
* Filters metadata before writing
* @param metadata Metadata to filter
* @param context Write context
* @return Filtered metadata safe for writing
*/
Metadata filterMetadata(Metadata metadata, WriteContext context);
}
/**
* Standard write filter with common filtering rules
*/
public class StandardWriteFilter implements MetadataWriteFilter {
/**
* Creates a StandardWriteFilter with default rules
*/
public StandardWriteFilter();
/**
* Filters sensitive metadata before writing
* @param metadata Original metadata
* @param context Write context
* @return Filtered metadata
*/
public Metadata filterMetadata(Metadata metadata, WriteContext context);
/**
* Adds a property to the exclusion list
* @param property Property to exclude from output
*/
public void excludeProperty(Property property);
/**
* Adds a property pattern to the exclusion list
* @param pattern Regular expression pattern for property names to exclude
*/
public void excludePattern(String pattern);
}Specialized filtering for metadata containing list values.
/**
* Interface for filtering metadata lists
*/
public interface MetadataListFilter {
/**
* Filters a list of metadata objects
* @param metadataList List of Metadata objects to filter
* @param context Processing context
* @return Filtered list of Metadata objects
*/
List<Metadata> filter(List<Metadata> metadataList, ParseContext context) throws TikaException;
}Common metadata property mappings across different standards:
// Document title mappings
TikaCoreProperties.TITLE // Generic title
DublinCore.TITLE // Dublin Core title
Office.TITLE // Office document title
PDF.TITLE // PDF document title
// Author/Creator mappings
TikaCoreProperties.CREATOR // Generic creator
DublinCore.CREATOR // Dublin Core creator
Office.AUTHOR // Office document author
PDF.AUTHOR // PDF document author
// Date mappings
TikaCoreProperties.CREATED // Generic creation date
TikaCoreProperties.MODIFIED // Generic modification date
DublinCore.DATE // Dublin Core date
Office.CREATION_DATE // Office creation date
PDF.DOC_INFO_CREATION_DATE // PDF creation date// Working with custom properties
Metadata metadata = new Metadata();
// Set custom properties
metadata.set("custom:department", "Engineering");
metadata.set("custom:project", "Atlas");
metadata.add("custom:tags", "important");
metadata.add("custom:tags", "review-needed");
// Define custom property interfaces
public interface CustomProperties {
Property DEPARTMENT = Property.internalText("custom:department");
Property PROJECT = Property.internalText("custom:project");
Property TAGS = Property.internalTextBag("custom:tags");
}// Merge metadata from multiple sources
Metadata combined = new Metadata();
// Copy all properties from source metadata
for (String name : sourceMetadata.names()) {
String[] values = sourceMetadata.getValues(name);
for (String value : values) {
combined.add(name, value);
}
}// Type-safe property operations using Property interfaces
Metadata metadata = new Metadata();
// Set using Property constants
metadata.set(TikaCoreProperties.TITLE, "Document Title");
metadata.set(TikaCoreProperties.PAGE_COUNT, "150");
// Get with type conversion
String title = metadata.get(TikaCoreProperties.TITLE);
Integer pageCount = metadata.getInt(TikaCoreProperties.PAGE_COUNT);
Date created = metadata.getDate(TikaCoreProperties.CREATED);Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core