CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

Pending
Overview
Eval results
Files

embedding.mddocs/

Document Embedding

Framework for embedding metadata into documents, allowing modification and insertion of metadata properties into existing files. This system provides both programmatic interfaces and external tool integration for embedding metadata into various document formats while preserving document structure and content.

Capabilities

Embedder Interface

Core interface for embedding metadata into documents with support for different document formats and metadata containers.

/**
 * Interface for embedding metadata into documents
 */
public interface Embedder extends Serializable {
    /**
     * Returns supported media types for embedding operations
     * @param context parse context for embedder configuration
     * @return immutable set of supported media types
     */
    Set<MediaType> getSupportedEmbedTypes(ParseContext context);
    
    /**
     * Embeds metadata from Metadata object into document stream
     * @param metadata document metadata to embed (input and output)
     * @param originalStream source document stream
     * @param outputStream target stream for document with embedded metadata
     * @param context parse context for embedding configuration
     * @throws IOException if document cannot be read or written
     * @throws TikaException if embedding operation fails
     */
    void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream,
               ParseContext context) throws IOException, TikaException;
}

External Embedder

Implementation that uses external command-line tools for embedding metadata into documents.

/**
 * Embedder using external programs for metadata embedding
 */
public class ExternalEmbedder implements Embedder {
    /** Token replaced with metadata command arguments in command templates */
    public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
    
    /** Token replaced with serialized metadata arguments in command templates */
    public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";
    
    /**
     * Creates external embedder with default settings
     */
    public ExternalEmbedder();
    
    /**
     * Gets supported embed types
     * @return set of supported media types
     */
    public Set<MediaType> getSupportedEmbedTypes();
    
    /**
     * Sets supported embed types for this embedder
     * @param supportedEmbedTypes set of media types to support
     */
    public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes);
    
    /**
     * Gets external command to execute
     * @return command array with tokens for file paths
     */
    public String[] getCommand();
    
    /**
     * Sets external command to execute for embedding
     * @param command command array supporting INPUT_FILE_TOKEN and OUTPUT_FILE_TOKEN
     */
    public void setCommand(String... command);
    
    /**
     * Gets assignment operator for metadata (e.g., "=")
     * @return assignment operator string
     */
    public String getCommandAssignmentOperator();
    
    /**
     * Sets assignment operator for metadata
     * @param commandAssignmentOperator operator string
     */
    public void setCommandAssignmentOperator(String commandAssignmentOperator);
    
    /**
     * Gets delimiter for multiple metadata assignments (e.g., ", ")
     * @return assignment delimiter string
     */
    public String getCommandAssignmentDelimeter();
    
    /**
     * Sets delimiter for multiple metadata assignments
     * @param commandAssignmentDelimeter delimiter string
     */
    public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter);
    
    /**
     * Gets append operator for metadata (e.g., "+=")
     * @return append operator string
     */
    public String getCommandAppendOperator();
    
    /**
     * Sets append operator for multi-valued metadata
     * @param commandAppendOperator append operator string
     */
    public void setCommandAppendOperator(String commandAppendOperator);
    
    /**
     * Gets whether to quote assignment values
     * @return true if values should be quoted
     */
    public boolean isQuoteAssignmentValues();
    
    /**
     * Sets whether to quote assignment values (e.g., tag='value')
     * @param quoteAssignmentValues true to quote values
     */
    public void setQuoteAssignmentValues(boolean quoteAssignmentValues);
    
    /**
     * Gets metadata property to command line parameter mapping
     * @return mapping of Tika properties to command arguments
     */
    public Map<Property, String[]> getMetadataCommandArguments();
    
    /**
     * Sets metadata property to command line parameter mapping
     * @param arguments mapping of properties to command line parameters
     */
    public void setMetadataCommandArguments(Map<Property, String[]> arguments);
}

Utility Methods

Static utility methods for working with external embedders.

/**
 * Utility methods for external embedder operations
 */
public class ExternalEmbedder {
    /**
     * Checks if external command is available and functional
     * @param checkCmd command to test (e.g., "exiftool --version")
     * @param errorValue error codes that indicate failure
     * @return true if command executes successfully
     */
    public static boolean check(String checkCmd, int... errorValue);
    
    /**
     * Checks if external command array is available and functional
     * @param checkCmd command array to test
     * @param errorValue error codes that indicate failure
     * @return true if command executes successfully
     */
    public static boolean check(String[] checkCmd, int... errorValue);
}

Usage Examples

Basic Metadata Embedding with ExifTool:

import org.apache.tika.embedder.*;
import org.apache.tika.metadata.*;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import java.io.*;
import java.util.*;

// Check if exiftool is available
if (ExternalEmbedder.check("exiftool", "-ver")) {
    // Create embedder for JPEG images
    ExternalEmbedder embedder = new ExternalEmbedder();
    
    // Configure supported types
    Set<MediaType> supportedTypes = new HashSet<>();
    supportedTypes.add(MediaType.image("jpeg"));
    supportedTypes.add(MediaType.image("tiff"));
    embedder.setSupportedEmbedTypes(supportedTypes);
    
    // Configure exiftool command
    embedder.setCommand("exiftool", 
                       "-overwrite_original", 
                       "${METADATA}", 
                       "${INPUT_FILE}");
    
    // Map Tika metadata to exiftool parameters
    Map<Property, String[]> metadataMapping = new HashMap<>();
    metadataMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title"});
    metadataMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-Author"});
    metadataMapping.put(TikaCoreProperties.SUBJECT, new String[]{"-Subject"});
    metadataMapping.put(TikaCoreProperties.DESCRIPTION, new String[]{"-Description"});
    embedder.setMetadataCommandArguments(metadataMapping);
    
    // Prepare metadata to embed
    Metadata metadata = new Metadata();
    metadata.set(TikaCoreProperties.TITLE, "Sunset Over Mountains");
    metadata.set(TikaCoreProperties.CREATOR, "John Photographer");
    metadata.set(TikaCoreProperties.SUBJECT, "Nature Photography");
    metadata.set(TikaCoreProperties.DESCRIPTION, "Beautiful sunset captured in the Rocky Mountains");
    
    // Embed metadata into image
    try (InputStream input = new FileInputStream("original.jpg");
         OutputStream output = new FileOutputStream("with_metadata.jpg")) {
        
        embedder.embed(metadata, input, output, new ParseContext());
        System.out.println("Metadata successfully embedded");
    }
}

PDF Metadata Embedding with pdftk:

// Configure embedder for PDF documents
ExternalEmbedder pdfEmbedder = new ExternalEmbedder();

// Set supported type
pdfEmbedder.setSupportedEmbedTypes(Set.of(MediaType.application("pdf")));

// Configure pdftk command with metadata file approach
pdfEmbedder.setCommand("pdftk", "${INPUT_FILE}", 
                      "update_info_utf8", "metadata.txt", 
                      "output", "${OUTPUT_FILE}");

// Configure metadata mapping for PDF
Map<Property, String[]> pdfMapping = new HashMap<>();
pdfMapping.put(TikaCoreProperties.TITLE, new String[]{"InfoKey: Title\nInfoValue: "});
pdfMapping.put(TikaCoreProperties.CREATOR, new String[]{"InfoKey: Author\nInfoValue: "});
pdfMapping.put(TikaCoreProperties.SUBJECT, new String[]{"InfoKey: Subject\nInfoValue: "});
pdfEmbedder.setMetadataCommandArguments(pdfMapping);

// Prepare PDF metadata
Metadata pdfMetadata = new Metadata();
pdfMetadata.set(TikaCoreProperties.TITLE, "Technical Documentation");
pdfMetadata.set(TikaCoreProperties.CREATOR, "Engineering Team");
pdfMetadata.set(TikaCoreProperties.SUBJECT, "API Reference Manual");

// Embed metadata
try (InputStream input = new FileInputStream("document.pdf");
     OutputStream output = new FileOutputStream("document_with_metadata.pdf")) {
    
    pdfEmbedder.embed(pdfMetadata, input, output, new ParseContext());
}

Custom Embedder Implementation:

/**
 * Custom embedder for a specific document format
 */
public class CustomDocumentEmbedder implements Embedder {
    private final Set<MediaType> supportedTypes;
    
    public CustomDocumentEmbedder() {
        this.supportedTypes = Set.of(MediaType.parse("application/x-custom"));
    }
    
    @Override
    public Set<MediaType> getSupportedEmbedTypes(ParseContext context) {
        return supportedTypes;
    }
    
    @Override
    public void embed(Metadata metadata, InputStream originalStream, 
                     OutputStream outputStream, ParseContext context) 
            throws IOException, TikaException {
        
        // Read original document
        byte[] originalData = originalStream.readAllBytes();
        
        // Create metadata section
        StringBuilder metadataSection = new StringBuilder();
        for (String name : metadata.names()) {
            String[] values = metadata.getValues(name);
            for (String value : values) {
                metadataSection.append(name).append("=").append(value).append("\n");
            }
        }
        
        // Write document with embedded metadata
        outputStream.write("METADATA_START\n".getBytes());
        outputStream.write(metadataSection.toString().getBytes());
        outputStream.write("METADATA_END\n".getBytes());
        outputStream.write(originalData);
        
        System.out.println("Custom metadata embedding completed");
    }
}

// Usage
CustomDocumentEmbedder customEmbedder = new CustomDocumentEmbedder();
Metadata customMetadata = new Metadata();
customMetadata.set("custom-field", "custom-value");
customMetadata.set(TikaCoreProperties.TITLE, "Custom Document");

try (InputStream input = new FileInputStream("custom.doc");
     OutputStream output = new FileOutputStream("custom_with_metadata.doc")) {
    customEmbedder.embed(customMetadata, input, output, new ParseContext());
}

Advanced External Tool Configuration:

// Configure embedder with complex command structure
ExternalEmbedder advancedEmbedder = new ExternalEmbedder();

// Set multiple supported formats
Set<MediaType> formats = new HashSet<>();
formats.add(MediaType.image("jpeg"));
formats.add(MediaType.image("png"));
formats.add(MediaType.image("tiff"));
advancedEmbedder.setSupportedEmbedTypes(formats);

// Configure advanced exiftool command with serialized metadata
advancedEmbedder.setCommand("exiftool", 
                           "-config", "custom.config",
                           "-overwrite_original",
                           "-charset", "utf8",
                           "${METADATA_SERIALIZED}",
                           "${INPUT_FILE}");

// Configure quote handling and operators
advancedEmbedder.setQuoteAssignmentValues(true);
advancedEmbedder.setCommandAssignmentOperator("=");
advancedEmbedder.setCommandAppendOperator("+=");
advancedEmbedder.setCommandAssignmentDelimeter(" ");

// Create comprehensive metadata mapping
Map<Property, String[]> comprehensiveMapping = new HashMap<>();
comprehensiveMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title", "-XMP:Title"});
comprehensiveMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-XMP:Creator"});
comprehensiveMapping.put(TikaCoreProperties.KEYWORDS, new String[]{"-Keywords", "-XMP:Keywords"});
comprehensiveMapping.put(Geographic.LATITUDE, new String[]{"-GPSLatitude"});
comprehensiveMapping.put(Geographic.LONGITUDE, new String[]{"-GPSLongitude"});
advancedEmbedder.setMetadataCommandArguments(comprehensiveMapping);

// Embed comprehensive metadata
Metadata richMetadata = new Metadata();
richMetadata.set(TikaCoreProperties.TITLE, "Mountain Landscape");
richMetadata.set(TikaCoreProperties.CREATOR, "Nature Photographer");
richMetadata.add(TikaCoreProperties.KEYWORDS, "mountain");
richMetadata.add(TikaCoreProperties.KEYWORDS, "landscape"); 
richMetadata.add(TikaCoreProperties.KEYWORDS, "nature");
richMetadata.set(Geographic.LATITUDE, "40.7128");
richMetadata.set(Geographic.LONGITUDE, "-74.0060");

try (InputStream input = new FileInputStream("landscape.jpg");
     OutputStream output = new FileOutputStream("landscape_enriched.jpg")) {
    
    advancedEmbedder.embed(richMetadata, input, output, new ParseContext());
    System.out.println("Rich metadata embedding completed");
}

The embedding framework provides flexible metadata insertion capabilities with support for external tools, custom implementations, and comprehensive metadata mapping strategies while preserving document integrity and supporting various file formats.

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

embedding.md

exceptions.md

index.md

io-utilities.md

language.md

metadata.md

mime-types.md

parsing.md

pipes.md

process-forking.md

rendering.md

tile.json