Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
Advanced infrastructure for running document parsing operations in separate JVM processes to provide isolation, memory management, and fault tolerance. This system enables robust document processing by isolating potentially problematic parsers, managing memory limits, and providing timeout controls while maintaining seamless integration with the main Tika parsing pipeline.
Main parser implementation that delegates parsing operations to separate JVM processes with connection pooling and resource management.
/**
* Parser that runs actual parsing in separate JVM processes for isolation
*/
public class ForkParser implements Parser, Closeable {
/**
* Creates fork parser with default configuration
*/
public ForkParser();
/**
* Creates fork parser with specific class loader and delegate parser
* @param loader class loader for parser resources
* @param parser delegate parser to run in forked process
*/
public ForkParser(ClassLoader loader, Parser parser);
/**
* Creates fork parser with Tika binary path and parser factory
* @param tikaBin path to Tika binary/jars directory
* @param parserFactoryFactory factory for creating parser instances
*/
public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory);
/**
* Gets Java command line arguments for forked processes
* @return list of JVM arguments and options
*/
public List<String> getJava();
/**
* Sets Java command line arguments for forked processes
* @param java JVM command line arguments
*/
public void setJava(List<String> java);
/**
* Gets connection pool size for forked processes
* @return number of processes in pool
*/
public int getPoolSize();
/**
* Sets connection pool size for forked processes
* @param poolSize number of processes to maintain in pool
*/
public void setPoolSize(int poolSize);
/**
* Gets server pulse interval for health checks
* @return pulse interval in milliseconds
*/
public long getServerPulseMillis();
/**
* Sets server pulse interval for health checks
* @param serverPulseMillis pulse interval in milliseconds
*/
public void setServerPulseMillis(long serverPulseMillis);
/**
* Gets parsing timeout for forked operations
* @return parsing timeout in milliseconds
*/
public long getServerParseTimeoutMillis();
/**
* Sets parsing timeout for forked operations
* @param serverParseTimeoutMillis timeout in milliseconds
*/
public void setServerParseTimeoutMillis(long serverParseTimeoutMillis);
/**
* Gets wait timeout for process communication
* @return wait timeout in milliseconds
*/
public long getServerWaitTimeoutMillis();
/**
* Sets wait timeout for process communication
* @param serverWaitTimeoutMillis timeout in milliseconds
*/
public void setServerWaitTimeoutMillis(long serverWaitTimeoutMillis);
}Internal components for managing the client-server communication between main process and forked processes.
/**
* Server running in forked process to handle parsing requests
*/
class ForkServer implements Runnable {
/** Protocol constants for client-server communication */
public static final byte ERROR = -1;
public static final byte DONE = 0;
public static final byte CALL = 1;
public static final byte PING = 2;
public static final byte RESOURCE = 3;
public static final byte READY = 4;
public static final byte FAILED_TO_START = 5;
public static final byte INIT_PARSER_FACTORY_FACTORY = 6;
public static final byte INIT_LOADER_PARSER = 7;
public static final byte INIT_PARSER_FACTORY_FACTORY_LOADER = 8;
}
/**
* Client in main process for communicating with forked server processes
*/
class ForkClient {
/**
* Acquires fork client from pool or creates new one
* @return fork client instance for parsing operations
*/
public static ForkClient acquire();
/**
* Releases fork client back to pool
* @param client client to release
*/
public static void release(ForkClient client);
/**
* Executes parsing operation in forked process
* @param resource fork resource representing the operation
* @return result of the parsing operation
*/
public Object execute(ForkResource resource);
}Interfaces and implementations for resource management and proxy objects in forked processes.
/**
* Interface for resources that can be processed in forked processes
*/
public interface ForkResource {
/**
* Processes resource in forked process context
* @param input data input stream from main process
* @param output data output stream to main process
* @return throwable if processing fails, null on success
* @throws IOException if I/O communication fails
*/
Throwable process(DataInputStream input, DataOutputStream output) throws IOException;
}
/**
* Interface for proxy objects that can communicate across process boundaries
*/
public interface ForkProxy extends Serializable {
/**
* Initializes proxy with communication streams
* @param input input stream for receiving data
* @param output output stream for sending data
*/
void init(DataInputStream input, DataOutputStream output);
}
/**
* Proxy for content handlers in forked processes
*/
public class ContentHandlerProxy implements ForkProxy {
/**
* Creates content handler proxy
* @param handler target content handler
*/
public ContentHandlerProxy(ContentHandler handler);
}
/**
* Proxy for input streams in forked processes
*/
public class InputStreamProxy implements ForkProxy {
/**
* Creates input stream proxy
* @param stream target input stream
*/
public InputStreamProxy(InputStream stream);
}
/**
* Proxy for class loaders in forked processes
*/
public class ClassLoaderProxy implements ForkProxy {
/**
* Creates class loader proxy
* @param loader target class loader
*/
public ClassLoaderProxy(ClassLoader loader);
}Classes for managing timeouts and configuration in the forking infrastructure.
/**
* Timeout configuration for forked operations
*/
class TimeoutLimits {
/**
* Creates timeout limits configuration
* @param pulseMS pulse interval for health checks
* @param parseTimeoutMS timeout for parsing operations
* @param waitTimeoutMS timeout for process communication
*/
TimeoutLimits(long pulseMS, long parseTimeoutMS, long waitTimeoutMS);
/** @return pulse interval in milliseconds */
public long getPulseMS();
/** @return parse timeout in milliseconds */
public long getParseTimeoutMS();
/** @return wait timeout in milliseconds */
public long getWaitTimeoutMS();
}
/**
* Factory interface for creating parser factories in forked processes
*/
public interface ParserFactoryFactory {
/**
* Creates parser factory instance
* @return parser factory for creating parsers
*/
ParserFactory create();
}Specialized components for memory management and resource handling in forked processes.
/**
* URL connection for memory-based resources in forked processes
*/
public class MemoryURLConnection {
/**
* Creates memory URL connection
* @return connection for memory-based resources
*/
public static MemoryURLConnection create();
}
/**
* URL stream handler for memory resources
*/
public class MemoryURLStreamHandler {
/**
* Creates memory URL stream handler
*/
public MemoryURLStreamHandler();
}
/**
* Factory for memory URL stream handlers
*/
public class MemoryURLStreamHandlerFactory {
/**
* Creates stream handler factory
*/
public MemoryURLStreamHandlerFactory();
}
/**
* Record for memory URL stream data
*/
public class MemoryURLStreamRecord {
/**
* Creates memory URL stream record
* @param url URL for the resource
* @param data resource data
*/
public MemoryURLStreamRecord(URL url, byte[] data);
}Basic Fork Parser Usage:
import org.apache.tika.fork.ForkParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Arrays;
// Create fork parser with memory-limited JVM settings
ForkParser forkParser = new ForkParser();
// Configure JVM arguments for forked processes
forkParser.setJava(Arrays.asList(
"java",
"-Xmx512m", // Limit memory to 512MB
"-Djava.awt.headless=true", // Headless mode
"-XX:+UseG1GC", // Use G1 garbage collector
"-XX:MaxGCPauseMillis=200" // Limit GC pause time
));
// Configure process pool and timeouts
forkParser.setPoolSize(3); // 3 processes in pool
forkParser.setServerParseTimeoutMillis(120000); // 2-minute parse timeout
forkParser.setServerWaitTimeoutMillis(30000); // 30-second wait timeout
forkParser.setServerPulseMillis(5000); // 5-second pulse interval
// Parse document in isolated process
try (InputStream stream = new FileInputStream("large_document.pdf")) {
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
forkParser.parse(stream, handler, metadata, new ParseContext());
System.out.println("Content: " + handler.toString());
System.out.println("Title: " + metadata.get(TikaCoreProperties.TITLE));
} finally {
// Close fork parser and clean up processes
forkParser.close();
}Custom Parser in Forked Process:
import org.apache.tika.fork.ForkParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.config.TikaConfig;
import java.nio.file.Paths;
// Custom parser factory for forked processes
public class CustomParserFactoryFactory implements ParserFactoryFactory {
@Override
public ParserFactory create() {
return new CustomParserFactory();
}
}
// Create fork parser with custom parser factory
Path tikaBinPath = Paths.get("/path/to/tika/jars");
CustomParserFactoryFactory factory = new CustomParserFactoryFactory();
ForkParser customForkParser = new ForkParser(tikaBinPath, factory);
// Configure for high-security parsing environment
customForkParser.setJava(Arrays.asList(
"java",
"-Xmx256m", // Very limited memory
"-Djava.awt.headless=true",
"-Djava.security.manager", // Enable security manager
"-Djava.security.policy=tika.policy", // Custom security policy
"-XX:+DisableAttachMechanism" // Disable JVM attach
));
customForkParser.setPoolSize(1); // Single process for security
customForkParser.setServerParseTimeoutMillis(60000); // Short timeout
// Use for parsing untrusted documents
try (InputStream stream = new FileInputStream("untrusted_document.doc")) {
BodyContentHandler handler = new BodyContentHandler(1000000); // 1MB limit
Metadata metadata = new Metadata();
customForkParser.parse(stream, handler, metadata, new ParseContext());
// Process extracted content safely
String content = handler.toString();
if (content.length() > 0) {
System.out.println("Successfully parsed untrusted document");
}
} finally {
customForkParser.close();
}Batch Processing with Fork Parser:
import org.apache.tika.fork.ForkParser;
import java.io.File;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
// Configure fork parser for batch processing
ForkParser batchForkParser = new ForkParser();
batchForkParser.setPoolSize(5); // Pool of 5 processes
batchForkParser.setServerParseTimeoutMillis(300000); // 5-minute timeout
batchForkParser.setJava(Arrays.asList(
"java",
"-Xmx1g", // 1GB per process
"-XX:+UseParallelGC", // Parallel GC for throughput
"-Djava.awt.headless=true"
));
// Process multiple documents concurrently
ExecutorService executor = Executors.newFixedThreadPool(10);
List<File> documentsToProcess = getDocumentList();
List<Future<String>> futures = new ArrayList<>();
for (File document : documentsToProcess) {
Future<String> future = executor.submit(() -> {
try (InputStream stream = new FileInputStream(document)) {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
// Each parsing operation runs in isolated forked process
batchForkParser.parse(stream, handler, metadata, new ParseContext());
return "Processed: " + document.getName() +
" (Title: " + metadata.get(TikaCoreProperties.TITLE) + ")";
} catch (Exception e) {
return "Failed: " + document.getName() + " - " + e.getMessage();
}
});
futures.add(future);
}
// Collect results
for (Future<String> future : futures) {
try {
String result = future.get();
System.out.println(result);
} catch (Exception e) {
System.err.println("Processing error: " + e.getMessage());
}
}
executor.shutdown();
batchForkParser.close();Error Handling and Recovery:
import org.apache.tika.fork.ForkParser;
import org.apache.tika.exception.TikaException;
// Create resilient fork parser configuration
ForkParser resilientParser = new ForkParser();
resilientParser.setPoolSize(2);
resilientParser.setServerParseTimeoutMillis(60000); // 1-minute timeout
resilientParser.setServerWaitTimeoutMillis(10000); // 10-second wait
try (InputStream stream = new FileInputStream("problematic_document.pdf")) {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
resilientParser.parse(stream, handler, metadata, new ParseContext());
System.out.println("Successfully parsed document");
} catch (TikaException e) {
if (e.getMessage().contains("timeout")) {
System.err.println("Document parsing timed out - document may be corrupted");
// Fork parser automatically kills hung processes and creates new ones
} else if (e.getMessage().contains("OutOfMemoryError")) {
System.err.println("Document too large for current memory settings");
// Process was isolated, main JVM unaffected
} else {
System.err.println("Parsing failed: " + e.getMessage());
// Other parsing errors handled gracefully
}
}
} finally {
resilientParser.close();
}The process forking infrastructure provides robust isolation and resource management for document parsing operations, enabling safe processing of potentially problematic documents while maintaining system stability and providing comprehensive timeout and memory management capabilities.
Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core