Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
I/O utilities providing enhanced input stream capabilities, temporary resource management, filename manipulation, and endian conversion utilities for robust document processing.
Enhanced input stream wrapper providing file backing, mark/reset capabilities, and metadata extraction support.
/**
* Enhanced input stream with file backing and metadata extraction capabilities
*/
public class TikaInputStream extends ProxyInputStream {
/**
* Wraps existing InputStream with TikaInputStream capabilities
* @param stream InputStream to wrap
* @return TikaInputStream wrapping the input stream
*/
public static TikaInputStream get(InputStream stream);
/**
* Creates TikaInputStream from File
* @param file File to create stream from
* @return TikaInputStream backed by the file
* @throws FileNotFoundException if file doesn't exist
*/
public static TikaInputStream get(File file) throws FileNotFoundException;
/**
* Creates TikaInputStream from Path
* @param path Path to create stream from
* @return TikaInputStream backed by the path
* @throws IOException if path cannot be accessed
*/
public static TikaInputStream get(Path path) throws IOException;
/**
* Creates TikaInputStream from URL
* @param url URL to create stream from
* @return TikaInputStream backed by URL content
* @throws IOException if URL cannot be accessed
*/
public static TikaInputStream get(URL url) throws IOException;
/**
* Creates TikaInputStream from byte array
* @param data Byte array containing data
* @return TikaInputStream backed by byte array
*/
public static TikaInputStream get(byte[] data);
/**
* Checks if stream is backed by a file
* @return true if stream has file backing
*/
public boolean hasFile();
/**
* Gets the backing file if available
* @return File backing this stream, or null if no file backing
*/
public File getFile();
/**
* Gets the file path if available
* @return Path backing this stream, or null if no path backing
*/
public Path getPath();
/**
* Gets or creates a temporary file containing stream data
* @return File containing all stream data
* @throws IOException if temporary file cannot be created
*/
public File getFileThreshold(int threshold) throws IOException;
/**
* Gets the length of the stream if known
* @return Stream length in bytes, or -1 if unknown
*/
public long getLength();
/**
* Gets the current position in the stream
* @return Current position in bytes from start
*/
public long getPosition();
/**
* Sets mark supported flag
* @param markSupported Whether mark/reset should be supported
*/
public void setMarkSupported(boolean markSupported);
/**
* Checks if mark/reset is supported
* @return true if mark/reset operations are supported
*/
@Override
public boolean markSupported();
/**
* Marks current position in stream
* @param readLimit Maximum bytes that can be read before mark becomes invalid
*/
@Override
public void mark(int readLimit);
/**
* Resets stream to marked position
* @throws IOException if reset is not supported or mark is invalid
*/
@Override
public void reset() throws IOException;
/**
* Reads specified number of bytes from current position
* @param buffer Buffer to read into
* @param offset Offset in buffer to start writing
* @param length Maximum number of bytes to read
* @return Number of bytes actually read, or -1 if end of stream
* @throws IOException if read operation fails
*/
@Override
public int read(byte[] buffer, int offset, int length) throws IOException;
/**
* Skips specified number of bytes
* @param n Number of bytes to skip
* @return Number of bytes actually skipped
* @throws IOException if skip operation fails
*/
@Override
public long skip(long n) throws IOException;
/**
* Closes stream and releases resources
* @throws IOException if close operation fails
*/
@Override
public void close() throws IOException;
}Input stream wrapper that limits the number of bytes that can be read from the underlying stream.
/**
* Input stream that limits reading to specified number of bytes
*/
public class BoundedInputStream extends ProxyInputStream {
/**
* Creates bounded input stream with maximum read limit
* @param stream Underlying input stream
* @param maxBytes Maximum number of bytes to read
*/
public BoundedInputStream(InputStream stream, long maxBytes);
/**
* Gets the maximum number of bytes that can be read
* @return Maximum byte limit for this stream
*/
public long getMaxBytes();
/**
* Gets the number of bytes read so far
* @return Number of bytes read from start
*/
public long getBytesRead();
/**
* Gets the number of remaining bytes that can be read
* @return Remaining bytes before limit is reached
*/
public long getRemainingBytes();
/**
* Checks if byte limit has been reached
* @return true if no more bytes can be read due to limit
*/
public boolean isLimitReached();
/**
* Reads single byte from stream
* @return Byte value (0-255) or -1 if end of stream or limit reached
* @throws IOException if read operation fails
*/
@Override
public int read() throws IOException;
/**
* Reads bytes into buffer
* @param buffer Buffer to read into
* @param offset Offset in buffer to start writing
* @param length Maximum number of bytes to read
* @return Number of bytes read, or -1 if end of stream or limit reached
* @throws IOException if read operation fails
*/
@Override
public int read(byte[] buffer, int offset, int length) throws IOException;
/**
* Skips bytes in stream up to remaining limit
* @param n Number of bytes to skip
* @return Number of bytes actually skipped
* @throws IOException if skip operation fails
*/
@Override
public long skip(long n) throws IOException;
}Manager for temporary files and resources with automatic cleanup capabilities.
/**
* Manager for temporary files and resources with automatic cleanup
*/
public class TemporaryResources implements Closeable {
/**
* Creates new temporary resources manager
*/
public TemporaryResources();
/**
* Creates temporary file with optional prefix and suffix
* @param prefix Prefix for temporary file name
* @param suffix Suffix for temporary file name
* @return File object for created temporary file
* @throws IOException if temporary file cannot be created
*/
public File createTemporaryFile(String prefix, String suffix) throws IOException;
/**
* Creates temporary file with default naming
* @return File object for created temporary file
* @throws IOException if temporary file cannot be created
*/
public File createTemporaryFile() throws IOException;
/**
* Creates temporary directory
* @param prefix Prefix for temporary directory name
* @return File object for created temporary directory
* @throws IOException if temporary directory cannot be created
*/
public File createTemporaryDirectory(String prefix) throws IOException;
/**
* Registers existing file for cleanup when resources are closed
* @param file File to register for automatic cleanup
*/
public void addToCleanupQueue(File file);
/**
* Creates TikaInputStream with temporary file backing
* @param stream Input stream to wrap
* @return TikaInputStream with temporary file backing
* @throws IOException if temporary file cannot be created
*/
public TikaInputStream createTikaInputStream(InputStream stream) throws IOException;
/**
* Copies input stream to temporary file
* @param stream Input stream to copy
* @param prefix Prefix for temporary file name
* @param suffix Suffix for temporary file name
* @return File containing copied stream data
* @throws IOException if copy operation fails
*/
public File copyToTemporaryFile(InputStream stream, String prefix, String suffix) throws IOException;
/**
* Gets list of all temporary files created
* @return List of File objects representing temporary files
*/
public List<File> getTemporaryFiles();
/**
* Gets total size of all temporary files
* @return Total size in bytes of all temporary files
*/
public long getTotalSize();
/**
* Cleans up all temporary resources
* @throws IOException if cleanup fails
*/
@Override
public void close() throws IOException;
}Collection of static utility methods for common I/O operations and stream handling.
/**
* Static utility methods for I/O operations and stream handling
*/
public class IOUtils {
/**
* Copies all bytes from input stream to output stream
* @param input Source input stream
* @param output Destination output stream
* @return Number of bytes copied
* @throws IOException if copy operation fails
*/
public static long copy(InputStream input, OutputStream output) throws IOException;
/**
* Copies input stream to output stream with buffer size control
* @param input Source input stream
* @param output Destination output stream
* @param bufferSize Size of copy buffer in bytes
* @return Number of bytes copied
* @throws IOException if copy operation fails
*/
public static long copy(InputStream input, OutputStream output, int bufferSize) throws IOException;
/**
* Copies input stream to writer using specified encoding
* @param input Source input stream
* @param writer Destination writer
* @param encoding Character encoding to use
* @throws IOException if copy operation fails
*/
public static void copy(InputStream input, Writer writer, String encoding) throws IOException;
/**
* Reads all bytes from input stream into byte array
* @param input Input stream to read
* @return Byte array containing all stream data
* @throws IOException if read operation fails
*/
public static byte[] toByteArray(InputStream input) throws IOException;
/**
* Reads all characters from reader into string
* @param reader Reader to read from
* @return String containing all reader data
* @throws IOException if read operation fails
*/
public static String toString(Reader reader) throws IOException;
/**
* Reads input stream into string using specified encoding
* @param input Input stream to read
* @param encoding Character encoding to use
* @return String containing stream data
* @throws IOException if read operation fails
*/
public static String toString(InputStream input, String encoding) throws IOException;
/**
* Quietly closes closeable object without throwing exceptions
* @param closeable Object to close (may be null)
*/
public static void closeQuietly(Closeable closeable);
/**
* Quietly closes multiple closeable objects
* @param closeables Array of objects to close
*/
public static void closeQuietly(Closeable... closeables);
/**
* Skips exactly the specified number of bytes from input stream
* @param input Input stream to skip from
* @param toSkip Number of bytes to skip
* @throws IOException if skip operation fails or reaches end of stream
*/
public static void skipFully(InputStream input, long toSkip) throws IOException;
/**
* Reads exactly the specified number of bytes from input stream
* @param input Input stream to read from
* @param buffer Buffer to read into
* @param offset Offset in buffer to start writing
* @param length Number of bytes to read
* @throws IOException if read fails or reaches end of stream prematurely
*/
public static void readFully(InputStream input, byte[] buffer, int offset, int length) throws IOException;
}Utilities for filename manipulation, extension extraction, and path handling.
/**
* Utilities for filename and path manipulation
*/
public class FilenameUtils {
/**
* Extracts file extension from filename
* @param filename Filename to extract extension from
* @return File extension without dot, or empty string if no extension
*/
public static String getExtension(String filename);
/**
* Gets basename of file without extension
* @param filename Filename to get basename from
* @return Filename without extension
*/
public static String getBaseName(String filename);
/**
* Gets filename without path components
* @param path Full path string
* @return Filename component only
*/
public static String getName(String path);
/**
* Gets parent directory path
* @param path Full path string
* @return Parent directory path, or null if no parent
*/
public static String getParent(String path);
/**
* Normalizes path separators to system format
* @param path Path to normalize
* @return Path with normalized separators
*/
public static String normalize(String path);
/**
* Removes extension from filename
* @param filename Filename to remove extension from
* @return Filename without extension
*/
public static String removeExtension(String filename);
/**
* Checks if path is absolute
* @param path Path to check
* @return true if path is absolute
*/
public static boolean isAbsolute(String path);
/**
* Concatenates paths with proper separators
* @param basePath Base path
* @param relativePath Relative path to append
* @return Combined path string
*/
public static String concat(String basePath, String relativePath);
/**
* Splits filename into name and extension parts
* @param filename Filename to split
* @return Array containing [basename, extension]
*/
public static String[] splitExtension(String filename);
}Utilities for converting between little-endian and big-endian byte representations.
/**
* Utilities for endian conversion and byte order manipulation
*/
public class EndianUtils {
/**
* Reads little-endian short from byte array
* @param data Byte array containing data
* @param offset Offset to start reading from
* @return Short value in host byte order
*/
public static short readSwappedShort(byte[] data, int offset);
/**
* Reads little-endian int from byte array
* @param data Byte array containing data
* @param offset Offset to start reading from
* @return Int value in host byte order
*/
public static int readSwappedInteger(byte[] data, int offset);
/**
* Reads little-endian long from byte array
* @param data Byte array containing data
* @param offset Offset to start reading from
* @return Long value in host byte order
*/
public static long readSwappedLong(byte[] data, int offset);
/**
* Reads little-endian float from byte array
* @param data Byte array containing data
* @param offset Offset to start reading from
* @return Float value in host byte order
*/
public static float readSwappedFloat(byte[] data, int offset);
/**
* Reads little-endian double from byte array
* @param data Byte array containing data
* @param offset Offset to start reading from
* @return Double value in host byte order
*/
public static double readSwappedDouble(byte[] data, int offset);
/**
* Writes short to byte array in little-endian format
* @param data Byte array to write to
* @param offset Offset to start writing at
* @param value Short value to write
*/
public static void writeSwappedShort(byte[] data, int offset, short value);
/**
* Writes int to byte array in little-endian format
* @param data Byte array to write to
* @param offset Offset to start writing at
* @param value Int value to write
*/
public static void writeSwappedInteger(byte[] data, int offset, int value);
/**
* Writes long to byte array in little-endian format
* @param data Byte array to write to
* @param offset Offset to start writing at
* @param value Long value to write
*/
public static void writeSwappedLong(byte[] data, int offset, long value);
/**
* Swaps byte order of short value
* @param value Short value to swap
* @return Short with swapped byte order
*/
public static short swapShort(short value);
/**
* Swaps byte order of int value
* @param value Int value to swap
* @return Int with swapped byte order
*/
public static int swapInteger(int value);
/**
* Swaps byte order of long value
* @param value Long value to swap
* @return Long with swapped byte order
*/
public static long swapLong(long value);
}// Create TikaInputStream from various sources
try (TikaInputStream tis = TikaInputStream.get(new FileInputStream("document.pdf"))) {
// Check if backed by file
if (tis.hasFile()) {
File file = tis.getFile();
System.out.println("File size: " + file.length());
}
// Use mark/reset capabilities
if (tis.markSupported()) {
tis.mark(1024);
byte[] header = new byte[10];
tis.read(header);
tis.reset(); // Return to marked position
}
// Get current position and length
System.out.println("Position: " + tis.getPosition());
System.out.println("Length: " + tis.getLength());
}
// Create from URL with temporary file backing
try (TikaInputStream tis = TikaInputStream.get(new URL("http://example.com/doc.pdf"))) {
// Stream content is downloaded to temporary file
File tempFile = tis.getFileThreshold(0);
System.out.println("Downloaded to: " + tempFile.getAbsolutePath());
}// Use TemporaryResources for automatic cleanup
try (TemporaryResources tmp = new TemporaryResources()) {
// Create temporary files
File tempFile1 = tmp.createTemporaryFile("tika", ".tmp");
File tempDir = tmp.createTemporaryDirectory("tika-work");
// Process documents with temporary storage
try (InputStream input = new FileInputStream("large-document.pdf")) {
File workFile = tmp.copyToTemporaryFile(input, "work", ".pdf");
// Use workFile for processing
processDocument(workFile);
System.out.println("Total temp space: " + tmp.getTotalSize() + " bytes");
}
// All temporary files automatically cleaned up when closed
}// Copy streams efficiently
try (InputStream input = new FileInputStream("source.txt");
OutputStream output = new FileOutputStream("destination.txt")) {
long bytesCopied = IOUtils.copy(input, output);
System.out.println("Copied " + bytesCopied + " bytes");
}
// Convert stream to string with encoding
try (InputStream input = new FileInputStream("text-file.txt")) {
String content = IOUtils.toString(input, "UTF-8");
System.out.println("Content: " + content);
}
// Read entire stream into byte array
try (InputStream input = new FileInputStream("binary-file.dat")) {
byte[] data = IOUtils.toByteArray(input);
System.out.println("Read " + data.length + " bytes");
}// Limit stream reading to prevent memory issues
try (InputStream input = new FileInputStream("huge-file.dat");
BoundedInputStream bounded = new BoundedInputStream(input, 1024 * 1024)) { // 1MB limit
byte[] buffer = new byte[8192];
int totalRead = 0;
while (true) {
int read = bounded.read(buffer);
if (read == -1 || bounded.isLimitReached()) {
break;
}
totalRead += read;
// Process buffer data
processData(buffer, 0, read);
}
System.out.println("Read " + totalRead + " bytes (limit: " + bounded.getMaxBytes() + ")");
}// Extract filename components
String filename = "document.backup.pdf";
String extension = FilenameUtils.getExtension(filename); // "pdf"
String basename = FilenameUtils.getBaseName(filename); // "document.backup"
String nameOnly = FilenameUtils.removeExtension(filename); // "document.backup"
// Path manipulation
String fullPath = "/home/user/documents/file.txt";
String name = FilenameUtils.getName(fullPath); // "file.txt"
String parent = FilenameUtils.getParent(fullPath); // "/home/user/documents"
// Split extension
String[] parts = FilenameUtils.splitExtension(filename); // ["document.backup", "pdf"]
// Path concatenation
String combined = FilenameUtils.concat("/home/user", "documents/file.txt");// Read little-endian data from byte array
byte[] data = new byte[] {0x12, 0x34, 0x56, 0x78};
int littleEndianInt = EndianUtils.readSwappedInteger(data, 0);
System.out.println("Value: " + Integer.toHexString(littleEndianInt));
// Write values in little-endian format
byte[] output = new byte[8];
EndianUtils.writeSwappedInteger(output, 0, 0x12345678);
EndianUtils.writeSwappedInteger(output, 4, 0xABCDEF00);
// Swap byte order
short hostValue = 0x1234;
short swapped = EndianUtils.swapShort(hostValue);
System.out.println("Original: " + Integer.toHexString(hostValue));
System.out.println("Swapped: " + Integer.toHexString(swapped));public class DocumentReader {
public String readDocument(InputStream input) throws IOException {
TemporaryResources tmp = new TemporaryResources();
try {
// Create TikaInputStream with temporary backing
TikaInputStream tis = tmp.createTikaInputStream(input);
// Limit reading to reasonable size
BoundedInputStream bounded = new BoundedInputStream(tis, 50 * 1024 * 1024); // 50MB
// Read content safely
StringBuilder content = new StringBuilder();
byte[] buffer = new byte[8192];
while (!bounded.isLimitReached()) {
int read = bounded.read(buffer);
if (read == -1) break;
content.append(new String(buffer, 0, read, "UTF-8"));
}
return content.toString();
} finally {
IOUtils.closeQuietly(tmp); // Cleanup all temporary resources
}
}
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core