CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

Pending
Overview
Eval results
Files

io-utilities.mddocs/

I/O Utilities

I/O utilities providing enhanced input stream capabilities, temporary resource management, filename manipulation, and endian conversion utilities for robust document processing.

Capabilities

Enhanced Input Streams

TikaInputStream

Enhanced input stream wrapper providing file backing, mark/reset capabilities, and metadata extraction support.

/**
 * Enhanced input stream with file backing and metadata extraction capabilities
 */
public class TikaInputStream extends ProxyInputStream {
    /**
     * Wraps existing InputStream with TikaInputStream capabilities
     * @param stream InputStream to wrap
     * @return TikaInputStream wrapping the input stream
     */
    public static TikaInputStream get(InputStream stream);
    
    /**
     * Creates TikaInputStream from File
     * @param file File to create stream from
     * @return TikaInputStream backed by the file
     * @throws FileNotFoundException if file doesn't exist
     */
    public static TikaInputStream get(File file) throws FileNotFoundException;
    
    /**
     * Creates TikaInputStream from Path
     * @param path Path to create stream from
     * @return TikaInputStream backed by the path
     * @throws IOException if path cannot be accessed
     */
    public static TikaInputStream get(Path path) throws IOException;
    
    /**
     * Creates TikaInputStream from URL
     * @param url URL to create stream from
     * @return TikaInputStream backed by URL content
     * @throws IOException if URL cannot be accessed
     */
    public static TikaInputStream get(URL url) throws IOException;
    
    /**
     * Creates TikaInputStream from byte array
     * @param data Byte array containing data
     * @return TikaInputStream backed by byte array
     */
    public static TikaInputStream get(byte[] data);
    
    /**
     * Checks if stream is backed by a file
     * @return true if stream has file backing
     */
    public boolean hasFile();
    
    /**
     * Gets the backing file if available
     * @return File backing this stream, or null if no file backing
     */
    public File getFile();
    
    /**
     * Gets the file path if available
     * @return Path backing this stream, or null if no path backing
     */
    public Path getPath();
    
    /**
     * Gets or creates a temporary file containing stream data
     * @return File containing all stream data
     * @throws IOException if temporary file cannot be created
     */
    public File getFileThreshold(int threshold) throws IOException;
    
    /**
     * Gets the length of the stream if known
     * @return Stream length in bytes, or -1 if unknown
     */
    public long getLength();
    
    /**
     * Gets the current position in the stream
     * @return Current position in bytes from start
     */
    public long getPosition();
    
    /**
     * Sets mark supported flag
     * @param markSupported Whether mark/reset should be supported
     */
    public void setMarkSupported(boolean markSupported);
    
    /**
     * Checks if mark/reset is supported
     * @return true if mark/reset operations are supported
     */
    @Override
    public boolean markSupported();
    
    /**
     * Marks current position in stream
     * @param readLimit Maximum bytes that can be read before mark becomes invalid
     */
    @Override
    public void mark(int readLimit);
    
    /**
     * Resets stream to marked position
     * @throws IOException if reset is not supported or mark is invalid
     */
    @Override
    public void reset() throws IOException;
    
    /**
     * Reads specified number of bytes from current position
     * @param buffer Buffer to read into
     * @param offset Offset in buffer to start writing
     * @param length Maximum number of bytes to read
     * @return Number of bytes actually read, or -1 if end of stream
     * @throws IOException if read operation fails
     */
    @Override
    public int read(byte[] buffer, int offset, int length) throws IOException;
    
    /**
     * Skips specified number of bytes
     * @param n Number of bytes to skip
     * @return Number of bytes actually skipped
     * @throws IOException if skip operation fails
     */
    @Override
    public long skip(long n) throws IOException;
    
    /**
     * Closes stream and releases resources
     * @throws IOException if close operation fails
     */
    @Override
    public void close() throws IOException;
}

BoundedInputStream

Input stream wrapper that limits the number of bytes that can be read from the underlying stream.

/**
 * Input stream that limits reading to specified number of bytes
 */
public class BoundedInputStream extends ProxyInputStream {
    /**
     * Creates bounded input stream with maximum read limit
     * @param stream Underlying input stream
     * @param maxBytes Maximum number of bytes to read
     */
    public BoundedInputStream(InputStream stream, long maxBytes);
    
    /**
     * Gets the maximum number of bytes that can be read
     * @return Maximum byte limit for this stream
     */
    public long getMaxBytes();
    
    /**
     * Gets the number of bytes read so far
     * @return Number of bytes read from start
     */
    public long getBytesRead();
    
    /**
     * Gets the number of remaining bytes that can be read
     * @return Remaining bytes before limit is reached
     */
    public long getRemainingBytes();
    
    /**
     * Checks if byte limit has been reached
     * @return true if no more bytes can be read due to limit
     */
    public boolean isLimitReached();
    
    /**
     * Reads single byte from stream
     * @return Byte value (0-255) or -1 if end of stream or limit reached
     * @throws IOException if read operation fails
     */
    @Override
    public int read() throws IOException;
    
    /**
     * Reads bytes into buffer
     * @param buffer Buffer to read into
     * @param offset Offset in buffer to start writing
     * @param length Maximum number of bytes to read
     * @return Number of bytes read, or -1 if end of stream or limit reached
     * @throws IOException if read operation fails
     */
    @Override
    public int read(byte[] buffer, int offset, int length) throws IOException;
    
    /**
     * Skips bytes in stream up to remaining limit
     * @param n Number of bytes to skip
     * @return Number of bytes actually skipped
     * @throws IOException if skip operation fails
     */
    @Override
    public long skip(long n) throws IOException;
}

Temporary Resource Management

TemporaryResources

Manager for temporary files and resources with automatic cleanup capabilities.

/**
 * Manager for temporary files and resources with automatic cleanup
 */
public class TemporaryResources implements Closeable {
    /**
     * Creates new temporary resources manager
     */
    public TemporaryResources();
    
    /**
     * Creates temporary file with optional prefix and suffix
     * @param prefix Prefix for temporary file name
     * @param suffix Suffix for temporary file name
     * @return File object for created temporary file
     * @throws IOException if temporary file cannot be created
     */
    public File createTemporaryFile(String prefix, String suffix) throws IOException;
    
    /**
     * Creates temporary file with default naming
     * @return File object for created temporary file
     * @throws IOException if temporary file cannot be created
     */
    public File createTemporaryFile() throws IOException;
    
    /**
     * Creates temporary directory
     * @param prefix Prefix for temporary directory name
     * @return File object for created temporary directory
     * @throws IOException if temporary directory cannot be created
     */
    public File createTemporaryDirectory(String prefix) throws IOException;
    
    /**
     * Registers existing file for cleanup when resources are closed
     * @param file File to register for automatic cleanup
     */
    public void addToCleanupQueue(File file);
    
    /**
     * Creates TikaInputStream with temporary file backing
     * @param stream Input stream to wrap
     * @return TikaInputStream with temporary file backing
     * @throws IOException if temporary file cannot be created
     */
    public TikaInputStream createTikaInputStream(InputStream stream) throws IOException;
    
    /**
     * Copies input stream to temporary file
     * @param stream Input stream to copy
     * @param prefix Prefix for temporary file name
     * @param suffix Suffix for temporary file name
     * @return File containing copied stream data
     * @throws IOException if copy operation fails
     */
    public File copyToTemporaryFile(InputStream stream, String prefix, String suffix) throws IOException;
    
    /**
     * Gets list of all temporary files created
     * @return List of File objects representing temporary files
     */
    public List<File> getTemporaryFiles();
    
    /**
     * Gets total size of all temporary files
     * @return Total size in bytes of all temporary files
     */
    public long getTotalSize();
    
    /**
     * Cleans up all temporary resources
     * @throws IOException if cleanup fails
     */
    @Override
    public void close() throws IOException;
}

I/O Utility Methods

IOUtils

Collection of static utility methods for common I/O operations and stream handling.

/**
 * Static utility methods for I/O operations and stream handling
 */
public class IOUtils {
    /**
     * Copies all bytes from input stream to output stream
     * @param input Source input stream
     * @param output Destination output stream
     * @return Number of bytes copied
     * @throws IOException if copy operation fails
     */
    public static long copy(InputStream input, OutputStream output) throws IOException;
    
    /**
     * Copies input stream to output stream with buffer size control
     * @param input Source input stream
     * @param output Destination output stream  
     * @param bufferSize Size of copy buffer in bytes
     * @return Number of bytes copied
     * @throws IOException if copy operation fails
     */
    public static long copy(InputStream input, OutputStream output, int bufferSize) throws IOException;
    
    /**
     * Copies input stream to writer using specified encoding
     * @param input Source input stream
     * @param writer Destination writer
     * @param encoding Character encoding to use
     * @throws IOException if copy operation fails
     */
    public static void copy(InputStream input, Writer writer, String encoding) throws IOException;
    
    /**
     * Reads all bytes from input stream into byte array
     * @param input Input stream to read
     * @return Byte array containing all stream data
     * @throws IOException if read operation fails
     */
    public static byte[] toByteArray(InputStream input) throws IOException;
    
    /**
     * Reads all characters from reader into string
     * @param reader Reader to read from
     * @return String containing all reader data
     * @throws IOException if read operation fails
     */
    public static String toString(Reader reader) throws IOException;
    
    /**
     * Reads input stream into string using specified encoding
     * @param input Input stream to read
     * @param encoding Character encoding to use
     * @return String containing stream data
     * @throws IOException if read operation fails
     */
    public static String toString(InputStream input, String encoding) throws IOException;
    
    /**
     * Quietly closes closeable object without throwing exceptions
     * @param closeable Object to close (may be null)
     */
    public static void closeQuietly(Closeable closeable);
    
    /**
     * Quietly closes multiple closeable objects
     * @param closeables Array of objects to close
     */
    public static void closeQuietly(Closeable... closeables);
    
    /**
     * Skips exactly the specified number of bytes from input stream
     * @param input Input stream to skip from
     * @param toSkip Number of bytes to skip
     * @throws IOException if skip operation fails or reaches end of stream
     */
    public static void skipFully(InputStream input, long toSkip) throws IOException;
    
    /**
     * Reads exactly the specified number of bytes from input stream
     * @param input Input stream to read from
     * @param buffer Buffer to read into
     * @param offset Offset in buffer to start writing
     * @param length Number of bytes to read
     * @throws IOException if read fails or reaches end of stream prematurely
     */
    public static void readFully(InputStream input, byte[] buffer, int offset, int length) throws IOException;
}

Filename Utilities

FilenameUtils

Utilities for filename manipulation, extension extraction, and path handling.

/**
 * Utilities for filename and path manipulation
 */
public class FilenameUtils {
    /**
     * Extracts file extension from filename
     * @param filename Filename to extract extension from
     * @return File extension without dot, or empty string if no extension
     */
    public static String getExtension(String filename);
    
    /**
     * Gets basename of file without extension
     * @param filename Filename to get basename from
     * @return Filename without extension
     */
    public static String getBaseName(String filename);
    
    /**
     * Gets filename without path components
     * @param path Full path string
     * @return Filename component only
     */
    public static String getName(String path);
    
    /**
     * Gets parent directory path
     * @param path Full path string
     * @return Parent directory path, or null if no parent
     */
    public static String getParent(String path);
    
    /**
     * Normalizes path separators to system format
     * @param path Path to normalize
     * @return Path with normalized separators
     */
    public static String normalize(String path);
    
    /**
     * Removes extension from filename
     * @param filename Filename to remove extension from
     * @return Filename without extension
     */
    public static String removeExtension(String filename);
    
    /**
     * Checks if path is absolute
     * @param path Path to check
     * @return true if path is absolute
     */
    public static boolean isAbsolute(String path);
    
    /**
     * Concatenates paths with proper separators
     * @param basePath Base path
     * @param relativePath Relative path to append
     * @return Combined path string
     */
    public static String concat(String basePath, String relativePath);
    
    /**
     * Splits filename into name and extension parts
     * @param filename Filename to split
     * @return Array containing [basename, extension]
     */
    public static String[] splitExtension(String filename);
}

Endian Conversion Utilities

EndianUtils

Utilities for converting between little-endian and big-endian byte representations.

/**
 * Utilities for endian conversion and byte order manipulation
 */
public class EndianUtils {
    /**
     * Reads little-endian short from byte array
     * @param data Byte array containing data
     * @param offset Offset to start reading from
     * @return Short value in host byte order
     */
    public static short readSwappedShort(byte[] data, int offset);
    
    /**
     * Reads little-endian int from byte array
     * @param data Byte array containing data
     * @param offset Offset to start reading from
     * @return Int value in host byte order
     */
    public static int readSwappedInteger(byte[] data, int offset);
    
    /**
     * Reads little-endian long from byte array
     * @param data Byte array containing data
     * @param offset Offset to start reading from
     * @return Long value in host byte order
     */
    public static long readSwappedLong(byte[] data, int offset);
    
    /**
     * Reads little-endian float from byte array
     * @param data Byte array containing data
     * @param offset Offset to start reading from
     * @return Float value in host byte order
     */
    public static float readSwappedFloat(byte[] data, int offset);
    
    /**
     * Reads little-endian double from byte array
     * @param data Byte array containing data
     * @param offset Offset to start reading from
     * @return Double value in host byte order
     */
    public static double readSwappedDouble(byte[] data, int offset);
    
    /**
     * Writes short to byte array in little-endian format
     * @param data Byte array to write to
     * @param offset Offset to start writing at
     * @param value Short value to write
     */
    public static void writeSwappedShort(byte[] data, int offset, short value);
    
    /**
     * Writes int to byte array in little-endian format
     * @param data Byte array to write to
     * @param offset Offset to start writing at
     * @param value Int value to write
     */
    public static void writeSwappedInteger(byte[] data, int offset, int value);
    
    /**
     * Writes long to byte array in little-endian format
     * @param data Byte array to write to
     * @param offset Offset to start writing at
     * @param value Long value to write
     */
    public static void writeSwappedLong(byte[] data, int offset, long value);
    
    /**
     * Swaps byte order of short value
     * @param value Short value to swap
     * @return Short with swapped byte order
     */
    public static short swapShort(short value);
    
    /**
     * Swaps byte order of int value
     * @param value Int value to swap
     * @return Int with swapped byte order
     */
    public static int swapInteger(int value);
    
    /**
     * Swaps byte order of long value
     * @param value Long value to swap
     * @return Long with swapped byte order
     */
    public static long swapLong(long value);
}

Usage Examples

Working with TikaInputStream

// Create TikaInputStream from various sources
try (TikaInputStream tis = TikaInputStream.get(new FileInputStream("document.pdf"))) {
    // Check if backed by file
    if (tis.hasFile()) {
        File file = tis.getFile();
        System.out.println("File size: " + file.length());
    }
    
    // Use mark/reset capabilities
    if (tis.markSupported()) {
        tis.mark(1024);
        byte[] header = new byte[10];
        tis.read(header);
        tis.reset(); // Return to marked position
    }
    
    // Get current position and length
    System.out.println("Position: " + tis.getPosition());
    System.out.println("Length: " + tis.getLength());
}

// Create from URL with temporary file backing
try (TikaInputStream tis = TikaInputStream.get(new URL("http://example.com/doc.pdf"))) {
    // Stream content is downloaded to temporary file
    File tempFile = tis.getFileThreshold(0);
    System.out.println("Downloaded to: " + tempFile.getAbsolutePath());
}

Temporary Resource Management

// Use TemporaryResources for automatic cleanup
try (TemporaryResources tmp = new TemporaryResources()) {
    // Create temporary files
    File tempFile1 = tmp.createTemporaryFile("tika", ".tmp");
    File tempDir = tmp.createTemporaryDirectory("tika-work");
    
    // Process documents with temporary storage
    try (InputStream input = new FileInputStream("large-document.pdf")) {
        File workFile = tmp.copyToTemporaryFile(input, "work", ".pdf");
        
        // Use workFile for processing
        processDocument(workFile);
        
        System.out.println("Total temp space: " + tmp.getTotalSize() + " bytes");
    }
    
    // All temporary files automatically cleaned up when closed
}

Stream Copying and Conversion

// Copy streams efficiently
try (InputStream input = new FileInputStream("source.txt");
     OutputStream output = new FileOutputStream("destination.txt")) {
    
    long bytesCopied = IOUtils.copy(input, output);
    System.out.println("Copied " + bytesCopied + " bytes");
}

// Convert stream to string with encoding
try (InputStream input = new FileInputStream("text-file.txt")) {
    String content = IOUtils.toString(input, "UTF-8");
    System.out.println("Content: " + content);
}

// Read entire stream into byte array
try (InputStream input = new FileInputStream("binary-file.dat")) {
    byte[] data = IOUtils.toByteArray(input);
    System.out.println("Read " + data.length + " bytes");
}

Bounded Stream Processing

// Limit stream reading to prevent memory issues
try (InputStream input = new FileInputStream("huge-file.dat");
     BoundedInputStream bounded = new BoundedInputStream(input, 1024 * 1024)) { // 1MB limit
    
    byte[] buffer = new byte[8192];
    int totalRead = 0;
    
    while (true) {
        int read = bounded.read(buffer);
        if (read == -1 || bounded.isLimitReached()) {
            break;
        }
        totalRead += read;
        
        // Process buffer data
        processData(buffer, 0, read);
    }
    
    System.out.println("Read " + totalRead + " bytes (limit: " + bounded.getMaxBytes() + ")");
}

Filename and Path Utilities

// Extract filename components
String filename = "document.backup.pdf";
String extension = FilenameUtils.getExtension(filename);     // "pdf"
String basename = FilenameUtils.getBaseName(filename);       // "document.backup"
String nameOnly = FilenameUtils.removeExtension(filename);   // "document.backup"

// Path manipulation
String fullPath = "/home/user/documents/file.txt";
String name = FilenameUtils.getName(fullPath);               // "file.txt"
String parent = FilenameUtils.getParent(fullPath);           // "/home/user/documents"

// Split extension
String[] parts = FilenameUtils.splitExtension(filename);     // ["document.backup", "pdf"]

// Path concatenation
String combined = FilenameUtils.concat("/home/user", "documents/file.txt");

Endian Conversion

// Read little-endian data from byte array
byte[] data = new byte[] {0x12, 0x34, 0x56, 0x78};
int littleEndianInt = EndianUtils.readSwappedInteger(data, 0);
System.out.println("Value: " + Integer.toHexString(littleEndianInt));

// Write values in little-endian format
byte[] output = new byte[8];
EndianUtils.writeSwappedInteger(output, 0, 0x12345678);
EndianUtils.writeSwappedInteger(output, 4, 0xABCDEF00);

// Swap byte order
short hostValue = 0x1234;
short swapped = EndianUtils.swapShort(hostValue);
System.out.println("Original: " + Integer.toHexString(hostValue));
System.out.println("Swapped: " + Integer.toHexString(swapped));

Robust Stream Handling

public class DocumentReader {
    
    public String readDocument(InputStream input) throws IOException {
        TemporaryResources tmp = new TemporaryResources();
        
        try {
            // Create TikaInputStream with temporary backing
            TikaInputStream tis = tmp.createTikaInputStream(input);
            
            // Limit reading to reasonable size
            BoundedInputStream bounded = new BoundedInputStream(tis, 50 * 1024 * 1024); // 50MB
            
            // Read content safely
            StringBuilder content = new StringBuilder();
            byte[] buffer = new byte[8192];
            
            while (!bounded.isLimitReached()) {
                int read = bounded.read(buffer);
                if (read == -1) break;
                
                content.append(new String(buffer, 0, read, "UTF-8"));
            }
            
            return content.toString();
            
        } finally {
            IOUtils.closeQuietly(tmp); // Cleanup all temporary resources
        }
    }
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

embedding.md

exceptions.md

index.md

io-utilities.md

language.md

metadata.md

mime-types.md

parsing.md

pipes.md

process-forking.md

rendering.md

tile.json