CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-tika--tika-core

Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.

Pending
Overview
Eval results
Files

language.mddocs/

Language Processing

Language processing capabilities including automatic language detection, text profiling, and translation services for multilingual document processing and content analysis.

Capabilities

Language Detection

LanguageIdentifier

Classic language identification using n-gram analysis and statistical models for detecting document language.

/**
 * Statistical language identifier using n-gram analysis
 */
public class LanguageIdentifier {
    /**
     * Creates LanguageIdentifier with default language profiles
     */
    public LanguageIdentifier();
    
    /**
     * Creates LanguageIdentifier with custom profile directory
     * @param profileDirectory Directory containing language profile files
     */
    public LanguageIdentifier(String profileDirectory);
    
    /**
     * Identifies language of text content
     * @param content Text content to analyze
     * @return Language code (ISO 639-1) of detected language
     */
    public String identify(String content);
    
    /**
     * Identifies language with confidence score
     * @param content Text content to analyze
     * @return LanguageResult containing language and confidence
     */
    public LanguageResult identifyWithConfidence(String content);
    
    /**
     * Checks if language can be reliably identified
     * @param content Text content to check
     * @return true if language detection confidence is high enough
     */
    public boolean isReasonablyCertain(String content);
    
    /**
     * Gets confidence score for detected language
     * @param content Text content to analyze
     * @return Confidence score between 0.0 and 1.0
     */
    public double getConfidence(String content);
    
    /**
     * Gets all supported language codes
     * @return Set of supported ISO 639-1 language codes
     */
    public Set<String> getSupportedLanguages();
    
    /**
     * Checks if specific language is supported
     * @param language ISO 639-1 language code to check
     * @return true if language detection is supported
     */
    public boolean isLanguageSupported(String language);
}

ProfilingHandler

Content handler for building language profiles during document parsing for improved detection accuracy.

/**
 * Content handler that builds language profiles for detection
 */
public class ProfilingHandler extends DefaultHandler {
    /**
     * Creates ProfilingHandler for language profiling
     */
    public ProfilingHandler();
    
    /**
     * Creates ProfilingHandler with custom LanguageIdentifier
     * @param identifier LanguageIdentifier to use for profiling
     */
    public ProfilingHandler(LanguageIdentifier identifier);
    
    /**
     * Gets the detected language after profiling
     * @return ISO 639-1 language code of detected language
     */
    public String getLanguage();
    
    /**
     * Gets confidence score of language detection
     * @return Confidence score between 0.0 and 1.0
     */
    public double getConfidence();
    
    /**
     * Checks if enough content has been processed for reliable detection
     * @return true if sufficient content analyzed
     */
    public boolean hasEnoughData();
    
    /**
     * Gets the amount of text content processed
     * @return Number of characters analyzed
     */
    public int getContentLength();
}

Modern Language Detection

LanguageDetector Interface

Modern interface for pluggable language detection implementations with support for multiple algorithms.

/**
 * Interface for modern language detection implementations
 */
public interface LanguageDetector {
    /**
     * Detects language of input text
     * @param text Text content to analyze
     * @return LanguageResult containing detected language and confidence
     * @throws IOException if detection process fails
     */
    LanguageResult detect(String text) throws IOException;
    
    /**
     * Detects multiple possible languages with probabilities
     * @param text Text content to analyze
     * @return List of LanguageResult objects sorted by confidence
     * @throws IOException if detection process fails
     */
    List<LanguageResult> detectAll(String text) throws IOException;
    
    /**
     * Checks if detector supports specific language
     * @param language ISO 639-1 language code
     * @return true if language is supported for detection
     */
    boolean isSupported(String language);
    
    /**
     * Gets all supported languages
     * @return Set of supported ISO 639-1 language codes
     */
    Set<String> getSupportedLanguages();
    
    /**
     * Loads detector from configuration
     * @param config Configuration parameters for detector
     * @throws IOException if loading fails
     */
    void loadModels(Map<String, Object> config) throws IOException;
    
    /**
     * Checks if detector is ready for use
     * @return true if detector is loaded and ready
     */
    boolean isAvailable();
}

LanguageResult

Result object containing detected language information and confidence metrics.

/**
 * Result of language detection containing language and confidence information
 */
public class LanguageResult {
    /**
     * Creates LanguageResult with language and confidence
     * @param language ISO 639-1 language code
     * @param confidence Confidence score (0.0 to 1.0)
     */
    public LanguageResult(String language, float confidence);
    
    /**
     * Creates LanguageResult with additional properties
     * @param language ISO 639-1 language code
     * @param confidence Confidence score
     * @param rawScore Raw detection score from algorithm
     */
    public LanguageResult(String language, float confidence, double rawScore);
    
    /**
     * Gets detected language code
     * @return ISO 639-1 language code (e.g., "en", "fr", "de")
     */
    public String getLanguage();
    
    /**
     * Gets confidence score of detection
     * @return Confidence between 0.0 (lowest) and 1.0 (highest)
     */
    public float getConfidence();
    
    /**
     * Gets raw algorithm score
     * @return Raw score from detection algorithm
     */
    public double getRawScore();
    
    /**
     * Checks if detection confidence is above threshold
     * @param threshold Minimum confidence threshold
     * @return true if confidence exceeds threshold
     */
    public boolean isReliable(float threshold);
    
    /**
     * Gets human-readable language name
     * @return Full language name in English
     */
    public String getLanguageName();
    
    /**
     * Compares results by confidence (descending order)
     * @param other LanguageResult to compare with
     * @return Comparison result for sorting
     */
    public int compareTo(LanguageResult other);
}

LanguageWriter

Writer wrapper that performs language detection on written content for streaming analysis.

/**
 * Writer that performs language detection on content as it's written
 */
public class LanguageWriter extends Writer {
    /**
     * Creates LanguageWriter with underlying writer and detector
     * @param writer Underlying Writer to delegate to
     * @param detector LanguageDetector for analysis
     */
    public LanguageWriter(Writer writer, LanguageDetector detector);
    
    /**
     * Creates LanguageWriter with detector and minimum content threshold
     * @param writer Underlying Writer
     * @param detector LanguageDetector for analysis  
     * @param minLength Minimum content length before detection
     */
    public LanguageWriter(Writer writer, LanguageDetector detector, int minLength);
    
    /**
     * Gets current detected language
     * @return LanguageResult with current detection, or null if insufficient data
     */
    public LanguageResult getDetectedLanguage();
    
    /**
     * Gets all possible languages detected
     * @return List of LanguageResult objects sorted by confidence
     */
    public List<LanguageResult> getAllDetectedLanguages();
    
    /**
     * Checks if enough content has been written for reliable detection
     * @return true if sufficient content for detection
     */
    public boolean hasMinimumContent();
    
    /**
     * Gets length of content analyzed so far
     * @return Number of characters written and analyzed
     */
    public int getContentLength();
    
    /**
     * Writes character array to underlying writer and updates detection
     * @param cbuf Character array to write
     * @param off Offset in character array
     * @param len Number of characters to write
     * @throws IOException if write operation fails
     */
    @Override
    public void write(char[] cbuf, int off, int len) throws IOException;
    
    /**
     * Writes string to underlying writer and updates detection
     * @param str String to write
     * @throws IOException if write operation fails
     */
    @Override
    public void write(String str) throws IOException;
    
    /**
     * Flushes underlying writer
     * @throws IOException if flush operation fails
     */
    @Override
    public void flush() throws IOException;
    
    /**
     * Closes underlying writer
     * @throws IOException if close operation fails
     */
    @Override
    public void close() throws IOException;
}

Translation Services

Translator Interface

Interface for text translation services supporting multiple translation backends and language pairs.

/**
 * Interface for text translation services
 */
public interface Translator {
    /**
     * Translates text to target language
     * @param text Text to translate
     * @param targetLanguage Target language code (ISO 639-1)
     * @return Translated text
     * @throws TikaException if translation fails
     * @throws IOException if communication with translation service fails
     */
    String translate(String text, String targetLanguage) throws TikaException, IOException;
    
    /**
     * Translates text from source to target language
     * @param text Text to translate
     * @param sourceLanguage Source language code (ISO 639-1)
     * @param targetLanguage Target language code (ISO 639-1)  
     * @return Translated text
     * @throws TikaException if translation fails
     * @throws IOException if communication fails
     */
    String translate(String text, String sourceLanguage, String targetLanguage) 
            throws TikaException, IOException;
    
    /**
     * Gets all supported source languages
     * @return Set of supported source language codes
     */
    Set<String> getSupportedSourceLanguages();
    
    /**
     * Gets all supported target languages
     * @return Set of supported target language codes
     */
    Set<String> getSupportedTargetLanguages();
    
    /**
     * Checks if translation from source to target language is supported
     * @param sourceLanguage Source language code
     * @param targetLanguage Target language code
     * @return true if translation pair is supported
     */
    boolean isSupported(String sourceLanguage, String targetLanguage);
    
    /**
     * Checks if translator service is available
     * @return true if translator can be used
     */
    boolean isAvailable();
    
    /**
     * Gets maximum text length supported for translation
     * @return Maximum characters per translation request
     */
    int getMaxTextLength();
}

DefaultTranslator

Default implementation of Translator interface providing basic translation capabilities.

/**
 * Default translator implementation with configurable backends
 */
public class DefaultTranslator implements Translator {
    /**
     * Creates DefaultTranslator with default configuration
     */
    public DefaultTranslator();
    
    /**
     * Creates DefaultTranslator with custom configuration
     * @param config Configuration properties for translator
     */
    public DefaultTranslator(Properties config);
    
    /**
     * Sets translation service endpoint URL
     * @param serviceUrl URL of translation service
     */
    public void setServiceUrl(String serviceUrl);
    
    /**
     * Gets current service endpoint URL
     * @return URL of translation service
     */
    public String getServiceUrl();
    
    /**
     * Sets API key for translation service
     * @param apiKey API key for service authentication
     */
    public void setApiKey(String apiKey);
    
    /**
     * Sets maximum text length for single translation request
     * @param maxLength Maximum characters per request
     */
    public void setMaxTextLength(int maxLength);
    
    /**
     * Sets timeout for translation requests
     * @param timeoutMs Timeout in milliseconds
     */
    public void setTimeout(int timeoutMs);
    
    /**
     * Translates text to target language with auto-detection
     * @param text Text to translate
     * @param targetLanguage Target language code
     * @return Translated text
     * @throws TikaException if translation fails
     * @throws IOException if service communication fails
     */
    @Override
    public String translate(String text, String targetLanguage) throws TikaException, IOException;
    
    /**
     * Translates text with explicit source language
     * @param text Text to translate
     * @param sourceLanguage Source language code
     * @param targetLanguage Target language code
     * @return Translated text
     * @throws TikaException if translation fails
     * @throws IOException if service communication fails
     */
    @Override
    public String translate(String text, String sourceLanguage, String targetLanguage) 
            throws TikaException, IOException;
    
    /**
     * Gets supported source languages from service
     * @return Set of source language codes
     */
    @Override
    public Set<String> getSupportedSourceLanguages();
    
    /**
     * Gets supported target languages from service
     * @return Set of target language codes  
     */
    @Override
    public Set<String> getSupportedTargetLanguages();
    
    /**
     * Checks if language pair is supported
     * @param sourceLanguage Source language code
     * @param targetLanguage Target language code
     * @return true if translation is supported
     */
    @Override
    public boolean isSupported(String sourceLanguage, String targetLanguage);
    
    /**
     * Checks if translation service is available
     * @return true if service can be reached
     */
    @Override
    public boolean isAvailable();
    
    /**
     * Gets maximum text length per request
     * @return Maximum characters per translation
     */
    @Override
    public int getMaxTextLength();
}

Usage Examples

Basic Language Detection

// Simple language identification
LanguageIdentifier identifier = new LanguageIdentifier();

String englishText = "This is a sample document written in English.";
String detectedLang = identifier.identify(englishText);
System.out.println("Detected language: " + detectedLang); // "en"

// Check detection confidence
if (identifier.isReasonablyCertain(englishText)) {
    double confidence = identifier.getConfidence(englishText);
    System.out.println("Confidence: " + confidence);
}

// Get all supported languages
Set<String> supported = identifier.getSupportedLanguages();
System.out.println("Supported languages: " + supported);

Advanced Language Detection with Results

// Modern language detection with detailed results
LanguageIdentifier identifier = new LanguageIdentifier();

String mixedText = "Bonjour, this is a mixed language document with français.";
LanguageResult result = identifier.identifyWithConfidence(mixedText);

System.out.println("Language: " + result.getLanguage());
System.out.println("Confidence: " + result.getConfidence());
System.out.println("Language name: " + result.getLanguageName());

// Check reliability
if (result.isReliable(0.8f)) {
    System.out.println("High confidence detection");
}

Language Detection During Parsing

// Detect language while parsing document
try {
    AutoDetectParser parser = new AutoDetectParser();
    ProfilingHandler langHandler = new ProfilingHandler();
    BodyContentHandler textHandler = new BodyContentHandler();
    
    // Use TeeContentHandler to process with both handlers
    TeeContentHandler teeHandler = new TeeContentHandler(langHandler, textHandler);
    
    Metadata metadata = new Metadata();
    parser.parse(inputStream, teeHandler, metadata, new ParseContext());
    
    // Get detected language and content
    String language = langHandler.getLanguage();
    double confidence = langHandler.getConfidence();
    String content = textHandler.toString();
    
    System.out.println("Document language: " + language + " (" + confidence + ")");
    System.out.println("Content length: " + langHandler.getContentLength());
    
} catch (Exception e) {
    System.err.println("Language detection failed: " + e.getMessage());
}

Streaming Language Detection

// Detect language as content is written
try (StringWriter stringWriter = new StringWriter()) {
    LanguageIdentifier detector = new LanguageIdentifier();
    LanguageWriter langWriter = new LanguageWriter(stringWriter, 
            text -> {
                try {
                    return detector.identifyWithConfidence(text);
                } catch (Exception e) {
                    return new LanguageResult("unknown", 0.0f);
                }
            }, 100); // Minimum 100 characters before detection
    
    // Write content progressively
    langWriter.write("Ceci est un document en français. ");
    langWriter.write("Il contient plusieurs phrases pour la détection. ");
    langWriter.write("La détection devrait identifier le français.");
    
    // Check detection results
    if (langWriter.hasMinimumContent()) {
        LanguageResult detected = langWriter.getDetectedLanguage();
        if (detected != null) {
            System.out.println("Detected: " + detected.getLanguage());
            System.out.println("Confidence: " + detected.getConfidence());
        }
    }
    
    langWriter.close();
    String fullText = stringWriter.toString();
    
} catch (IOException e) {
    System.err.println("Language detection error: " + e.getMessage());
}

Text Translation

// Basic text translation
DefaultTranslator translator = new DefaultTranslator();

if (translator.isAvailable()) {
    try {
        // Translate to English (auto-detect source)
        String frenchText = "Bonjour, comment allez-vous?";
        String englishText = translator.translate(frenchText, "en");
        System.out.println("Translation: " + englishText);
        
        // Translate with explicit source language
        String germanText = translator.translate(englishText, "en", "de");
        System.out.println("German: " + germanText);
        
    } catch (TikaException | IOException e) {
        System.err.println("Translation failed: " + e.getMessage());
    }
}

// Check supported languages
Set<String> sourceLanguages = translator.getSupportedSourceLanguages();
Set<String> targetLanguages = translator.getSupportedTargetLanguages();
System.out.println("Source languages: " + sourceLanguages.size());
System.out.println("Target languages: " + targetLanguages.size());

Configured Translation Service

// Configure translation service
Properties config = new Properties();
config.setProperty("translator.service.url", "https://api.translate.service.com");
config.setProperty("translator.api.key", "your-api-key");
config.setProperty("translator.timeout", "30000");
config.setProperty("translator.maxLength", "5000");

DefaultTranslator translator = new DefaultTranslator(config);
translator.setMaxTextLength(10000);
translator.setTimeout(60000);

// Check if specific translation is supported
boolean canTranslate = translator.isSupported("fr", "en");
if (canTranslate) {
    String translation = translator.translate("Texte français", "fr", "en");
    System.out.println("Translated: " + translation);
}

Multilingual Document Processing

public class MultilingualProcessor {
    
    private final LanguageIdentifier detector;
    private final Translator translator;
    
    public MultilingualProcessor() {
        this.detector = new LanguageIdentifier();
        this.translator = new DefaultTranslator();
    }
    
    public ProcessedDocument processDocument(InputStream input) 
            throws IOException, SAXException, TikaException {
        
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler textHandler = new BodyContentHandler();
        ProfilingHandler langHandler = new ProfilingHandler(detector);
        
        TeeContentHandler teeHandler = new TeeContentHandler(textHandler, langHandler);
        
        Metadata metadata = new Metadata();
        parser.parse(input, teeHandler, metadata, new ParseContext());
        
        String content = textHandler.toString();
        String language = langHandler.getLanguage();
        double confidence = langHandler.getConfidence();
        
        ProcessedDocument result = new ProcessedDocument();
        result.setOriginalContent(content);
        result.setDetectedLanguage(language);
        result.setLanguageConfidence(confidence);
        
        // Translate to English if not already English
        if (!"en".equals(language) && translator.isSupported(language, "en")) {
            try {
                String translation = translator.translate(content, language, "en");
                result.setEnglishTranslation(translation);
            } catch (Exception e) {
                result.addWarning("Translation failed: " + e.getMessage());
            }
        }
        
        return result;
    }
}

Language Detection Comparison

// Compare different detection methods
public class LanguageDetectionComparison {
    
    public void compareDetectors(String text) {
        // Classic detector
        LanguageIdentifier classic = new LanguageIdentifier();
        String classicResult = classic.identify(text);
        double classicConfidence = classic.getConfidence(text);
        
        System.out.println("Classic detector:");
        System.out.println("  Language: " + classicResult);
        System.out.println("  Confidence: " + classicConfidence);
        System.out.println("  Certain: " + classic.isReasonablyCertain(text));
        
        // Modern detector with detailed results
        LanguageResult detailedResult = classic.identifyWithConfidence(text);
        System.out.println("\nDetailed result:");
        System.out.println("  Language: " + detailedResult.getLanguage());
        System.out.println("  Confidence: " + detailedResult.getConfidence());
        System.out.println("  Raw score: " + detailedResult.getRawScore());
        System.out.println("  Reliable (>0.8): " + detailedResult.isReliable(0.8f));
        System.out.println("  Language name: " + detailedResult.getLanguageName());
    }
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-tika--tika-core

docs

configuration.md

content-processing.md

detection.md

embedded-extraction.md

embedding.md

exceptions.md

index.md

io-utilities.md

language.md

metadata.md

mime-types.md

parsing.md

pipes.md

process-forking.md

rendering.md

tile.json