Apache Tika Core provides the foundational APIs for detecting and extracting metadata and structured text content from various document formats.
—
Language processing capabilities including automatic language detection, text profiling, and translation services for multilingual document processing and content analysis.
Classic language identification using n-gram analysis and statistical models for detecting document language.
/**
* Statistical language identifier using n-gram analysis
*/
public class LanguageIdentifier {
/**
* Creates LanguageIdentifier with default language profiles
*/
public LanguageIdentifier();
/**
* Creates LanguageIdentifier with custom profile directory
* @param profileDirectory Directory containing language profile files
*/
public LanguageIdentifier(String profileDirectory);
/**
* Identifies language of text content
* @param content Text content to analyze
* @return Language code (ISO 639-1) of detected language
*/
public String identify(String content);
/**
* Identifies language with confidence score
* @param content Text content to analyze
* @return LanguageResult containing language and confidence
*/
public LanguageResult identifyWithConfidence(String content);
/**
* Checks if language can be reliably identified
* @param content Text content to check
* @return true if language detection confidence is high enough
*/
public boolean isReasonablyCertain(String content);
/**
* Gets confidence score for detected language
* @param content Text content to analyze
* @return Confidence score between 0.0 and 1.0
*/
public double getConfidence(String content);
/**
* Gets all supported language codes
* @return Set of supported ISO 639-1 language codes
*/
public Set<String> getSupportedLanguages();
/**
* Checks if specific language is supported
* @param language ISO 639-1 language code to check
* @return true if language detection is supported
*/
public boolean isLanguageSupported(String language);
}Content handler for building language profiles during document parsing for improved detection accuracy.
/**
* Content handler that builds language profiles for detection
*/
public class ProfilingHandler extends DefaultHandler {
/**
* Creates ProfilingHandler for language profiling
*/
public ProfilingHandler();
/**
* Creates ProfilingHandler with custom LanguageIdentifier
* @param identifier LanguageIdentifier to use for profiling
*/
public ProfilingHandler(LanguageIdentifier identifier);
/**
* Gets the detected language after profiling
* @return ISO 639-1 language code of detected language
*/
public String getLanguage();
/**
* Gets confidence score of language detection
* @return Confidence score between 0.0 and 1.0
*/
public double getConfidence();
/**
* Checks if enough content has been processed for reliable detection
* @return true if sufficient content analyzed
*/
public boolean hasEnoughData();
/**
* Gets the amount of text content processed
* @return Number of characters analyzed
*/
public int getContentLength();
}Modern interface for pluggable language detection implementations with support for multiple algorithms.
/**
* Interface for modern language detection implementations
*/
public interface LanguageDetector {
/**
* Detects language of input text
* @param text Text content to analyze
* @return LanguageResult containing detected language and confidence
* @throws IOException if detection process fails
*/
LanguageResult detect(String text) throws IOException;
/**
* Detects multiple possible languages with probabilities
* @param text Text content to analyze
* @return List of LanguageResult objects sorted by confidence
* @throws IOException if detection process fails
*/
List<LanguageResult> detectAll(String text) throws IOException;
/**
* Checks if detector supports specific language
* @param language ISO 639-1 language code
* @return true if language is supported for detection
*/
boolean isSupported(String language);
/**
* Gets all supported languages
* @return Set of supported ISO 639-1 language codes
*/
Set<String> getSupportedLanguages();
/**
* Loads detector from configuration
* @param config Configuration parameters for detector
* @throws IOException if loading fails
*/
void loadModels(Map<String, Object> config) throws IOException;
/**
* Checks if detector is ready for use
* @return true if detector is loaded and ready
*/
boolean isAvailable();
}Result object containing detected language information and confidence metrics.
/**
* Result of language detection containing language and confidence information
*/
public class LanguageResult {
/**
* Creates LanguageResult with language and confidence
* @param language ISO 639-1 language code
* @param confidence Confidence score (0.0 to 1.0)
*/
public LanguageResult(String language, float confidence);
/**
* Creates LanguageResult with additional properties
* @param language ISO 639-1 language code
* @param confidence Confidence score
* @param rawScore Raw detection score from algorithm
*/
public LanguageResult(String language, float confidence, double rawScore);
/**
* Gets detected language code
* @return ISO 639-1 language code (e.g., "en", "fr", "de")
*/
public String getLanguage();
/**
* Gets confidence score of detection
* @return Confidence between 0.0 (lowest) and 1.0 (highest)
*/
public float getConfidence();
/**
* Gets raw algorithm score
* @return Raw score from detection algorithm
*/
public double getRawScore();
/**
* Checks if detection confidence is above threshold
* @param threshold Minimum confidence threshold
* @return true if confidence exceeds threshold
*/
public boolean isReliable(float threshold);
/**
* Gets human-readable language name
* @return Full language name in English
*/
public String getLanguageName();
/**
* Compares results by confidence (descending order)
* @param other LanguageResult to compare with
* @return Comparison result for sorting
*/
public int compareTo(LanguageResult other);
}Writer wrapper that performs language detection on written content for streaming analysis.
/**
* Writer that performs language detection on content as it's written
*/
public class LanguageWriter extends Writer {
/**
* Creates LanguageWriter with underlying writer and detector
* @param writer Underlying Writer to delegate to
* @param detector LanguageDetector for analysis
*/
public LanguageWriter(Writer writer, LanguageDetector detector);
/**
* Creates LanguageWriter with detector and minimum content threshold
* @param writer Underlying Writer
* @param detector LanguageDetector for analysis
* @param minLength Minimum content length before detection
*/
public LanguageWriter(Writer writer, LanguageDetector detector, int minLength);
/**
* Gets current detected language
* @return LanguageResult with current detection, or null if insufficient data
*/
public LanguageResult getDetectedLanguage();
/**
* Gets all possible languages detected
* @return List of LanguageResult objects sorted by confidence
*/
public List<LanguageResult> getAllDetectedLanguages();
/**
* Checks if enough content has been written for reliable detection
* @return true if sufficient content for detection
*/
public boolean hasMinimumContent();
/**
* Gets length of content analyzed so far
* @return Number of characters written and analyzed
*/
public int getContentLength();
/**
* Writes character array to underlying writer and updates detection
* @param cbuf Character array to write
* @param off Offset in character array
* @param len Number of characters to write
* @throws IOException if write operation fails
*/
@Override
public void write(char[] cbuf, int off, int len) throws IOException;
/**
* Writes string to underlying writer and updates detection
* @param str String to write
* @throws IOException if write operation fails
*/
@Override
public void write(String str) throws IOException;
/**
* Flushes underlying writer
* @throws IOException if flush operation fails
*/
@Override
public void flush() throws IOException;
/**
* Closes underlying writer
* @throws IOException if close operation fails
*/
@Override
public void close() throws IOException;
}Interface for text translation services supporting multiple translation backends and language pairs.
/**
* Interface for text translation services
*/
public interface Translator {
/**
* Translates text to target language
* @param text Text to translate
* @param targetLanguage Target language code (ISO 639-1)
* @return Translated text
* @throws TikaException if translation fails
* @throws IOException if communication with translation service fails
*/
String translate(String text, String targetLanguage) throws TikaException, IOException;
/**
* Translates text from source to target language
* @param text Text to translate
* @param sourceLanguage Source language code (ISO 639-1)
* @param targetLanguage Target language code (ISO 639-1)
* @return Translated text
* @throws TikaException if translation fails
* @throws IOException if communication fails
*/
String translate(String text, String sourceLanguage, String targetLanguage)
throws TikaException, IOException;
/**
* Gets all supported source languages
* @return Set of supported source language codes
*/
Set<String> getSupportedSourceLanguages();
/**
* Gets all supported target languages
* @return Set of supported target language codes
*/
Set<String> getSupportedTargetLanguages();
/**
* Checks if translation from source to target language is supported
* @param sourceLanguage Source language code
* @param targetLanguage Target language code
* @return true if translation pair is supported
*/
boolean isSupported(String sourceLanguage, String targetLanguage);
/**
* Checks if translator service is available
* @return true if translator can be used
*/
boolean isAvailable();
/**
* Gets maximum text length supported for translation
* @return Maximum characters per translation request
*/
int getMaxTextLength();
}Default implementation of Translator interface providing basic translation capabilities.
/**
* Default translator implementation with configurable backends
*/
public class DefaultTranslator implements Translator {
/**
* Creates DefaultTranslator with default configuration
*/
public DefaultTranslator();
/**
* Creates DefaultTranslator with custom configuration
* @param config Configuration properties for translator
*/
public DefaultTranslator(Properties config);
/**
* Sets translation service endpoint URL
* @param serviceUrl URL of translation service
*/
public void setServiceUrl(String serviceUrl);
/**
* Gets current service endpoint URL
* @return URL of translation service
*/
public String getServiceUrl();
/**
* Sets API key for translation service
* @param apiKey API key for service authentication
*/
public void setApiKey(String apiKey);
/**
* Sets maximum text length for single translation request
* @param maxLength Maximum characters per request
*/
public void setMaxTextLength(int maxLength);
/**
* Sets timeout for translation requests
* @param timeoutMs Timeout in milliseconds
*/
public void setTimeout(int timeoutMs);
/**
* Translates text to target language with auto-detection
* @param text Text to translate
* @param targetLanguage Target language code
* @return Translated text
* @throws TikaException if translation fails
* @throws IOException if service communication fails
*/
@Override
public String translate(String text, String targetLanguage) throws TikaException, IOException;
/**
* Translates text with explicit source language
* @param text Text to translate
* @param sourceLanguage Source language code
* @param targetLanguage Target language code
* @return Translated text
* @throws TikaException if translation fails
* @throws IOException if service communication fails
*/
@Override
public String translate(String text, String sourceLanguage, String targetLanguage)
throws TikaException, IOException;
/**
* Gets supported source languages from service
* @return Set of source language codes
*/
@Override
public Set<String> getSupportedSourceLanguages();
/**
* Gets supported target languages from service
* @return Set of target language codes
*/
@Override
public Set<String> getSupportedTargetLanguages();
/**
* Checks if language pair is supported
* @param sourceLanguage Source language code
* @param targetLanguage Target language code
* @return true if translation is supported
*/
@Override
public boolean isSupported(String sourceLanguage, String targetLanguage);
/**
* Checks if translation service is available
* @return true if service can be reached
*/
@Override
public boolean isAvailable();
/**
* Gets maximum text length per request
* @return Maximum characters per translation
*/
@Override
public int getMaxTextLength();
}// Simple language identification
LanguageIdentifier identifier = new LanguageIdentifier();
String englishText = "This is a sample document written in English.";
String detectedLang = identifier.identify(englishText);
System.out.println("Detected language: " + detectedLang); // "en"
// Check detection confidence
if (identifier.isReasonablyCertain(englishText)) {
double confidence = identifier.getConfidence(englishText);
System.out.println("Confidence: " + confidence);
}
// Get all supported languages
Set<String> supported = identifier.getSupportedLanguages();
System.out.println("Supported languages: " + supported);// Modern language detection with detailed results
LanguageIdentifier identifier = new LanguageIdentifier();
String mixedText = "Bonjour, this is a mixed language document with français.";
LanguageResult result = identifier.identifyWithConfidence(mixedText);
System.out.println("Language: " + result.getLanguage());
System.out.println("Confidence: " + result.getConfidence());
System.out.println("Language name: " + result.getLanguageName());
// Check reliability
if (result.isReliable(0.8f)) {
System.out.println("High confidence detection");
}// Detect language while parsing document
try {
AutoDetectParser parser = new AutoDetectParser();
ProfilingHandler langHandler = new ProfilingHandler();
BodyContentHandler textHandler = new BodyContentHandler();
// Use TeeContentHandler to process with both handlers
TeeContentHandler teeHandler = new TeeContentHandler(langHandler, textHandler);
Metadata metadata = new Metadata();
parser.parse(inputStream, teeHandler, metadata, new ParseContext());
// Get detected language and content
String language = langHandler.getLanguage();
double confidence = langHandler.getConfidence();
String content = textHandler.toString();
System.out.println("Document language: " + language + " (" + confidence + ")");
System.out.println("Content length: " + langHandler.getContentLength());
} catch (Exception e) {
System.err.println("Language detection failed: " + e.getMessage());
}// Detect language as content is written
try (StringWriter stringWriter = new StringWriter()) {
LanguageIdentifier detector = new LanguageIdentifier();
LanguageWriter langWriter = new LanguageWriter(stringWriter,
text -> {
try {
return detector.identifyWithConfidence(text);
} catch (Exception e) {
return new LanguageResult("unknown", 0.0f);
}
}, 100); // Minimum 100 characters before detection
// Write content progressively
langWriter.write("Ceci est un document en français. ");
langWriter.write("Il contient plusieurs phrases pour la détection. ");
langWriter.write("La détection devrait identifier le français.");
// Check detection results
if (langWriter.hasMinimumContent()) {
LanguageResult detected = langWriter.getDetectedLanguage();
if (detected != null) {
System.out.println("Detected: " + detected.getLanguage());
System.out.println("Confidence: " + detected.getConfidence());
}
}
langWriter.close();
String fullText = stringWriter.toString();
} catch (IOException e) {
System.err.println("Language detection error: " + e.getMessage());
}// Basic text translation
DefaultTranslator translator = new DefaultTranslator();
if (translator.isAvailable()) {
try {
// Translate to English (auto-detect source)
String frenchText = "Bonjour, comment allez-vous?";
String englishText = translator.translate(frenchText, "en");
System.out.println("Translation: " + englishText);
// Translate with explicit source language
String germanText = translator.translate(englishText, "en", "de");
System.out.println("German: " + germanText);
} catch (TikaException | IOException e) {
System.err.println("Translation failed: " + e.getMessage());
}
}
// Check supported languages
Set<String> sourceLanguages = translator.getSupportedSourceLanguages();
Set<String> targetLanguages = translator.getSupportedTargetLanguages();
System.out.println("Source languages: " + sourceLanguages.size());
System.out.println("Target languages: " + targetLanguages.size());// Configure translation service
Properties config = new Properties();
config.setProperty("translator.service.url", "https://api.translate.service.com");
config.setProperty("translator.api.key", "your-api-key");
config.setProperty("translator.timeout", "30000");
config.setProperty("translator.maxLength", "5000");
DefaultTranslator translator = new DefaultTranslator(config);
translator.setMaxTextLength(10000);
translator.setTimeout(60000);
// Check if specific translation is supported
boolean canTranslate = translator.isSupported("fr", "en");
if (canTranslate) {
String translation = translator.translate("Texte français", "fr", "en");
System.out.println("Translated: " + translation);
}public class MultilingualProcessor {
private final LanguageIdentifier detector;
private final Translator translator;
public MultilingualProcessor() {
this.detector = new LanguageIdentifier();
this.translator = new DefaultTranslator();
}
public ProcessedDocument processDocument(InputStream input)
throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler textHandler = new BodyContentHandler();
ProfilingHandler langHandler = new ProfilingHandler(detector);
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, langHandler);
Metadata metadata = new Metadata();
parser.parse(input, teeHandler, metadata, new ParseContext());
String content = textHandler.toString();
String language = langHandler.getLanguage();
double confidence = langHandler.getConfidence();
ProcessedDocument result = new ProcessedDocument();
result.setOriginalContent(content);
result.setDetectedLanguage(language);
result.setLanguageConfidence(confidence);
// Translate to English if not already English
if (!"en".equals(language) && translator.isSupported(language, "en")) {
try {
String translation = translator.translate(content, language, "en");
result.setEnglishTranslation(translation);
} catch (Exception e) {
result.addWarning("Translation failed: " + e.getMessage());
}
}
return result;
}
}// Compare different detection methods
public class LanguageDetectionComparison {
public void compareDetectors(String text) {
// Classic detector
LanguageIdentifier classic = new LanguageIdentifier();
String classicResult = classic.identify(text);
double classicConfidence = classic.getConfidence(text);
System.out.println("Classic detector:");
System.out.println(" Language: " + classicResult);
System.out.println(" Confidence: " + classicConfidence);
System.out.println(" Certain: " + classic.isReasonablyCertain(text));
// Modern detector with detailed results
LanguageResult detailedResult = classic.identifyWithConfidence(text);
System.out.println("\nDetailed result:");
System.out.println(" Language: " + detailedResult.getLanguage());
System.out.println(" Confidence: " + detailedResult.getConfidence());
System.out.println(" Raw score: " + detailedResult.getRawScore());
System.out.println(" Reliable (>0.8): " + detailedResult.isReliable(0.8f));
System.out.println(" Language name: " + detailedResult.getLanguageName());
}
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-tika--tika-core