JavaCPP platform aggregator for Tesseract OCR native libraries providing cross-platform OCR capabilities in Java applications
—
Multi-language OCR with support for 100+ languages, custom language models, and language detection capabilities. Tesseract provides comprehensive support for different scripts, writing systems, and language-specific recognition optimizations.
Initialize Tesseract with one or more languages for recognition.
public class TessBaseAPI {
// Language initialization
public int Init(String datapath, String language, int oem);
public int Init(String datapath, String language);
// Language information
public native @Cast("const char*") BytePointer GetInitLanguagesAsString();
public void GetLoadedLanguagesAsVector(StringVector langs);
public void GetAvailableLanguagesAsVector(StringVector langs);
}Language Code Format:
"eng" (English), "fra" (French), "deu" (German)"eng+fra+deu" (English + French + German)"chi_sim" (Simplified Chinese), "ara" (Arabic)TessBaseAPI api = new TessBaseAPI();
// Initialize with single language
int result = api.Init(null, "eng");
// Initialize with multiple languages
int result2 = api.Init(null, "eng+fra+deu");
// Initialize with mixed scripts
int result3 = api.Init(null, "eng+ara+chi_sim");
// Check which languages were loaded
BytePointer loadedLangsPtr = api.GetInitLanguagesAsString();
String loadedLangs = loadedLangsPtr.getString();
System.out.println("Loaded languages: " + loadedLangs);
loadedLangsPtr.deallocate();
// Get available languages as vector
StringVector availableLangs = new StringVector();
api.GetAvailableLanguagesAsVector(availableLangs);
System.out.println("Available languages:");
for (int i = 0; i < availableLangs.size(); i++) {
System.out.println(" " + availableLangs.get(i));
}Latin Script Languages:
// Western European
"eng" // English
"fra" // French
"deu" // German
"ita" // Italian
"spa" // Spanish
"por" // Portuguese
"nld" // Dutch
"dan" // Danish
"nor" // Norwegian
"swe" // Swedish
"fin" // Finnish
// Eastern European
"pol" // Polish
"ces" // Czech
"slk" // Slovak
"hun" // Hungarian
"ron" // Romanian
"hrv" // Croatian
"slv" // Slovenian
"est" // Estonian
"lav" // Latvian
"lit" // LithuanianNon-Latin Script Languages:
// Cyrillic
"rus" // Russian
"ukr" // Ukrainian
"bul" // Bulgarian
"srp" // Serbian
"mkd" // Macedonian
"bel" // Belarusian
// Arabic Script
"ara" // Arabic
"fas" // Persian (Farsi)
"urd" // Urdu
"pus" // Pashto
// Asian Scripts
"chi_sim" // Simplified Chinese
"chi_tra" // Traditional Chinese
"jpn" // Japanese
"kor" // Korean
"tha" // Thai
"vie" // Vietnamese
"khm" // Khmer (Cambodian)
"lao" // Lao
// Indic Scripts
"hin" // Hindi
"ben" // Bengali
"guj" // Gujarati
"pan" // Punjabi
"tel" // Telugu
"kan" // Kannada
"mal" // Malayalam
"tam" // Tamil
"ori" // Odia
"san" // Sanskrit
// Other Scripts
"heb" // Hebrew
"ell" // Greek
"amh" // Amharic
"geo" // Georgian
"arm" // Armenianpublic class MultiLanguageOCR {
public static String recognizeWithLanguageDetection(PIX image) {
TessBaseAPI api = new TessBaseAPI();
try {
// Try common language combinations based on context
String[] languageSets = {
"eng", // English only
"eng+fra+deu", // Western European
"eng+spa+por", // English + Iberian
"eng+rus+ukr", // English + Slavic Cyrillic
"eng+ara", // English + Arabic
"eng+chi_sim+jpn+kor" // English + East Asian
};
String bestResult = "";
int bestConfidence = 0;
for (String langs : languageSets) {
api.End(); // Clean up previous initialization
if (api.Init(null, langs) == 0) {
api.SetImage(image);
String text = api.GetUTF8Text();
int confidence = api.MeanTextConf();
System.out.println("Languages: " + langs + ", Confidence: " + confidence);
if (confidence > bestConfidence) {
bestConfidence = confidence;
bestResult = text;
}
}
}
return bestResult;
} finally {
api.End();
}
}
}Automatic detection of script types and text direction for proper processing.
public class PageIterator {
// Orientation and script information
public void Orientation(int[] orientation, int[] writing_direction,
int[] textline_order, float[] deskew_angle);
}
public class ResultIterator {
// Language detection per word
public String WordRecognitionLanguage();
public int WordDirection();
public boolean ParagraphIsLtr();
}
// Writing direction constants
public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;
public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;
public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;
// Script direction constants
public static final int DIR_NEUTRAL = 0; // Neutral characters
public static final int DIR_LEFT_TO_RIGHT = 1; // LTR scripts (Latin, Cyrillic)
public static final int DIR_RIGHT_TO_LEFT = 2; // RTL scripts (Arabic, Hebrew)
public static final int DIR_MIX = 3; // Mixed direction textTessBaseAPI api = new TessBaseAPI();
api.Init(null, "eng+ara+heb"); // Mixed LTR/RTL languages
api.SetImage(image);
ResultIterator resultIt = api.GetIterator();
resultIt.Begin();
// Analyze text direction and language per word
do {
String word = resultIt.GetUTF8Text(RIL_WORD);
String wordLang = resultIt.WordRecognitionLanguage();
int direction = resultIt.WordDirection();
String directionName = switch (direction) {
case DIR_LEFT_TO_RIGHT -> "LTR";
case DIR_RIGHT_TO_LEFT -> "RTL";
case DIR_MIX -> "Mixed";
default -> "Neutral";
};
System.out.printf("Word: '%s' Language: %s Direction: %s\n",
word, wordLang, directionName);
} while (resultIt.Next(RIL_WORD));
// Check paragraph direction
resultIt.Begin();
if (resultIt.IsAtBeginningOf(RIL_PARA)) {
boolean isLtr = resultIt.ParagraphIsLtr();
System.out.println("Paragraph direction: " +
(isLtr ? "Left-to-Right" : "Right-to-Left"));
}Optimize recognition for specific languages and scripts.
TessBaseAPI api = new TessBaseAPI();
api.Init(null, "ara");
// Arabic-specific optimizations
api.SetVariable("textord_arabic_text", "1");
api.SetVariable("textord_use_cjk_fp_model", "0");
api.SetVariable("preserve_interword_spaces", "1");
// Enable bidirectional text support
api.SetPageSegMode(PSM_AUTO);TessBaseAPI api = new TessBaseAPI();
api.Init(null, "chi_sim+jpn+kor");
// CJK-specific optimizations
api.SetVariable("textord_use_cjk_fp_model", "1");
api.SetVariable("language_model_penalty_non_dict_word", "0.25");
api.SetVariable("language_model_penalty_non_freq_dict_word", "0.15");
// Vertical text support
api.SetPageSegMode(PSM_AUTO);
api.SetVariable("textord_tabfind_vertical_text", "1");TessBaseAPI api = new TessBaseAPI();
api.Init(null, "hin+ben+guj");
// Indic script optimizations
api.SetVariable("textord_use_cjk_fp_model", "0");
api.SetVariable("preserve_interword_spaces", "1");
api.SetVariable("segment_penalty_dict_nonword", "1.25");Work with custom trained language models and specialized vocabularies.
// Custom language models are placed in tessdata directory
// with naming convention: <lang>.traineddata
TessBaseAPI api = new TessBaseAPI();
// Load custom model (place custom_eng.traineddata in tessdata)
int result = api.Init("/path/to/custom/tessdata", "custom_eng");
// Combine custom with standard models
int result2 = api.Init("/path/to/tessdata", "eng+custom_domain");
// Use specialized models for specific domains
int result3 = api.Init("/path/to/tessdata", "eng_medical"); // Medical terminology
int result4 = api.Init("/path/to/tessdata", "eng_legal"); // Legal documentsTessBaseAPI api = new TessBaseAPI();
api.Init(null, "eng");
// Load custom word list (one word per line in tessdata/eng.user-words)
api.SetVariable("load_system_dawg", "1");
api.SetVariable("load_freq_dawg", "1");
api.SetVariable("load_unambig_dawg", "1");
// Adjust language model penalties for custom vocabulary
api.SetVariable("language_model_penalty_non_dict_word", "0.3");
api.SetVariable("language_model_penalty_non_freq_dict_word", "0.2");
// Enable user patterns (tessdata/eng.user-patterns)
api.SetVariable("user_patterns_suffix", "user-patterns");Handle documents with mixed languages and scripts.
public class MultilingualProcessor {
public static class LanguageRegion {
public String language;
public int left, top, right, bottom;
public double confidence;
}
public static String processMultilingualDocument(PIX image) {
TessBaseAPI api = new TessBaseAPI();
StringBuilder result = new StringBuilder();
try {
// Step 1: Detect layout and potential language regions
api.Init(null, "osd"); // Orientation and Script Detection
api.SetPageSegMode(PSM_OSD_ONLY);
api.SetImage(image);
// Get orientation info
PageIterator pageIt = api.AnalyseLayout();
// ... orientation detection logic ...
// Step 2: Process with multiple language models
String[] languageTests = {
"eng", "fra", "deu", "spa", "ita", // Latin scripts
"rus", "ukr", "bul", // Cyrillic
"ara", "fas", // Arabic
"chi_sim", "jpn", "kor" // CJK
};
api.End();
// Test each language and find best matches per region
Map<String, Double> languageConfidences = new HashMap<>();
for (String lang : languageTests) {
api.Init(null, lang);
api.SetImage(image);
String text = api.GetUTF8Text();
int confidence = api.MeanTextConf();
if (confidence > 70 && !text.trim().isEmpty()) {
languageConfidences.put(lang, (double) confidence);
}
api.End();
}
// Step 3: Use best language combination
String bestLanguages = determineBestLanguageSet(languageConfidences);
api.Init(null, bestLanguages);
api.SetPageSegMode(PSM_AUTO);
api.SetImage(image);
result.append(api.GetUTF8Text());
} finally {
api.End();
}
return result.toString();
}
private static String determineBestLanguageSet(Map<String, Double> confidences) {
// Logic to combine compatible languages based on confidence scores
List<String> topLanguages = confidences.entrySet().stream()
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
.limit(3)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
return String.join("+", topLanguages);
}
}Access information about loaded language models and their capabilities.
public class TessBaseAPI {
// Language information
public String GetInitLanguagesAsString();
public void GetLoadedLanguagesAsVector(StringVector langs);
public void GetAvailableLanguagesAsVector(StringVector langs);
}
public class ResultIterator {
// Per-word language detection
public String WordRecognitionLanguage();
}TessBaseAPI api = new TessBaseAPI();
api.Init(null, "eng+fra+deu+ara+chi_sim");
// Get comprehensive language information
System.out.println("Initialized languages: " + api.GetInitLanguagesAsString());
StringVector loaded = new StringVector();
api.GetLoadedLanguagesAsVector(loaded);
System.out.println("Loaded language models:");
for (int i = 0; i < loaded.size(); i++) {
System.out.println(" " + loaded.get(i));
}
StringVector available = new StringVector();
api.GetAvailableLanguagesAsVector(available);
System.out.println("Available language models:");
for (int i = 0; i < available.size(); i++) {
System.out.println(" " + available.get(i));
}
// Analyze language detection per word
api.SetImage(multilingualImage);
ResultIterator resultIt = api.GetIterator();
resultIt.Begin();
Map<String, Integer> langCounts = new HashMap<>();
do {
String wordLang = resultIt.WordRecognitionLanguage();
langCounts.merge(wordLang, 1, Integer::sum);
} while (resultIt.Next(RIL_WORD));
System.out.println("Language distribution in document:");
langCounts.forEach((lang, count) ->
System.out.println(" " + lang + ": " + count + " words"));Handle missing language models and provide fallback strategies.
public class RobustLanguageOCR {
public static String recognizeWithFallback(PIX image, String preferredLangs) {
TessBaseAPI api = new TessBaseAPI();
try {
// Try preferred languages first
if (api.Init(null, preferredLangs) == 0) {
api.SetImage(image);
String result = api.GetUTF8Text();
int confidence = api.MeanTextConf();
if (confidence > 60) { // Good confidence
return result;
}
}
// Fallback to English if preferred languages fail
api.End();
if (api.Init(null, "eng") == 0) {
api.SetImage(image);
String result = api.GetUTF8Text();
System.out.println("Fell back to English recognition");
return result;
}
throw new RuntimeException("No language models could be loaded");
} finally {
api.End();
}
}
public static boolean isLanguageAvailable(String language) {
TessBaseAPI api = new TessBaseAPI();
try {
int result = api.Init(null, language);
return (result == 0);
} finally {
api.End();
}
}
public static List<String> getWorkingLanguages(String[] candidates) {
List<String> working = new ArrayList<>();
for (String lang : candidates) {
if (isLanguageAvailable(lang)) {
working.add(lang);
} else {
System.out.println("Language model not available: " + lang);
}
}
return working;
}
}// Common language codes (ISO 639-3)
public static final String LANG_ENGLISH = "eng";
public static final String LANG_FRENCH = "fra";
public static final String LANG_GERMAN = "deu";
public static final String LANG_SPANISH = "spa";
public static final String LANG_ITALIAN = "ita";
public static final String LANG_PORTUGUESE = "por";
public static final String LANG_RUSSIAN = "rus";
public static final String LANG_ARABIC = "ara";
public static final String LANG_CHINESE_SIMPLIFIED = "chi_sim";
public static final String LANG_CHINESE_TRADITIONAL = "chi_tra";
public static final String LANG_JAPANESE = "jpn";
public static final String LANG_KOREAN = "kor";
public static final String LANG_HINDI = "hin";
public static final String LANG_HEBREW = "heb";public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;
public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;
public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;
public static final int DIR_NEUTRAL = 0;
public static final int DIR_LEFT_TO_RIGHT = 1;
public static final int DIR_RIGHT_TO_LEFT = 2;
public static final int DIR_MIX = 3;// String vector for language lists
public class StringVector {
public long size();
public String get(long i);
// Used by GetLoadedLanguagesAsVector and GetAvailableLanguagesAsVector
}Install with Tessl CLI
npx tessl i tessl/maven-org-bytedeco--tesseract-platform