0
# Language Support
1
2
Multi-language OCR with support for 100+ languages, custom language models, and language detection capabilities. Tesseract provides comprehensive support for different scripts, writing systems, and language-specific recognition optimizations.
3
4
## Capabilities
5
6
### Language Initialization
7
8
Initialize Tesseract with one or more languages for recognition.
9
10
```java { .api }
11
public class TessBaseAPI {
12
// Language initialization
13
public int Init(String datapath, String language, int oem);
14
public int Init(String datapath, String language);
15
16
// Language information
17
public native @Cast("const char*") BytePointer GetInitLanguagesAsString();
18
public void GetLoadedLanguagesAsVector(StringVector langs);
19
public void GetAvailableLanguagesAsVector(StringVector langs);
20
}
21
```
22
23
**Language Code Format:**
24
- **Single language**: `"eng"` (English), `"fra"` (French), `"deu"` (German)
25
- **Multiple languages**: `"eng+fra+deu"` (English + French + German)
26
- **Script-based**: `"chi_sim"` (Simplified Chinese), `"ara"` (Arabic)
27
28
#### Usage Example
29
30
```java
31
TessBaseAPI api = new TessBaseAPI();
32
33
// Initialize with single language
34
int result = api.Init(null, "eng");
35
36
// Initialize with multiple languages
37
int result2 = api.Init(null, "eng+fra+deu");
38
39
// Initialize with mixed scripts
40
int result3 = api.Init(null, "eng+ara+chi_sim");
41
42
// Check which languages were loaded
43
BytePointer loadedLangsPtr = api.GetInitLanguagesAsString();
44
String loadedLangs = loadedLangsPtr.getString();
45
System.out.println("Loaded languages: " + loadedLangs);
46
loadedLangsPtr.deallocate();
47
48
// Get available languages as vector
49
StringVector availableLangs = new StringVector();
50
api.GetAvailableLanguagesAsVector(availableLangs);
51
52
System.out.println("Available languages:");
53
for (int i = 0; i < availableLangs.size(); i++) {
54
System.out.println(" " + availableLangs.get(i));
55
}
56
```
57
58
### Common Language Codes
59
60
**Latin Script Languages:**
61
```java
62
// Western European
63
"eng" // English
64
"fra" // French
65
"deu" // German
66
"ita" // Italian
67
"spa" // Spanish
68
"por" // Portuguese
69
"nld" // Dutch
70
"dan" // Danish
71
"nor" // Norwegian
72
"swe" // Swedish
73
"fin" // Finnish
74
75
// Eastern European
76
"pol" // Polish
77
"ces" // Czech
78
"slk" // Slovak
79
"hun" // Hungarian
80
"ron" // Romanian
81
"hrv" // Croatian
82
"slv" // Slovenian
83
"est" // Estonian
84
"lav" // Latvian
85
"lit" // Lithuanian
86
```
87
88
**Non-Latin Script Languages:**
89
```java
90
// Cyrillic
91
"rus" // Russian
92
"ukr" // Ukrainian
93
"bul" // Bulgarian
94
"srp" // Serbian
95
"mkd" // Macedonian
96
"bel" // Belarusian
97
98
// Arabic Script
99
"ara" // Arabic
100
"fas" // Persian (Farsi)
101
"urd" // Urdu
102
"pus" // Pashto
103
104
// Asian Scripts
105
"chi_sim" // Simplified Chinese
106
"chi_tra" // Traditional Chinese
107
"jpn" // Japanese
108
"kor" // Korean
109
"tha" // Thai
110
"vie" // Vietnamese
111
"khm" // Khmer (Cambodian)
112
"lao" // Lao
113
114
// Indic Scripts
115
"hin" // Hindi
116
"ben" // Bengali
117
"guj" // Gujarati
118
"pan" // Punjabi
119
"tel" // Telugu
120
"kan" // Kannada
121
"mal" // Malayalam
122
"tam" // Tamil
123
"ori" // Odia
124
"san" // Sanskrit
125
126
// Other Scripts
127
"heb" // Hebrew
128
"ell" // Greek
129
"amh" // Amharic
130
"geo" // Georgian
131
"arm" // Armenian
132
```
133
134
#### Language Selection Example
135
136
```java
137
public class MultiLanguageOCR {
138
139
public static String recognizeWithLanguageDetection(PIX image) {
140
TessBaseAPI api = new TessBaseAPI();
141
142
try {
143
// Try common language combinations based on context
144
String[] languageSets = {
145
"eng", // English only
146
"eng+fra+deu", // Western European
147
"eng+spa+por", // English + Iberian
148
"eng+rus+ukr", // English + Slavic Cyrillic
149
"eng+ara", // English + Arabic
150
"eng+chi_sim+jpn+kor" // English + East Asian
151
};
152
153
String bestResult = "";
154
int bestConfidence = 0;
155
156
for (String langs : languageSets) {
157
api.End(); // Clean up previous initialization
158
159
if (api.Init(null, langs) == 0) {
160
api.SetImage(image);
161
String text = api.GetUTF8Text();
162
int confidence = api.MeanTextConf();
163
164
System.out.println("Languages: " + langs + ", Confidence: " + confidence);
165
166
if (confidence > bestConfidence) {
167
bestConfidence = confidence;
168
bestResult = text;
169
}
170
}
171
}
172
173
return bestResult;
174
175
} finally {
176
api.End();
177
}
178
}
179
}
180
```
181
182
### Script and Writing Direction Detection
183
184
Automatic detection of script types and text direction for proper processing.
185
186
```java { .api }
187
public class PageIterator {
188
// Orientation and script information
189
public void Orientation(int[] orientation, int[] writing_direction,
190
int[] textline_order, float[] deskew_angle);
191
}
192
193
public class ResultIterator {
194
// Language detection per word
195
public String WordRecognitionLanguage();
196
public int WordDirection();
197
public boolean ParagraphIsLtr();
198
}
199
200
// Writing direction constants
201
public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;
202
public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;
203
public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;
204
205
// Script direction constants
206
public static final int DIR_NEUTRAL = 0; // Neutral characters
207
public static final int DIR_LEFT_TO_RIGHT = 1; // LTR scripts (Latin, Cyrillic)
208
public static final int DIR_RIGHT_TO_LEFT = 2; // RTL scripts (Arabic, Hebrew)
209
public static final int DIR_MIX = 3; // Mixed direction text
210
```
211
212
#### Usage Example
213
214
```java
215
TessBaseAPI api = new TessBaseAPI();
216
api.Init(null, "eng+ara+heb"); // Mixed LTR/RTL languages
217
api.SetImage(image);
218
219
ResultIterator resultIt = api.GetIterator();
220
resultIt.Begin();
221
222
// Analyze text direction and language per word
223
do {
224
String word = resultIt.GetUTF8Text(RIL_WORD);
225
String wordLang = resultIt.WordRecognitionLanguage();
226
int direction = resultIt.WordDirection();
227
228
String directionName = switch (direction) {
229
case DIR_LEFT_TO_RIGHT -> "LTR";
230
case DIR_RIGHT_TO_LEFT -> "RTL";
231
case DIR_MIX -> "Mixed";
232
default -> "Neutral";
233
};
234
235
System.out.printf("Word: '%s' Language: %s Direction: %s\n",
236
word, wordLang, directionName);
237
238
} while (resultIt.Next(RIL_WORD));
239
240
// Check paragraph direction
241
resultIt.Begin();
242
if (resultIt.IsAtBeginningOf(RIL_PARA)) {
243
boolean isLtr = resultIt.ParagraphIsLtr();
244
System.out.println("Paragraph direction: " +
245
(isLtr ? "Left-to-Right" : "Right-to-Left"));
246
}
247
```
248
249
### Language-Specific Configuration
250
251
Optimize recognition for specific languages and scripts.
252
253
#### Arabic Script Configuration
254
255
```java
256
TessBaseAPI api = new TessBaseAPI();
257
api.Init(null, "ara");
258
259
// Arabic-specific optimizations
260
api.SetVariable("textord_arabic_text", "1");
261
api.SetVariable("textord_use_cjk_fp_model", "0");
262
api.SetVariable("preserve_interword_spaces", "1");
263
264
// Enable bidirectional text support
265
api.SetPageSegMode(PSM_AUTO);
266
```
267
268
#### Chinese/Japanese/Korean (CJK) Configuration
269
270
```java
271
TessBaseAPI api = new TessBaseAPI();
272
api.Init(null, "chi_sim+jpn+kor");
273
274
// CJK-specific optimizations
275
api.SetVariable("textord_use_cjk_fp_model", "1");
276
api.SetVariable("language_model_penalty_non_dict_word", "0.25");
277
api.SetVariable("language_model_penalty_non_freq_dict_word", "0.15");
278
279
// Vertical text support
280
api.SetPageSegMode(PSM_AUTO);
281
api.SetVariable("textord_tabfind_vertical_text", "1");
282
```
283
284
#### Indic Script Configuration
285
286
```java
287
TessBaseAPI api = new TessBaseAPI();
288
api.Init(null, "hin+ben+guj");
289
290
// Indic script optimizations
291
api.SetVariable("textord_use_cjk_fp_model", "0");
292
api.SetVariable("preserve_interword_spaces", "1");
293
api.SetVariable("segment_penalty_dict_nonword", "1.25");
294
```
295
296
### Custom Language Models
297
298
Work with custom trained language models and specialized vocabularies.
299
300
#### Loading Custom Models
301
302
```java
303
// Custom language models are placed in tessdata directory
304
// with naming convention: <lang>.traineddata
305
306
TessBaseAPI api = new TessBaseAPI();
307
308
// Load custom model (place custom_eng.traineddata in tessdata)
309
int result = api.Init("/path/to/custom/tessdata", "custom_eng");
310
311
// Combine custom with standard models
312
int result2 = api.Init("/path/to/tessdata", "eng+custom_domain");
313
314
// Use specialized models for specific domains
315
int result3 = api.Init("/path/to/tessdata", "eng_medical"); // Medical terminology
316
int result4 = api.Init("/path/to/tessdata", "eng_legal"); // Legal documents
317
```
318
319
#### Custom Vocabulary Configuration
320
321
```java
322
TessBaseAPI api = new TessBaseAPI();
323
api.Init(null, "eng");
324
325
// Load custom word list (one word per line in tessdata/eng.user-words)
326
api.SetVariable("load_system_dawg", "1");
327
api.SetVariable("load_freq_dawg", "1");
328
api.SetVariable("load_unambig_dawg", "1");
329
330
// Adjust language model penalties for custom vocabulary
331
api.SetVariable("language_model_penalty_non_dict_word", "0.3");
332
api.SetVariable("language_model_penalty_non_freq_dict_word", "0.2");
333
334
// Enable user patterns (tessdata/eng.user-patterns)
335
api.SetVariable("user_patterns_suffix", "user-patterns");
336
```
337
338
### Multilingual Document Processing
339
340
Handle documents with mixed languages and scripts.
341
342
#### Language Switching Strategy
343
344
```java
345
public class MultilingualProcessor {
346
347
public static class LanguageRegion {
348
public String language;
349
public int left, top, right, bottom;
350
public double confidence;
351
}
352
353
public static String processMultilingualDocument(PIX image) {
354
TessBaseAPI api = new TessBaseAPI();
355
StringBuilder result = new StringBuilder();
356
357
try {
358
// Step 1: Detect layout and potential language regions
359
api.Init(null, "osd"); // Orientation and Script Detection
360
api.SetPageSegMode(PSM_OSD_ONLY);
361
api.SetImage(image);
362
363
// Get orientation info
364
PageIterator pageIt = api.AnalyseLayout();
365
// ... orientation detection logic ...
366
367
// Step 2: Process with multiple language models
368
String[] languageTests = {
369
"eng", "fra", "deu", "spa", "ita", // Latin scripts
370
"rus", "ukr", "bul", // Cyrillic
371
"ara", "fas", // Arabic
372
"chi_sim", "jpn", "kor" // CJK
373
};
374
375
api.End();
376
377
// Test each language and find best matches per region
378
Map<String, Double> languageConfidences = new HashMap<>();
379
380
for (String lang : languageTests) {
381
api.Init(null, lang);
382
api.SetImage(image);
383
384
String text = api.GetUTF8Text();
385
int confidence = api.MeanTextConf();
386
387
if (confidence > 70 && !text.trim().isEmpty()) {
388
languageConfidences.put(lang, (double) confidence);
389
}
390
391
api.End();
392
}
393
394
// Step 3: Use best language combination
395
String bestLanguages = determineBestLanguageSet(languageConfidences);
396
397
api.Init(null, bestLanguages);
398
api.SetPageSegMode(PSM_AUTO);
399
api.SetImage(image);
400
401
result.append(api.GetUTF8Text());
402
403
} finally {
404
api.End();
405
}
406
407
return result.toString();
408
}
409
410
private static String determineBestLanguageSet(Map<String, Double> confidences) {
411
// Logic to combine compatible languages based on confidence scores
412
List<String> topLanguages = confidences.entrySet().stream()
413
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
414
.limit(3)
415
.map(Map.Entry::getKey)
416
.collect(Collectors.toList());
417
418
return String.join("+", topLanguages);
419
}
420
}
421
```
422
423
### Language Model Information
424
425
Access information about loaded language models and their capabilities.
426
427
```java { .api }
428
public class TessBaseAPI {
429
// Language information
430
public String GetInitLanguagesAsString();
431
public void GetLoadedLanguagesAsVector(StringVector langs);
432
public void GetAvailableLanguagesAsVector(StringVector langs);
433
}
434
435
public class ResultIterator {
436
// Per-word language detection
437
public String WordRecognitionLanguage();
438
}
439
```
440
441
#### Language Introspection Example
442
443
```java
444
TessBaseAPI api = new TessBaseAPI();
445
api.Init(null, "eng+fra+deu+ara+chi_sim");
446
447
// Get comprehensive language information
448
System.out.println("Initialized languages: " + api.GetInitLanguagesAsString());
449
450
StringVector loaded = new StringVector();
451
api.GetLoadedLanguagesAsVector(loaded);
452
System.out.println("Loaded language models:");
453
for (int i = 0; i < loaded.size(); i++) {
454
System.out.println(" " + loaded.get(i));
455
}
456
457
StringVector available = new StringVector();
458
api.GetAvailableLanguagesAsVector(available);
459
System.out.println("Available language models:");
460
for (int i = 0; i < available.size(); i++) {
461
System.out.println(" " + available.get(i));
462
}
463
464
// Analyze language detection per word
465
api.SetImage(multilingualImage);
466
ResultIterator resultIt = api.GetIterator();
467
resultIt.Begin();
468
469
Map<String, Integer> langCounts = new HashMap<>();
470
do {
471
String wordLang = resultIt.WordRecognitionLanguage();
472
langCounts.merge(wordLang, 1, Integer::sum);
473
} while (resultIt.Next(RIL_WORD));
474
475
System.out.println("Language distribution in document:");
476
langCounts.forEach((lang, count) ->
477
System.out.println(" " + lang + ": " + count + " words"));
478
```
479
480
### Error Handling and Language Fallbacks
481
482
Handle missing language models and provide fallback strategies.
483
484
```java
485
public class RobustLanguageOCR {
486
487
public static String recognizeWithFallback(PIX image, String preferredLangs) {
488
TessBaseAPI api = new TessBaseAPI();
489
490
try {
491
// Try preferred languages first
492
if (api.Init(null, preferredLangs) == 0) {
493
api.SetImage(image);
494
String result = api.GetUTF8Text();
495
int confidence = api.MeanTextConf();
496
497
if (confidence > 60) { // Good confidence
498
return result;
499
}
500
}
501
502
// Fallback to English if preferred languages fail
503
api.End();
504
if (api.Init(null, "eng") == 0) {
505
api.SetImage(image);
506
String result = api.GetUTF8Text();
507
System.out.println("Fell back to English recognition");
508
return result;
509
}
510
511
throw new RuntimeException("No language models could be loaded");
512
513
} finally {
514
api.End();
515
}
516
}
517
518
public static boolean isLanguageAvailable(String language) {
519
TessBaseAPI api = new TessBaseAPI();
520
try {
521
int result = api.Init(null, language);
522
return (result == 0);
523
} finally {
524
api.End();
525
}
526
}
527
528
public static List<String> getWorkingLanguages(String[] candidates) {
529
List<String> working = new ArrayList<>();
530
531
for (String lang : candidates) {
532
if (isLanguageAvailable(lang)) {
533
working.add(lang);
534
} else {
535
System.out.println("Language model not available: " + lang);
536
}
537
}
538
539
return working;
540
}
541
}
542
```
543
544
## Types
545
546
### Language Code Examples
547
548
```java { .api }
549
// Common language codes (ISO 639-3)
550
public static final String LANG_ENGLISH = "eng";
551
public static final String LANG_FRENCH = "fra";
552
public static final String LANG_GERMAN = "deu";
553
public static final String LANG_SPANISH = "spa";
554
public static final String LANG_ITALIAN = "ita";
555
public static final String LANG_PORTUGUESE = "por";
556
public static final String LANG_RUSSIAN = "rus";
557
public static final String LANG_ARABIC = "ara";
558
public static final String LANG_CHINESE_SIMPLIFIED = "chi_sim";
559
public static final String LANG_CHINESE_TRADITIONAL = "chi_tra";
560
public static final String LANG_JAPANESE = "jpn";
561
public static final String LANG_KOREAN = "kor";
562
public static final String LANG_HINDI = "hin";
563
public static final String LANG_HEBREW = "heb";
564
```
565
566
### Text Direction Constants
567
568
```java { .api }
569
public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;
570
public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;
571
public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;
572
573
public static final int DIR_NEUTRAL = 0;
574
public static final int DIR_LEFT_TO_RIGHT = 1;
575
public static final int DIR_RIGHT_TO_LEFT = 2;
576
public static final int DIR_MIX = 3;
577
```
578
579
### Language Information Structures
580
581
```java { .api }
582
// String vector for language lists
583
public class StringVector {
584
public long size();
585
public String get(long i);
586
// Used by GetLoadedLanguagesAsVector and GetAvailableLanguagesAsVector
587
}
588
```