or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcore-ocr-engine.mdindex.mdlanguage-support.mdlayout-analysis.mdoutput-renderers.mdresult-navigation.md

language-support.mddocs/

0

# Language Support

1

2

Multi-language OCR with support for 100+ languages, custom language models, and language detection capabilities. Tesseract provides comprehensive support for different scripts, writing systems, and language-specific recognition optimizations.

3

4

## Capabilities

5

6

### Language Initialization

7

8

Initialize Tesseract with one or more languages for recognition.

9

10

```java { .api }

11

public class TessBaseAPI {

12

// Language initialization

13

public int Init(String datapath, String language, int oem);

14

public int Init(String datapath, String language);

15

16

// Language information

17

public native @Cast("const char*") BytePointer GetInitLanguagesAsString();

18

public void GetLoadedLanguagesAsVector(StringVector langs);

19

public void GetAvailableLanguagesAsVector(StringVector langs);

20

}

21

```

22

23

**Language Code Format:**

24

- **Single language**: `"eng"` (English), `"fra"` (French), `"deu"` (German)

25

- **Multiple languages**: `"eng+fra+deu"` (English + French + German)

26

- **Script-based**: `"chi_sim"` (Simplified Chinese), `"ara"` (Arabic)

27

28

#### Usage Example

29

30

```java

31

TessBaseAPI api = new TessBaseAPI();

32

33

// Initialize with single language

34

int result = api.Init(null, "eng");

35

36

// Initialize with multiple languages

37

int result2 = api.Init(null, "eng+fra+deu");

38

39

// Initialize with mixed scripts

40

int result3 = api.Init(null, "eng+ara+chi_sim");

41

42

// Check which languages were loaded

43

BytePointer loadedLangsPtr = api.GetInitLanguagesAsString();

44

String loadedLangs = loadedLangsPtr.getString();

45

System.out.println("Loaded languages: " + loadedLangs);

46

loadedLangsPtr.deallocate();

47

48

// Get available languages as vector

49

StringVector availableLangs = new StringVector();

50

api.GetAvailableLanguagesAsVector(availableLangs);

51

52

System.out.println("Available languages:");

53

for (int i = 0; i < availableLangs.size(); i++) {

54

System.out.println(" " + availableLangs.get(i));

55

}

56

```

57

58

### Common Language Codes

59

60

**Latin Script Languages:**

61

```java

62

// Western European

63

"eng" // English

64

"fra" // French

65

"deu" // German

66

"ita" // Italian

67

"spa" // Spanish

68

"por" // Portuguese

69

"nld" // Dutch

70

"dan" // Danish

71

"nor" // Norwegian

72

"swe" // Swedish

73

"fin" // Finnish

74

75

// Eastern European

76

"pol" // Polish

77

"ces" // Czech

78

"slk" // Slovak

79

"hun" // Hungarian

80

"ron" // Romanian

81

"hrv" // Croatian

82

"slv" // Slovenian

83

"est" // Estonian

84

"lav" // Latvian

85

"lit" // Lithuanian

86

```

87

88

**Non-Latin Script Languages:**

89

```java

90

// Cyrillic

91

"rus" // Russian

92

"ukr" // Ukrainian

93

"bul" // Bulgarian

94

"srp" // Serbian

95

"mkd" // Macedonian

96

"bel" // Belarusian

97

98

// Arabic Script

99

"ara" // Arabic

100

"fas" // Persian (Farsi)

101

"urd" // Urdu

102

"pus" // Pashto

103

104

// Asian Scripts

105

"chi_sim" // Simplified Chinese

106

"chi_tra" // Traditional Chinese

107

"jpn" // Japanese

108

"kor" // Korean

109

"tha" // Thai

110

"vie" // Vietnamese

111

"khm" // Khmer (Cambodian)

112

"lao" // Lao

113

114

// Indic Scripts

115

"hin" // Hindi

116

"ben" // Bengali

117

"guj" // Gujarati

118

"pan" // Punjabi

119

"tel" // Telugu

120

"kan" // Kannada

121

"mal" // Malayalam

122

"tam" // Tamil

123

"ori" // Odia

124

"san" // Sanskrit

125

126

// Other Scripts

127

"heb" // Hebrew

128

"ell" // Greek

129

"amh" // Amharic

130

"geo" // Georgian

131

"arm" // Armenian

132

```

133

134

#### Language Selection Example

135

136

```java

137

public class MultiLanguageOCR {

138

139

public static String recognizeWithLanguageDetection(PIX image) {

140

TessBaseAPI api = new TessBaseAPI();

141

142

try {

143

// Try common language combinations based on context

144

String[] languageSets = {

145

"eng", // English only

146

"eng+fra+deu", // Western European

147

"eng+spa+por", // English + Iberian

148

"eng+rus+ukr", // English + Slavic Cyrillic

149

"eng+ara", // English + Arabic

150

"eng+chi_sim+jpn+kor" // English + East Asian

151

};

152

153

String bestResult = "";

154

int bestConfidence = 0;

155

156

for (String langs : languageSets) {

157

api.End(); // Clean up previous initialization

158

159

if (api.Init(null, langs) == 0) {

160

api.SetImage(image);

161

String text = api.GetUTF8Text();

162

int confidence = api.MeanTextConf();

163

164

System.out.println("Languages: " + langs + ", Confidence: " + confidence);

165

166

if (confidence > bestConfidence) {

167

bestConfidence = confidence;

168

bestResult = text;

169

}

170

}

171

}

172

173

return bestResult;

174

175

} finally {

176

api.End();

177

}

178

}

179

}

180

```

181

182

### Script and Writing Direction Detection

183

184

Automatic detection of script types and text direction for proper processing.

185

186

```java { .api }

187

public class PageIterator {

188

// Orientation and script information

189

public void Orientation(int[] orientation, int[] writing_direction,

190

int[] textline_order, float[] deskew_angle);

191

}

192

193

public class ResultIterator {

194

// Language detection per word

195

public String WordRecognitionLanguage();

196

public int WordDirection();

197

public boolean ParagraphIsLtr();

198

}

199

200

// Writing direction constants

201

public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;

202

public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;

203

public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;

204

205

// Script direction constants

206

public static final int DIR_NEUTRAL = 0; // Neutral characters

207

public static final int DIR_LEFT_TO_RIGHT = 1; // LTR scripts (Latin, Cyrillic)

208

public static final int DIR_RIGHT_TO_LEFT = 2; // RTL scripts (Arabic, Hebrew)

209

public static final int DIR_MIX = 3; // Mixed direction text

210

```

211

212

#### Usage Example

213

214

```java

215

TessBaseAPI api = new TessBaseAPI();

216

api.Init(null, "eng+ara+heb"); // Mixed LTR/RTL languages

217

api.SetImage(image);

218

219

ResultIterator resultIt = api.GetIterator();

220

resultIt.Begin();

221

222

// Analyze text direction and language per word

223

do {

224

String word = resultIt.GetUTF8Text(RIL_WORD);

225

String wordLang = resultIt.WordRecognitionLanguage();

226

int direction = resultIt.WordDirection();

227

228

String directionName = switch (direction) {

229

case DIR_LEFT_TO_RIGHT -> "LTR";

230

case DIR_RIGHT_TO_LEFT -> "RTL";

231

case DIR_MIX -> "Mixed";

232

default -> "Neutral";

233

};

234

235

System.out.printf("Word: '%s' Language: %s Direction: %s\n",

236

word, wordLang, directionName);

237

238

} while (resultIt.Next(RIL_WORD));

239

240

// Check paragraph direction

241

resultIt.Begin();

242

if (resultIt.IsAtBeginningOf(RIL_PARA)) {

243

boolean isLtr = resultIt.ParagraphIsLtr();

244

System.out.println("Paragraph direction: " +

245

(isLtr ? "Left-to-Right" : "Right-to-Left"));

246

}

247

```

248

249

### Language-Specific Configuration

250

251

Optimize recognition for specific languages and scripts.

252

253

#### Arabic Script Configuration

254

255

```java

256

TessBaseAPI api = new TessBaseAPI();

257

api.Init(null, "ara");

258

259

// Arabic-specific optimizations

260

api.SetVariable("textord_arabic_text", "1");

261

api.SetVariable("textord_use_cjk_fp_model", "0");

262

api.SetVariable("preserve_interword_spaces", "1");

263

264

// Enable bidirectional text support

265

api.SetPageSegMode(PSM_AUTO);

266

```

267

268

#### Chinese/Japanese/Korean (CJK) Configuration

269

270

```java

271

TessBaseAPI api = new TessBaseAPI();

272

api.Init(null, "chi_sim+jpn+kor");

273

274

// CJK-specific optimizations

275

api.SetVariable("textord_use_cjk_fp_model", "1");

276

api.SetVariable("language_model_penalty_non_dict_word", "0.25");

277

api.SetVariable("language_model_penalty_non_freq_dict_word", "0.15");

278

279

// Vertical text support

280

api.SetPageSegMode(PSM_AUTO);

281

api.SetVariable("textord_tabfind_vertical_text", "1");

282

```

283

284

#### Indic Script Configuration

285

286

```java

287

TessBaseAPI api = new TessBaseAPI();

288

api.Init(null, "hin+ben+guj");

289

290

// Indic script optimizations

291

api.SetVariable("textord_use_cjk_fp_model", "0");

292

api.SetVariable("preserve_interword_spaces", "1");

293

api.SetVariable("segment_penalty_dict_nonword", "1.25");

294

```

295

296

### Custom Language Models

297

298

Work with custom trained language models and specialized vocabularies.

299

300

#### Loading Custom Models

301

302

```java

303

// Custom language models are placed in tessdata directory

304

// with naming convention: <lang>.traineddata

305

306

TessBaseAPI api = new TessBaseAPI();

307

308

// Load custom model (place custom_eng.traineddata in tessdata)

309

int result = api.Init("/path/to/custom/tessdata", "custom_eng");

310

311

// Combine custom with standard models

312

int result2 = api.Init("/path/to/tessdata", "eng+custom_domain");

313

314

// Use specialized models for specific domains

315

int result3 = api.Init("/path/to/tessdata", "eng_medical"); // Medical terminology

316

int result4 = api.Init("/path/to/tessdata", "eng_legal"); // Legal documents

317

```

318

319

#### Custom Vocabulary Configuration

320

321

```java

322

TessBaseAPI api = new TessBaseAPI();

323

api.Init(null, "eng");

324

325

// Load custom word list (one word per line in tessdata/eng.user-words)

326

api.SetVariable("load_system_dawg", "1");

327

api.SetVariable("load_freq_dawg", "1");

328

api.SetVariable("load_unambig_dawg", "1");

329

330

// Adjust language model penalties for custom vocabulary

331

api.SetVariable("language_model_penalty_non_dict_word", "0.3");

332

api.SetVariable("language_model_penalty_non_freq_dict_word", "0.2");

333

334

// Enable user patterns (tessdata/eng.user-patterns)

335

api.SetVariable("user_patterns_suffix", "user-patterns");

336

```

337

338

### Multilingual Document Processing

339

340

Handle documents with mixed languages and scripts.

341

342

#### Language Switching Strategy

343

344

```java

345

public class MultilingualProcessor {

346

347

public static class LanguageRegion {

348

public String language;

349

public int left, top, right, bottom;

350

public double confidence;

351

}

352

353

public static String processMultilingualDocument(PIX image) {

354

TessBaseAPI api = new TessBaseAPI();

355

StringBuilder result = new StringBuilder();

356

357

try {

358

// Step 1: Detect layout and potential language regions

359

api.Init(null, "osd"); // Orientation and Script Detection

360

api.SetPageSegMode(PSM_OSD_ONLY);

361

api.SetImage(image);

362

363

// Get orientation info

364

PageIterator pageIt = api.AnalyseLayout();

365

// ... orientation detection logic ...

366

367

// Step 2: Process with multiple language models

368

String[] languageTests = {

369

"eng", "fra", "deu", "spa", "ita", // Latin scripts

370

"rus", "ukr", "bul", // Cyrillic

371

"ara", "fas", // Arabic

372

"chi_sim", "jpn", "kor" // CJK

373

};

374

375

api.End();

376

377

// Test each language and find best matches per region

378

Map<String, Double> languageConfidences = new HashMap<>();

379

380

for (String lang : languageTests) {

381

api.Init(null, lang);

382

api.SetImage(image);

383

384

String text = api.GetUTF8Text();

385

int confidence = api.MeanTextConf();

386

387

if (confidence > 70 && !text.trim().isEmpty()) {

388

languageConfidences.put(lang, (double) confidence);

389

}

390

391

api.End();

392

}

393

394

// Step 3: Use best language combination

395

String bestLanguages = determineBestLanguageSet(languageConfidences);

396

397

api.Init(null, bestLanguages);

398

api.SetPageSegMode(PSM_AUTO);

399

api.SetImage(image);

400

401

result.append(api.GetUTF8Text());

402

403

} finally {

404

api.End();

405

}

406

407

return result.toString();

408

}

409

410

private static String determineBestLanguageSet(Map<String, Double> confidences) {

411

// Logic to combine compatible languages based on confidence scores

412

List<String> topLanguages = confidences.entrySet().stream()

413

.sorted(Map.Entry.<String, Double>comparingByValue().reversed())

414

.limit(3)

415

.map(Map.Entry::getKey)

416

.collect(Collectors.toList());

417

418

return String.join("+", topLanguages);

419

}

420

}

421

```

422

423

### Language Model Information

424

425

Access information about loaded language models and their capabilities.

426

427

```java { .api }

428

public class TessBaseAPI {

429

// Language information

430

public String GetInitLanguagesAsString();

431

public void GetLoadedLanguagesAsVector(StringVector langs);

432

public void GetAvailableLanguagesAsVector(StringVector langs);

433

}

434

435

public class ResultIterator {

436

// Per-word language detection

437

public String WordRecognitionLanguage();

438

}

439

```

440

441

#### Language Introspection Example

442

443

```java

444

TessBaseAPI api = new TessBaseAPI();

445

api.Init(null, "eng+fra+deu+ara+chi_sim");

446

447

// Get comprehensive language information

448

System.out.println("Initialized languages: " + api.GetInitLanguagesAsString());

449

450

StringVector loaded = new StringVector();

451

api.GetLoadedLanguagesAsVector(loaded);

452

System.out.println("Loaded language models:");

453

for (int i = 0; i < loaded.size(); i++) {

454

System.out.println(" " + loaded.get(i));

455

}

456

457

StringVector available = new StringVector();

458

api.GetAvailableLanguagesAsVector(available);

459

System.out.println("Available language models:");

460

for (int i = 0; i < available.size(); i++) {

461

System.out.println(" " + available.get(i));

462

}

463

464

// Analyze language detection per word

465

api.SetImage(multilingualImage);

466

ResultIterator resultIt = api.GetIterator();

467

resultIt.Begin();

468

469

Map<String, Integer> langCounts = new HashMap<>();

470

do {

471

String wordLang = resultIt.WordRecognitionLanguage();

472

langCounts.merge(wordLang, 1, Integer::sum);

473

} while (resultIt.Next(RIL_WORD));

474

475

System.out.println("Language distribution in document:");

476

langCounts.forEach((lang, count) ->

477

System.out.println(" " + lang + ": " + count + " words"));

478

```

479

480

### Error Handling and Language Fallbacks

481

482

Handle missing language models and provide fallback strategies.

483

484

```java

485

public class RobustLanguageOCR {

486

487

public static String recognizeWithFallback(PIX image, String preferredLangs) {

488

TessBaseAPI api = new TessBaseAPI();

489

490

try {

491

// Try preferred languages first

492

if (api.Init(null, preferredLangs) == 0) {

493

api.SetImage(image);

494

String result = api.GetUTF8Text();

495

int confidence = api.MeanTextConf();

496

497

if (confidence > 60) { // Good confidence

498

return result;

499

}

500

}

501

502

// Fallback to English if preferred languages fail

503

api.End();

504

if (api.Init(null, "eng") == 0) {

505

api.SetImage(image);

506

String result = api.GetUTF8Text();

507

System.out.println("Fell back to English recognition");

508

return result;

509

}

510

511

throw new RuntimeException("No language models could be loaded");

512

513

} finally {

514

api.End();

515

}

516

}

517

518

public static boolean isLanguageAvailable(String language) {

519

TessBaseAPI api = new TessBaseAPI();

520

try {

521

int result = api.Init(null, language);

522

return (result == 0);

523

} finally {

524

api.End();

525

}

526

}

527

528

public static List<String> getWorkingLanguages(String[] candidates) {

529

List<String> working = new ArrayList<>();

530

531

for (String lang : candidates) {

532

if (isLanguageAvailable(lang)) {

533

working.add(lang);

534

} else {

535

System.out.println("Language model not available: " + lang);

536

}

537

}

538

539

return working;

540

}

541

}

542

```

543

544

## Types

545

546

### Language Code Examples

547

548

```java { .api }

549

// Common language codes (ISO 639-3)

550

public static final String LANG_ENGLISH = "eng";

551

public static final String LANG_FRENCH = "fra";

552

public static final String LANG_GERMAN = "deu";

553

public static final String LANG_SPANISH = "spa";

554

public static final String LANG_ITALIAN = "ita";

555

public static final String LANG_PORTUGUESE = "por";

556

public static final String LANG_RUSSIAN = "rus";

557

public static final String LANG_ARABIC = "ara";

558

public static final String LANG_CHINESE_SIMPLIFIED = "chi_sim";

559

public static final String LANG_CHINESE_TRADITIONAL = "chi_tra";

560

public static final String LANG_JAPANESE = "jpn";

561

public static final String LANG_KOREAN = "kor";

562

public static final String LANG_HINDI = "hin";

563

public static final String LANG_HEBREW = "heb";

564

```

565

566

### Text Direction Constants

567

568

```java { .api }

569

public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0;

570

public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1;

571

public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2;

572

573

public static final int DIR_NEUTRAL = 0;

574

public static final int DIR_LEFT_TO_RIGHT = 1;

575

public static final int DIR_RIGHT_TO_LEFT = 2;

576

public static final int DIR_MIX = 3;

577

```

578

579

### Language Information Structures

580

581

```java { .api }

582

// String vector for language lists

583

public class StringVector {

584

public long size();

585

public String get(long i);

586

// Used by GetLoadedLanguagesAsVector and GetAvailableLanguagesAsVector

587

}

588

```