0
# Configuration and Constants
1
2
Tesseract.js provides extensive configuration options through constants for language selection, OCR engine modes, page segmentation modes, and logging control.
3
4
## Capabilities
5
6
### Language Constants
7
8
Complete set of language codes for all supported languages in Tesseract.js.
9
10
```javascript { .api }
11
/**
12
* Language codes for all supported OCR languages
13
* Each property maps a descriptive name to the Tesseract language code
14
*/
15
const languages: {
16
AFR: 'afr'; // Afrikaans
17
AMH: 'amh'; // Amharic
18
ARA: 'ara'; // Arabic
19
ASM: 'asm'; // Assamese
20
AZE: 'aze'; // Azerbaijani
21
AZE_CYRL: 'aze_cyrl'; // Azerbaijani - Cyrillic
22
BEL: 'bel'; // Belarusian
23
BEN: 'ben'; // Bengali
24
BOD: 'bod'; // Tibetan
25
BOS: 'bos'; // Bosnian
26
BUL: 'bul'; // Bulgarian
27
CAT: 'cat'; // Catalan; Valencian
28
CEB: 'ceb'; // Cebuano
29
CES: 'ces'; // Czech
30
CHI_SIM: 'chi_sim'; // Chinese - Simplified
31
CHI_TRA: 'chi_tra'; // Chinese - Traditional
32
CHR: 'chr'; // Cherokee
33
CYM: 'cym'; // Welsh
34
DAN: 'dan'; // Danish
35
DEU: 'deu'; // German
36
DZO: 'dzo'; // Dzongkha
37
ELL: 'ell'; // Greek, Modern (1453-)
38
ENG: 'eng'; // English
39
ENM: 'enm'; // English, Middle (1100-1500)
40
EPO: 'epo'; // Esperanto
41
EST: 'est'; // Estonian
42
EUS: 'eus'; // Basque
43
FAS: 'fas'; // Persian
44
FIN: 'fin'; // Finnish
45
FRA: 'fra'; // French
46
FRK: 'frk'; // German Fraktur
47
FRM: 'frm'; // French, Middle (ca. 1400-1600)
48
GLE: 'gle'; // Irish
49
GLG: 'glg'; // Galician
50
GRC: 'grc'; // Greek, Ancient (-1453)
51
GUJ: 'guj'; // Gujarati
52
HAT: 'hat'; // Haitian; Haitian Creole
53
HEB: 'heb'; // Hebrew
54
HIN: 'hin'; // Hindi
55
HRV: 'hrv'; // Croatian
56
HUN: 'hun'; // Hungarian
57
IKU: 'iku'; // Inuktitut
58
IND: 'ind'; // Indonesian
59
ISL: 'isl'; // Icelandic
60
ITA: 'ita'; // Italian
61
ITA_OLD: 'ita_old'; // Italian - Old
62
JAV: 'jav'; // Javanese
63
JPN: 'jpn'; // Japanese
64
KAN: 'kan'; // Kannada
65
KAT: 'kat'; // Georgian
66
KAT_OLD: 'kat_old'; // Georgian - Old
67
KAZ: 'kaz'; // Kazakh
68
KHM: 'khm'; // Central Khmer
69
KIR: 'kir'; // Kirghiz; Kyrgyz
70
KOR: 'kor'; // Korean
71
KUR: 'kur'; // Kurdish
72
LAO: 'lao'; // Lao
73
LAT: 'lat'; // Latin
74
LAV: 'lav'; // Latvian
75
LIT: 'lit'; // Lithuanian
76
MAL: 'mal'; // Malayalam
77
MAR: 'mar'; // Marathi
78
MKD: 'mkd'; // Macedonian
79
MLT: 'mlt'; // Maltese
80
MSA: 'msa'; // Malay
81
MYA: 'mya'; // Burmese
82
NEP: 'nep'; // Nepali
83
NLD: 'nld'; // Dutch; Flemish
84
NOR: 'nor'; // Norwegian
85
ORI: 'ori'; // Oriya
86
PAN: 'pan'; // Panjabi; Punjabi
87
POL: 'pol'; // Polish
88
POR: 'por'; // Portuguese
89
PUS: 'pus'; // Pushto; Pashto
90
RON: 'ron'; // Romanian; Moldavian; Moldovan
91
RUS: 'rus'; // Russian
92
SAN: 'san'; // Sanskrit
93
SIN: 'sin'; // Sinhala; Sinhalese
94
SLK: 'slk'; // Slovak
95
SLV: 'slv'; // Slovenian
96
SPA: 'spa'; // Spanish; Castilian
97
SPA_OLD: 'spa_old'; // Spanish; Castilian - Old
98
SQI: 'sqi'; // Albanian
99
SRP: 'srp'; // Serbian
100
SRP_LATN: 'srp_latn'; // Serbian - Latin
101
SWA: 'swa'; // Swahili
102
SWE: 'swe'; // Swedish
103
SYR: 'syr'; // Syriac
104
TAM: 'tam'; // Tamil
105
TEL: 'tel'; // Telugu
106
TGK: 'tgk'; // Tajik
107
TGL: 'tgl'; // Tagalog
108
THA: 'tha'; // Thai
109
TIR: 'tir'; // Tigrinya
110
TUR: 'tur'; // Turkish
111
UIG: 'uig'; // Uighur; Uyghur
112
UKR: 'ukr'; // Ukrainian
113
URD: 'urd'; // Urdu
114
UZB: 'uzb'; // Uzbek
115
UZB_CYRL: 'uzb_cyrl'; // Uzbek - Cyrillic
116
VIE: 'vie'; // Vietnamese
117
YID: 'yid'; // Yiddish
118
};
119
```
120
121
**Usage Examples:**
122
123
```javascript
124
import { createWorker, languages } from 'tesseract.js';
125
126
// Use language constants for better readability
127
const worker = await createWorker(languages.ENG);
128
const multiWorker = await createWorker([languages.ENG, languages.FRA, languages.DEU]);
129
130
// Useful for dynamic language selection
131
const userLanguage = 'french';
132
const langCode = userLanguage === 'french' ? languages.FRA : languages.ENG;
133
const dynamicWorker = await createWorker(langCode);
134
```
135
136
### OCR Engine Mode (OEM) Constants
137
138
Constants for selecting OCR engine modes with different accuracy/speed tradeoffs.
139
140
```javascript { .api }
141
/**
142
* OCR Engine Mode constants
143
* Controls which OCR engine is used for recognition
144
*/
145
enum OEM {
146
TESSERACT_ONLY = 0, // Legacy Tesseract engine only
147
LSTM_ONLY = 1, // LSTM neural networks only (default, best accuracy)
148
TESSERACT_LSTM_COMBINED = 2, // Legacy + LSTM combined
149
DEFAULT = 3 // Default (currently LSTM)
150
}
151
```
152
153
**Usage Examples:**
154
155
```javascript
156
import { createWorker, OEM } from 'tesseract.js';
157
158
// Use LSTM for best accuracy (default)
159
const lstmWorker = await createWorker('eng', OEM.LSTM_ONLY);
160
161
// Use legacy engine for compatibility
162
const legacyWorker = await createWorker('eng', OEM.TESSERACT_ONLY);
163
164
// Use combined mode for maximum coverage
165
const combinedWorker = await createWorker('eng', OEM.TESSERACT_LSTM_COMBINED);
166
```
167
168
### Page Segmentation Mode (PSM) Constants
169
170
Constants for controlling how Tesseract segments the page before recognition.
171
172
```javascript { .api }
173
/**
174
* Page Segmentation Mode constants
175
* Controls how the page is analyzed and segmented for OCR
176
*/
177
enum PSM {
178
OSD_ONLY = '0', // Orientation and script detection only
179
AUTO_OSD = '1', // Automatic page segmentation with OSD
180
AUTO_ONLY = '2', // Automatic page segmentation, no OSD
181
AUTO = '3', // Fully automatic page segmentation (default)
182
SINGLE_COLUMN = '4', // Single uniform column
183
SINGLE_BLOCK_VERT_TEXT = '5', // Single uniform block of vertically aligned text
184
SINGLE_BLOCK = '6', // Single uniform block
185
SINGLE_LINE = '7', // Single text line
186
SINGLE_WORD = '8', // Single word
187
CIRCLE_WORD = '9', // Single word in a circle
188
SINGLE_CHAR = '10', // Single character
189
SPARSE_TEXT = '11', // Sparse text, find as much text as possible
190
SPARSE_TEXT_OSD = '12', // Sparse text with OSD
191
RAW_LINE = '13' // Raw line, treat image as single text line
192
}
193
```
194
195
**Usage Examples:**
196
197
```javascript
198
import { createWorker, PSM } from 'tesseract.js';
199
200
const worker = await createWorker('eng');
201
202
// Set page segmentation mode for single line of text
203
await worker.setParameters({
204
tessedit_pageseg_mode: PSM.SINGLE_LINE
205
});
206
207
// For single word recognition
208
await worker.setParameters({
209
tessedit_pageseg_mode: PSM.SINGLE_WORD
210
});
211
212
// For documents with sparse text
213
await worker.setParameters({
214
tessedit_pageseg_mode: PSM.SPARSE_TEXT
215
});
216
```
217
218
### Logging Configuration
219
220
Enable or disable debug logging for OCR operations.
221
222
```javascript { .api }
223
/**
224
* Enables or disables debug logging for OCR operations
225
* @param logging - True to enable logging, false to disable
226
*/
227
function setLogging(logging: boolean): void;
228
```
229
230
**Usage Examples:**
231
232
```javascript
233
import { setLogging, createWorker } from 'tesseract.js';
234
235
// Enable global logging
236
setLogging(true);
237
238
// All OCR operations will now log debug information
239
const worker = await createWorker('eng');
240
const result = await worker.recognize('image.png');
241
242
// Disable logging
243
setLogging(false);
244
```
245
246
## Configuration Patterns
247
248
### Language Selection Strategies
249
250
```javascript
251
import { languages, createWorker } from 'tesseract.js';
252
253
// Multi-language document processing
254
async function createMultiLanguageWorker(regions) {
255
const langCodes = [];
256
257
if (regions.includes('europe')) {
258
langCodes.push(languages.ENG, languages.FRA, languages.DEU, languages.SPA);
259
}
260
261
if (regions.includes('asia')) {
262
langCodes.push(languages.JPN, languages.KOR, languages.CHI_SIM);
263
}
264
265
if (regions.includes('middle-east')) {
266
langCodes.push(languages.ARA, languages.HEB, languages.FAS);
267
}
268
269
return await createWorker(langCodes);
270
}
271
272
// Usage
273
const europeanWorker = await createMultiLanguageWorker(['europe']);
274
const globalWorker = await createMultiLanguageWorker(['europe', 'asia', 'middle-east']);
275
```
276
277
### OCR Optimization Settings
278
279
```javascript
280
import { createWorker, OEM, PSM } from 'tesseract.js';
281
282
async function createOptimizedWorker(documentType) {
283
let oem = OEM.LSTM_ONLY;
284
let psm = PSM.AUTO;
285
let params = {};
286
287
switch (documentType) {
288
case 'receipt':
289
psm = PSM.SPARSE_TEXT;
290
params.tessedit_char_whitelist = '0123456789.$';
291
break;
292
293
case 'license-plate':
294
psm = PSM.SINGLE_LINE;
295
params.tessedit_char_whitelist = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
296
break;
297
298
case 'form-field':
299
psm = PSM.SINGLE_WORD;
300
break;
301
302
case 'book-page':
303
psm = PSM.SINGLE_COLUMN;
304
break;
305
306
case 'legacy-document':
307
oem = OEM.TESSERACT_ONLY;
308
break;
309
}
310
311
const worker = await createWorker('eng', oem);
312
await worker.setParameters({
313
tessedit_pageseg_mode: psm,
314
...params
315
});
316
317
return worker;
318
}
319
320
// Usage
321
const receiptWorker = await createOptimizedWorker('receipt');
322
const plateWorker = await createOptimizedWorker('license-plate');
323
```
324
325
### Progressive Language Detection
326
327
```javascript
328
import { detect, createWorker, languages } from 'tesseract.js';
329
330
async function smartLanguageRecognition(imagePath) {
331
// First detect the script
332
const detection = await detect(imagePath);
333
const script = detection.data.script;
334
335
// Map scripts to likely languages
336
const scriptLanguageMap = {
337
'Latin': [languages.ENG, languages.FRA, languages.DEU, languages.SPA],
338
'Han': [languages.CHI_SIM, languages.CHI_TRA],
339
'Hiragana': [languages.JPN],
340
'Arabic': [languages.ARA],
341
'Cyrillic': [languages.RUS, languages.UKR, languages.BUL],
342
'Devanagari': [languages.HIN, languages.NEP]
343
};
344
345
const candidateLanguages = scriptLanguageMap[script] || [languages.ENG];
346
347
// Try recognition with script-appropriate languages
348
const worker = await createWorker(candidateLanguages);
349
const result = await worker.recognize(imagePath);
350
await worker.terminate();
351
352
return {
353
detectedScript: script,
354
usedLanguages: candidateLanguages,
355
text: result.data.text,
356
confidence: result.data.confidence
357
};
358
}
359
360
// Usage
361
const smartResult = await smartLanguageRecognition('multilingual-doc.png');
362
console.log(`Detected ${smartResult.detectedScript} script`);
363
console.log(`Used languages: ${smartResult.usedLanguages.join(', ')}`);
364
```
365
366
### Environment-Specific Configuration
367
368
```javascript
369
import { createWorker } from 'tesseract.js';
370
371
async function createEnvironmentOptimizedWorker() {
372
const isBrowser = typeof window !== 'undefined';
373
const isNode = typeof process !== 'undefined' && process.versions?.node;
374
375
const options = {
376
logger: (m) => console.log(`OCR: ${m.status} - ${m.progress}%`)
377
};
378
379
if (isBrowser) {
380
// Browser-specific optimizations
381
options.workerBlobURL = true;
382
options.gzip = true;
383
} else if (isNode) {
384
// Node.js-specific optimizations
385
options.workerBlobURL = false;
386
options.cacheMethod = 'none'; // Disable caching in server environments
387
}
388
389
return await createWorker('eng', undefined, options);
390
}
391
392
// Usage
393
const worker = await createEnvironmentOptimizedWorker();
394
```
395
396
## Advanced Parameter Configuration
397
398
```javascript
399
import { createWorker, PSM } from 'tesseract.js';
400
401
async function createHighPrecisionWorker() {
402
const worker = await createWorker('eng');
403
404
// Configure for maximum accuracy
405
await worker.setParameters({
406
// Page segmentation
407
tessedit_pageseg_mode: PSM.AUTO,
408
409
// Character recognition
410
tessedit_char_whitelist: '', // Allow all characters
411
tessedit_char_blacklist: '', // Block no characters
412
413
// Word recognition
414
preserve_interword_spaces: '1',
415
416
// Quality settings
417
user_defined_dpi: '300',
418
419
// Advanced Tesseract parameters
420
tessedit_do_invert: '0',
421
tessedit_create_hocr: '1',
422
tessedit_create_tsv: '1'
423
});
424
425
return worker;
426
}
427
428
// Usage for high-accuracy document processing
429
const precisionWorker = await createHighPrecisionWorker();
430
const result = await precisionWorker.recognize('high-quality-document.png', {}, {
431
text: true,
432
hocr: true,
433
tsv: true,
434
pdf: true
435
});
436
```