Pure JavaScript multilingual OCR library that brings the powerful Tesseract OCR engine to both browser and Node.js environments through WebAssembly
—
Tesseract.js provides extensive configuration options through constants for language selection, OCR engine modes, page segmentation modes, and logging control.
Complete set of language codes for all supported languages in Tesseract.js.
/**
* Language codes for all supported OCR languages
* Each property maps a descriptive name to the Tesseract language code
*/
const languages: {
AFR: 'afr'; // Afrikaans
AMH: 'amh'; // Amharic
ARA: 'ara'; // Arabic
ASM: 'asm'; // Assamese
AZE: 'aze'; // Azerbaijani
AZE_CYRL: 'aze_cyrl'; // Azerbaijani - Cyrillic
BEL: 'bel'; // Belarusian
BEN: 'ben'; // Bengali
BOD: 'bod'; // Tibetan
BOS: 'bos'; // Bosnian
BUL: 'bul'; // Bulgarian
CAT: 'cat'; // Catalan; Valencian
CEB: 'ceb'; // Cebuano
CES: 'ces'; // Czech
CHI_SIM: 'chi_sim'; // Chinese - Simplified
CHI_TRA: 'chi_tra'; // Chinese - Traditional
CHR: 'chr'; // Cherokee
CYM: 'cym'; // Welsh
DAN: 'dan'; // Danish
DEU: 'deu'; // German
DZO: 'dzo'; // Dzongkha
ELL: 'ell'; // Greek, Modern (1453-)
ENG: 'eng'; // English
ENM: 'enm'; // English, Middle (1100-1500)
EPO: 'epo'; // Esperanto
EST: 'est'; // Estonian
EUS: 'eus'; // Basque
FAS: 'fas'; // Persian
FIN: 'fin'; // Finnish
FRA: 'fra'; // French
FRK: 'frk'; // German Fraktur
FRM: 'frm'; // French, Middle (ca. 1400-1600)
GLE: 'gle'; // Irish
GLG: 'glg'; // Galician
GRC: 'grc'; // Greek, Ancient (-1453)
GUJ: 'guj'; // Gujarati
HAT: 'hat'; // Haitian; Haitian Creole
HEB: 'heb'; // Hebrew
HIN: 'hin'; // Hindi
HRV: 'hrv'; // Croatian
HUN: 'hun'; // Hungarian
IKU: 'iku'; // Inuktitut
IND: 'ind'; // Indonesian
ISL: 'isl'; // Icelandic
ITA: 'ita'; // Italian
ITA_OLD: 'ita_old'; // Italian - Old
JAV: 'jav'; // Javanese
JPN: 'jpn'; // Japanese
KAN: 'kan'; // Kannada
KAT: 'kat'; // Georgian
KAT_OLD: 'kat_old'; // Georgian - Old
KAZ: 'kaz'; // Kazakh
KHM: 'khm'; // Central Khmer
KIR: 'kir'; // Kirghiz; Kyrgyz
KOR: 'kor'; // Korean
KUR: 'kur'; // Kurdish
LAO: 'lao'; // Lao
LAT: 'lat'; // Latin
LAV: 'lav'; // Latvian
LIT: 'lit'; // Lithuanian
MAL: 'mal'; // Malayalam
MAR: 'mar'; // Marathi
MKD: 'mkd'; // Macedonian
MLT: 'mlt'; // Maltese
MSA: 'msa'; // Malay
MYA: 'mya'; // Burmese
NEP: 'nep'; // Nepali
NLD: 'nld'; // Dutch; Flemish
NOR: 'nor'; // Norwegian
ORI: 'ori'; // Oriya
PAN: 'pan'; // Panjabi; Punjabi
POL: 'pol'; // Polish
POR: 'por'; // Portuguese
PUS: 'pus'; // Pushto; Pashto
RON: 'ron'; // Romanian; Moldavian; Moldovan
RUS: 'rus'; // Russian
SAN: 'san'; // Sanskrit
SIN: 'sin'; // Sinhala; Sinhalese
SLK: 'slk'; // Slovak
SLV: 'slv'; // Slovenian
SPA: 'spa'; // Spanish; Castilian
SPA_OLD: 'spa_old'; // Spanish; Castilian - Old
SQI: 'sqi'; // Albanian
SRP: 'srp'; // Serbian
SRP_LATN: 'srp_latn'; // Serbian - Latin
SWA: 'swa'; // Swahili
SWE: 'swe'; // Swedish
SYR: 'syr'; // Syriac
TAM: 'tam'; // Tamil
TEL: 'tel'; // Telugu
TGK: 'tgk'; // Tajik
TGL: 'tgl'; // Tagalog
THA: 'tha'; // Thai
TIR: 'tir'; // Tigrinya
TUR: 'tur'; // Turkish
UIG: 'uig'; // Uighur; Uyghur
UKR: 'ukr'; // Ukrainian
URD: 'urd'; // Urdu
UZB: 'uzb'; // Uzbek
UZB_CYRL: 'uzb_cyrl'; // Uzbek - Cyrillic
VIE: 'vie'; // Vietnamese
YID: 'yid'; // Yiddish
};Usage Examples:
import { createWorker, languages } from 'tesseract.js';
// Use language constants for better readability
const worker = await createWorker(languages.ENG);
const multiWorker = await createWorker([languages.ENG, languages.FRA, languages.DEU]);
// Useful for dynamic language selection
const userLanguage = 'french';
const langCode = userLanguage === 'french' ? languages.FRA : languages.ENG;
const dynamicWorker = await createWorker(langCode);Constants for selecting OCR engine modes with different accuracy/speed tradeoffs.
/**
* OCR Engine Mode constants
* Controls which OCR engine is used for recognition
*/
enum OEM {
TESSERACT_ONLY = 0, // Legacy Tesseract engine only
LSTM_ONLY = 1, // LSTM neural networks only (default, best accuracy)
TESSERACT_LSTM_COMBINED = 2, // Legacy + LSTM combined
DEFAULT = 3 // Default (currently LSTM)
}Usage Examples:
import { createWorker, OEM } from 'tesseract.js';
// Use LSTM for best accuracy (default)
const lstmWorker = await createWorker('eng', OEM.LSTM_ONLY);
// Use legacy engine for compatibility
const legacyWorker = await createWorker('eng', OEM.TESSERACT_ONLY);
// Use combined mode for maximum coverage
const combinedWorker = await createWorker('eng', OEM.TESSERACT_LSTM_COMBINED);Constants for controlling how Tesseract segments the page before recognition.
/**
* Page Segmentation Mode constants
* Controls how the page is analyzed and segmented for OCR
*/
enum PSM {
OSD_ONLY = '0', // Orientation and script detection only
AUTO_OSD = '1', // Automatic page segmentation with OSD
AUTO_ONLY = '2', // Automatic page segmentation, no OSD
AUTO = '3', // Fully automatic page segmentation (default)
SINGLE_COLUMN = '4', // Single uniform column
SINGLE_BLOCK_VERT_TEXT = '5', // Single uniform block of vertically aligned text
SINGLE_BLOCK = '6', // Single uniform block
SINGLE_LINE = '7', // Single text line
SINGLE_WORD = '8', // Single word
CIRCLE_WORD = '9', // Single word in a circle
SINGLE_CHAR = '10', // Single character
SPARSE_TEXT = '11', // Sparse text, find as much text as possible
SPARSE_TEXT_OSD = '12', // Sparse text with OSD
RAW_LINE = '13' // Raw line, treat image as single text line
}Usage Examples:
import { createWorker, PSM } from 'tesseract.js';
const worker = await createWorker('eng');
// Set page segmentation mode for single line of text
await worker.setParameters({
tessedit_pageseg_mode: PSM.SINGLE_LINE
});
// For single word recognition
await worker.setParameters({
tessedit_pageseg_mode: PSM.SINGLE_WORD
});
// For documents with sparse text
await worker.setParameters({
tessedit_pageseg_mode: PSM.SPARSE_TEXT
});Enable or disable debug logging for OCR operations.
/**
* Enables or disables debug logging for OCR operations
* @param logging - True to enable logging, false to disable
*/
function setLogging(logging: boolean): void;Usage Examples:
import { setLogging, createWorker } from 'tesseract.js';
// Enable global logging
setLogging(true);
// All OCR operations will now log debug information
const worker = await createWorker('eng');
const result = await worker.recognize('image.png');
// Disable logging
setLogging(false);import { languages, createWorker } from 'tesseract.js';
// Multi-language document processing
async function createMultiLanguageWorker(regions) {
const langCodes = [];
if (regions.includes('europe')) {
langCodes.push(languages.ENG, languages.FRA, languages.DEU, languages.SPA);
}
if (regions.includes('asia')) {
langCodes.push(languages.JPN, languages.KOR, languages.CHI_SIM);
}
if (regions.includes('middle-east')) {
langCodes.push(languages.ARA, languages.HEB, languages.FAS);
}
return await createWorker(langCodes);
}
// Usage
const europeanWorker = await createMultiLanguageWorker(['europe']);
const globalWorker = await createMultiLanguageWorker(['europe', 'asia', 'middle-east']);import { createWorker, OEM, PSM } from 'tesseract.js';
async function createOptimizedWorker(documentType) {
let oem = OEM.LSTM_ONLY;
let psm = PSM.AUTO;
let params = {};
switch (documentType) {
case 'receipt':
psm = PSM.SPARSE_TEXT;
params.tessedit_char_whitelist = '0123456789.$';
break;
case 'license-plate':
psm = PSM.SINGLE_LINE;
params.tessedit_char_whitelist = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
break;
case 'form-field':
psm = PSM.SINGLE_WORD;
break;
case 'book-page':
psm = PSM.SINGLE_COLUMN;
break;
case 'legacy-document':
oem = OEM.TESSERACT_ONLY;
break;
}
const worker = await createWorker('eng', oem);
await worker.setParameters({
tessedit_pageseg_mode: psm,
...params
});
return worker;
}
// Usage
const receiptWorker = await createOptimizedWorker('receipt');
const plateWorker = await createOptimizedWorker('license-plate');import { detect, createWorker, languages } from 'tesseract.js';
async function smartLanguageRecognition(imagePath) {
// First detect the script
const detection = await detect(imagePath);
const script = detection.data.script;
// Map scripts to likely languages
const scriptLanguageMap = {
'Latin': [languages.ENG, languages.FRA, languages.DEU, languages.SPA],
'Han': [languages.CHI_SIM, languages.CHI_TRA],
'Hiragana': [languages.JPN],
'Arabic': [languages.ARA],
'Cyrillic': [languages.RUS, languages.UKR, languages.BUL],
'Devanagari': [languages.HIN, languages.NEP]
};
const candidateLanguages = scriptLanguageMap[script] || [languages.ENG];
// Try recognition with script-appropriate languages
const worker = await createWorker(candidateLanguages);
const result = await worker.recognize(imagePath);
await worker.terminate();
return {
detectedScript: script,
usedLanguages: candidateLanguages,
text: result.data.text,
confidence: result.data.confidence
};
}
// Usage
const smartResult = await smartLanguageRecognition('multilingual-doc.png');
console.log(`Detected ${smartResult.detectedScript} script`);
console.log(`Used languages: ${smartResult.usedLanguages.join(', ')}`);import { createWorker } from 'tesseract.js';
async function createEnvironmentOptimizedWorker() {
const isBrowser = typeof window !== 'undefined';
const isNode = typeof process !== 'undefined' && process.versions?.node;
const options = {
logger: (m) => console.log(`OCR: ${m.status} - ${m.progress}%`)
};
if (isBrowser) {
// Browser-specific optimizations
options.workerBlobURL = true;
options.gzip = true;
} else if (isNode) {
// Node.js-specific optimizations
options.workerBlobURL = false;
options.cacheMethod = 'none'; // Disable caching in server environments
}
return await createWorker('eng', undefined, options);
}
// Usage
const worker = await createEnvironmentOptimizedWorker();import { createWorker, PSM } from 'tesseract.js';
async function createHighPrecisionWorker() {
const worker = await createWorker('eng');
// Configure for maximum accuracy
await worker.setParameters({
// Page segmentation
tessedit_pageseg_mode: PSM.AUTO,
// Character recognition
tessedit_char_whitelist: '', // Allow all characters
tessedit_char_blacklist: '', // Block no characters
// Word recognition
preserve_interword_spaces: '1',
// Quality settings
user_defined_dpi: '300',
// Advanced Tesseract parameters
tessedit_do_invert: '0',
tessedit_create_hocr: '1',
tessedit_create_tsv: '1'
});
return worker;
}
// Usage for high-accuracy document processing
const precisionWorker = await createHighPrecisionWorker();
const result = await precisionWorker.recognize('high-quality-document.png', {}, {
text: true,
hocr: true,
tsv: true,
pdf: true
});Install with Tessl CLI
npx tessl i tessl/npm-tesseract-js