or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration-constants.mdhigh-level-functions.mdindex.mdscheduler-api.mdworker-api.md

configuration-constants.mddocs/

0

# Configuration and Constants

1

2

Tesseract.js provides extensive configuration options through constants for language selection, OCR engine modes, page segmentation modes, and logging control.

3

4

## Capabilities

5

6

### Language Constants

7

8

Complete set of language codes for all supported languages in Tesseract.js.

9

10

```javascript { .api }

11

/**

12

* Language codes for all supported OCR languages

13

* Each property maps a descriptive name to the Tesseract language code

14

*/

15

const languages: {

16

AFR: 'afr'; // Afrikaans

17

AMH: 'amh'; // Amharic

18

ARA: 'ara'; // Arabic

19

ASM: 'asm'; // Assamese

20

AZE: 'aze'; // Azerbaijani

21

AZE_CYRL: 'aze_cyrl'; // Azerbaijani - Cyrillic

22

BEL: 'bel'; // Belarusian

23

BEN: 'ben'; // Bengali

24

BOD: 'bod'; // Tibetan

25

BOS: 'bos'; // Bosnian

26

BUL: 'bul'; // Bulgarian

27

CAT: 'cat'; // Catalan; Valencian

28

CEB: 'ceb'; // Cebuano

29

CES: 'ces'; // Czech

30

CHI_SIM: 'chi_sim'; // Chinese - Simplified

31

CHI_TRA: 'chi_tra'; // Chinese - Traditional

32

CHR: 'chr'; // Cherokee

33

CYM: 'cym'; // Welsh

34

DAN: 'dan'; // Danish

35

DEU: 'deu'; // German

36

DZO: 'dzo'; // Dzongkha

37

ELL: 'ell'; // Greek, Modern (1453-)

38

ENG: 'eng'; // English

39

ENM: 'enm'; // English, Middle (1100-1500)

40

EPO: 'epo'; // Esperanto

41

EST: 'est'; // Estonian

42

EUS: 'eus'; // Basque

43

FAS: 'fas'; // Persian

44

FIN: 'fin'; // Finnish

45

FRA: 'fra'; // French

46

FRK: 'frk'; // German Fraktur

47

FRM: 'frm'; // French, Middle (ca. 1400-1600)

48

GLE: 'gle'; // Irish

49

GLG: 'glg'; // Galician

50

GRC: 'grc'; // Greek, Ancient (-1453)

51

GUJ: 'guj'; // Gujarati

52

HAT: 'hat'; // Haitian; Haitian Creole

53

HEB: 'heb'; // Hebrew

54

HIN: 'hin'; // Hindi

55

HRV: 'hrv'; // Croatian

56

HUN: 'hun'; // Hungarian

57

IKU: 'iku'; // Inuktitut

58

IND: 'ind'; // Indonesian

59

ISL: 'isl'; // Icelandic

60

ITA: 'ita'; // Italian

61

ITA_OLD: 'ita_old'; // Italian - Old

62

JAV: 'jav'; // Javanese

63

JPN: 'jpn'; // Japanese

64

KAN: 'kan'; // Kannada

65

KAT: 'kat'; // Georgian

66

KAT_OLD: 'kat_old'; // Georgian - Old

67

KAZ: 'kaz'; // Kazakh

68

KHM: 'khm'; // Central Khmer

69

KIR: 'kir'; // Kirghiz; Kyrgyz

70

KOR: 'kor'; // Korean

71

KUR: 'kur'; // Kurdish

72

LAO: 'lao'; // Lao

73

LAT: 'lat'; // Latin

74

LAV: 'lav'; // Latvian

75

LIT: 'lit'; // Lithuanian

76

MAL: 'mal'; // Malayalam

77

MAR: 'mar'; // Marathi

78

MKD: 'mkd'; // Macedonian

79

MLT: 'mlt'; // Maltese

80

MSA: 'msa'; // Malay

81

MYA: 'mya'; // Burmese

82

NEP: 'nep'; // Nepali

83

NLD: 'nld'; // Dutch; Flemish

84

NOR: 'nor'; // Norwegian

85

ORI: 'ori'; // Oriya

86

PAN: 'pan'; // Panjabi; Punjabi

87

POL: 'pol'; // Polish

88

POR: 'por'; // Portuguese

89

PUS: 'pus'; // Pushto; Pashto

90

RON: 'ron'; // Romanian; Moldavian; Moldovan

91

RUS: 'rus'; // Russian

92

SAN: 'san'; // Sanskrit

93

SIN: 'sin'; // Sinhala; Sinhalese

94

SLK: 'slk'; // Slovak

95

SLV: 'slv'; // Slovenian

96

SPA: 'spa'; // Spanish; Castilian

97

SPA_OLD: 'spa_old'; // Spanish; Castilian - Old

98

SQI: 'sqi'; // Albanian

99

SRP: 'srp'; // Serbian

100

SRP_LATN: 'srp_latn'; // Serbian - Latin

101

SWA: 'swa'; // Swahili

102

SWE: 'swe'; // Swedish

103

SYR: 'syr'; // Syriac

104

TAM: 'tam'; // Tamil

105

TEL: 'tel'; // Telugu

106

TGK: 'tgk'; // Tajik

107

TGL: 'tgl'; // Tagalog

108

THA: 'tha'; // Thai

109

TIR: 'tir'; // Tigrinya

110

TUR: 'tur'; // Turkish

111

UIG: 'uig'; // Uighur; Uyghur

112

UKR: 'ukr'; // Ukrainian

113

URD: 'urd'; // Urdu

114

UZB: 'uzb'; // Uzbek

115

UZB_CYRL: 'uzb_cyrl'; // Uzbek - Cyrillic

116

VIE: 'vie'; // Vietnamese

117

YID: 'yid'; // Yiddish

118

};

119

```

120

121

**Usage Examples:**

122

123

```javascript

124

import { createWorker, languages } from 'tesseract.js';

125

126

// Use language constants for better readability

127

const worker = await createWorker(languages.ENG);

128

const multiWorker = await createWorker([languages.ENG, languages.FRA, languages.DEU]);

129

130

// Useful for dynamic language selection

131

const userLanguage = 'french';

132

const langCode = userLanguage === 'french' ? languages.FRA : languages.ENG;

133

const dynamicWorker = await createWorker(langCode);

134

```

135

136

### OCR Engine Mode (OEM) Constants

137

138

Constants for selecting OCR engine modes with different accuracy/speed tradeoffs.

139

140

```javascript { .api }

141

/**

142

* OCR Engine Mode constants

143

* Controls which OCR engine is used for recognition

144

*/

145

enum OEM {

146

TESSERACT_ONLY = 0, // Legacy Tesseract engine only

147

LSTM_ONLY = 1, // LSTM neural networks only (default, best accuracy)

148

TESSERACT_LSTM_COMBINED = 2, // Legacy + LSTM combined

149

DEFAULT = 3 // Default (currently LSTM)

150

}

151

```

152

153

**Usage Examples:**

154

155

```javascript

156

import { createWorker, OEM } from 'tesseract.js';

157

158

// Use LSTM for best accuracy (default)

159

const lstmWorker = await createWorker('eng', OEM.LSTM_ONLY);

160

161

// Use legacy engine for compatibility

162

const legacyWorker = await createWorker('eng', OEM.TESSERACT_ONLY);

163

164

// Use combined mode for maximum coverage

165

const combinedWorker = await createWorker('eng', OEM.TESSERACT_LSTM_COMBINED);

166

```

167

168

### Page Segmentation Mode (PSM) Constants

169

170

Constants for controlling how Tesseract segments the page before recognition.

171

172

```javascript { .api }

173

/**

174

* Page Segmentation Mode constants

175

* Controls how the page is analyzed and segmented for OCR

176

*/

177

enum PSM {

178

OSD_ONLY = '0', // Orientation and script detection only

179

AUTO_OSD = '1', // Automatic page segmentation with OSD

180

AUTO_ONLY = '2', // Automatic page segmentation, no OSD

181

AUTO = '3', // Fully automatic page segmentation (default)

182

SINGLE_COLUMN = '4', // Single uniform column

183

SINGLE_BLOCK_VERT_TEXT = '5', // Single uniform block of vertically aligned text

184

SINGLE_BLOCK = '6', // Single uniform block

185

SINGLE_LINE = '7', // Single text line

186

SINGLE_WORD = '8', // Single word

187

CIRCLE_WORD = '9', // Single word in a circle

188

SINGLE_CHAR = '10', // Single character

189

SPARSE_TEXT = '11', // Sparse text, find as much text as possible

190

SPARSE_TEXT_OSD = '12', // Sparse text with OSD

191

RAW_LINE = '13' // Raw line, treat image as single text line

192

}

193

```

194

195

**Usage Examples:**

196

197

```javascript

198

import { createWorker, PSM } from 'tesseract.js';

199

200

const worker = await createWorker('eng');

201

202

// Set page segmentation mode for single line of text

203

await worker.setParameters({

204

tessedit_pageseg_mode: PSM.SINGLE_LINE

205

});

206

207

// For single word recognition

208

await worker.setParameters({

209

tessedit_pageseg_mode: PSM.SINGLE_WORD

210

});

211

212

// For documents with sparse text

213

await worker.setParameters({

214

tessedit_pageseg_mode: PSM.SPARSE_TEXT

215

});

216

```

217

218

### Logging Configuration

219

220

Enable or disable debug logging for OCR operations.

221

222

```javascript { .api }

223

/**

224

* Enables or disables debug logging for OCR operations

225

* @param logging - True to enable logging, false to disable

226

*/

227

function setLogging(logging: boolean): void;

228

```

229

230

**Usage Examples:**

231

232

```javascript

233

import { setLogging, createWorker } from 'tesseract.js';

234

235

// Enable global logging

236

setLogging(true);

237

238

// All OCR operations will now log debug information

239

const worker = await createWorker('eng');

240

const result = await worker.recognize('image.png');

241

242

// Disable logging

243

setLogging(false);

244

```

245

246

## Configuration Patterns

247

248

### Language Selection Strategies

249

250

```javascript

251

import { languages, createWorker } from 'tesseract.js';

252

253

// Multi-language document processing

254

async function createMultiLanguageWorker(regions) {

255

const langCodes = [];

256

257

if (regions.includes('europe')) {

258

langCodes.push(languages.ENG, languages.FRA, languages.DEU, languages.SPA);

259

}

260

261

if (regions.includes('asia')) {

262

langCodes.push(languages.JPN, languages.KOR, languages.CHI_SIM);

263

}

264

265

if (regions.includes('middle-east')) {

266

langCodes.push(languages.ARA, languages.HEB, languages.FAS);

267

}

268

269

return await createWorker(langCodes);

270

}

271

272

// Usage

273

const europeanWorker = await createMultiLanguageWorker(['europe']);

274

const globalWorker = await createMultiLanguageWorker(['europe', 'asia', 'middle-east']);

275

```

276

277

### OCR Optimization Settings

278

279

```javascript

280

import { createWorker, OEM, PSM } from 'tesseract.js';

281

282

async function createOptimizedWorker(documentType) {

283

let oem = OEM.LSTM_ONLY;

284

let psm = PSM.AUTO;

285

let params = {};

286

287

switch (documentType) {

288

case 'receipt':

289

psm = PSM.SPARSE_TEXT;

290

params.tessedit_char_whitelist = '0123456789.$';

291

break;

292

293

case 'license-plate':

294

psm = PSM.SINGLE_LINE;

295

params.tessedit_char_whitelist = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';

296

break;

297

298

case 'form-field':

299

psm = PSM.SINGLE_WORD;

300

break;

301

302

case 'book-page':

303

psm = PSM.SINGLE_COLUMN;

304

break;

305

306

case 'legacy-document':

307

oem = OEM.TESSERACT_ONLY;

308

break;

309

}

310

311

const worker = await createWorker('eng', oem);

312

await worker.setParameters({

313

tessedit_pageseg_mode: psm,

314

...params

315

});

316

317

return worker;

318

}

319

320

// Usage

321

const receiptWorker = await createOptimizedWorker('receipt');

322

const plateWorker = await createOptimizedWorker('license-plate');

323

```

324

325

### Progressive Language Detection

326

327

```javascript

328

import { detect, createWorker, languages } from 'tesseract.js';

329

330

async function smartLanguageRecognition(imagePath) {

331

// First detect the script

332

const detection = await detect(imagePath);

333

const script = detection.data.script;

334

335

// Map scripts to likely languages

336

const scriptLanguageMap = {

337

'Latin': [languages.ENG, languages.FRA, languages.DEU, languages.SPA],

338

'Han': [languages.CHI_SIM, languages.CHI_TRA],

339

'Hiragana': [languages.JPN],

340

'Arabic': [languages.ARA],

341

'Cyrillic': [languages.RUS, languages.UKR, languages.BUL],

342

'Devanagari': [languages.HIN, languages.NEP]

343

};

344

345

const candidateLanguages = scriptLanguageMap[script] || [languages.ENG];

346

347

// Try recognition with script-appropriate languages

348

const worker = await createWorker(candidateLanguages);

349

const result = await worker.recognize(imagePath);

350

await worker.terminate();

351

352

return {

353

detectedScript: script,

354

usedLanguages: candidateLanguages,

355

text: result.data.text,

356

confidence: result.data.confidence

357

};

358

}

359

360

// Usage

361

const smartResult = await smartLanguageRecognition('multilingual-doc.png');

362

console.log(`Detected ${smartResult.detectedScript} script`);

363

console.log(`Used languages: ${smartResult.usedLanguages.join(', ')}`);

364

```

365

366

### Environment-Specific Configuration

367

368

```javascript

369

import { createWorker } from 'tesseract.js';

370

371

async function createEnvironmentOptimizedWorker() {

372

const isBrowser = typeof window !== 'undefined';

373

const isNode = typeof process !== 'undefined' && process.versions?.node;

374

375

const options = {

376

logger: (m) => console.log(`OCR: ${m.status} - ${m.progress}%`)

377

};

378

379

if (isBrowser) {

380

// Browser-specific optimizations

381

options.workerBlobURL = true;

382

options.gzip = true;

383

} else if (isNode) {

384

// Node.js-specific optimizations

385

options.workerBlobURL = false;

386

options.cacheMethod = 'none'; // Disable caching in server environments

387

}

388

389

return await createWorker('eng', undefined, options);

390

}

391

392

// Usage

393

const worker = await createEnvironmentOptimizedWorker();

394

```

395

396

## Advanced Parameter Configuration

397

398

```javascript

399

import { createWorker, PSM } from 'tesseract.js';

400

401

async function createHighPrecisionWorker() {

402

const worker = await createWorker('eng');

403

404

// Configure for maximum accuracy

405

await worker.setParameters({

406

// Page segmentation

407

tessedit_pageseg_mode: PSM.AUTO,

408

409

// Character recognition

410

tessedit_char_whitelist: '', // Allow all characters

411

tessedit_char_blacklist: '', // Block no characters

412

413

// Word recognition

414

preserve_interword_spaces: '1',

415

416

// Quality settings

417

user_defined_dpi: '300',

418

419

// Advanced Tesseract parameters

420

tessedit_do_invert: '0',

421

tessedit_create_hocr: '1',

422

tessedit_create_tsv: '1'

423

});

424

425

return worker;

426

}

427

428

// Usage for high-accuracy document processing

429

const precisionWorker = await createHighPrecisionWorker();

430

const result = await precisionWorker.recognize('high-quality-document.png', {}, {

431

text: true,

432

hocr: true,

433

tsv: true,

434

pdf: true

435

});

436

```