or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

language.mddocs/

0

# Language Processing

1

2

Language processing capabilities including automatic language detection, text profiling, and translation services for multilingual document processing and content analysis.

3

4

## Capabilities

5

6

### Language Detection

7

8

#### LanguageIdentifier

9

10

Classic language identification using n-gram analysis and statistical models for detecting document language.

11

12

```java { .api }

13

/**

14

* Statistical language identifier using n-gram analysis

15

*/

16

public class LanguageIdentifier {

17

/**

18

* Creates LanguageIdentifier with default language profiles

19

*/

20

public LanguageIdentifier();

21

22

/**

23

* Creates LanguageIdentifier with custom profile directory

24

* @param profileDirectory Directory containing language profile files

25

*/

26

public LanguageIdentifier(String profileDirectory);

27

28

/**

29

* Identifies language of text content

30

* @param content Text content to analyze

31

* @return Language code (ISO 639-1) of detected language

32

*/

33

public String identify(String content);

34

35

/**

36

* Identifies language with confidence score

37

* @param content Text content to analyze

38

* @return LanguageResult containing language and confidence

39

*/

40

public LanguageResult identifyWithConfidence(String content);

41

42

/**

43

* Checks if language can be reliably identified

44

* @param content Text content to check

45

* @return true if language detection confidence is high enough

46

*/

47

public boolean isReasonablyCertain(String content);

48

49

/**

50

* Gets confidence score for detected language

51

* @param content Text content to analyze

52

* @return Confidence score between 0.0 and 1.0

53

*/

54

public double getConfidence(String content);

55

56

/**

57

* Gets all supported language codes

58

* @return Set of supported ISO 639-1 language codes

59

*/

60

public Set<String> getSupportedLanguages();

61

62

/**

63

* Checks if specific language is supported

64

* @param language ISO 639-1 language code to check

65

* @return true if language detection is supported

66

*/

67

public boolean isLanguageSupported(String language);

68

}

69

```

70

71

#### ProfilingHandler

72

73

Content handler for building language profiles during document parsing for improved detection accuracy.

74

75

```java { .api }

76

/**

77

* Content handler that builds language profiles for detection

78

*/

79

public class ProfilingHandler extends DefaultHandler {

80

/**

81

* Creates ProfilingHandler for language profiling

82

*/

83

public ProfilingHandler();

84

85

/**

86

* Creates ProfilingHandler with custom LanguageIdentifier

87

* @param identifier LanguageIdentifier to use for profiling

88

*/

89

public ProfilingHandler(LanguageIdentifier identifier);

90

91

/**

92

* Gets the detected language after profiling

93

* @return ISO 639-1 language code of detected language

94

*/

95

public String getLanguage();

96

97

/**

98

* Gets confidence score of language detection

99

* @return Confidence score between 0.0 and 1.0

100

*/

101

public double getConfidence();

102

103

/**

104

* Checks if enough content has been processed for reliable detection

105

* @return true if sufficient content analyzed

106

*/

107

public boolean hasEnoughData();

108

109

/**

110

* Gets the amount of text content processed

111

* @return Number of characters analyzed

112

*/

113

public int getContentLength();

114

}

115

```

116

117

### Modern Language Detection

118

119

#### LanguageDetector Interface

120

121

Modern interface for pluggable language detection implementations with support for multiple algorithms.

122

123

```java { .api }

124

/**

125

* Interface for modern language detection implementations

126

*/

127

public interface LanguageDetector {

128

/**

129

* Detects language of input text

130

* @param text Text content to analyze

131

* @return LanguageResult containing detected language and confidence

132

* @throws IOException if detection process fails

133

*/

134

LanguageResult detect(String text) throws IOException;

135

136

/**

137

* Detects multiple possible languages with probabilities

138

* @param text Text content to analyze

139

* @return List of LanguageResult objects sorted by confidence

140

* @throws IOException if detection process fails

141

*/

142

List<LanguageResult> detectAll(String text) throws IOException;

143

144

/**

145

* Checks if detector supports specific language

146

* @param language ISO 639-1 language code

147

* @return true if language is supported for detection

148

*/

149

boolean isSupported(String language);

150

151

/**

152

* Gets all supported languages

153

* @return Set of supported ISO 639-1 language codes

154

*/

155

Set<String> getSupportedLanguages();

156

157

/**

158

* Loads detector from configuration

159

* @param config Configuration parameters for detector

160

* @throws IOException if loading fails

161

*/

162

void loadModels(Map<String, Object> config) throws IOException;

163

164

/**

165

* Checks if detector is ready for use

166

* @return true if detector is loaded and ready

167

*/

168

boolean isAvailable();

169

}

170

```

171

172

#### LanguageResult

173

174

Result object containing detected language information and confidence metrics.

175

176

```java { .api }

177

/**

178

* Result of language detection containing language and confidence information

179

*/

180

public class LanguageResult {

181

/**

182

* Creates LanguageResult with language and confidence

183

* @param language ISO 639-1 language code

184

* @param confidence Confidence score (0.0 to 1.0)

185

*/

186

public LanguageResult(String language, float confidence);

187

188

/**

189

* Creates LanguageResult with additional properties

190

* @param language ISO 639-1 language code

191

* @param confidence Confidence score

192

* @param rawScore Raw detection score from algorithm

193

*/

194

public LanguageResult(String language, float confidence, double rawScore);

195

196

/**

197

* Gets detected language code

198

* @return ISO 639-1 language code (e.g., "en", "fr", "de")

199

*/

200

public String getLanguage();

201

202

/**

203

* Gets confidence score of detection

204

* @return Confidence between 0.0 (lowest) and 1.0 (highest)

205

*/

206

public float getConfidence();

207

208

/**

209

* Gets raw algorithm score

210

* @return Raw score from detection algorithm

211

*/

212

public double getRawScore();

213

214

/**

215

* Checks if detection confidence is above threshold

216

* @param threshold Minimum confidence threshold

217

* @return true if confidence exceeds threshold

218

*/

219

public boolean isReliable(float threshold);

220

221

/**

222

* Gets human-readable language name

223

* @return Full language name in English

224

*/

225

public String getLanguageName();

226

227

/**

228

* Compares results by confidence (descending order)

229

* @param other LanguageResult to compare with

230

* @return Comparison result for sorting

231

*/

232

public int compareTo(LanguageResult other);

233

}

234

```

235

236

#### LanguageWriter

237

238

Writer wrapper that performs language detection on written content for streaming analysis.

239

240

```java { .api }

241

/**

242

* Writer that performs language detection on content as it's written

243

*/

244

public class LanguageWriter extends Writer {

245

/**

246

* Creates LanguageWriter with underlying writer and detector

247

* @param writer Underlying Writer to delegate to

248

* @param detector LanguageDetector for analysis

249

*/

250

public LanguageWriter(Writer writer, LanguageDetector detector);

251

252

/**

253

* Creates LanguageWriter with detector and minimum content threshold

254

* @param writer Underlying Writer

255

* @param detector LanguageDetector for analysis

256

* @param minLength Minimum content length before detection

257

*/

258

public LanguageWriter(Writer writer, LanguageDetector detector, int minLength);

259

260

/**

261

* Gets current detected language

262

* @return LanguageResult with current detection, or null if insufficient data

263

*/

264

public LanguageResult getDetectedLanguage();

265

266

/**

267

* Gets all possible languages detected

268

* @return List of LanguageResult objects sorted by confidence

269

*/

270

public List<LanguageResult> getAllDetectedLanguages();

271

272

/**

273

* Checks if enough content has been written for reliable detection

274

* @return true if sufficient content for detection

275

*/

276

public boolean hasMinimumContent();

277

278

/**

279

* Gets length of content analyzed so far

280

* @return Number of characters written and analyzed

281

*/

282

public int getContentLength();

283

284

/**

285

* Writes character array to underlying writer and updates detection

286

* @param cbuf Character array to write

287

* @param off Offset in character array

288

* @param len Number of characters to write

289

* @throws IOException if write operation fails

290

*/

291

@Override

292

public void write(char[] cbuf, int off, int len) throws IOException;

293

294

/**

295

* Writes string to underlying writer and updates detection

296

* @param str String to write

297

* @throws IOException if write operation fails

298

*/

299

@Override

300

public void write(String str) throws IOException;

301

302

/**

303

* Flushes underlying writer

304

* @throws IOException if flush operation fails

305

*/

306

@Override

307

public void flush() throws IOException;

308

309

/**

310

* Closes underlying writer

311

* @throws IOException if close operation fails

312

*/

313

@Override

314

public void close() throws IOException;

315

}

316

```

317

318

### Translation Services

319

320

#### Translator Interface

321

322

Interface for text translation services supporting multiple translation backends and language pairs.

323

324

```java { .api }

325

/**

326

* Interface for text translation services

327

*/

328

public interface Translator {

329

/**

330

* Translates text to target language

331

* @param text Text to translate

332

* @param targetLanguage Target language code (ISO 639-1)

333

* @return Translated text

334

* @throws TikaException if translation fails

335

* @throws IOException if communication with translation service fails

336

*/

337

String translate(String text, String targetLanguage) throws TikaException, IOException;

338

339

/**

340

* Translates text from source to target language

341

* @param text Text to translate

342

* @param sourceLanguage Source language code (ISO 639-1)

343

* @param targetLanguage Target language code (ISO 639-1)

344

* @return Translated text

345

* @throws TikaException if translation fails

346

* @throws IOException if communication fails

347

*/

348

String translate(String text, String sourceLanguage, String targetLanguage)

349

throws TikaException, IOException;

350

351

/**

352

* Gets all supported source languages

353

* @return Set of supported source language codes

354

*/

355

Set<String> getSupportedSourceLanguages();

356

357

/**

358

* Gets all supported target languages

359

* @return Set of supported target language codes

360

*/

361

Set<String> getSupportedTargetLanguages();

362

363

/**

364

* Checks if translation from source to target language is supported

365

* @param sourceLanguage Source language code

366

* @param targetLanguage Target language code

367

* @return true if translation pair is supported

368

*/

369

boolean isSupported(String sourceLanguage, String targetLanguage);

370

371

/**

372

* Checks if translator service is available

373

* @return true if translator can be used

374

*/

375

boolean isAvailable();

376

377

/**

378

* Gets maximum text length supported for translation

379

* @return Maximum characters per translation request

380

*/

381

int getMaxTextLength();

382

}

383

```

384

385

#### DefaultTranslator

386

387

Default implementation of Translator interface providing basic translation capabilities.

388

389

```java { .api }

390

/**

391

* Default translator implementation with configurable backends

392

*/

393

public class DefaultTranslator implements Translator {

394

/**

395

* Creates DefaultTranslator with default configuration

396

*/

397

public DefaultTranslator();

398

399

/**

400

* Creates DefaultTranslator with custom configuration

401

* @param config Configuration properties for translator

402

*/

403

public DefaultTranslator(Properties config);

404

405

/**

406

* Sets translation service endpoint URL

407

* @param serviceUrl URL of translation service

408

*/

409

public void setServiceUrl(String serviceUrl);

410

411

/**

412

* Gets current service endpoint URL

413

* @return URL of translation service

414

*/

415

public String getServiceUrl();

416

417

/**

418

* Sets API key for translation service

419

* @param apiKey API key for service authentication

420

*/

421

public void setApiKey(String apiKey);

422

423

/**

424

* Sets maximum text length for single translation request

425

* @param maxLength Maximum characters per request

426

*/

427

public void setMaxTextLength(int maxLength);

428

429

/**

430

* Sets timeout for translation requests

431

* @param timeoutMs Timeout in milliseconds

432

*/

433

public void setTimeout(int timeoutMs);

434

435

/**

436

* Translates text to target language with auto-detection

437

* @param text Text to translate

438

* @param targetLanguage Target language code

439

* @return Translated text

440

* @throws TikaException if translation fails

441

* @throws IOException if service communication fails

442

*/

443

@Override

444

public String translate(String text, String targetLanguage) throws TikaException, IOException;

445

446

/**

447

* Translates text with explicit source language

448

* @param text Text to translate

449

* @param sourceLanguage Source language code

450

* @param targetLanguage Target language code

451

* @return Translated text

452

* @throws TikaException if translation fails

453

* @throws IOException if service communication fails

454

*/

455

@Override

456

public String translate(String text, String sourceLanguage, String targetLanguage)

457

throws TikaException, IOException;

458

459

/**

460

* Gets supported source languages from service

461

* @return Set of source language codes

462

*/

463

@Override

464

public Set<String> getSupportedSourceLanguages();

465

466

/**

467

* Gets supported target languages from service

468

* @return Set of target language codes

469

*/

470

@Override

471

public Set<String> getSupportedTargetLanguages();

472

473

/**

474

* Checks if language pair is supported

475

* @param sourceLanguage Source language code

476

* @param targetLanguage Target language code

477

* @return true if translation is supported

478

*/

479

@Override

480

public boolean isSupported(String sourceLanguage, String targetLanguage);

481

482

/**

483

* Checks if translation service is available

484

* @return true if service can be reached

485

*/

486

@Override

487

public boolean isAvailable();

488

489

/**

490

* Gets maximum text length per request

491

* @return Maximum characters per translation

492

*/

493

@Override

494

public int getMaxTextLength();

495

}

496

```

497

498

## Usage Examples

499

500

### Basic Language Detection

501

502

```java { .api }

503

// Simple language identification

504

LanguageIdentifier identifier = new LanguageIdentifier();

505

506

String englishText = "This is a sample document written in English.";

507

String detectedLang = identifier.identify(englishText);

508

System.out.println("Detected language: " + detectedLang); // "en"

509

510

// Check detection confidence

511

if (identifier.isReasonablyCertain(englishText)) {

512

double confidence = identifier.getConfidence(englishText);

513

System.out.println("Confidence: " + confidence);

514

}

515

516

// Get all supported languages

517

Set<String> supported = identifier.getSupportedLanguages();

518

System.out.println("Supported languages: " + supported);

519

```

520

521

### Advanced Language Detection with Results

522

523

```java { .api }

524

// Modern language detection with detailed results

525

LanguageIdentifier identifier = new LanguageIdentifier();

526

527

String mixedText = "Bonjour, this is a mixed language document with français.";

528

LanguageResult result = identifier.identifyWithConfidence(mixedText);

529

530

System.out.println("Language: " + result.getLanguage());

531

System.out.println("Confidence: " + result.getConfidence());

532

System.out.println("Language name: " + result.getLanguageName());

533

534

// Check reliability

535

if (result.isReliable(0.8f)) {

536

System.out.println("High confidence detection");

537

}

538

```

539

540

### Language Detection During Parsing

541

542

```java { .api }

543

// Detect language while parsing document

544

try {

545

AutoDetectParser parser = new AutoDetectParser();

546

ProfilingHandler langHandler = new ProfilingHandler();

547

BodyContentHandler textHandler = new BodyContentHandler();

548

549

// Use TeeContentHandler to process with both handlers

550

TeeContentHandler teeHandler = new TeeContentHandler(langHandler, textHandler);

551

552

Metadata metadata = new Metadata();

553

parser.parse(inputStream, teeHandler, metadata, new ParseContext());

554

555

// Get detected language and content

556

String language = langHandler.getLanguage();

557

double confidence = langHandler.getConfidence();

558

String content = textHandler.toString();

559

560

System.out.println("Document language: " + language + " (" + confidence + ")");

561

System.out.println("Content length: " + langHandler.getContentLength());

562

563

} catch (Exception e) {

564

System.err.println("Language detection failed: " + e.getMessage());

565

}

566

```

567

568

### Streaming Language Detection

569

570

```java { .api }

571

// Detect language as content is written

572

try (StringWriter stringWriter = new StringWriter()) {

573

LanguageIdentifier detector = new LanguageIdentifier();

574

LanguageWriter langWriter = new LanguageWriter(stringWriter,

575

text -> {

576

try {

577

return detector.identifyWithConfidence(text);

578

} catch (Exception e) {

579

return new LanguageResult("unknown", 0.0f);

580

}

581

}, 100); // Minimum 100 characters before detection

582

583

// Write content progressively

584

langWriter.write("Ceci est un document en français. ");

585

langWriter.write("Il contient plusieurs phrases pour la détection. ");

586

langWriter.write("La détection devrait identifier le français.");

587

588

// Check detection results

589

if (langWriter.hasMinimumContent()) {

590

LanguageResult detected = langWriter.getDetectedLanguage();

591

if (detected != null) {

592

System.out.println("Detected: " + detected.getLanguage());

593

System.out.println("Confidence: " + detected.getConfidence());

594

}

595

}

596

597

langWriter.close();

598

String fullText = stringWriter.toString();

599

600

} catch (IOException e) {

601

System.err.println("Language detection error: " + e.getMessage());

602

}

603

```

604

605

### Text Translation

606

607

```java { .api }

608

// Basic text translation

609

DefaultTranslator translator = new DefaultTranslator();

610

611

if (translator.isAvailable()) {

612

try {

613

// Translate to English (auto-detect source)

614

String frenchText = "Bonjour, comment allez-vous?";

615

String englishText = translator.translate(frenchText, "en");

616

System.out.println("Translation: " + englishText);

617

618

// Translate with explicit source language

619

String germanText = translator.translate(englishText, "en", "de");

620

System.out.println("German: " + germanText);

621

622

} catch (TikaException | IOException e) {

623

System.err.println("Translation failed: " + e.getMessage());

624

}

625

}

626

627

// Check supported languages

628

Set<String> sourceLanguages = translator.getSupportedSourceLanguages();

629

Set<String> targetLanguages = translator.getSupportedTargetLanguages();

630

System.out.println("Source languages: " + sourceLanguages.size());

631

System.out.println("Target languages: " + targetLanguages.size());

632

```

633

634

### Configured Translation Service

635

636

```java { .api }

637

// Configure translation service

638

Properties config = new Properties();

639

config.setProperty("translator.service.url", "https://api.translate.service.com");

640

config.setProperty("translator.api.key", "your-api-key");

641

config.setProperty("translator.timeout", "30000");

642

config.setProperty("translator.maxLength", "5000");

643

644

DefaultTranslator translator = new DefaultTranslator(config);

645

translator.setMaxTextLength(10000);

646

translator.setTimeout(60000);

647

648

// Check if specific translation is supported

649

boolean canTranslate = translator.isSupported("fr", "en");

650

if (canTranslate) {

651

String translation = translator.translate("Texte français", "fr", "en");

652

System.out.println("Translated: " + translation);

653

}

654

```

655

656

### Multilingual Document Processing

657

658

```java { .api }

659

public class MultilingualProcessor {

660

661

private final LanguageIdentifier detector;

662

private final Translator translator;

663

664

public MultilingualProcessor() {

665

this.detector = new LanguageIdentifier();

666

this.translator = new DefaultTranslator();

667

}

668

669

public ProcessedDocument processDocument(InputStream input)

670

throws IOException, SAXException, TikaException {

671

672

AutoDetectParser parser = new AutoDetectParser();

673

BodyContentHandler textHandler = new BodyContentHandler();

674

ProfilingHandler langHandler = new ProfilingHandler(detector);

675

676

TeeContentHandler teeHandler = new TeeContentHandler(textHandler, langHandler);

677

678

Metadata metadata = new Metadata();

679

parser.parse(input, teeHandler, metadata, new ParseContext());

680

681

String content = textHandler.toString();

682

String language = langHandler.getLanguage();

683

double confidence = langHandler.getConfidence();

684

685

ProcessedDocument result = new ProcessedDocument();

686

result.setOriginalContent(content);

687

result.setDetectedLanguage(language);

688

result.setLanguageConfidence(confidence);

689

690

// Translate to English if not already English

691

if (!"en".equals(language) && translator.isSupported(language, "en")) {

692

try {

693

String translation = translator.translate(content, language, "en");

694

result.setEnglishTranslation(translation);

695

} catch (Exception e) {

696

result.addWarning("Translation failed: " + e.getMessage());

697

}

698

}

699

700

return result;

701

}

702

}

703

```

704

705

### Language Detection Comparison

706

707

```java { .api }

708

// Compare different detection methods

709

public class LanguageDetectionComparison {

710

711

public void compareDetectors(String text) {

712

// Classic detector

713

LanguageIdentifier classic = new LanguageIdentifier();

714

String classicResult = classic.identify(text);

715

double classicConfidence = classic.getConfidence(text);

716

717

System.out.println("Classic detector:");

718

System.out.println(" Language: " + classicResult);

719

System.out.println(" Confidence: " + classicConfidence);

720

System.out.println(" Certain: " + classic.isReasonablyCertain(text));

721

722

// Modern detector with detailed results

723

LanguageResult detailedResult = classic.identifyWithConfidence(text);

724

System.out.println("\nDetailed result:");

725

System.out.println(" Language: " + detailedResult.getLanguage());

726

System.out.println(" Confidence: " + detailedResult.getConfidence());

727

System.out.println(" Raw score: " + detailedResult.getRawScore());

728

System.out.println(" Reliable (>0.8): " + detailedResult.isReliable(0.8f));

729

System.out.println(" Language name: " + detailedResult.getLanguageName());

730

}

731

}

732

```