0
# Language Processing
1
2
Language processing capabilities including automatic language detection, text profiling, and translation services for multilingual document processing and content analysis.
3
4
## Capabilities
5
6
### Language Detection
7
8
#### LanguageIdentifier
9
10
Classic language identification using n-gram analysis and statistical models for detecting document language.
11
12
```java { .api }
13
/**
14
* Statistical language identifier using n-gram analysis
15
*/
16
public class LanguageIdentifier {
17
/**
18
* Creates LanguageIdentifier with default language profiles
19
*/
20
public LanguageIdentifier();
21
22
/**
23
* Creates LanguageIdentifier with custom profile directory
24
* @param profileDirectory Directory containing language profile files
25
*/
26
public LanguageIdentifier(String profileDirectory);
27
28
/**
29
* Identifies language of text content
30
* @param content Text content to analyze
31
* @return Language code (ISO 639-1) of detected language
32
*/
33
public String identify(String content);
34
35
/**
36
* Identifies language with confidence score
37
* @param content Text content to analyze
38
* @return LanguageResult containing language and confidence
39
*/
40
public LanguageResult identifyWithConfidence(String content);
41
42
/**
43
* Checks if language can be reliably identified
44
* @param content Text content to check
45
* @return true if language detection confidence is high enough
46
*/
47
public boolean isReasonablyCertain(String content);
48
49
/**
50
* Gets confidence score for detected language
51
* @param content Text content to analyze
52
* @return Confidence score between 0.0 and 1.0
53
*/
54
public double getConfidence(String content);
55
56
/**
57
* Gets all supported language codes
58
* @return Set of supported ISO 639-1 language codes
59
*/
60
public Set<String> getSupportedLanguages();
61
62
/**
63
* Checks if specific language is supported
64
* @param language ISO 639-1 language code to check
65
* @return true if language detection is supported
66
*/
67
public boolean isLanguageSupported(String language);
68
}
69
```
70
71
#### ProfilingHandler
72
73
Content handler for building language profiles during document parsing for improved detection accuracy.
74
75
```java { .api }
76
/**
77
* Content handler that builds language profiles for detection
78
*/
79
public class ProfilingHandler extends DefaultHandler {
80
/**
81
* Creates ProfilingHandler for language profiling
82
*/
83
public ProfilingHandler();
84
85
/**
86
* Creates ProfilingHandler with custom LanguageIdentifier
87
* @param identifier LanguageIdentifier to use for profiling
88
*/
89
public ProfilingHandler(LanguageIdentifier identifier);
90
91
/**
92
* Gets the detected language after profiling
93
* @return ISO 639-1 language code of detected language
94
*/
95
public String getLanguage();
96
97
/**
98
* Gets confidence score of language detection
99
* @return Confidence score between 0.0 and 1.0
100
*/
101
public double getConfidence();
102
103
/**
104
* Checks if enough content has been processed for reliable detection
105
* @return true if sufficient content analyzed
106
*/
107
public boolean hasEnoughData();
108
109
/**
110
* Gets the amount of text content processed
111
* @return Number of characters analyzed
112
*/
113
public int getContentLength();
114
}
115
```
116
117
### Modern Language Detection
118
119
#### LanguageDetector Interface
120
121
Modern interface for pluggable language detection implementations with support for multiple algorithms.
122
123
```java { .api }
124
/**
125
* Interface for modern language detection implementations
126
*/
127
public interface LanguageDetector {
128
/**
129
* Detects language of input text
130
* @param text Text content to analyze
131
* @return LanguageResult containing detected language and confidence
132
* @throws IOException if detection process fails
133
*/
134
LanguageResult detect(String text) throws IOException;
135
136
/**
137
* Detects multiple possible languages with probabilities
138
* @param text Text content to analyze
139
* @return List of LanguageResult objects sorted by confidence
140
* @throws IOException if detection process fails
141
*/
142
List<LanguageResult> detectAll(String text) throws IOException;
143
144
/**
145
* Checks if detector supports specific language
146
* @param language ISO 639-1 language code
147
* @return true if language is supported for detection
148
*/
149
boolean isSupported(String language);
150
151
/**
152
* Gets all supported languages
153
* @return Set of supported ISO 639-1 language codes
154
*/
155
Set<String> getSupportedLanguages();
156
157
/**
158
* Loads detector from configuration
159
* @param config Configuration parameters for detector
160
* @throws IOException if loading fails
161
*/
162
void loadModels(Map<String, Object> config) throws IOException;
163
164
/**
165
* Checks if detector is ready for use
166
* @return true if detector is loaded and ready
167
*/
168
boolean isAvailable();
169
}
170
```
171
172
#### LanguageResult
173
174
Result object containing detected language information and confidence metrics.
175
176
```java { .api }
177
/**
178
* Result of language detection containing language and confidence information
179
*/
180
public class LanguageResult {
181
/**
182
* Creates LanguageResult with language and confidence
183
* @param language ISO 639-1 language code
184
* @param confidence Confidence score (0.0 to 1.0)
185
*/
186
public LanguageResult(String language, float confidence);
187
188
/**
189
* Creates LanguageResult with additional properties
190
* @param language ISO 639-1 language code
191
* @param confidence Confidence score
192
* @param rawScore Raw detection score from algorithm
193
*/
194
public LanguageResult(String language, float confidence, double rawScore);
195
196
/**
197
* Gets detected language code
198
* @return ISO 639-1 language code (e.g., "en", "fr", "de")
199
*/
200
public String getLanguage();
201
202
/**
203
* Gets confidence score of detection
204
* @return Confidence between 0.0 (lowest) and 1.0 (highest)
205
*/
206
public float getConfidence();
207
208
/**
209
* Gets raw algorithm score
210
* @return Raw score from detection algorithm
211
*/
212
public double getRawScore();
213
214
/**
215
* Checks if detection confidence is above threshold
216
* @param threshold Minimum confidence threshold
217
* @return true if confidence exceeds threshold
218
*/
219
public boolean isReliable(float threshold);
220
221
/**
222
* Gets human-readable language name
223
* @return Full language name in English
224
*/
225
public String getLanguageName();
226
227
/**
228
* Compares results by confidence (descending order)
229
* @param other LanguageResult to compare with
230
* @return Comparison result for sorting
231
*/
232
public int compareTo(LanguageResult other);
233
}
234
```
235
236
#### LanguageWriter
237
238
Writer wrapper that performs language detection on written content for streaming analysis.
239
240
```java { .api }
241
/**
242
* Writer that performs language detection on content as it's written
243
*/
244
public class LanguageWriter extends Writer {
245
/**
246
* Creates LanguageWriter with underlying writer and detector
247
* @param writer Underlying Writer to delegate to
248
* @param detector LanguageDetector for analysis
249
*/
250
public LanguageWriter(Writer writer, LanguageDetector detector);
251
252
/**
253
* Creates LanguageWriter with detector and minimum content threshold
254
* @param writer Underlying Writer
255
* @param detector LanguageDetector for analysis
256
* @param minLength Minimum content length before detection
257
*/
258
public LanguageWriter(Writer writer, LanguageDetector detector, int minLength);
259
260
/**
261
* Gets current detected language
262
* @return LanguageResult with current detection, or null if insufficient data
263
*/
264
public LanguageResult getDetectedLanguage();
265
266
/**
267
* Gets all possible languages detected
268
* @return List of LanguageResult objects sorted by confidence
269
*/
270
public List<LanguageResult> getAllDetectedLanguages();
271
272
/**
273
* Checks if enough content has been written for reliable detection
274
* @return true if sufficient content for detection
275
*/
276
public boolean hasMinimumContent();
277
278
/**
279
* Gets length of content analyzed so far
280
* @return Number of characters written and analyzed
281
*/
282
public int getContentLength();
283
284
/**
285
* Writes character array to underlying writer and updates detection
286
* @param cbuf Character array to write
287
* @param off Offset in character array
288
* @param len Number of characters to write
289
* @throws IOException if write operation fails
290
*/
291
@Override
292
public void write(char[] cbuf, int off, int len) throws IOException;
293
294
/**
295
* Writes string to underlying writer and updates detection
296
* @param str String to write
297
* @throws IOException if write operation fails
298
*/
299
@Override
300
public void write(String str) throws IOException;
301
302
/**
303
* Flushes underlying writer
304
* @throws IOException if flush operation fails
305
*/
306
@Override
307
public void flush() throws IOException;
308
309
/**
310
* Closes underlying writer
311
* @throws IOException if close operation fails
312
*/
313
@Override
314
public void close() throws IOException;
315
}
316
```
317
318
### Translation Services
319
320
#### Translator Interface
321
322
Interface for text translation services supporting multiple translation backends and language pairs.
323
324
```java { .api }
325
/**
326
* Interface for text translation services
327
*/
328
public interface Translator {
329
/**
330
* Translates text to target language
331
* @param text Text to translate
332
* @param targetLanguage Target language code (ISO 639-1)
333
* @return Translated text
334
* @throws TikaException if translation fails
335
* @throws IOException if communication with translation service fails
336
*/
337
String translate(String text, String targetLanguage) throws TikaException, IOException;
338
339
/**
340
* Translates text from source to target language
341
* @param text Text to translate
342
* @param sourceLanguage Source language code (ISO 639-1)
343
* @param targetLanguage Target language code (ISO 639-1)
344
* @return Translated text
345
* @throws TikaException if translation fails
346
* @throws IOException if communication fails
347
*/
348
String translate(String text, String sourceLanguage, String targetLanguage)
349
throws TikaException, IOException;
350
351
/**
352
* Gets all supported source languages
353
* @return Set of supported source language codes
354
*/
355
Set<String> getSupportedSourceLanguages();
356
357
/**
358
* Gets all supported target languages
359
* @return Set of supported target language codes
360
*/
361
Set<String> getSupportedTargetLanguages();
362
363
/**
364
* Checks if translation from source to target language is supported
365
* @param sourceLanguage Source language code
366
* @param targetLanguage Target language code
367
* @return true if translation pair is supported
368
*/
369
boolean isSupported(String sourceLanguage, String targetLanguage);
370
371
/**
372
* Checks if translator service is available
373
* @return true if translator can be used
374
*/
375
boolean isAvailable();
376
377
/**
378
* Gets maximum text length supported for translation
379
* @return Maximum characters per translation request
380
*/
381
int getMaxTextLength();
382
}
383
```
384
385
#### DefaultTranslator
386
387
Default implementation of Translator interface providing basic translation capabilities.
388
389
```java { .api }
390
/**
391
* Default translator implementation with configurable backends
392
*/
393
public class DefaultTranslator implements Translator {
394
/**
395
* Creates DefaultTranslator with default configuration
396
*/
397
public DefaultTranslator();
398
399
/**
400
* Creates DefaultTranslator with custom configuration
401
* @param config Configuration properties for translator
402
*/
403
public DefaultTranslator(Properties config);
404
405
/**
406
* Sets translation service endpoint URL
407
* @param serviceUrl URL of translation service
408
*/
409
public void setServiceUrl(String serviceUrl);
410
411
/**
412
* Gets current service endpoint URL
413
* @return URL of translation service
414
*/
415
public String getServiceUrl();
416
417
/**
418
* Sets API key for translation service
419
* @param apiKey API key for service authentication
420
*/
421
public void setApiKey(String apiKey);
422
423
/**
424
* Sets maximum text length for single translation request
425
* @param maxLength Maximum characters per request
426
*/
427
public void setMaxTextLength(int maxLength);
428
429
/**
430
* Sets timeout for translation requests
431
* @param timeoutMs Timeout in milliseconds
432
*/
433
public void setTimeout(int timeoutMs);
434
435
/**
436
* Translates text to target language with auto-detection
437
* @param text Text to translate
438
* @param targetLanguage Target language code
439
* @return Translated text
440
* @throws TikaException if translation fails
441
* @throws IOException if service communication fails
442
*/
443
@Override
444
public String translate(String text, String targetLanguage) throws TikaException, IOException;
445
446
/**
447
* Translates text with explicit source language
448
* @param text Text to translate
449
* @param sourceLanguage Source language code
450
* @param targetLanguage Target language code
451
* @return Translated text
452
* @throws TikaException if translation fails
453
* @throws IOException if service communication fails
454
*/
455
@Override
456
public String translate(String text, String sourceLanguage, String targetLanguage)
457
throws TikaException, IOException;
458
459
/**
460
* Gets supported source languages from service
461
* @return Set of source language codes
462
*/
463
@Override
464
public Set<String> getSupportedSourceLanguages();
465
466
/**
467
* Gets supported target languages from service
468
* @return Set of target language codes
469
*/
470
@Override
471
public Set<String> getSupportedTargetLanguages();
472
473
/**
474
* Checks if language pair is supported
475
* @param sourceLanguage Source language code
476
* @param targetLanguage Target language code
477
* @return true if translation is supported
478
*/
479
@Override
480
public boolean isSupported(String sourceLanguage, String targetLanguage);
481
482
/**
483
* Checks if translation service is available
484
* @return true if service can be reached
485
*/
486
@Override
487
public boolean isAvailable();
488
489
/**
490
* Gets maximum text length per request
491
* @return Maximum characters per translation
492
*/
493
@Override
494
public int getMaxTextLength();
495
}
496
```
497
498
## Usage Examples
499
500
### Basic Language Detection
501
502
```java { .api }
503
// Simple language identification
504
LanguageIdentifier identifier = new LanguageIdentifier();
505
506
String englishText = "This is a sample document written in English.";
507
String detectedLang = identifier.identify(englishText);
508
System.out.println("Detected language: " + detectedLang); // "en"
509
510
// Check detection confidence
511
if (identifier.isReasonablyCertain(englishText)) {
512
double confidence = identifier.getConfidence(englishText);
513
System.out.println("Confidence: " + confidence);
514
}
515
516
// Get all supported languages
517
Set<String> supported = identifier.getSupportedLanguages();
518
System.out.println("Supported languages: " + supported);
519
```
520
521
### Advanced Language Detection with Results
522
523
```java { .api }
524
// Modern language detection with detailed results
525
LanguageIdentifier identifier = new LanguageIdentifier();
526
527
String mixedText = "Bonjour, this is a mixed language document with français.";
528
LanguageResult result = identifier.identifyWithConfidence(mixedText);
529
530
System.out.println("Language: " + result.getLanguage());
531
System.out.println("Confidence: " + result.getConfidence());
532
System.out.println("Language name: " + result.getLanguageName());
533
534
// Check reliability
535
if (result.isReliable(0.8f)) {
536
System.out.println("High confidence detection");
537
}
538
```
539
540
### Language Detection During Parsing
541
542
```java { .api }
543
// Detect language while parsing document
544
try {
545
AutoDetectParser parser = new AutoDetectParser();
546
ProfilingHandler langHandler = new ProfilingHandler();
547
BodyContentHandler textHandler = new BodyContentHandler();
548
549
// Use TeeContentHandler to process with both handlers
550
TeeContentHandler teeHandler = new TeeContentHandler(langHandler, textHandler);
551
552
Metadata metadata = new Metadata();
553
parser.parse(inputStream, teeHandler, metadata, new ParseContext());
554
555
// Get detected language and content
556
String language = langHandler.getLanguage();
557
double confidence = langHandler.getConfidence();
558
String content = textHandler.toString();
559
560
System.out.println("Document language: " + language + " (" + confidence + ")");
561
System.out.println("Content length: " + langHandler.getContentLength());
562
563
} catch (Exception e) {
564
System.err.println("Language detection failed: " + e.getMessage());
565
}
566
```
567
568
### Streaming Language Detection
569
570
```java { .api }
571
// Detect language as content is written
572
try (StringWriter stringWriter = new StringWriter()) {
573
LanguageIdentifier detector = new LanguageIdentifier();
574
LanguageWriter langWriter = new LanguageWriter(stringWriter,
575
text -> {
576
try {
577
return detector.identifyWithConfidence(text);
578
} catch (Exception e) {
579
return new LanguageResult("unknown", 0.0f);
580
}
581
}, 100); // Minimum 100 characters before detection
582
583
// Write content progressively
584
langWriter.write("Ceci est un document en français. ");
585
langWriter.write("Il contient plusieurs phrases pour la détection. ");
586
langWriter.write("La détection devrait identifier le français.");
587
588
// Check detection results
589
if (langWriter.hasMinimumContent()) {
590
LanguageResult detected = langWriter.getDetectedLanguage();
591
if (detected != null) {
592
System.out.println("Detected: " + detected.getLanguage());
593
System.out.println("Confidence: " + detected.getConfidence());
594
}
595
}
596
597
langWriter.close();
598
String fullText = stringWriter.toString();
599
600
} catch (IOException e) {
601
System.err.println("Language detection error: " + e.getMessage());
602
}
603
```
604
605
### Text Translation
606
607
```java { .api }
608
// Basic text translation
609
DefaultTranslator translator = new DefaultTranslator();
610
611
if (translator.isAvailable()) {
612
try {
613
// Translate to English (auto-detect source)
614
String frenchText = "Bonjour, comment allez-vous?";
615
String englishText = translator.translate(frenchText, "en");
616
System.out.println("Translation: " + englishText);
617
618
// Translate with explicit source language
619
String germanText = translator.translate(englishText, "en", "de");
620
System.out.println("German: " + germanText);
621
622
} catch (TikaException | IOException e) {
623
System.err.println("Translation failed: " + e.getMessage());
624
}
625
}
626
627
// Check supported languages
628
Set<String> sourceLanguages = translator.getSupportedSourceLanguages();
629
Set<String> targetLanguages = translator.getSupportedTargetLanguages();
630
System.out.println("Source languages: " + sourceLanguages.size());
631
System.out.println("Target languages: " + targetLanguages.size());
632
```
633
634
### Configured Translation Service
635
636
```java { .api }
637
// Configure translation service
638
Properties config = new Properties();
639
config.setProperty("translator.service.url", "https://api.translate.service.com");
640
config.setProperty("translator.api.key", "your-api-key");
641
config.setProperty("translator.timeout", "30000");
642
config.setProperty("translator.maxLength", "5000");
643
644
DefaultTranslator translator = new DefaultTranslator(config);
645
translator.setMaxTextLength(10000);
646
translator.setTimeout(60000);
647
648
// Check if specific translation is supported
649
boolean canTranslate = translator.isSupported("fr", "en");
650
if (canTranslate) {
651
String translation = translator.translate("Texte français", "fr", "en");
652
System.out.println("Translated: " + translation);
653
}
654
```
655
656
### Multilingual Document Processing
657
658
```java { .api }
659
public class MultilingualProcessor {
660
661
private final LanguageIdentifier detector;
662
private final Translator translator;
663
664
public MultilingualProcessor() {
665
this.detector = new LanguageIdentifier();
666
this.translator = new DefaultTranslator();
667
}
668
669
public ProcessedDocument processDocument(InputStream input)
670
throws IOException, SAXException, TikaException {
671
672
AutoDetectParser parser = new AutoDetectParser();
673
BodyContentHandler textHandler = new BodyContentHandler();
674
ProfilingHandler langHandler = new ProfilingHandler(detector);
675
676
TeeContentHandler teeHandler = new TeeContentHandler(textHandler, langHandler);
677
678
Metadata metadata = new Metadata();
679
parser.parse(input, teeHandler, metadata, new ParseContext());
680
681
String content = textHandler.toString();
682
String language = langHandler.getLanguage();
683
double confidence = langHandler.getConfidence();
684
685
ProcessedDocument result = new ProcessedDocument();
686
result.setOriginalContent(content);
687
result.setDetectedLanguage(language);
688
result.setLanguageConfidence(confidence);
689
690
// Translate to English if not already English
691
if (!"en".equals(language) && translator.isSupported(language, "en")) {
692
try {
693
String translation = translator.translate(content, language, "en");
694
result.setEnglishTranslation(translation);
695
} catch (Exception e) {
696
result.addWarning("Translation failed: " + e.getMessage());
697
}
698
}
699
700
return result;
701
}
702
}
703
```
704
705
### Language Detection Comparison
706
707
```java { .api }
708
// Compare different detection methods
709
public class LanguageDetectionComparison {
710
711
public void compareDetectors(String text) {
712
// Classic detector
713
LanguageIdentifier classic = new LanguageIdentifier();
714
String classicResult = classic.identify(text);
715
double classicConfidence = classic.getConfidence(text);
716
717
System.out.println("Classic detector:");
718
System.out.println(" Language: " + classicResult);
719
System.out.println(" Confidence: " + classicConfidence);
720
System.out.println(" Certain: " + classic.isReasonablyCertain(text));
721
722
// Modern detector with detailed results
723
LanguageResult detailedResult = classic.identifyWithConfidence(text);
724
System.out.println("\nDetailed result:");
725
System.out.println(" Language: " + detailedResult.getLanguage());
726
System.out.println(" Confidence: " + detailedResult.getConfidence());
727
System.out.println(" Raw score: " + detailedResult.getRawScore());
728
System.out.println(" Reliable (>0.8): " + detailedResult.isReliable(0.8f));
729
System.out.println(" Language name: " + detailedResult.getLanguageName());
730
}
731
}
732
```