0
# Text Processing
1
2
Text recognition, natural language processing, and document analysis capabilities through Tesseract OCR, Leptonica image processing, and SentencePiece tokenization.
3
4
## Capabilities
5
6
### Optical Character Recognition (OCR)
7
8
Tesseract OCR engine for extracting text from images and documents.
9
10
```java { .api }
11
/**
12
* Tesseract OCR API base class
13
*/
14
public class TessBaseAPI extends Pointer {
15
/**
16
* Create Tesseract API instance
17
*/
18
public TessBaseAPI();
19
20
/**
21
* Initialize Tesseract with language and data path
22
* @param datapath Path to tessdata directory
23
* @param language Language code (e.g., "eng", "spa", "fra")
24
* @return true if initialization successful
25
*/
26
public native boolean Init(String datapath, String language);
27
28
/**
29
* Initialize with language, OCR engine mode, and config variables
30
* @param datapath Path to tessdata directory
31
* @param language Language code
32
* @param mode OCR Engine Mode (OEM_TESSERACT_ONLY, OEM_LSTM_ONLY, etc.)
33
* @param configs Config files to load
34
* @param configs_size Number of config files
35
* @param vars_vec Variable names to set
36
* @param vars_values Variable values to set
37
* @param vars_vec_size Number of variables
38
* @param set_only_non_debug_params Only set non-debug parameters
39
* @return true if initialization successful
40
*/
41
public native boolean Init(String datapath, String language, int mode,
42
PointerPointer configs, int configs_size, StringVector vars_vec,
43
StringVector vars_values, long vars_vec_size, boolean set_only_non_debug_params);
44
45
/**
46
* Set image from memory buffer
47
* @param imagedata Image data buffer
48
* @param width Image width in pixels
49
* @param height Image height in pixels
50
* @param bytes_per_pixel Bytes per pixel (1, 3, or 4)
51
* @param bytes_per_line Bytes per line (width * bytes_per_pixel if no padding)
52
*/
53
public native void SetImage(BytePointer imagedata, int width, int height,
54
int bytes_per_pixel, int bytes_per_line);
55
56
/**
57
* Set image from PIX (Leptonica image format)
58
* @param pix Leptonica PIX image
59
*/
60
public native void SetImage(PIX pix);
61
62
/**
63
* Get recognized text as UTF-8 string
64
* @return Recognized text (caller must free with delete[])
65
*/
66
public native String GetUTF8Text();
67
68
/**
69
* Get recognition confidence (0-100)
70
* @return Mean confidence value
71
*/
72
public native int MeanTextConf();
73
74
/**
75
* Get word-level recognition results
76
* @return Array of word confidence values
77
*/
78
public native IntPointer AllWordConfidences();
79
80
/**
81
* Set variable value
82
* @param name Variable name
83
* @param value Variable value
84
* @return true if variable was set
85
*/
86
public native boolean SetVariable(String name, String value);
87
88
/**
89
* Get variable value
90
* @param name Variable name
91
* @return Variable value or null if not found
92
*/
93
public native String GetStringVariable(String name);
94
95
/**
96
* Set page segmentation mode
97
* @param mode Page segmentation mode
98
*/
99
public native void SetPageSegMode(int mode);
100
101
/**
102
* Get current page segmentation mode
103
* @return Current PSM
104
*/
105
public native int GetPageSegMode();
106
107
/**
108
* Set rectangle to restrict recognition area
109
* @param left Left boundary
110
* @param top Top boundary
111
* @param width Rectangle width
112
* @param height Rectangle height
113
*/
114
public native void SetRectangle(int left, int top, int width, int height);
115
116
/**
117
* Clear recognition results and free memory
118
*/
119
public native void Clear();
120
121
/**
122
* End recognition and free resources
123
*/
124
public native void End();
125
}
126
127
/**
128
* Result iterator for detailed OCR results
129
*/
130
public class ResultIterator extends Pointer {
131
/**
132
* Get text at current position
133
* @param level Text level (word, line, paragraph, block)
134
* @return Text string
135
*/
136
public native String GetUTF8Text(int level);
137
138
/**
139
* Get confidence at current position
140
* @param level Text level
141
* @return Confidence value (0-100)
142
*/
143
public native float Confidence(int level);
144
145
/**
146
* Get bounding box at current position
147
* @param level Text level
148
* @param left Output left coordinate
149
* @param top Output top coordinate
150
* @param right Output right coordinate
151
* @param bottom Output bottom coordinate
152
* @return true if bounding box available
153
*/
154
public native boolean BoundingBox(int level, IntPointer left, IntPointer top,
155
IntPointer right, IntPointer bottom);
156
157
/**
158
* Move to next element at specified level
159
* @param level Text level
160
* @return true if moved successfully
161
*/
162
public native boolean Next(int level);
163
164
/**
165
* Check if iterator is at beginning of element
166
* @param level Text level
167
* @return true if at beginning
168
*/
169
public native boolean IsAtBeginningOf(int level);
170
171
/**
172
* Check if iterator is at final element
173
* @param level Text level
174
* @param element Element type
175
* @return true if at final element
176
*/
177
public native boolean IsAtFinalElement(int level, int element);
178
}
179
180
/**
181
* Page segmentation modes
182
*/
183
public static final int PSM_OSD_ONLY = 0; // Orientation and script detection only
184
public static final int PSM_AUTO_OSD = 1; // Automatic page segmentation with OSD
185
public static final int PSM_AUTO_ONLY = 2; // Automatic page segmentation without OSD
186
public static final int PSM_AUTO = 3; // Fully automatic page segmentation (default)
187
public static final int PSM_SINGLE_COLUMN = 4; // Single uniform column of text
188
public static final int PSM_SINGLE_BLOCK_VERT_TEXT = 5; // Single uniform block of vertically aligned text
189
public static final int PSM_SINGLE_BLOCK = 6; // Single uniform block of text
190
public static final int PSM_SINGLE_LINE = 7; // Single text line
191
public static final int PSM_SINGLE_WORD = 8; // Single word
192
public static final int PSM_CIRCLE_WORD = 9; // Single word in a circle
193
public static final int PSM_SINGLE_CHAR = 10; // Single character
194
public static final int PSM_SPARSE_TEXT = 11; // Sparse text (find as much text as possible)
195
public static final int PSM_SPARSE_TEXT_OSD = 12; // Sparse text with orientation and script detection
196
public static final int PSM_RAW_LINE = 13; // Raw line (no assumptions about text layout)
197
198
/**
199
* OCR Engine modes
200
*/
201
public static final int OEM_TESSERACT_ONLY = 0; // Legacy Tesseract engine only
202
public static final int OEM_LSTM_ONLY = 1; // Neural nets LSTM engine only
203
public static final int OEM_TESSERACT_LSTM_COMBINED = 2; // Both engines combined
204
public static final int OEM_DEFAULT = 3; // Default (whatever is available)
205
```
206
207
### Image Processing for OCR
208
209
Leptonica library providing image processing operations optimized for document analysis and OCR preprocessing.
210
211
```java { .api }
212
/**
213
* PIX - Leptonica image structure
214
*/
215
public class PIX extends Pointer {
216
/**
217
* Get image width
218
* @return Image width in pixels
219
*/
220
public native int getWidth();
221
222
/**
223
* Get image height
224
* @return Image height in pixels
225
*/
226
public native int getHeight();
227
228
/**
229
* Get image depth (bits per pixel)
230
* @return Image depth
231
*/
232
public native int getDepth();
233
234
/**
235
* Get image data pointer
236
* @return Pointer to image data
237
*/
238
public native IntPointer getData();
239
240
/**
241
* Get words per line
242
* @return Words per line
243
*/
244
public native int getWpl();
245
246
/**
247
* Get input format
248
* @return Input file format
249
*/
250
public native int getInputFormat();
251
252
/**
253
* Get X resolution (DPI)
254
* @return X resolution
255
*/
256
public native int getXRes();
257
258
/**
259
* Get Y resolution (DPI)
260
* @return Y resolution
261
*/
262
public native int getYRes();
263
264
/**
265
* Clone PIX image
266
* @return Cloned image
267
*/
268
public native PIX pixClone();
269
270
/**
271
* Copy PIX image
272
* @return Copied image
273
*/
274
public native PIX pixCopy();
275
}
276
277
/**
278
* Image I/O operations
279
*/
280
public static class LeptonicaIO {
281
/**
282
* Read image from file
283
* @param filename Image file path
284
* @return PIX image or null on error
285
*/
286
public static native PIX pixRead(String filename);
287
288
/**
289
* Write image to file
290
* @param filename Output file path
291
* @param pix Image to write
292
* @param format Output format (IFF_PNG, IFF_JPEG, etc.)
293
* @return 0 on success, 1 on error
294
*/
295
public static native int pixWrite(String filename, PIX pix, int format);
296
297
/**
298
* Read image from memory
299
* @param data Image data buffer
300
* @param size Buffer size
301
* @return PIX image or null on error
302
*/
303
public static native PIX pixReadMem(BytePointer data, long size);
304
305
/**
306
* Write image to memory
307
* @param pdata Output data buffer pointer
308
* @param psize Output buffer size
309
* @param pix Image to write
310
* @param format Output format
311
* @return 0 on success, 1 on error
312
*/
313
public static native int pixWriteMem(PointerPointer pdata, SizeTPointer psize,
314
PIX pix, int format);
315
316
/**
317
* Display image (X11 or other display)
318
* @param pix Image to display
319
* @param x X position
320
* @param y Y position
321
* @return 0 on success, 1 on error
322
*/
323
public static native int pixDisplay(PIX pix, int x, int y);
324
}
325
326
/**
327
* Image enhancement and preprocessing
328
*/
329
public static class LeptonicaEnhancement {
330
/**
331
* Convert to grayscale
332
* @param pixs Source image
333
* @return Grayscale image
334
*/
335
public static native PIX pixConvertTo8(PIX pixs);
336
337
/**
338
* Scale image
339
* @param pixs Source image
340
* @param scalex X scale factor
341
* @param scaley Y scale factor
342
* @return Scaled image
343
*/
344
public static native PIX pixScale(PIX pixs, float scalex, float scaley);
345
346
/**
347
* Rotate image
348
* @param pixs Source image
349
* @param angle Rotation angle in radians
350
* @param type Rotation type (L_ROTATE_AREA_MAP, etc.)
351
* @param incolor Fill color for background
352
* @param width Output width (0 for auto)
353
* @param height Output height (0 for auto)
354
* @return Rotated image
355
*/
356
public static native PIX pixRotate(PIX pixs, float angle, int type, int incolor,
357
int width, int height);
358
359
/**
360
* Deskew image (correct skew angle)
361
* @param pixs Source image
362
* @param redsearch Reduction factor for search
363
* @return Deskewed image
364
*/
365
public static native PIX pixDeskew(PIX pixs, int redsearch);
366
367
/**
368
* Unsharp mask filter for sharpening
369
* @param pixs Source image
370
* @param halfwidth Half-width of convolution kernel
371
* @param fract Fraction for mixing
372
* @return Sharpened image
373
*/
374
public static native PIX pixUnsharpMasking(PIX pixs, int halfwidth, float fract);
375
376
/**
377
* Otsu thresholding for binarization
378
* @param pixs Source grayscale image
379
* @param sx Tile width for adaptive threshold
380
* @param sy Tile height for adaptive threshold
381
* @param smoothx Smoothing width
382
* @param smoothy Smoothing height
383
* @param scorefract Fraction of max score
384
* @param pthresh Output threshold value
385
* @return Binary image
386
*/
387
public static native PIX pixOtsuAdaptiveThreshold(PIX pixs, int sx, int sy,
388
int smoothx, int smoothy, float scorefract, IntPointer pthresh);
389
390
/**
391
* Remove noise using morphological operations
392
* @param pixs Source binary image
393
* @param removal Type of removal (L_REMOVE_SMALL_CC, etc.)
394
* @param minsize Minimum component size to keep
395
* @param connectivity Connectivity (4 or 8)
396
* @return Denoised image
397
*/
398
public static native PIX pixRemoveNoise(PIX pixs, int removal, int minsize, int connectivity);
399
}
400
401
/**
402
* Morphological operations
403
*/
404
public static class LeptonicaMorphology {
405
/**
406
* Morphological erosion
407
* @param pixs Source image
408
* @param sel Structuring element
409
* @return Eroded image
410
*/
411
public static native PIX pixErode(PIX pixs, SEL sel);
412
413
/**
414
* Morphological dilation
415
* @param pixs Source image
416
* @param sel Structuring element
417
* @return Dilated image
418
*/
419
public static native PIX pixDilate(PIX pixs, SEL sel);
420
421
/**
422
* Morphological opening (erosion followed by dilation)
423
* @param pixs Source image
424
* @param sel Structuring element
425
* @return Opened image
426
*/
427
public static native PIX pixOpen(PIX pixs, SEL sel);
428
429
/**
430
* Morphological closing (dilation followed by erosion)
431
* @param pixs Source image
432
* @param sel Structuring element
433
* @return Closed image
434
*/
435
public static native PIX pixClose(PIX pixs, SEL sel);
436
}
437
```
438
439
### Text Tokenization
440
441
SentencePiece library for neural text processing and tokenization.
442
443
```java { .api }
444
/**
445
* SentencePiece processor for text tokenization
446
*/
447
public class SentencePieceProcessor extends Pointer {
448
/**
449
* Create SentencePiece processor
450
*/
451
public SentencePieceProcessor();
452
453
/**
454
* Load model from file
455
* @param filename Path to SentencePiece model file
456
* @return Status object indicating success/failure
457
*/
458
public native Status Load(String filename);
459
460
/**
461
* Load model from serialized data
462
* @param serialized_model_proto Serialized model data
463
* @return Status object
464
*/
465
public native Status LoadFromSerializedProto(String serialized_model_proto);
466
467
/**
468
* Encode text to pieces
469
* @param input Input text
470
* @param pieces Output token pieces
471
* @return Status object
472
*/
473
public native Status Encode(String input, StringVector pieces);
474
475
/**
476
* Encode text to IDs
477
* @param input Input text
478
* @param ids Output token IDs
479
* @return Status object
480
*/
481
public native Status Encode(String input, IntVector ids);
482
483
/**
484
* Decode pieces to text
485
* @param pieces Input token pieces
486
* @param output Output text
487
* @return Status object
488
*/
489
public native Status Decode(StringVector pieces, StringPointer output);
490
491
/**
492
* Decode IDs to text
493
* @param ids Input token IDs
494
* @param output Output text
495
* @return Status object
496
*/
497
public native Status Decode(IntVector ids, StringPointer output);
498
499
/**
500
* Sample encode with multiple possible segmentations
501
* @param input Input text
502
* @param nbest_size Number of best segmentations
503
* @param alpha Smoothing parameter
504
* @param pieces Output token pieces
505
* @return Status object
506
*/
507
public native Status SampleEncode(String input, int nbest_size, float alpha,
508
StringVector pieces);
509
510
/**
511
* Get vocabulary size
512
* @return Vocabulary size
513
*/
514
public native int GetPieceSize();
515
516
/**
517
* Get piece from ID
518
* @param id Token ID
519
* @return Token piece string
520
*/
521
public native String IdToPiece(int id);
522
523
/**
524
* Get ID from piece
525
* @param piece Token piece string
526
* @return Token ID
527
*/
528
public native int PieceToId(String piece);
529
530
/**
531
* Check if token is unknown
532
* @param id Token ID
533
* @return true if unknown token
534
*/
535
public native boolean IsUnknown(int id);
536
537
/**
538
* Check if token is control symbol
539
* @param id Token ID
540
* @return true if control symbol
541
*/
542
public native boolean IsControl(int id);
543
544
/**
545
* Set encoding extra options
546
* @param extra_option Extra options string
547
* @return Status object
548
*/
549
public native Status SetEncodeExtraOptions(String extra_option);
550
551
/**
552
* Set decoding extra options
553
* @param extra_option Extra options string
554
* @return Status object
555
*/
556
public native Status SetDecodeExtraOptions(String extra_option);
557
}
558
559
/**
560
* Status object for operation results
561
*/
562
public class Status extends Pointer {
563
/**
564
* Check if operation was successful
565
* @return true if successful
566
*/
567
public native boolean ok();
568
569
/**
570
* Get error code
571
* @return Error code
572
*/
573
public native int code();
574
575
/**
576
* Get error message
577
* @return Error message string
578
*/
579
public native String error_message();
580
581
/**
582
* Convert to string representation
583
* @return Status string
584
*/
585
public native String ToString();
586
}
587
588
/**
589
* SentencePiece trainer for creating custom models
590
*/
591
public static class SentencePieceTrainer {
592
/**
593
* Train SentencePiece model
594
* @param args Training arguments
595
* @return Status object
596
*/
597
public static native Status Train(String args);
598
599
/**
600
* Train from arguments map
601
* @param kwargs Training arguments as key-value pairs
602
* @return Status object
603
*/
604
public static native Status Train(StringStringMap kwargs);
605
}
606
```
607
608
## Usage Examples
609
610
### Basic OCR with Tesseract
611
612
```java
613
import org.bytedeco.tesseract.*;
614
import org.bytedeco.leptonica.*;
615
import static org.bytedeco.tesseract.global.tesseract.*;
616
import static org.bytedeco.leptonica.global.leptonica.*;
617
618
public class TesseractOCR {
619
static {
620
Loader.load(tesseract.class);
621
Loader.load(leptonica.class);
622
}
623
624
public static void basicOCR(String imagePath) {
625
try (PointerScope scope = new PointerScope()) {
626
// Initialize Tesseract API
627
TessBaseAPI api = new TessBaseAPI();
628
629
// Initialize with English language
630
// Note: tessdata directory must be available
631
if (!api.Init(null, "eng")) {
632
System.err.println("Could not initialize Tesseract API");
633
return;
634
}
635
636
// Load image using Leptonica
637
PIX image = pixRead(imagePath);
638
if (image == null) {
639
System.err.println("Could not load image: " + imagePath);
640
api.End();
641
return;
642
}
643
644
// Set image for OCR
645
api.SetImage(image);
646
647
// Get OCR result
648
String ocrResult = api.GetUTF8Text();
649
int confidence = api.MeanTextConf();
650
651
System.out.println("OCR Result:");
652
System.out.println(ocrResult);
653
System.out.println("Mean confidence: " + confidence + "%");
654
655
// Cleanup
656
pixDestroy(image);
657
api.End();
658
}
659
}
660
661
public static void configuredOCR(String imagePath) {
662
try (PointerScope scope = new PointerScope()) {
663
TessBaseAPI api = new TessBaseAPI();
664
665
// Initialize with specific configurations
666
if (!api.Init(null, "eng")) {
667
System.err.println("Could not initialize Tesseract");
668
return;
669
}
670
671
// Configure OCR parameters
672
api.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ");
673
api.SetPageSegMode(PSM_SINGLE_BLOCK);
674
675
PIX image = pixRead(imagePath);
676
api.SetImage(image);
677
678
// Set recognition area (optional)
679
api.SetRectangle(50, 50, 400, 200);
680
681
String text = api.GetUTF8Text();
682
System.out.println("Configured OCR Result: " + text);
683
684
pixDestroy(image);
685
api.End();
686
}
687
}
688
689
public static void detailedOCR(String imagePath) {
690
try (PointerScope scope = new PointerScope()) {
691
TessBaseAPI api = new TessBaseAPI();
692
api.Init(null, "eng");
693
694
PIX image = pixRead(imagePath);
695
api.SetImage(image);
696
697
// Get detailed results with iterator
698
ResultIterator ri = api.GetIterator();
699
if (ri != null) {
700
int level = RIL_WORD; // Word level
701
702
do {
703
String word = ri.GetUTF8Text(level);
704
float conf = ri.Confidence(level);
705
706
// Get bounding box
707
IntPointer left = new IntPointer(1);
708
IntPointer top = new IntPointer(1);
709
IntPointer right = new IntPointer(1);
710
IntPointer bottom = new IntPointer(1);
711
712
if (ri.BoundingBox(level, left, top, right, bottom)) {
713
System.out.printf("Word: '%s' (conf: %.2f) at (%d,%d)-(%d,%d)\n",
714
word, conf, left.get(), top.get(), right.get(), bottom.get());
715
}
716
717
} while (ri.Next(level));
718
}
719
720
pixDestroy(image);
721
api.End();
722
}
723
}
724
}
725
```
726
727
### Image Preprocessing with Leptonica
728
729
```java
730
import org.bytedeco.leptonica.*;
731
import static org.bytedeco.leptonica.global.leptonica.*;
732
733
public class ImagePreprocessing {
734
static {
735
Loader.load(leptonica.class);
736
}
737
738
public static void preprocessForOCR(String inputPath, String outputPath) {
739
try (PointerScope scope = new PointerScope()) {
740
// Load image
741
PIX original = pixRead(inputPath);
742
if (original == null) {
743
System.err.println("Could not load image");
744
return;
745
}
746
747
System.out.printf("Original image: %dx%d, depth: %d\n",
748
original.getWidth(), original.getHeight(), original.getDepth());
749
750
// Convert to 8-bit grayscale
751
PIX gray = pixConvertTo8(original);
752
753
// Scale up if image is small (improves OCR accuracy)
754
PIX scaled = gray;
755
if (gray.getWidth() < 300 || gray.getHeight() < 300) {
756
float scale = Math.max(300.0f / gray.getWidth(), 300.0f / gray.getHeight());
757
scaled = pixScale(gray, scale, scale);
758
pixDestroy(gray);
759
}
760
761
// Deskew the image
762
PIX deskewed = pixDeskew(scaled, 2);
763
if (deskewed != null) {
764
pixDestroy(scaled);
765
scaled = deskewed;
766
}
767
768
// Unsharp masking for better text definition
769
PIX sharpened = pixUnsharpMasking(scaled, 5, 0.3f);
770
771
// Adaptive binarization using Otsu
772
IntPointer threshold = new IntPointer(1);
773
PIX binary = pixOtsuAdaptiveThreshold(sharpened, 32, 32, 0, 0, 0.1f, threshold);
774
775
System.out.println("Adaptive threshold: " + threshold.get());
776
777
// Remove small noise components
778
PIX denoised = pixRemoveNoise(binary, L_REMOVE_SMALL_CC, 3, 8);
779
780
// Save preprocessed image
781
pixWrite(outputPath, denoised, IFF_PNG);
782
783
System.out.printf("Preprocessed image saved: %dx%d\n",
784
denoised.getWidth(), denoised.getHeight());
785
786
// Cleanup
787
pixDestroy(original);
788
pixDestroy(sharpened);
789
pixDestroy(binary);
790
pixDestroy(denoised);
791
}
792
}
793
794
public static void morphologicalOperations(String imagePath) {
795
try (PointerScope scope = new PointerScope()) {
796
PIX original = pixRead(imagePath);
797
PIX binary = pixConvertTo1(original, 128); // Convert to binary
798
799
// Create structuring elements
800
SEL sel3x3 = selCreateBrick(3, 3, 1, 1, SEL_HIT);
801
SEL sel5x1 = selCreateBrick(5, 1, 2, 0, SEL_HIT);
802
803
// Morphological operations
804
PIX eroded = pixErode(binary, sel3x3);
805
PIX dilated = pixDilate(binary, sel3x3);
806
PIX opened = pixOpen(binary, sel3x3);
807
PIX closed = pixClose(binary, sel3x3);
808
809
// Horizontal line detection
810
PIX horizontal = pixOpen(binary, sel5x1);
811
812
// Save results
813
pixWrite("eroded.png", eroded, IFF_PNG);
814
pixWrite("dilated.png", dilated, IFF_PNG);
815
pixWrite("opened.png", opened, IFF_PNG);
816
pixWrite("closed.png", closed, IFF_PNG);
817
pixWrite("horizontal.png", horizontal, IFF_PNG);
818
819
// Cleanup
820
pixDestroy(original);
821
pixDestroy(binary);
822
pixDestroy(eroded);
823
pixDestroy(dilated);
824
pixDestroy(opened);
825
pixDestroy(closed);
826
pixDestroy(horizontal);
827
selDestroy(sel3x3);
828
selDestroy(sel5x1);
829
}
830
}
831
}
832
```
833
834
### Text Tokenization with SentencePiece
835
836
```java
837
import org.bytedeco.sentencepiece.*;
838
import static org.bytedeco.sentencepiece.global.sentencepiece.*;
839
840
public class TextTokenization {
841
static {
842
Loader.load(sentencepiece.class);
843
}
844
845
public static void basicTokenization(String modelPath) {
846
try (PointerScope scope = new PointerScope()) {
847
// Create processor
848
SentencePieceProcessor processor = new SentencePieceProcessor();
849
850
// Load pre-trained model
851
Status status = processor.Load(modelPath);
852
if (!status.ok()) {
853
System.err.println("Failed to load model: " + status.error_message());
854
return;
855
}
856
857
String text = "This is a sample text for tokenization.";
858
859
// Encode to pieces
860
StringVector pieces = new StringVector();
861
status = processor.Encode(text, pieces);
862
863
if (status.ok()) {
864
System.out.println("Input text: " + text);
865
System.out.print("Pieces: ");
866
for (int i = 0; i < pieces.size(); i++) {
867
System.out.print("'" + pieces.get(i).getString() + "' ");
868
}
869
System.out.println();
870
}
871
872
// Encode to IDs
873
IntVector ids = new IntVector();
874
status = processor.Encode(text, ids);
875
876
if (status.ok()) {
877
System.out.print("IDs: ");
878
for (int i = 0; i < ids.size(); i++) {
879
System.out.print(ids.get(i) + " ");
880
}
881
System.out.println();
882
}
883
884
// Decode back to text
885
StringPointer decoded = new StringPointer();
886
status = processor.Decode(pieces, decoded);
887
888
if (status.ok()) {
889
System.out.println("Decoded: " + decoded.getString());
890
}
891
892
// Vocabulary info
893
System.out.println("Vocabulary size: " + processor.GetPieceSize());
894
System.out.println("First 10 pieces:");
895
for (int i = 0; i < Math.min(10, processor.GetPieceSize()); i++) {
896
System.out.println(" " + i + ": '" + processor.IdToPiece(i) + "'");
897
}
898
}
899
}
900
901
public static void samplingTokenization(String modelPath) {
902
try (PointerScope scope = new PointerScope()) {
903
SentencePieceProcessor processor = new SentencePieceProcessor();
904
processor.Load(modelPath);
905
906
String text = "Neural machine translation with attention mechanism.";
907
908
// Sample multiple segmentations
909
System.out.println("Input: " + text);
910
System.out.println("Sample segmentations:");
911
912
for (int i = 0; i < 5; i++) {
913
StringVector pieces = new StringVector();
914
Status status = processor.SampleEncode(text, -1, 0.1f, pieces);
915
916
if (status.ok()) {
917
System.out.print("Sample " + (i+1) + ": ");
918
for (int j = 0; j < pieces.size(); j++) {
919
System.out.print("'" + pieces.get(j).getString() + "' ");
920
}
921
System.out.println();
922
}
923
}
924
}
925
}
926
927
public static void trainCustomModel() {
928
try (PointerScope scope = new PointerScope()) {
929
// Training arguments
930
String args = "--input=training_data.txt " +
931
"--model_prefix=custom_model " +
932
"--vocab_size=8000 " +
933
"--character_coverage=0.9995 " +
934
"--model_type=bpe";
935
936
Status status = SentencePieceTrainer.Train(args);
937
938
if (status.ok()) {
939
System.out.println("Model training completed successfully!");
940
System.out.println("Model files: custom_model.model, custom_model.vocab");
941
} else {
942
System.err.println("Training failed: " + status.error_message());
943
}
944
}
945
}
946
}
947
```
948
949
### Complete OCR Pipeline
950
951
```java
952
import org.bytedeco.tesseract.*;
953
import org.bytedeco.leptonica.*;
954
import org.bytedeco.sentencepiece.*;
955
956
public class OCRPipeline {
957
public static void processDocument(String imagePath, String modelPath) {
958
try (PointerScope scope = new PointerScope()) {
959
// Step 1: Preprocess image
960
PIX original = pixRead(imagePath);
961
PIX gray = pixConvertTo8(original);
962
PIX deskewed = pixDeskew(gray, 2);
963
PIX sharpened = pixUnsharpMasking(deskewed != null ? deskewed : gray, 5, 0.3f);
964
965
IntPointer threshold = new IntPointer(1);
966
PIX binary = pixOtsuAdaptiveThreshold(sharpened, 32, 32, 0, 0, 0.1f, threshold);
967
PIX denoised = pixRemoveNoise(binary, L_REMOVE_SMALL_CC, 3, 8);
968
969
// Step 2: OCR with Tesseract
970
TessBaseAPI api = new TessBaseAPI();
971
api.Init(null, "eng");
972
api.SetImage(denoised);
973
974
String rawText = api.GetUTF8Text();
975
int confidence = api.MeanTextConf();
976
977
System.out.println("OCR Confidence: " + confidence + "%");
978
System.out.println("Raw OCR Text:\n" + rawText);
979
980
// Step 3: Post-process with SentencePiece (if model available)
981
if (modelPath != null) {
982
SentencePieceProcessor processor = new SentencePieceProcessor();
983
Status status = processor.Load(modelPath);
984
985
if (status.ok()) {
986
StringVector pieces = new StringVector();
987
processor.Encode(rawText, pieces);
988
989
System.out.println("\nTokenized into " + pieces.size() + " pieces:");
990
for (int i = 0; i < Math.min(pieces.size(), 20); i++) {
991
System.out.print("'" + pieces.get(i).getString() + "' ");
992
}
993
System.out.println();
994
}
995
}
996
997
// Cleanup
998
pixDestroy(original);
999
pixDestroy(gray);
1000
if (deskewed != null) pixDestroy(deskewed);
1001
pixDestroy(sharpened);
1002
pixDestroy(binary);
1003
pixDestroy(denoised);
1004
api.End();
1005
}
1006
}
1007
}
1008
```