or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

computer-vision.mdgpu-computing.mdindex.mdmachine-learning.mdmultimedia.mdscientific-computing.mdtext-processing.md

text-processing.mddocs/

0

# Text Processing

1

2

Text recognition, natural language processing, and document analysis capabilities through Tesseract OCR, Leptonica image processing, and SentencePiece tokenization.

3

4

## Capabilities

5

6

### Optical Character Recognition (OCR)

7

8

Tesseract OCR engine for extracting text from images and documents.

9

10

```java { .api }

11

/**

12

* Tesseract OCR API base class

13

*/

14

public class TessBaseAPI extends Pointer {

15

/**

16

* Create Tesseract API instance

17

*/

18

public TessBaseAPI();

19

20

/**

21

* Initialize Tesseract with language and data path

22

* @param datapath Path to tessdata directory

23

* @param language Language code (e.g., "eng", "spa", "fra")

24

* @return true if initialization successful

25

*/

26

public native boolean Init(String datapath, String language);

27

28

/**

29

* Initialize with language, OCR engine mode, and config variables

30

* @param datapath Path to tessdata directory

31

* @param language Language code

32

* @param mode OCR Engine Mode (OEM_TESSERACT_ONLY, OEM_LSTM_ONLY, etc.)

33

* @param configs Config files to load

34

* @param configs_size Number of config files

35

* @param vars_vec Variable names to set

36

* @param vars_values Variable values to set

37

* @param vars_vec_size Number of variables

38

* @param set_only_non_debug_params Only set non-debug parameters

39

* @return true if initialization successful

40

*/

41

public native boolean Init(String datapath, String language, int mode,

42

PointerPointer configs, int configs_size, StringVector vars_vec,

43

StringVector vars_values, long vars_vec_size, boolean set_only_non_debug_params);

44

45

/**

46

* Set image from memory buffer

47

* @param imagedata Image data buffer

48

* @param width Image width in pixels

49

* @param height Image height in pixels

50

* @param bytes_per_pixel Bytes per pixel (1, 3, or 4)

51

* @param bytes_per_line Bytes per line (width * bytes_per_pixel if no padding)

52

*/

53

public native void SetImage(BytePointer imagedata, int width, int height,

54

int bytes_per_pixel, int bytes_per_line);

55

56

/**

57

* Set image from PIX (Leptonica image format)

58

* @param pix Leptonica PIX image

59

*/

60

public native void SetImage(PIX pix);

61

62

/**

63

* Get recognized text as UTF-8 string

64

* @return Recognized text (caller must free with delete[])

65

*/

66

public native String GetUTF8Text();

67

68

/**

69

* Get recognition confidence (0-100)

70

* @return Mean confidence value

71

*/

72

public native int MeanTextConf();

73

74

/**

75

* Get word-level recognition results

76

* @return Array of word confidence values

77

*/

78

public native IntPointer AllWordConfidences();

79

80

/**

81

* Set variable value

82

* @param name Variable name

83

* @param value Variable value

84

* @return true if variable was set

85

*/

86

public native boolean SetVariable(String name, String value);

87

88

/**

89

* Get variable value

90

* @param name Variable name

91

* @return Variable value or null if not found

92

*/

93

public native String GetStringVariable(String name);

94

95

/**

96

* Set page segmentation mode

97

* @param mode Page segmentation mode

98

*/

99

public native void SetPageSegMode(int mode);

100

101

/**

102

* Get current page segmentation mode

103

* @return Current PSM

104

*/

105

public native int GetPageSegMode();

106

107

/**

108

* Set rectangle to restrict recognition area

109

* @param left Left boundary

110

* @param top Top boundary

111

* @param width Rectangle width

112

* @param height Rectangle height

113

*/

114

public native void SetRectangle(int left, int top, int width, int height);

115

116

/**

117

* Clear recognition results and free memory

118

*/

119

public native void Clear();

120

121

/**

122

* End recognition and free resources

123

*/

124

public native void End();

125

}

126

127

/**

128

* Result iterator for detailed OCR results

129

*/

130

public class ResultIterator extends Pointer {

131

/**

132

* Get text at current position

133

* @param level Text level (word, line, paragraph, block)

134

* @return Text string

135

*/

136

public native String GetUTF8Text(int level);

137

138

/**

139

* Get confidence at current position

140

* @param level Text level

141

* @return Confidence value (0-100)

142

*/

143

public native float Confidence(int level);

144

145

/**

146

* Get bounding box at current position

147

* @param level Text level

148

* @param left Output left coordinate

149

* @param top Output top coordinate

150

* @param right Output right coordinate

151

* @param bottom Output bottom coordinate

152

* @return true if bounding box available

153

*/

154

public native boolean BoundingBox(int level, IntPointer left, IntPointer top,

155

IntPointer right, IntPointer bottom);

156

157

/**

158

* Move to next element at specified level

159

* @param level Text level

160

* @return true if moved successfully

161

*/

162

public native boolean Next(int level);

163

164

/**

165

* Check if iterator is at beginning of element

166

* @param level Text level

167

* @return true if at beginning

168

*/

169

public native boolean IsAtBeginningOf(int level);

170

171

/**

172

* Check if iterator is at final element

173

* @param level Text level

174

* @param element Element type

175

* @return true if at final element

176

*/

177

public native boolean IsAtFinalElement(int level, int element);

178

}

179

180

/**

181

* Page segmentation modes

182

*/

183

public static final int PSM_OSD_ONLY = 0; // Orientation and script detection only

184

public static final int PSM_AUTO_OSD = 1; // Automatic page segmentation with OSD

185

public static final int PSM_AUTO_ONLY = 2; // Automatic page segmentation without OSD

186

public static final int PSM_AUTO = 3; // Fully automatic page segmentation (default)

187

public static final int PSM_SINGLE_COLUMN = 4; // Single uniform column of text

188

public static final int PSM_SINGLE_BLOCK_VERT_TEXT = 5; // Single uniform block of vertically aligned text

189

public static final int PSM_SINGLE_BLOCK = 6; // Single uniform block of text

190

public static final int PSM_SINGLE_LINE = 7; // Single text line

191

public static final int PSM_SINGLE_WORD = 8; // Single word

192

public static final int PSM_CIRCLE_WORD = 9; // Single word in a circle

193

public static final int PSM_SINGLE_CHAR = 10; // Single character

194

public static final int PSM_SPARSE_TEXT = 11; // Sparse text (find as much text as possible)

195

public static final int PSM_SPARSE_TEXT_OSD = 12; // Sparse text with orientation and script detection

196

public static final int PSM_RAW_LINE = 13; // Raw line (no assumptions about text layout)

197

198

/**

199

* OCR Engine modes

200

*/

201

public static final int OEM_TESSERACT_ONLY = 0; // Legacy Tesseract engine only

202

public static final int OEM_LSTM_ONLY = 1; // Neural nets LSTM engine only

203

public static final int OEM_TESSERACT_LSTM_COMBINED = 2; // Both engines combined

204

public static final int OEM_DEFAULT = 3; // Default (whatever is available)

205

```

206

207

### Image Processing for OCR

208

209

Leptonica library providing image processing operations optimized for document analysis and OCR preprocessing.

210

211

```java { .api }

212

/**

213

* PIX - Leptonica image structure

214

*/

215

public class PIX extends Pointer {

216

/**

217

* Get image width

218

* @return Image width in pixels

219

*/

220

public native int getWidth();

221

222

/**

223

* Get image height

224

* @return Image height in pixels

225

*/

226

public native int getHeight();

227

228

/**

229

* Get image depth (bits per pixel)

230

* @return Image depth

231

*/

232

public native int getDepth();

233

234

/**

235

* Get image data pointer

236

* @return Pointer to image data

237

*/

238

public native IntPointer getData();

239

240

/**

241

* Get words per line

242

* @return Words per line

243

*/

244

public native int getWpl();

245

246

/**

247

* Get input format

248

* @return Input file format

249

*/

250

public native int getInputFormat();

251

252

/**

253

* Get X resolution (DPI)

254

* @return X resolution

255

*/

256

public native int getXRes();

257

258

/**

259

* Get Y resolution (DPI)

260

* @return Y resolution

261

*/

262

public native int getYRes();

263

264

/**

265

* Clone PIX image

266

* @return Cloned image

267

*/

268

public native PIX pixClone();

269

270

/**

271

* Copy PIX image

272

* @return Copied image

273

*/

274

public native PIX pixCopy();

275

}

276

277

/**

278

* Image I/O operations

279

*/

280

public static class LeptonicaIO {

281

/**

282

* Read image from file

283

* @param filename Image file path

284

* @return PIX image or null on error

285

*/

286

public static native PIX pixRead(String filename);

287

288

/**

289

* Write image to file

290

* @param filename Output file path

291

* @param pix Image to write

292

* @param format Output format (IFF_PNG, IFF_JPEG, etc.)

293

* @return 0 on success, 1 on error

294

*/

295

public static native int pixWrite(String filename, PIX pix, int format);

296

297

/**

298

* Read image from memory

299

* @param data Image data buffer

300

* @param size Buffer size

301

* @return PIX image or null on error

302

*/

303

public static native PIX pixReadMem(BytePointer data, long size);

304

305

/**

306

* Write image to memory

307

* @param pdata Output data buffer pointer

308

* @param psize Output buffer size

309

* @param pix Image to write

310

* @param format Output format

311

* @return 0 on success, 1 on error

312

*/

313

public static native int pixWriteMem(PointerPointer pdata, SizeTPointer psize,

314

PIX pix, int format);

315

316

/**

317

* Display image (X11 or other display)

318

* @param pix Image to display

319

* @param x X position

320

* @param y Y position

321

* @return 0 on success, 1 on error

322

*/

323

public static native int pixDisplay(PIX pix, int x, int y);

324

}

325

326

/**

327

* Image enhancement and preprocessing

328

*/

329

public static class LeptonicaEnhancement {

330

/**

331

* Convert to grayscale

332

* @param pixs Source image

333

* @return Grayscale image

334

*/

335

public static native PIX pixConvertTo8(PIX pixs);

336

337

/**

338

* Scale image

339

* @param pixs Source image

340

* @param scalex X scale factor

341

* @param scaley Y scale factor

342

* @return Scaled image

343

*/

344

public static native PIX pixScale(PIX pixs, float scalex, float scaley);

345

346

/**

347

* Rotate image

348

* @param pixs Source image

349

* @param angle Rotation angle in radians

350

* @param type Rotation type (L_ROTATE_AREA_MAP, etc.)

351

* @param incolor Fill color for background

352

* @param width Output width (0 for auto)

353

* @param height Output height (0 for auto)

354

* @return Rotated image

355

*/

356

public static native PIX pixRotate(PIX pixs, float angle, int type, int incolor,

357

int width, int height);

358

359

/**

360

* Deskew image (correct skew angle)

361

* @param pixs Source image

362

* @param redsearch Reduction factor for search

363

* @return Deskewed image

364

*/

365

public static native PIX pixDeskew(PIX pixs, int redsearch);

366

367

/**

368

* Unsharp mask filter for sharpening

369

* @param pixs Source image

370

* @param halfwidth Half-width of convolution kernel

371

* @param fract Fraction for mixing

372

* @return Sharpened image

373

*/

374

public static native PIX pixUnsharpMasking(PIX pixs, int halfwidth, float fract);

375

376

/**

377

* Otsu thresholding for binarization

378

* @param pixs Source grayscale image

379

* @param sx Tile width for adaptive threshold

380

* @param sy Tile height for adaptive threshold

381

* @param smoothx Smoothing width

382

* @param smoothy Smoothing height

383

* @param scorefract Fraction of max score

384

* @param pthresh Output threshold value

385

* @return Binary image

386

*/

387

public static native PIX pixOtsuAdaptiveThreshold(PIX pixs, int sx, int sy,

388

int smoothx, int smoothy, float scorefract, IntPointer pthresh);

389

390

/**

391

* Remove noise using morphological operations

392

* @param pixs Source binary image

393

* @param removal Type of removal (L_REMOVE_SMALL_CC, etc.)

394

* @param minsize Minimum component size to keep

395

* @param connectivity Connectivity (4 or 8)

396

* @return Denoised image

397

*/

398

public static native PIX pixRemoveNoise(PIX pixs, int removal, int minsize, int connectivity);

399

}

400

401

/**

402

* Morphological operations

403

*/

404

public static class LeptonicaMorphology {

405

/**

406

* Morphological erosion

407

* @param pixs Source image

408

* @param sel Structuring element

409

* @return Eroded image

410

*/

411

public static native PIX pixErode(PIX pixs, SEL sel);

412

413

/**

414

* Morphological dilation

415

* @param pixs Source image

416

* @param sel Structuring element

417

* @return Dilated image

418

*/

419

public static native PIX pixDilate(PIX pixs, SEL sel);

420

421

/**

422

* Morphological opening (erosion followed by dilation)

423

* @param pixs Source image

424

* @param sel Structuring element

425

* @return Opened image

426

*/

427

public static native PIX pixOpen(PIX pixs, SEL sel);

428

429

/**

430

* Morphological closing (dilation followed by erosion)

431

* @param pixs Source image

432

* @param sel Structuring element

433

* @return Closed image

434

*/

435

public static native PIX pixClose(PIX pixs, SEL sel);

436

}

437

```

438

439

### Text Tokenization

440

441

SentencePiece library for neural text processing and tokenization.

442

443

```java { .api }

444

/**

445

* SentencePiece processor for text tokenization

446

*/

447

public class SentencePieceProcessor extends Pointer {

448

/**

449

* Create SentencePiece processor

450

*/

451

public SentencePieceProcessor();

452

453

/**

454

* Load model from file

455

* @param filename Path to SentencePiece model file

456

* @return Status object indicating success/failure

457

*/

458

public native Status Load(String filename);

459

460

/**

461

* Load model from serialized data

462

* @param serialized_model_proto Serialized model data

463

* @return Status object

464

*/

465

public native Status LoadFromSerializedProto(String serialized_model_proto);

466

467

/**

468

* Encode text to pieces

469

* @param input Input text

470

* @param pieces Output token pieces

471

* @return Status object

472

*/

473

public native Status Encode(String input, StringVector pieces);

474

475

/**

476

* Encode text to IDs

477

* @param input Input text

478

* @param ids Output token IDs

479

* @return Status object

480

*/

481

public native Status Encode(String input, IntVector ids);

482

483

/**

484

* Decode pieces to text

485

* @param pieces Input token pieces

486

* @param output Output text

487

* @return Status object

488

*/

489

public native Status Decode(StringVector pieces, StringPointer output);

490

491

/**

492

* Decode IDs to text

493

* @param ids Input token IDs

494

* @param output Output text

495

* @return Status object

496

*/

497

public native Status Decode(IntVector ids, StringPointer output);

498

499

/**

500

* Sample encode with multiple possible segmentations

501

* @param input Input text

502

* @param nbest_size Number of best segmentations

503

* @param alpha Smoothing parameter

504

* @param pieces Output token pieces

505

* @return Status object

506

*/

507

public native Status SampleEncode(String input, int nbest_size, float alpha,

508

StringVector pieces);

509

510

/**

511

* Get vocabulary size

512

* @return Vocabulary size

513

*/

514

public native int GetPieceSize();

515

516

/**

517

* Get piece from ID

518

* @param id Token ID

519

* @return Token piece string

520

*/

521

public native String IdToPiece(int id);

522

523

/**

524

* Get ID from piece

525

* @param piece Token piece string

526

* @return Token ID

527

*/

528

public native int PieceToId(String piece);

529

530

/**

531

* Check if token is unknown

532

* @param id Token ID

533

* @return true if unknown token

534

*/

535

public native boolean IsUnknown(int id);

536

537

/**

538

* Check if token is control symbol

539

* @param id Token ID

540

* @return true if control symbol

541

*/

542

public native boolean IsControl(int id);

543

544

/**

545

* Set encoding extra options

546

* @param extra_option Extra options string

547

* @return Status object

548

*/

549

public native Status SetEncodeExtraOptions(String extra_option);

550

551

/**

552

* Set decoding extra options

553

* @param extra_option Extra options string

554

* @return Status object

555

*/

556

public native Status SetDecodeExtraOptions(String extra_option);

557

}

558

559

/**

560

* Status object for operation results

561

*/

562

public class Status extends Pointer {

563

/**

564

* Check if operation was successful

565

* @return true if successful

566

*/

567

public native boolean ok();

568

569

/**

570

* Get error code

571

* @return Error code

572

*/

573

public native int code();

574

575

/**

576

* Get error message

577

* @return Error message string

578

*/

579

public native String error_message();

580

581

/**

582

* Convert to string representation

583

* @return Status string

584

*/

585

public native String ToString();

586

}

587

588

/**

589

* SentencePiece trainer for creating custom models

590

*/

591

public static class SentencePieceTrainer {

592

/**

593

* Train SentencePiece model

594

* @param args Training arguments

595

* @return Status object

596

*/

597

public static native Status Train(String args);

598

599

/**

600

* Train from arguments map

601

* @param kwargs Training arguments as key-value pairs

602

* @return Status object

603

*/

604

public static native Status Train(StringStringMap kwargs);

605

}

606

```

607

608

## Usage Examples

609

610

### Basic OCR with Tesseract

611

612

```java

613

import org.bytedeco.tesseract.*;

614

import org.bytedeco.leptonica.*;

615

import static org.bytedeco.tesseract.global.tesseract.*;

616

import static org.bytedeco.leptonica.global.leptonica.*;

617

618

public class TesseractOCR {

619

static {

620

Loader.load(tesseract.class);

621

Loader.load(leptonica.class);

622

}

623

624

public static void basicOCR(String imagePath) {

625

try (PointerScope scope = new PointerScope()) {

626

// Initialize Tesseract API

627

TessBaseAPI api = new TessBaseAPI();

628

629

// Initialize with English language

630

// Note: tessdata directory must be available

631

if (!api.Init(null, "eng")) {

632

System.err.println("Could not initialize Tesseract API");

633

return;

634

}

635

636

// Load image using Leptonica

637

PIX image = pixRead(imagePath);

638

if (image == null) {

639

System.err.println("Could not load image: " + imagePath);

640

api.End();

641

return;

642

}

643

644

// Set image for OCR

645

api.SetImage(image);

646

647

// Get OCR result

648

String ocrResult = api.GetUTF8Text();

649

int confidence = api.MeanTextConf();

650

651

System.out.println("OCR Result:");

652

System.out.println(ocrResult);

653

System.out.println("Mean confidence: " + confidence + "%");

654

655

// Cleanup

656

pixDestroy(image);

657

api.End();

658

}

659

}

660

661

public static void configuredOCR(String imagePath) {

662

try (PointerScope scope = new PointerScope()) {

663

TessBaseAPI api = new TessBaseAPI();

664

665

// Initialize with specific configurations

666

if (!api.Init(null, "eng")) {

667

System.err.println("Could not initialize Tesseract");

668

return;

669

}

670

671

// Configure OCR parameters

672

api.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ");

673

api.SetPageSegMode(PSM_SINGLE_BLOCK);

674

675

PIX image = pixRead(imagePath);

676

api.SetImage(image);

677

678

// Set recognition area (optional)

679

api.SetRectangle(50, 50, 400, 200);

680

681

String text = api.GetUTF8Text();

682

System.out.println("Configured OCR Result: " + text);

683

684

pixDestroy(image);

685

api.End();

686

}

687

}

688

689

public static void detailedOCR(String imagePath) {

690

try (PointerScope scope = new PointerScope()) {

691

TessBaseAPI api = new TessBaseAPI();

692

api.Init(null, "eng");

693

694

PIX image = pixRead(imagePath);

695

api.SetImage(image);

696

697

// Get detailed results with iterator

698

ResultIterator ri = api.GetIterator();

699

if (ri != null) {

700

int level = RIL_WORD; // Word level

701

702

do {

703

String word = ri.GetUTF8Text(level);

704

float conf = ri.Confidence(level);

705

706

// Get bounding box

707

IntPointer left = new IntPointer(1);

708

IntPointer top = new IntPointer(1);

709

IntPointer right = new IntPointer(1);

710

IntPointer bottom = new IntPointer(1);

711

712

if (ri.BoundingBox(level, left, top, right, bottom)) {

713

System.out.printf("Word: '%s' (conf: %.2f) at (%d,%d)-(%d,%d)\n",

714

word, conf, left.get(), top.get(), right.get(), bottom.get());

715

}

716

717

} while (ri.Next(level));

718

}

719

720

pixDestroy(image);

721

api.End();

722

}

723

}

724

}

725

```

726

727

### Image Preprocessing with Leptonica

728

729

```java

730

import org.bytedeco.leptonica.*;

731

import static org.bytedeco.leptonica.global.leptonica.*;

732

733

public class ImagePreprocessing {

734

static {

735

Loader.load(leptonica.class);

736

}

737

738

public static void preprocessForOCR(String inputPath, String outputPath) {

739

try (PointerScope scope = new PointerScope()) {

740

// Load image

741

PIX original = pixRead(inputPath);

742

if (original == null) {

743

System.err.println("Could not load image");

744

return;

745

}

746

747

System.out.printf("Original image: %dx%d, depth: %d\n",

748

original.getWidth(), original.getHeight(), original.getDepth());

749

750

// Convert to 8-bit grayscale

751

PIX gray = pixConvertTo8(original);

752

753

// Scale up if image is small (improves OCR accuracy)

754

PIX scaled = gray;

755

if (gray.getWidth() < 300 || gray.getHeight() < 300) {

756

float scale = Math.max(300.0f / gray.getWidth(), 300.0f / gray.getHeight());

757

scaled = pixScale(gray, scale, scale);

758

pixDestroy(gray);

759

}

760

761

// Deskew the image

762

PIX deskewed = pixDeskew(scaled, 2);

763

if (deskewed != null) {

764

pixDestroy(scaled);

765

scaled = deskewed;

766

}

767

768

// Unsharp masking for better text definition

769

PIX sharpened = pixUnsharpMasking(scaled, 5, 0.3f);

770

771

// Adaptive binarization using Otsu

772

IntPointer threshold = new IntPointer(1);

773

PIX binary = pixOtsuAdaptiveThreshold(sharpened, 32, 32, 0, 0, 0.1f, threshold);

774

775

System.out.println("Adaptive threshold: " + threshold.get());

776

777

// Remove small noise components

778

PIX denoised = pixRemoveNoise(binary, L_REMOVE_SMALL_CC, 3, 8);

779

780

// Save preprocessed image

781

pixWrite(outputPath, denoised, IFF_PNG);

782

783

System.out.printf("Preprocessed image saved: %dx%d\n",

784

denoised.getWidth(), denoised.getHeight());

785

786

// Cleanup

787

pixDestroy(original);

788

pixDestroy(sharpened);

789

pixDestroy(binary);

790

pixDestroy(denoised);

791

}

792

}

793

794

public static void morphologicalOperations(String imagePath) {

795

try (PointerScope scope = new PointerScope()) {

796

PIX original = pixRead(imagePath);

797

PIX binary = pixConvertTo1(original, 128); // Convert to binary

798

799

// Create structuring elements

800

SEL sel3x3 = selCreateBrick(3, 3, 1, 1, SEL_HIT);

801

SEL sel5x1 = selCreateBrick(5, 1, 2, 0, SEL_HIT);

802

803

// Morphological operations

804

PIX eroded = pixErode(binary, sel3x3);

805

PIX dilated = pixDilate(binary, sel3x3);

806

PIX opened = pixOpen(binary, sel3x3);

807

PIX closed = pixClose(binary, sel3x3);

808

809

// Horizontal line detection

810

PIX horizontal = pixOpen(binary, sel5x1);

811

812

// Save results

813

pixWrite("eroded.png", eroded, IFF_PNG);

814

pixWrite("dilated.png", dilated, IFF_PNG);

815

pixWrite("opened.png", opened, IFF_PNG);

816

pixWrite("closed.png", closed, IFF_PNG);

817

pixWrite("horizontal.png", horizontal, IFF_PNG);

818

819

// Cleanup

820

pixDestroy(original);

821

pixDestroy(binary);

822

pixDestroy(eroded);

823

pixDestroy(dilated);

824

pixDestroy(opened);

825

pixDestroy(closed);

826

pixDestroy(horizontal);

827

selDestroy(sel3x3);

828

selDestroy(sel5x1);

829

}

830

}

831

}

832

```

833

834

### Text Tokenization with SentencePiece

835

836

```java

837

import org.bytedeco.sentencepiece.*;

838

import static org.bytedeco.sentencepiece.global.sentencepiece.*;

839

840

public class TextTokenization {

841

static {

842

Loader.load(sentencepiece.class);

843

}

844

845

public static void basicTokenization(String modelPath) {

846

try (PointerScope scope = new PointerScope()) {

847

// Create processor

848

SentencePieceProcessor processor = new SentencePieceProcessor();

849

850

// Load pre-trained model

851

Status status = processor.Load(modelPath);

852

if (!status.ok()) {

853

System.err.println("Failed to load model: " + status.error_message());

854

return;

855

}

856

857

String text = "This is a sample text for tokenization.";

858

859

// Encode to pieces

860

StringVector pieces = new StringVector();

861

status = processor.Encode(text, pieces);

862

863

if (status.ok()) {

864

System.out.println("Input text: " + text);

865

System.out.print("Pieces: ");

866

for (int i = 0; i < pieces.size(); i++) {

867

System.out.print("'" + pieces.get(i).getString() + "' ");

868

}

869

System.out.println();

870

}

871

872

// Encode to IDs

873

IntVector ids = new IntVector();

874

status = processor.Encode(text, ids);

875

876

if (status.ok()) {

877

System.out.print("IDs: ");

878

for (int i = 0; i < ids.size(); i++) {

879

System.out.print(ids.get(i) + " ");

880

}

881

System.out.println();

882

}

883

884

// Decode back to text

885

StringPointer decoded = new StringPointer();

886

status = processor.Decode(pieces, decoded);

887

888

if (status.ok()) {

889

System.out.println("Decoded: " + decoded.getString());

890

}

891

892

// Vocabulary info

893

System.out.println("Vocabulary size: " + processor.GetPieceSize());

894

System.out.println("First 10 pieces:");

895

for (int i = 0; i < Math.min(10, processor.GetPieceSize()); i++) {

896

System.out.println(" " + i + ": '" + processor.IdToPiece(i) + "'");

897

}

898

}

899

}

900

901

public static void samplingTokenization(String modelPath) {

902

try (PointerScope scope = new PointerScope()) {

903

SentencePieceProcessor processor = new SentencePieceProcessor();

904

processor.Load(modelPath);

905

906

String text = "Neural machine translation with attention mechanism.";

907

908

// Sample multiple segmentations

909

System.out.println("Input: " + text);

910

System.out.println("Sample segmentations:");

911

912

for (int i = 0; i < 5; i++) {

913

StringVector pieces = new StringVector();

914

Status status = processor.SampleEncode(text, -1, 0.1f, pieces);

915

916

if (status.ok()) {

917

System.out.print("Sample " + (i+1) + ": ");

918

for (int j = 0; j < pieces.size(); j++) {

919

System.out.print("'" + pieces.get(j).getString() + "' ");

920

}

921

System.out.println();

922

}

923

}

924

}

925

}

926

927

public static void trainCustomModel() {

928

try (PointerScope scope = new PointerScope()) {

929

// Training arguments

930

String args = "--input=training_data.txt " +

931

"--model_prefix=custom_model " +

932

"--vocab_size=8000 " +

933

"--character_coverage=0.9995 " +

934

"--model_type=bpe";

935

936

Status status = SentencePieceTrainer.Train(args);

937

938

if (status.ok()) {

939

System.out.println("Model training completed successfully!");

940

System.out.println("Model files: custom_model.model, custom_model.vocab");

941

} else {

942

System.err.println("Training failed: " + status.error_message());

943

}

944

}

945

}

946

}

947

```

948

949

### Complete OCR Pipeline

950

951

```java

952

import org.bytedeco.tesseract.*;

953

import org.bytedeco.leptonica.*;

954

import org.bytedeco.sentencepiece.*;

955

956

public class OCRPipeline {

957

public static void processDocument(String imagePath, String modelPath) {

958

try (PointerScope scope = new PointerScope()) {

959

// Step 1: Preprocess image

960

PIX original = pixRead(imagePath);

961

PIX gray = pixConvertTo8(original);

962

PIX deskewed = pixDeskew(gray, 2);

963

PIX sharpened = pixUnsharpMasking(deskewed != null ? deskewed : gray, 5, 0.3f);

964

965

IntPointer threshold = new IntPointer(1);

966

PIX binary = pixOtsuAdaptiveThreshold(sharpened, 32, 32, 0, 0, 0.1f, threshold);

967

PIX denoised = pixRemoveNoise(binary, L_REMOVE_SMALL_CC, 3, 8);

968

969

// Step 2: OCR with Tesseract

970

TessBaseAPI api = new TessBaseAPI();

971

api.Init(null, "eng");

972

api.SetImage(denoised);

973

974

String rawText = api.GetUTF8Text();

975

int confidence = api.MeanTextConf();

976

977

System.out.println("OCR Confidence: " + confidence + "%");

978

System.out.println("Raw OCR Text:\n" + rawText);

979

980

// Step 3: Post-process with SentencePiece (if model available)

981

if (modelPath != null) {

982

SentencePieceProcessor processor = new SentencePieceProcessor();

983

Status status = processor.Load(modelPath);

984

985

if (status.ok()) {

986

StringVector pieces = new StringVector();

987

processor.Encode(rawText, pieces);

988

989

System.out.println("\nTokenized into " + pieces.size() + " pieces:");

990

for (int i = 0; i < Math.min(pieces.size(), 20); i++) {

991

System.out.print("'" + pieces.get(i).getString() + "' ");

992

}

993

System.out.println();

994

}

995

}

996

997

// Cleanup

998

pixDestroy(original);

999

pixDestroy(gray);

1000

if (deskewed != null) pixDestroy(deskewed);

1001

pixDestroy(sharpened);

1002

pixDestroy(binary);

1003

pixDestroy(denoised);

1004

api.End();

1005

}

1006

}

1007

}

1008

```