or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

attachments.mdcli-tools.mddocument-management.mdimage-bitmap.mdindex.mdpage-manipulation.mdpage-objects.mdtext-processing.mdtransformation.mdversion-info.md

text-processing.mddocs/

0

# Text Processing

1

2

Comprehensive text extraction and search capabilities with support for bounded text extraction, character-level positioning, full-text search, and detailed text analysis. The PdfTextPage class provides access to all text-related operations.

3

4

## Capabilities

5

6

### Text Extraction

7

8

Extract text content from PDF pages with various extraction modes and error handling options.

9

10

```python { .api }

11

class PdfTextPage:

12

def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False) -> str:

13

"""

14

Extract text from a character range.

15

16

Parameters:

17

- index: int, starting character index (0-based)

18

- count: int, number of characters to extract (-1 for all remaining)

19

- errors: str, error handling mode ("ignore", "strict", "replace")

20

- force_this: bool, force extraction from this specific text page

21

22

Returns:

23

str: Extracted text content

24

"""

25

26

def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore") -> str:

27

"""

28

Extract text within specified bounding rectangle.

29

30

Parameters:

31

- left: float, left boundary in PDF units (None = page left)

32

- bottom: float, bottom boundary in PDF units (None = page bottom)

33

- right: float, right boundary in PDF units (None = page right)

34

- top: float, top boundary in PDF units (None = page top)

35

- errors: str, error handling mode ("ignore", "strict", "replace")

36

37

Returns:

38

str: Text within the specified bounds

39

"""

40

```

41

42

Basic text extraction examples:

43

44

```python

45

import pypdfium2 as pdfium

46

47

pdf = pdfium.PdfDocument("document.pdf")

48

page = pdf[0]

49

textpage = page.get_textpage()

50

51

# Extract all text from page

52

full_text = textpage.get_text_range()

53

print(f"Full page text:\n{full_text}")

54

55

# Extract text from specific character range

56

partial_text = textpage.get_text_range(index=100, count=200)

57

print(f"Characters 100-299: {partial_text}")

58

59

# Extract text from bounded area (top-left quadrant)

60

width, height = page.get_size()

61

bounded_text = textpage.get_text_bounded(

62

left=0,

63

bottom=height/2,

64

right=width/2,

65

top=height

66

)

67

print(f"Top-left text: {bounded_text}")

68

69

# Extract text from middle column

70

column_text = textpage.get_text_bounded(

71

left=width/3,

72

right=2*width/3

73

)

74

print(f"Middle column: {column_text}")

75

```

76

77

### Character Information

78

79

Access detailed information about individual characters including position, bounding boxes, and character counts.

80

81

```python { .api }

82

def count_chars(self) -> int:

83

"""

84

Get total number of characters on the page.

85

86

Returns:

87

int: Character count including spaces and special characters

88

"""

89

90

def get_index(self, x: float, y: float, x_tol: float, y_tol: float) -> int:

91

"""

92

Get character index at specified coordinates.

93

94

Parameters:

95

- x: float, x-coordinate in PDF units

96

- y: float, y-coordinate in PDF units

97

- x_tol: float, x-axis tolerance

98

- y_tol: float, y-axis tolerance

99

100

Returns:

101

int: Character index at position, or -1 if no character found

102

"""

103

104

def get_charbox(self, index: int, loose=False) -> tuple:

105

"""

106

Get bounding box for character at index.

107

108

Parameters:

109

- index: int, character index

110

- loose: bool, use loose bounding box calculation

111

112

Returns:

113

tuple: (left, bottom, right, top) character bounds

114

"""

115

```

116

117

Character analysis examples:

118

119

```python

120

textpage = page.get_textpage()

121

122

# Get character count

123

char_count = textpage.count_chars()

124

print(f"Page has {char_count} characters")

125

126

# Find character at mouse click position

127

click_x, click_y = 300, 400 # Example coordinates

128

char_index = textpage.get_index(click_x, click_y, 5, 5)

129

if char_index != -1:

130

char_box = textpage.get_charbox(char_index)

131

print(f"Character at ({click_x}, {click_y}): index {char_index}")

132

print(f"Character bounds: {char_box}")

133

134

# Get the actual character

135

character = textpage.get_text_range(char_index, 1)

136

print(f"Character: '{character}'")

137

138

# Analyze character positions for first 100 characters

139

for i in range(min(100, char_count)):

140

char_box = textpage.get_charbox(i)

141

character = textpage.get_text_range(i, 1)

142

if character not in [' ', '\n', '\t']: # Skip whitespace

143

print(f"'{character}' at {char_box}")

144

```

145

146

### Text Rectangles

147

148

Access text rectangle information for layout analysis and text positioning.

149

150

```python { .api }

151

def count_rects(self, index=0, count=-1) -> int:

152

"""

153

Get number of text rectangles for character range.

154

155

Parameters:

156

- index: int, starting character index

157

- count: int, character count (-1 for all remaining)

158

159

Returns:

160

int: Number of rectangles covering the text range

161

"""

162

163

def get_rect(self, index: int) -> tuple:

164

"""

165

Get text rectangle coordinates by index.

166

167

Parameters:

168

- index: int, rectangle index

169

170

Returns:

171

tuple: (left, bottom, right, top) rectangle coordinates

172

"""

173

```

174

175

Rectangle analysis:

176

177

```python

178

textpage = page.get_textpage()

179

180

# Get rectangles for first 500 characters

181

rect_count = textpage.count_rects(0, 500)

182

print(f"First 500 characters span {rect_count} rectangles")

183

184

# Analyze text layout by examining rectangles

185

for i in range(rect_count):

186

rect = textpage.get_rect(i)

187

print(f"Rectangle {i}: {rect}")

188

189

# Calculate rectangle dimensions

190

left, bottom, right, top = rect

191

width = right - left

192

height = top - bottom

193

print(f" Size: {width:.1f} x {height:.1f}")

194

```

195

196

### Text Search

197

198

Perform text search operations with various matching options and result iteration.

199

200

```python { .api }

201

def search(self, text: str, index=0, match_case=False, match_whole_word=False, consecutive=False) -> PdfTextSearcher:

202

"""

203

Create text searcher for finding text matches.

204

205

Parameters:

206

- text: str, text to search for

207

- index: int, starting character index for search

208

- match_case: bool, perform case-sensitive search

209

- match_whole_word: bool, match complete words only

210

- consecutive: bool, search for consecutive occurrences

211

212

Returns:

213

PdfTextSearcher: Search object for iterating through matches

214

"""

215

```

216

217

#### PdfTextSearcher Class

218

219

Text search helper class for iterating through search matches on a text page.

220

221

```python { .api }

222

class PdfTextSearcher:

223

"""

224

Text searcher helper class for finding and iterating through text matches.

225

226

Created by PdfTextPage.search() to manage search state and provide

227

efficient iteration through search results. Supports both forward

228

and backward searching through matches.

229

230

Attributes:

231

- raw: FPDF_SCHHANDLE, underlying PDFium searcher handle

232

- textpage: PdfTextPage, reference to the textpage this searcher belongs to

233

"""

234

235

def __init__(self, raw, textpage):

236

"""

237

Initialize text searcher.

238

239

Parameters:

240

- raw: FPDF_SCHHANDLE, PDFium searcher handle

241

- textpage: PdfTextPage, parent textpage

242

243

Note: Typically created via PdfTextPage.search() rather than direct instantiation.

244

"""

245

246

def get_next(self) -> tuple[int, int] | None:

247

"""

248

Find next search match.

249

250

Returns:

251

tuple: (start_index, char_count) for the next match occurrence,

252

or None if no more matches are found

253

254

Advances the search position to the next occurrence of the search text.

255

The returned indices can be used with PdfTextPage.get_text_range() to

256

extract the matched text.

257

"""

258

259

def get_prev(self) -> tuple[int, int] | None:

260

"""

261

Find previous search match.

262

263

Returns:

264

tuple: (start_index, char_count) for the previous match occurrence,

265

or None if no previous matches exist

266

267

Moves the search position backward to the previous occurrence.

268

Useful for bidirectional search navigation.

269

"""

270

271

def close(self):

272

"""Close and clean up search resources."""

273

```

274

275

Text search examples:

276

277

```python

278

textpage = page.get_textpage()

279

280

# Search for specific text

281

searcher = textpage.search("important", match_case=False)

282

283

# Find all matches

284

matches = []

285

while True:

286

match = searcher.get_next()

287

if match is None:

288

break

289

matches.append(match)

290

291

print(f"Found {len(matches)} matches for 'important'")

292

293

# Process each match

294

for start_idx, char_count in matches:

295

# Get the matched text (for verification)

296

matched_text = textpage.get_text_range(start_idx, char_count)

297

298

# Get bounding boxes for highlight

299

match_boxes = []

300

for i in range(start_idx, start_idx + char_count):

301

char_box = textpage.get_charbox(i)

302

match_boxes.append(char_box)

303

304

print(f"Match: '{matched_text}' at chars {start_idx}-{start_idx+char_count}")

305

print(f"First char box: {match_boxes[0]}")

306

307

# Close the searcher when done

308

searcher.close()

309

310

# Case-sensitive search for exact matches

311

exact_searcher = textpage.search("PDF", match_case=True, match_whole_word=True)

312

exact_match = exact_searcher.get_next()

313

if exact_match:

314

start_idx, char_count = exact_match

315

print(f"Found exact 'PDF' match at position {start_idx}")

316

317

# Bidirectional search example

318

bidirectional_searcher = textpage.search("chapter")

319

320

# Find matches and navigate back and forth

321

forward_matches = []

322

match = bidirectional_searcher.get_next()

323

while match:

324

forward_matches.append(match)

325

match = bidirectional_searcher.get_next()

326

327

print(f"Found {len(forward_matches)} forward matches")

328

329

# Go backward through matches

330

backward_matches = []

331

match = bidirectional_searcher.get_prev()

332

while match:

333

backward_matches.append(match)

334

match = bidirectional_searcher.get_prev()

335

336

print(f"Found {len(backward_matches)} backward matches")

337

bidirectional_searcher.close()

338

```

339

340

Advanced search patterns:

341

342

```python

343

def search_and_highlight_text(textpage, search_terms):

344

"""Search for multiple terms and collect highlighting information."""

345

346

all_highlights = []

347

348

for term in search_terms:

349

print(f"\nSearching for '{term}':")

350

351

# Create searcher with appropriate options

352

searcher = textpage.search(

353

term,

354

match_case=False,

355

match_whole_word=True # Match complete words only

356

)

357

358

# Collect all matches for this term

359

term_matches = []

360

while True:

361

match = searcher.get_next()

362

if match is None:

363

break

364

365

start_idx, char_count = match

366

367

# Extract the matched text

368

matched_text = textpage.get_text_range(start_idx, char_count)

369

370

# Calculate bounding box for the entire match

371

char_boxes = []

372

for i in range(start_idx, start_idx + char_count):

373

char_box = textpage.get_charbox(i)

374

char_boxes.append(char_box)

375

376

# Create overall bounding box

377

if char_boxes:

378

all_lefts = [box[0] for box in char_boxes]

379

all_bottoms = [box[1] for box in char_boxes]

380

all_rights = [box[2] for box in char_boxes]

381

all_tops = [box[3] for box in char_boxes]

382

383

overall_box = (

384

min(all_lefts), min(all_bottoms),

385

max(all_rights), max(all_tops)

386

)

387

388

match_info = {

389

'term': term,

390

'text': matched_text,

391

'start_index': start_idx,

392

'char_count': char_count,

393

'bbox': overall_box

394

}

395

396

term_matches.append(match_info)

397

all_highlights.append(match_info)

398

399

print(f" Found {len(term_matches)} matches")

400

searcher.close()

401

402

return all_highlights

403

404

# Usage

405

search_terms = ["introduction", "conclusion", "figure", "table", "reference"]

406

textpage = page.get_textpage()

407

highlights = search_and_highlight_text(textpage, search_terms)

408

409

# Print highlight summary

410

print(f"\nTotal highlights: {len(highlights)}")

411

for highlight in highlights:

412

print(f"'{highlight['term']}' -> '{highlight['text']}' at {highlight['bbox']}")

413

```

414

415

### Advanced Text Analysis

416

417

Combine multiple text processing features for comprehensive text analysis.

418

419

```python

420

def analyze_page_text(page):

421

"""Comprehensive text analysis example."""

422

textpage = page.get_textpage()

423

424

# Basic statistics

425

char_count = textpage.count_chars()

426

full_text = textpage.get_text_range()

427

word_count = len(full_text.split())

428

line_count = full_text.count('\n') + 1

429

430

print(f"Text Statistics:")

431

print(f" Characters: {char_count}")

432

print(f" Words: {word_count}")

433

print(f" Lines: {line_count}")

434

435

# Find common words

436

words = full_text.lower().split()

437

word_freq = {}

438

for word in words:

439

word_freq[word] = word_freq.get(word, 0) + 1

440

441

# Most common words (excluding short words)

442

common_words = [(word, count) for word, count in word_freq.items()

443

if len(word) > 3]

444

common_words.sort(key=lambda x: x[1], reverse=True)

445

446

print(f"\nMost common words:")

447

for word, count in common_words[:10]:

448

print(f" '{word}': {count}")

449

450

# Search for specific patterns

451

patterns = ["http", "www", "@", "phone", "email"]

452

for pattern in patterns:

453

searcher = textpage.search(pattern, match_case=False)

454

match_count = 0

455

while searcher.get_next():

456

match_count += 1

457

if match_count > 0:

458

print(f"Found {match_count} matches for '{pattern}'")

459

460

return {

461

'char_count': char_count,

462

'word_count': word_count,

463

'line_count': line_count,

464

'common_words': common_words[:10]

465

}

466

467

# Usage

468

pdf = pdfium.PdfDocument("document.pdf")

469

for i, page in enumerate(pdf):

470

print(f"\n--- Page {i+1} Analysis ---")

471

stats = analyze_page_text(page)

472

```

473

474

### Text Extraction with Coordinates

475

476

Extract text while preserving positional information for layout reconstruction.

477

478

```python

479

def extract_text_with_positions(textpage):

480

"""Extract text with character positions."""

481

char_count = textpage.count_chars()

482

text_elements = []

483

484

current_line = []

485

current_y = None

486

487

for i in range(char_count):

488

char = textpage.get_text_range(i, 1)

489

char_box = textpage.get_charbox(i)

490

left, bottom, right, top = char_box

491

492

# Group characters by line (similar y-coordinates)

493

if current_y is None or abs(bottom - current_y) > 5:

494

if current_line:

495

text_elements.append(current_line)

496

current_line = []

497

current_y = bottom

498

499

current_line.append({

500

'char': char,

501

'box': char_box,

502

'x': left,

503

'y': bottom

504

})

505

506

if current_line:

507

text_elements.append(current_line)

508

509

return text_elements

510

511

# Usage

512

textpage = page.get_textpage()

513

text_lines = extract_text_with_positions(textpage)

514

515

print(f"Found {len(text_lines)} text lines")

516

for i, line in enumerate(text_lines):

517

line_text = ''.join(elem['char'] for elem in line)

518

if line_text.strip(): # Skip empty lines

519

first_char_y = line[0]['y']

520

print(f"Line {i+1} (y={first_char_y:.1f}): {line_text.strip()}")

521

```

522

523

## Properties

524

525

```python { .api }

526

@property

527

def raw(self) -> FPDF_TEXTPAGE:

528

"""Raw PDFium textpage handle for low-level operations."""

529

530

@property

531

def page(self) -> PdfPage:

532

"""Parent page containing this text."""

533

```

534

535

## Text Processing Best Practices

536

537

1. **Always create textpage objects for text operations**

538

2. **Handle encoding errors appropriately** - use `errors="ignore"` for robustness

539

3. **Use bounded extraction for targeted text** - more efficient than full extraction

540

4. **Consider character-level analysis for precise positioning**

541

5. **Clean up textpage objects** when done to free memory

542

6. **Use search functionality for finding specific content** rather than manual parsing