or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdindex.mdpage-manipulation.mdpdf-operations.mdtable-extraction.mdtext-extraction.mdutilities.mdvisual-debugging.md

utilities.mddocs/

0

# Utilities

1

2

Extensive utility functions for geometry operations, text processing, clustering algorithms, PDF internal structure manipulation, and data conversion utilities.

3

4

## Capabilities

5

6

### Geometry Operations

7

8

Comprehensive geometric operations for bounding boxes, object positioning, and spatial analysis.

9

10

```python { .api }

11

def bbox_to_rect(bbox):

12

"""

13

Convert bounding box to rectangle dictionary.

14

15

Parameters:

16

- bbox: Tuple[T_num, T_num, T_num, T_num] - (x0, top, x1, bottom)

17

18

Returns:

19

Dict[str, T_num]: Rectangle with x0, top, x1, bottom, width, height

20

"""

21

22

def calculate_area(bbox):

23

"""

24

Calculate bounding box area.

25

26

Parameters:

27

- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates

28

29

Returns:

30

T_num: Area of bounding box

31

"""

32

33

def merge_bboxes(bboxes):

34

"""

35

Merge multiple bounding boxes into single encompassing box.

36

37

Parameters:

38

- bboxes: List[T_bbox] - List of bounding boxes

39

40

Returns:

41

T_bbox: Single bounding box containing all input boxes

42

"""

43

44

def get_bbox_overlap(a, b):

45

"""

46

Get overlap between two bounding boxes.

47

48

Parameters:

49

- a, b: T_bbox - Two bounding boxes

50

51

Returns:

52

T_bbox or None: Overlapping region or None if no overlap

53

"""

54

55

def objects_to_bbox(objects):

56

"""

57

Get bounding box containing all objects.

58

59

Parameters:

60

- objects: List[T_obj] - List of objects with bbox information

61

62

Returns:

63

T_bbox: Bounding box encompassing all objects

64

"""

65

66

def objects_to_rect(objects):

67

"""

68

Get rectangle containing all objects.

69

70

Parameters:

71

- objects: List[T_obj] - List of objects

72

73

Returns:

74

Dict[str, T_num]: Rectangle dictionary

75

"""

76

```

77

78

**Usage Examples:**

79

80

```python

81

from pdfplumber.utils import bbox_to_rect, merge_bboxes, calculate_area

82

83

with pdfplumber.open("document.pdf") as pdf:

84

page = pdf.pages[0]

85

86

# Convert bbox to rect format

87

char = page.chars[0]

88

rect = bbox_to_rect((char['x0'], char['top'], char['x1'], char['bottom']))

89

print(f"Character width: {rect['width']}, height: {rect['height']}")

90

91

# Find bounding box of all characters

92

all_chars_bbox = objects_to_bbox(page.chars)

93

print(f"Text area: {all_chars_bbox}")

94

95

# Calculate text coverage

96

page_area = calculate_area((0, 0, page.width, page.height))

97

text_area = calculate_area(all_chars_bbox)

98

coverage = text_area / page_area

99

print(f"Text covers {coverage:.1%} of page")

100

```

101

102

### Object Spatial Filtering

103

104

Filter objects based on spatial relationships and positioning.

105

106

```python { .api }

107

def within_bbox(objs, bbox):

108

"""

109

Filter objects within bounding box.

110

111

Parameters:

112

- objs: List[T_obj] - Objects to filter

113

- bbox: T_bbox - Bounding box for filtering

114

115

Returns:

116

List[T_obj]: Objects within bounding box

117

"""

118

119

def outside_bbox(objs, bbox):

120

"""

121

Filter objects outside bounding box.

122

123

Parameters:

124

- objs: List[T_obj] - Objects to filter

125

- bbox: T_bbox - Bounding box for filtering

126

127

Returns:

128

List[T_obj]: Objects outside bounding box

129

"""

130

131

def intersects_bbox(objs, bbox):

132

"""

133

Filter objects intersecting bounding box.

134

135

Parameters:

136

- objs: List[T_obj] - Objects to filter

137

- bbox: T_bbox - Bounding box for intersection test

138

139

Returns:

140

List[T_obj]: Objects intersecting bounding box

141

"""

142

143

def crop_to_bbox(objs, bbox):

144

"""

145

Filter objects intersecting bbox (alias for intersects_bbox).

146

147

Parameters:

148

- objs: List[T_obj] - Objects to filter

149

- bbox: T_bbox - Bounding box

150

151

Returns:

152

List[T_obj]: Objects intersecting bounding box

153

"""

154

```

155

156

### Object Manipulation

157

158

Transform and modify object properties and positioning.

159

160

```python { .api }

161

def move_object(obj, axis, value):

162

"""

163

Move object along specified axis.

164

165

Parameters:

166

- obj: T_obj - Object to move

167

- axis: str - Axis to move along ('x' or 'y')

168

- value: T_num - Distance to move

169

170

Returns:

171

T_obj: New object with updated coordinates

172

"""

173

174

def resize_object(obj, key, value):

175

"""

176

Resize object property.

177

178

Parameters:

179

- obj: T_obj - Object to resize

180

- key: str - Property to modify

181

- value: T_num - New value

182

183

Returns:

184

T_obj: New object with updated property

185

"""

186

187

def clip_obj(obj, bbox):

188

"""

189

Clip object to bounding box.

190

191

Parameters:

192

- obj: T_obj - Object to clip

193

- bbox: T_bbox - Clipping boundary

194

195

Returns:

196

T_obj or None: Clipped object or None if completely outside

197

"""

198

```

199

200

### Edge and Line Processing

201

202

Convert objects to edges and process line elements.

203

204

```python { .api }

205

def obj_to_edges(obj):

206

"""

207

Convert object to edges.

208

209

Parameters:

210

- obj: T_obj - Object (rectangle, curve, etc.)

211

212

Returns:

213

List[T_obj]: List of edge objects

214

"""

215

216

def line_to_edge(line):

217

"""

218

Convert line object to edge.

219

220

Parameters:

221

- line: T_obj - Line object

222

223

Returns:

224

T_obj: Edge object

225

"""

226

227

def curve_to_edges(curve):

228

"""

229

Convert curve to edges.

230

231

Parameters:

232

- curve: T_obj - Curve object

233

234

Returns:

235

List[T_obj]: List of edge objects from curve

236

"""

237

238

def rect_to_edges(rect):

239

"""

240

Convert rectangle to edges.

241

242

Parameters:

243

- rect: T_obj - Rectangle object

244

245

Returns:

246

List[T_obj]: Four edge objects (top, bottom, left, right)

247

"""

248

249

def filter_edges(edges, orientation=None, edge_type=None, min_length=1):

250

"""

251

Filter edges by orientation, type, and minimum length.

252

253

Parameters:

254

- edges: List[T_obj] - Edge objects to filter

255

- orientation: str, optional - 'h' for horizontal, 'v' for vertical

256

- edge_type: str, optional - Type of edge to include

257

- min_length: T_num - Minimum edge length

258

259

Returns:

260

List[T_obj]: Filtered edge objects

261

"""

262

```

263

264

### Object Snapping and Alignment

265

266

Align objects to common positions and snap coordinates.

267

268

```python { .api }

269

def snap_objects(objs, attr, tolerance):

270

"""

271

Snap objects to common values.

272

273

Parameters:

274

- objs: List[T_obj] - Objects to snap

275

- attr: str - Attribute to snap (e.g., 'x0', 'top')

276

- tolerance: T_num - Snapping tolerance

277

278

Returns:

279

List[T_obj]: Objects with snapped coordinates

280

"""

281

```

282

283

### Clustering Operations

284

285

Group objects and values using clustering algorithms.

286

287

```python { .api }

288

def cluster_list(xs, tolerance=0):

289

"""

290

Cluster list of numbers.

291

292

Parameters:

293

- xs: List[T_num] - Numbers to cluster

294

- tolerance: T_num - Clustering tolerance

295

296

Returns:

297

List[List[T_num]]: Clusters of numbers

298

"""

299

300

def cluster_objects(objs, key_fn, tolerance):

301

"""

302

Cluster objects by key function.

303

304

Parameters:

305

- objs: List[T_obj] - Objects to cluster

306

- key_fn: Callable[[T_obj], T_num] - Function to extract clustering key

307

- tolerance: T_num - Clustering tolerance

308

309

Returns:

310

List[List[T_obj]]: Clusters of objects

311

"""

312

313

def make_cluster_dict(values, tolerance):

314

"""

315

Create value-to-cluster mapping.

316

317

Parameters:

318

- values: List[T_num] - Values to cluster

319

- tolerance: T_num - Clustering tolerance

320

321

Returns:

322

Dict[T_num, T_num]: Mapping from value to cluster representative

323

"""

324

```

325

326

**Usage Examples:**

327

328

```python

329

from pdfplumber.utils import cluster_objects, cluster_list

330

331

with pdfplumber.open("document.pdf") as pdf:

332

page = pdf.pages[0]

333

334

# Cluster characters by font size

335

size_clusters = cluster_objects(

336

page.chars,

337

lambda c: c.get('size', 0),

338

tolerance=1

339

)

340

print(f"Found {len(size_clusters)} font size groups")

341

342

# Cluster horizontal positions

343

x_positions = [c['x0'] for c in page.chars]

344

x_clusters = cluster_list(x_positions, tolerance=5)

345

print(f"Text aligns to {len(x_clusters)} column positions")

346

347

# Find common Y positions (likely text lines)

348

y_positions = [c['top'] for c in page.chars]

349

y_clusters = cluster_list(y_positions, tolerance=2)

350

print(f"Text appears on {len(y_clusters)} distinct lines")

351

```

352

353

### Text Processing

354

355

Advanced text processing and character manipulation utilities.

356

357

```python { .api }

358

def extract_text(chars, **kwargs):

359

"""

360

Extract text from character objects.

361

362

Parameters:

363

- chars: List[T_obj] - Character objects

364

- **kwargs: Text extraction options

365

366

Returns:

367

str: Extracted text

368

"""

369

370

def extract_text_simple(chars, **kwargs):

371

"""

372

Simple text extraction from characters.

373

374

Parameters:

375

- chars: List[T_obj] - Character objects

376

- **kwargs: Extraction options

377

378

Returns:

379

str: Extracted text without layout preservation

380

"""

381

382

def extract_words(chars, **kwargs):

383

"""

384

Extract words from character objects.

385

386

Parameters:

387

- chars: List[T_obj] - Character objects

388

- **kwargs: Word extraction options

389

390

Returns:

391

List[T_obj]: Word objects with position data

392

"""

393

394

def dedupe_chars(chars, tolerance=1, **kwargs):

395

"""

396

Remove duplicate characters from list.

397

398

Parameters:

399

- chars: List[T_obj] - Character objects

400

- tolerance: T_num - Distance tolerance for duplicate detection

401

- **kwargs: Deduplication options

402

403

Returns:

404

List[T_obj]: Deduplicated character objects

405

"""

406

407

def chars_to_textmap(chars, **kwargs):

408

"""

409

Convert characters to TextMap object.

410

411

Parameters:

412

- chars: List[T_obj] - Character objects

413

- **kwargs: TextMap options

414

415

Returns:

416

TextMap: Character mapping object

417

"""

418

419

def collate_line(chars, **kwargs):

420

"""

421

Collate characters into text line.

422

423

Parameters:

424

- chars: List[T_obj] - Character objects for single line

425

- **kwargs: Line collation options

426

427

Returns:

428

str: Text content of line

429

"""

430

```

431

432

### PDF Internals

433

434

Low-level PDF object processing and decoding utilities.

435

436

```python { .api }

437

def resolve(x):

438

"""

439

Resolve PDF object references.

440

441

Parameters:

442

- x: Any - PDF object that may contain references

443

444

Returns:

445

Any: Resolved object with references dereferenced

446

"""

447

448

def resolve_all(x):

449

"""

450

Recursively resolve PDF objects.

451

452

Parameters:

453

- x: Any - PDF object structure

454

455

Returns:

456

Any: Completely resolved object structure

457

"""

458

459

def resolve_and_decode(obj):

460

"""

461

Resolve and decode PDF object.

462

463

Parameters:

464

- obj: Any - PDF object

465

466

Returns:

467

Any: Resolved and decoded object

468

"""

469

470

def decode_text(s):

471

"""

472

Decode text from bytes/string.

473

474

Parameters:

475

- s: bytes or str - Text to decode

476

477

Returns:

478

str: Decoded text string

479

"""

480

481

def decode_psl_list(psl_list):

482

"""

483

Decode PSLiteral list.

484

485

Parameters:

486

- psl_list: List - List of PSLiteral objects

487

488

Returns:

489

List: Decoded list

490

"""

491

```

492

493

### Generic Utilities

494

495

General-purpose utility functions.

496

497

```python { .api }

498

def to_list(collection):

499

"""

500

Convert collection to list.

501

502

Parameters:

503

- collection: Any - Collection to convert (list, tuple, generator, etc.)

504

505

Returns:

506

List: List representation of collection

507

"""

508

```

509

510

### Constants

511

512

Commonly used default values and tolerances.

513

514

```python { .api }

515

# Text processing constants

516

DEFAULT_X_TOLERANCE = 3

517

DEFAULT_Y_TOLERANCE = 3

518

DEFAULT_X_DENSITY = 7.25

519

DEFAULT_Y_DENSITY = 13

520

```

521

522

**Usage Examples:**

523

524

```python

525

from pdfplumber.utils import (

526

DEFAULT_X_TOLERANCE, DEFAULT_Y_TOLERANCE,

527

extract_text, resolve_all

528

)

529

530

with pdfplumber.open("document.pdf") as pdf:

531

page = pdf.pages[0]

532

533

# Use default tolerances

534

text = extract_text(page.chars,

535

x_tolerance=DEFAULT_X_TOLERANCE,

536

y_tolerance=DEFAULT_Y_TOLERANCE)

537

538

# Process PDF internals

539

raw_chars = page._objs.get('char', []) # Access raw PDF objects

540

resolved_chars = [resolve_all(char) for char in raw_chars]

541

```

542

543

## Advanced Utility Workflows

544

545

**Spatial Analysis:**

546

547

```python

548

from pdfplumber.utils import cluster_objects, objects_to_bbox

549

550

with pdfplumber.open("document.pdf") as pdf:

551

page = pdf.pages[0]

552

553

# Find text columns

554

char_clusters = cluster_objects(

555

page.chars,

556

lambda c: c['x0'], # Group by left edge

557

tolerance=10

558

)

559

560

columns = []

561

for cluster in char_clusters:

562

column_bbox = objects_to_bbox(cluster)

563

column_text = extract_text(cluster)

564

columns.append({

565

'bbox': column_bbox,

566

'text': column_text,

567

'char_count': len(cluster)

568

})

569

570

print(f"Document has {len(columns)} columns")

571

```

572

573

**Font Analysis:**

574

575

```python

576

from pdfplumber.utils import cluster_objects

577

578

with pdfplumber.open("document.pdf") as pdf:

579

page = pdf.pages[0]

580

581

# Group by font properties

582

font_groups = cluster_objects(

583

page.chars,

584

lambda c: (c.get('fontname', ''), c.get('size', 0)),

585

tolerance=0 # Exact matching for fonts

586

)

587

588

for group in font_groups:

589

sample = group[0]

590

font_name = sample.get('fontname', 'Unknown')

591

font_size = sample.get('size', 0)

592

char_count = len(group)

593

594

print(f"Font: {font_name}, Size: {font_size}, Characters: {char_count}")

595

```