0
# Utilities
1
2
Extensive utility functions for geometry operations, text processing, clustering algorithms, PDF internal structure manipulation, and data conversion utilities.
3
4
## Capabilities
5
6
### Geometry Operations
7
8
Comprehensive geometric operations for bounding boxes, object positioning, and spatial analysis.
9
10
```python { .api }
11
def bbox_to_rect(bbox):
12
"""
13
Convert bounding box to rectangle dictionary.
14
15
Parameters:
16
- bbox: Tuple[T_num, T_num, T_num, T_num] - (x0, top, x1, bottom)
17
18
Returns:
19
Dict[str, T_num]: Rectangle with x0, top, x1, bottom, width, height
20
"""
21
22
def calculate_area(bbox):
23
"""
24
Calculate bounding box area.
25
26
Parameters:
27
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
28
29
Returns:
30
T_num: Area of bounding box
31
"""
32
33
def merge_bboxes(bboxes):
34
"""
35
Merge multiple bounding boxes into single encompassing box.
36
37
Parameters:
38
- bboxes: List[T_bbox] - List of bounding boxes
39
40
Returns:
41
T_bbox: Single bounding box containing all input boxes
42
"""
43
44
def get_bbox_overlap(a, b):
45
"""
46
Get overlap between two bounding boxes.
47
48
Parameters:
49
- a, b: T_bbox - Two bounding boxes
50
51
Returns:
52
T_bbox or None: Overlapping region or None if no overlap
53
"""
54
55
def objects_to_bbox(objects):
56
"""
57
Get bounding box containing all objects.
58
59
Parameters:
60
- objects: List[T_obj] - List of objects with bbox information
61
62
Returns:
63
T_bbox: Bounding box encompassing all objects
64
"""
65
66
def objects_to_rect(objects):
67
"""
68
Get rectangle containing all objects.
69
70
Parameters:
71
- objects: List[T_obj] - List of objects
72
73
Returns:
74
Dict[str, T_num]: Rectangle dictionary
75
"""
76
```
77
78
**Usage Examples:**
79
80
```python
81
from pdfplumber.utils import bbox_to_rect, merge_bboxes, calculate_area
82
83
with pdfplumber.open("document.pdf") as pdf:
84
page = pdf.pages[0]
85
86
# Convert bbox to rect format
87
char = page.chars[0]
88
rect = bbox_to_rect((char['x0'], char['top'], char['x1'], char['bottom']))
89
print(f"Character width: {rect['width']}, height: {rect['height']}")
90
91
# Find bounding box of all characters
92
all_chars_bbox = objects_to_bbox(page.chars)
93
print(f"Text area: {all_chars_bbox}")
94
95
# Calculate text coverage
96
page_area = calculate_area((0, 0, page.width, page.height))
97
text_area = calculate_area(all_chars_bbox)
98
coverage = text_area / page_area
99
print(f"Text covers {coverage:.1%} of page")
100
```
101
102
### Object Spatial Filtering
103
104
Filter objects based on spatial relationships and positioning.
105
106
```python { .api }
107
def within_bbox(objs, bbox):
108
"""
109
Filter objects within bounding box.
110
111
Parameters:
112
- objs: List[T_obj] - Objects to filter
113
- bbox: T_bbox - Bounding box for filtering
114
115
Returns:
116
List[T_obj]: Objects within bounding box
117
"""
118
119
def outside_bbox(objs, bbox):
120
"""
121
Filter objects outside bounding box.
122
123
Parameters:
124
- objs: List[T_obj] - Objects to filter
125
- bbox: T_bbox - Bounding box for filtering
126
127
Returns:
128
List[T_obj]: Objects outside bounding box
129
"""
130
131
def intersects_bbox(objs, bbox):
132
"""
133
Filter objects intersecting bounding box.
134
135
Parameters:
136
- objs: List[T_obj] - Objects to filter
137
- bbox: T_bbox - Bounding box for intersection test
138
139
Returns:
140
List[T_obj]: Objects intersecting bounding box
141
"""
142
143
def crop_to_bbox(objs, bbox):
144
"""
145
Filter objects intersecting bbox (alias for intersects_bbox).
146
147
Parameters:
148
- objs: List[T_obj] - Objects to filter
149
- bbox: T_bbox - Bounding box
150
151
Returns:
152
List[T_obj]: Objects intersecting bounding box
153
"""
154
```
155
156
### Object Manipulation
157
158
Transform and modify object properties and positioning.
159
160
```python { .api }
161
def move_object(obj, axis, value):
162
"""
163
Move object along specified axis.
164
165
Parameters:
166
- obj: T_obj - Object to move
167
- axis: str - Axis to move along ('x' or 'y')
168
- value: T_num - Distance to move
169
170
Returns:
171
T_obj: New object with updated coordinates
172
"""
173
174
def resize_object(obj, key, value):
175
"""
176
Resize object property.
177
178
Parameters:
179
- obj: T_obj - Object to resize
180
- key: str - Property to modify
181
- value: T_num - New value
182
183
Returns:
184
T_obj: New object with updated property
185
"""
186
187
def clip_obj(obj, bbox):
188
"""
189
Clip object to bounding box.
190
191
Parameters:
192
- obj: T_obj - Object to clip
193
- bbox: T_bbox - Clipping boundary
194
195
Returns:
196
T_obj or None: Clipped object or None if completely outside
197
"""
198
```
199
200
### Edge and Line Processing
201
202
Convert objects to edges and process line elements.
203
204
```python { .api }
205
def obj_to_edges(obj):
206
"""
207
Convert object to edges.
208
209
Parameters:
210
- obj: T_obj - Object (rectangle, curve, etc.)
211
212
Returns:
213
List[T_obj]: List of edge objects
214
"""
215
216
def line_to_edge(line):
217
"""
218
Convert line object to edge.
219
220
Parameters:
221
- line: T_obj - Line object
222
223
Returns:
224
T_obj: Edge object
225
"""
226
227
def curve_to_edges(curve):
228
"""
229
Convert curve to edges.
230
231
Parameters:
232
- curve: T_obj - Curve object
233
234
Returns:
235
List[T_obj]: List of edge objects from curve
236
"""
237
238
def rect_to_edges(rect):
239
"""
240
Convert rectangle to edges.
241
242
Parameters:
243
- rect: T_obj - Rectangle object
244
245
Returns:
246
List[T_obj]: Four edge objects (top, bottom, left, right)
247
"""
248
249
def filter_edges(edges, orientation=None, edge_type=None, min_length=1):
250
"""
251
Filter edges by orientation, type, and minimum length.
252
253
Parameters:
254
- edges: List[T_obj] - Edge objects to filter
255
- orientation: str, optional - 'h' for horizontal, 'v' for vertical
256
- edge_type: str, optional - Type of edge to include
257
- min_length: T_num - Minimum edge length
258
259
Returns:
260
List[T_obj]: Filtered edge objects
261
"""
262
```
263
264
### Object Snapping and Alignment
265
266
Align objects to common positions and snap coordinates.
267
268
```python { .api }
269
def snap_objects(objs, attr, tolerance):
270
"""
271
Snap objects to common values.
272
273
Parameters:
274
- objs: List[T_obj] - Objects to snap
275
- attr: str - Attribute to snap (e.g., 'x0', 'top')
276
- tolerance: T_num - Snapping tolerance
277
278
Returns:
279
List[T_obj]: Objects with snapped coordinates
280
"""
281
```
282
283
### Clustering Operations
284
285
Group objects and values using clustering algorithms.
286
287
```python { .api }
288
def cluster_list(xs, tolerance=0):
289
"""
290
Cluster list of numbers.
291
292
Parameters:
293
- xs: List[T_num] - Numbers to cluster
294
- tolerance: T_num - Clustering tolerance
295
296
Returns:
297
List[List[T_num]]: Clusters of numbers
298
"""
299
300
def cluster_objects(objs, key_fn, tolerance):
301
"""
302
Cluster objects by key function.
303
304
Parameters:
305
- objs: List[T_obj] - Objects to cluster
306
- key_fn: Callable[[T_obj], T_num] - Function to extract clustering key
307
- tolerance: T_num - Clustering tolerance
308
309
Returns:
310
List[List[T_obj]]: Clusters of objects
311
"""
312
313
def make_cluster_dict(values, tolerance):
314
"""
315
Create value-to-cluster mapping.
316
317
Parameters:
318
- values: List[T_num] - Values to cluster
319
- tolerance: T_num - Clustering tolerance
320
321
Returns:
322
Dict[T_num, T_num]: Mapping from value to cluster representative
323
"""
324
```
325
326
**Usage Examples:**
327
328
```python
329
from pdfplumber.utils import cluster_objects, cluster_list
330
331
with pdfplumber.open("document.pdf") as pdf:
332
page = pdf.pages[0]
333
334
# Cluster characters by font size
335
size_clusters = cluster_objects(
336
page.chars,
337
lambda c: c.get('size', 0),
338
tolerance=1
339
)
340
print(f"Found {len(size_clusters)} font size groups")
341
342
# Cluster horizontal positions
343
x_positions = [c['x0'] for c in page.chars]
344
x_clusters = cluster_list(x_positions, tolerance=5)
345
print(f"Text aligns to {len(x_clusters)} column positions")
346
347
# Find common Y positions (likely text lines)
348
y_positions = [c['top'] for c in page.chars]
349
y_clusters = cluster_list(y_positions, tolerance=2)
350
print(f"Text appears on {len(y_clusters)} distinct lines")
351
```
352
353
### Text Processing
354
355
Advanced text processing and character manipulation utilities.
356
357
```python { .api }
358
def extract_text(chars, **kwargs):
359
"""
360
Extract text from character objects.
361
362
Parameters:
363
- chars: List[T_obj] - Character objects
364
- **kwargs: Text extraction options
365
366
Returns:
367
str: Extracted text
368
"""
369
370
def extract_text_simple(chars, **kwargs):
371
"""
372
Simple text extraction from characters.
373
374
Parameters:
375
- chars: List[T_obj] - Character objects
376
- **kwargs: Extraction options
377
378
Returns:
379
str: Extracted text without layout preservation
380
"""
381
382
def extract_words(chars, **kwargs):
383
"""
384
Extract words from character objects.
385
386
Parameters:
387
- chars: List[T_obj] - Character objects
388
- **kwargs: Word extraction options
389
390
Returns:
391
List[T_obj]: Word objects with position data
392
"""
393
394
def dedupe_chars(chars, tolerance=1, **kwargs):
395
"""
396
Remove duplicate characters from list.
397
398
Parameters:
399
- chars: List[T_obj] - Character objects
400
- tolerance: T_num - Distance tolerance for duplicate detection
401
- **kwargs: Deduplication options
402
403
Returns:
404
List[T_obj]: Deduplicated character objects
405
"""
406
407
def chars_to_textmap(chars, **kwargs):
408
"""
409
Convert characters to TextMap object.
410
411
Parameters:
412
- chars: List[T_obj] - Character objects
413
- **kwargs: TextMap options
414
415
Returns:
416
TextMap: Character mapping object
417
"""
418
419
def collate_line(chars, **kwargs):
420
"""
421
Collate characters into text line.
422
423
Parameters:
424
- chars: List[T_obj] - Character objects for single line
425
- **kwargs: Line collation options
426
427
Returns:
428
str: Text content of line
429
"""
430
```
431
432
### PDF Internals
433
434
Low-level PDF object processing and decoding utilities.
435
436
```python { .api }
437
def resolve(x):
438
"""
439
Resolve PDF object references.
440
441
Parameters:
442
- x: Any - PDF object that may contain references
443
444
Returns:
445
Any: Resolved object with references dereferenced
446
"""
447
448
def resolve_all(x):
449
"""
450
Recursively resolve PDF objects.
451
452
Parameters:
453
- x: Any - PDF object structure
454
455
Returns:
456
Any: Completely resolved object structure
457
"""
458
459
def resolve_and_decode(obj):
460
"""
461
Resolve and decode PDF object.
462
463
Parameters:
464
- obj: Any - PDF object
465
466
Returns:
467
Any: Resolved and decoded object
468
"""
469
470
def decode_text(s):
471
"""
472
Decode text from bytes/string.
473
474
Parameters:
475
- s: bytes or str - Text to decode
476
477
Returns:
478
str: Decoded text string
479
"""
480
481
def decode_psl_list(psl_list):
482
"""
483
Decode PSLiteral list.
484
485
Parameters:
486
- psl_list: List - List of PSLiteral objects
487
488
Returns:
489
List: Decoded list
490
"""
491
```
492
493
### Generic Utilities
494
495
General-purpose utility functions.
496
497
```python { .api }
498
def to_list(collection):
499
"""
500
Convert collection to list.
501
502
Parameters:
503
- collection: Any - Collection to convert (list, tuple, generator, etc.)
504
505
Returns:
506
List: List representation of collection
507
"""
508
```
509
510
### Constants
511
512
Commonly used default values and tolerances.
513
514
```python { .api }
515
# Text processing constants
516
DEFAULT_X_TOLERANCE = 3
517
DEFAULT_Y_TOLERANCE = 3
518
DEFAULT_X_DENSITY = 7.25
519
DEFAULT_Y_DENSITY = 13
520
```
521
522
**Usage Examples:**
523
524
```python
525
from pdfplumber.utils import (
526
DEFAULT_X_TOLERANCE, DEFAULT_Y_TOLERANCE,
527
extract_text, resolve_all
528
)
529
530
with pdfplumber.open("document.pdf") as pdf:
531
page = pdf.pages[0]
532
533
# Use default tolerances
534
text = extract_text(page.chars,
535
x_tolerance=DEFAULT_X_TOLERANCE,
536
y_tolerance=DEFAULT_Y_TOLERANCE)
537
538
# Process PDF internals
539
raw_chars = page._objs.get('char', []) # Access raw PDF objects
540
resolved_chars = [resolve_all(char) for char in raw_chars]
541
```
542
543
## Advanced Utility Workflows
544
545
**Spatial Analysis:**
546
547
```python
548
from pdfplumber.utils import cluster_objects, objects_to_bbox
549
550
with pdfplumber.open("document.pdf") as pdf:
551
page = pdf.pages[0]
552
553
# Find text columns
554
char_clusters = cluster_objects(
555
page.chars,
556
lambda c: c['x0'], # Group by left edge
557
tolerance=10
558
)
559
560
columns = []
561
for cluster in char_clusters:
562
column_bbox = objects_to_bbox(cluster)
563
column_text = extract_text(cluster)
564
columns.append({
565
'bbox': column_bbox,
566
'text': column_text,
567
'char_count': len(cluster)
568
})
569
570
print(f"Document has {len(columns)} columns")
571
```
572
573
**Font Analysis:**
574
575
```python
576
from pdfplumber.utils import cluster_objects
577
578
with pdfplumber.open("document.pdf") as pdf:
579
page = pdf.pages[0]
580
581
# Group by font properties
582
font_groups = cluster_objects(
583
page.chars,
584
lambda c: (c.get('fontname', ''), c.get('size', 0)),
585
tolerance=0 # Exact matching for fonts
586
)
587
588
for group in font_groups:
589
sample = group[0]
590
font_name = sample.get('fontname', 'Unknown')
591
font_size = sample.get('size', 0)
592
char_count = len(group)
593
594
print(f"Font: {font_name}, Size: {font_size}, Characters: {char_count}")
595
```