Tessl Tile for pypi/pdfplumber@0.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli.md index.md page-manipulation.md pdf-operations.md table-extraction.md text-extraction.md utilities.md visual-debugging.md

utilities.mddocs/

0
# Utilities
1

2
Extensive utility functions for geometry operations, text processing, clustering algorithms, PDF internal structure manipulation, and data conversion utilities.
3

4
## Capabilities
5

6
### Geometry Operations
7

8
Comprehensive geometric operations for bounding boxes, object positioning, and spatial analysis.
9

10
```python { .api }
11
def bbox_to_rect(bbox):
12
    """
13
    Convert bounding box to rectangle dictionary.
14
    
15
    Parameters:
16
    - bbox: Tuple[T_num, T_num, T_num, T_num] - (x0, top, x1, bottom)
17
    
18
    Returns:
19
    Dict[str, T_num]: Rectangle with x0, top, x1, bottom, width, height
20
    """
21

22
def calculate_area(bbox):
23
    """
24
    Calculate bounding box area.
25
    
26
    Parameters:
27
    - bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
28
    
29
    Returns:
30
    T_num: Area of bounding box
31
    """
32

33
def merge_bboxes(bboxes):
34
    """
35
    Merge multiple bounding boxes into single encompassing box.
36
    
37
    Parameters:
38
    - bboxes: List[T_bbox] - List of bounding boxes
39
    
40
    Returns:
41
    T_bbox: Single bounding box containing all input boxes
42
    """
43

44
def get_bbox_overlap(a, b):
45
    """
46
    Get overlap between two bounding boxes.
47
    
48
    Parameters:
49
    - a, b: T_bbox - Two bounding boxes
50
    
51
    Returns:
52
    T_bbox or None: Overlapping region or None if no overlap
53
    """
54

55
def objects_to_bbox(objects):
56
    """
57
    Get bounding box containing all objects.
58
    
59
    Parameters:
60
    - objects: List[T_obj] - List of objects with bbox information
61
    
62
    Returns:
63
    T_bbox: Bounding box encompassing all objects
64
    """
65

66
def objects_to_rect(objects):
67
    """
68
    Get rectangle containing all objects.
69
    
70
    Parameters:
71
    - objects: List[T_obj] - List of objects
72
    
73
    Returns:
74
    Dict[str, T_num]: Rectangle dictionary
75
    """
76
```
77

78
**Usage Examples:**
79

80
```python
81
from pdfplumber.utils import bbox_to_rect, merge_bboxes, calculate_area
82

83
with pdfplumber.open("document.pdf") as pdf:
84
    page = pdf.pages[0]
85
    
86
    # Convert bbox to rect format
87
    char = page.chars[0]
88
    rect = bbox_to_rect((char['x0'], char['top'], char['x1'], char['bottom']))
89
    print(f"Character width: {rect['width']}, height: {rect['height']}")
90
    
91
    # Find bounding box of all characters
92
    all_chars_bbox = objects_to_bbox(page.chars)
93
    print(f"Text area: {all_chars_bbox}")
94
    
95
    # Calculate text coverage
96
    page_area = calculate_area((0, 0, page.width, page.height))
97
    text_area = calculate_area(all_chars_bbox)
98
    coverage = text_area / page_area
99
    print(f"Text covers {coverage:.1%} of page")
100
```
101

102
### Object Spatial Filtering
103

104
Filter objects based on spatial relationships and positioning.
105

106
```python { .api }
107
def within_bbox(objs, bbox):
108
    """
109
    Filter objects within bounding box.
110
    
111
    Parameters:
112
    - objs: List[T_obj] - Objects to filter
113
    - bbox: T_bbox - Bounding box for filtering
114
    
115
    Returns:
116
    List[T_obj]: Objects within bounding box
117
    """
118

119
def outside_bbox(objs, bbox):
120
    """
121
    Filter objects outside bounding box.
122
    
123
    Parameters:
124
    - objs: List[T_obj] - Objects to filter
125
    - bbox: T_bbox - Bounding box for filtering
126
    
127
    Returns:
128
    List[T_obj]: Objects outside bounding box
129
    """
130

131
def intersects_bbox(objs, bbox):
132
    """
133
    Filter objects intersecting bounding box.
134
    
135
    Parameters:
136
    - objs: List[T_obj] - Objects to filter
137
    - bbox: T_bbox - Bounding box for intersection test
138
    
139
    Returns:
140
    List[T_obj]: Objects intersecting bounding box
141
    """
142

143
def crop_to_bbox(objs, bbox):
144
    """
145
    Filter objects intersecting bbox (alias for intersects_bbox).
146
    
147
    Parameters:
148
    - objs: List[T_obj] - Objects to filter
149
    - bbox: T_bbox - Bounding box
150
    
151
    Returns:
152
    List[T_obj]: Objects intersecting bounding box
153
    """
154
```
155

156
### Object Manipulation
157

158
Transform and modify object properties and positioning.
159

160
```python { .api }
161
def move_object(obj, axis, value):
162
    """
163
    Move object along specified axis.
164
    
165
    Parameters:
166
    - obj: T_obj - Object to move
167
    - axis: str - Axis to move along ('x' or 'y')
168
    - value: T_num - Distance to move
169
    
170
    Returns:
171
    T_obj: New object with updated coordinates
172
    """
173

174
def resize_object(obj, key, value):
175
    """
176
    Resize object property.
177
    
178
    Parameters:
179
    - obj: T_obj - Object to resize
180
    - key: str - Property to modify
181
    - value: T_num - New value
182
    
183
    Returns:
184
    T_obj: New object with updated property
185
    """
186

187
def clip_obj(obj, bbox):
188
    """
189
    Clip object to bounding box.
190
    
191
    Parameters:
192
    - obj: T_obj - Object to clip
193
    - bbox: T_bbox - Clipping boundary
194
    
195
    Returns:
196
    T_obj or None: Clipped object or None if completely outside
197
    """
198
```
199

200
### Edge and Line Processing
201

202
Convert objects to edges and process line elements.
203

204
```python { .api }
205
def obj_to_edges(obj):
206
    """
207
    Convert object to edges.
208
    
209
    Parameters:
210
    - obj: T_obj - Object (rectangle, curve, etc.)
211
    
212
    Returns:
213
    List[T_obj]: List of edge objects
214
    """
215

216
def line_to_edge(line):
217
    """
218
    Convert line object to edge.
219
    
220
    Parameters:
221
    - line: T_obj - Line object
222
    
223
    Returns:
224
    T_obj: Edge object
225
    """
226

227
def curve_to_edges(curve):
228
    """
229
    Convert curve to edges.
230
    
231
    Parameters:
232
    - curve: T_obj - Curve object
233
    
234
    Returns:
235
    List[T_obj]: List of edge objects from curve
236
    """
237

238
def rect_to_edges(rect):
239
    """
240
    Convert rectangle to edges.
241
    
242
    Parameters:
243
    - rect: T_obj - Rectangle object
244
    
245
    Returns:
246
    List[T_obj]: Four edge objects (top, bottom, left, right)
247
    """
248

249
def filter_edges(edges, orientation=None, edge_type=None, min_length=1):
250
    """
251
    Filter edges by orientation, type, and minimum length.
252
    
253
    Parameters:
254
    - edges: List[T_obj] - Edge objects to filter
255
    - orientation: str, optional - 'h' for horizontal, 'v' for vertical
256
    - edge_type: str, optional - Type of edge to include
257
    - min_length: T_num - Minimum edge length
258
    
259
    Returns:
260
    List[T_obj]: Filtered edge objects
261
    """
262
```
263

264
### Object Snapping and Alignment
265

266
Align objects to common positions and snap coordinates.
267

268
```python { .api }
269
def snap_objects(objs, attr, tolerance):
270
    """
271
    Snap objects to common values.
272
    
273
    Parameters:
274
    - objs: List[T_obj] - Objects to snap
275
    - attr: str - Attribute to snap (e.g., 'x0', 'top')
276
    - tolerance: T_num - Snapping tolerance
277
    
278
    Returns:
279
    List[T_obj]: Objects with snapped coordinates
280
    """
281
```
282

283
### Clustering Operations
284

285
Group objects and values using clustering algorithms.
286

287
```python { .api }
288
def cluster_list(xs, tolerance=0):
289
    """
290
    Cluster list of numbers.
291
    
292
    Parameters:
293
    - xs: List[T_num] - Numbers to cluster
294
    - tolerance: T_num - Clustering tolerance
295
    
296
    Returns:
297
    List[List[T_num]]: Clusters of numbers
298
    """
299

300
def cluster_objects(objs, key_fn, tolerance):
301
    """
302
    Cluster objects by key function.
303
    
304
    Parameters:
305
    - objs: List[T_obj] - Objects to cluster
306
    - key_fn: Callable[[T_obj], T_num] - Function to extract clustering key
307
    - tolerance: T_num - Clustering tolerance
308
    
309
    Returns:
310
    List[List[T_obj]]: Clusters of objects
311
    """
312

313
def make_cluster_dict(values, tolerance):
314
    """
315
    Create value-to-cluster mapping.
316
    
317
    Parameters:
318
    - values: List[T_num] - Values to cluster
319
    - tolerance: T_num - Clustering tolerance
320
    
321
    Returns:
322
    Dict[T_num, T_num]: Mapping from value to cluster representative
323
    """
324
```
325

326
**Usage Examples:**
327

328
```python
329
from pdfplumber.utils import cluster_objects, cluster_list
330

331
with pdfplumber.open("document.pdf") as pdf:
332
    page = pdf.pages[0]
333
    
334
    # Cluster characters by font size
335
    size_clusters = cluster_objects(
336
        page.chars, 
337
        lambda c: c.get('size', 0), 
338
        tolerance=1
339
    )
340
    print(f"Found {len(size_clusters)} font size groups")
341
    
342
    # Cluster horizontal positions
343
    x_positions = [c['x0'] for c in page.chars]
344
    x_clusters = cluster_list(x_positions, tolerance=5)
345
    print(f"Text aligns to {len(x_clusters)} column positions")
346
    
347
    # Find common Y positions (likely text lines)
348
    y_positions = [c['top'] for c in page.chars]  
349
    y_clusters = cluster_list(y_positions, tolerance=2)
350
    print(f"Text appears on {len(y_clusters)} distinct lines")
351
```
352

353
### Text Processing
354

355
Advanced text processing and character manipulation utilities.
356

357
```python { .api }
358
def extract_text(chars, **kwargs):
359
    """
360
    Extract text from character objects.
361
    
362
    Parameters:
363
    - chars: List[T_obj] - Character objects
364
    - **kwargs: Text extraction options
365
    
366
    Returns:
367
    str: Extracted text
368
    """
369

370
def extract_text_simple(chars, **kwargs):
371
    """
372
    Simple text extraction from characters.
373
    
374
    Parameters:
375
    - chars: List[T_obj] - Character objects
376
    - **kwargs: Extraction options
377
    
378
    Returns:
379
    str: Extracted text without layout preservation
380
    """
381

382
def extract_words(chars, **kwargs):
383
    """
384
    Extract words from character objects.
385
    
386
    Parameters:
387
    - chars: List[T_obj] - Character objects
388
    - **kwargs: Word extraction options
389
    
390
    Returns:
391
    List[T_obj]: Word objects with position data
392
    """
393

394
def dedupe_chars(chars, tolerance=1, **kwargs):
395
    """
396
    Remove duplicate characters from list.
397
    
398
    Parameters:
399
    - chars: List[T_obj] - Character objects
400
    - tolerance: T_num - Distance tolerance for duplicate detection
401
    - **kwargs: Deduplication options
402
    
403
    Returns:
404
    List[T_obj]: Deduplicated character objects
405
    """
406

407
def chars_to_textmap(chars, **kwargs):
408
    """
409
    Convert characters to TextMap object.
410
    
411
    Parameters:
412
    - chars: List[T_obj] - Character objects
413
    - **kwargs: TextMap options
414
    
415
    Returns:
416
    TextMap: Character mapping object
417
    """
418

419
def collate_line(chars, **kwargs):
420
    """
421
    Collate characters into text line.
422
    
423
    Parameters:
424
    - chars: List[T_obj] - Character objects for single line
425
    - **kwargs: Line collation options
426
    
427
    Returns:
428
    str: Text content of line
429
    """
430
```
431

432
### PDF Internals
433

434
Low-level PDF object processing and decoding utilities.
435

436
```python { .api }
437
def resolve(x):
438
    """
439
    Resolve PDF object references.
440
    
441
    Parameters:
442
    - x: Any - PDF object that may contain references
443
    
444
    Returns:
445
    Any: Resolved object with references dereferenced
446
    """
447

448
def resolve_all(x):
449
    """
450
    Recursively resolve PDF objects.
451
    
452
    Parameters:
453
    - x: Any - PDF object structure
454
    
455
    Returns:
456
    Any: Completely resolved object structure
457
    """
458

459
def resolve_and_decode(obj):
460
    """
461
    Resolve and decode PDF object.
462
    
463
    Parameters:
464
    - obj: Any - PDF object
465
    
466
    Returns:
467
    Any: Resolved and decoded object
468
    """
469

470
def decode_text(s):
471
    """
472
    Decode text from bytes/string.
473
    
474
    Parameters:
475
    - s: bytes or str - Text to decode
476
    
477
    Returns:
478
    str: Decoded text string
479
    """
480

481
def decode_psl_list(psl_list):
482
    """
483
    Decode PSLiteral list.
484
    
485
    Parameters:
486
    - psl_list: List - List of PSLiteral objects
487
    
488
    Returns:
489
    List: Decoded list
490
    """
491
```
492

493
### Generic Utilities
494

495
General-purpose utility functions.
496

497
```python { .api }
498
def to_list(collection):
499
    """
500
    Convert collection to list.
501
    
502
    Parameters:
503
    - collection: Any - Collection to convert (list, tuple, generator, etc.)
504
    
505
    Returns:
506
    List: List representation of collection
507
    """
508
```
509

510
### Constants
511

512
Commonly used default values and tolerances.
513

514
```python { .api }
515
# Text processing constants
516
DEFAULT_X_TOLERANCE = 3
517
DEFAULT_Y_TOLERANCE = 3
518
DEFAULT_X_DENSITY = 7.25
519
DEFAULT_Y_DENSITY = 13
520
```
521

522
**Usage Examples:**
523

524
```python
525
from pdfplumber.utils import (
526
    DEFAULT_X_TOLERANCE, DEFAULT_Y_TOLERANCE,
527
    extract_text, resolve_all
528
)
529

530
with pdfplumber.open("document.pdf") as pdf:
531
    page = pdf.pages[0]
532
    
533
    # Use default tolerances
534
    text = extract_text(page.chars, 
535
                       x_tolerance=DEFAULT_X_TOLERANCE,
536
                       y_tolerance=DEFAULT_Y_TOLERANCE)
537
    
538
    # Process PDF internals
539
    raw_chars = page._objs.get('char', [])  # Access raw PDF objects
540
    resolved_chars = [resolve_all(char) for char in raw_chars]
541
```
542

543
## Advanced Utility Workflows
544

545
**Spatial Analysis:**
546

547
```python
548
from pdfplumber.utils import cluster_objects, objects_to_bbox
549

550
with pdfplumber.open("document.pdf") as pdf:
551
    page = pdf.pages[0]
552
    
553
    # Find text columns
554
    char_clusters = cluster_objects(
555
        page.chars,
556
        lambda c: c['x0'],  # Group by left edge
557
        tolerance=10
558
    )
559
    
560
    columns = []
561
    for cluster in char_clusters:
562
        column_bbox = objects_to_bbox(cluster)
563
        column_text = extract_text(cluster)
564
        columns.append({
565
            'bbox': column_bbox,
566
            'text': column_text,
567
            'char_count': len(cluster)
568
        })
569
    
570
    print(f"Document has {len(columns)} columns")
571
```
572

573
**Font Analysis:**
574

575
```python
576
from pdfplumber.utils import cluster_objects
577

578
with pdfplumber.open("document.pdf") as pdf:
579
    page = pdf.pages[0]
580
    
581
    # Group by font properties
582
    font_groups = cluster_objects(
583
        page.chars,
584
        lambda c: (c.get('fontname', ''), c.get('size', 0)),
585
        tolerance=0  # Exact matching for fonts
586
    )
587
    
588
    for group in font_groups:
589
        sample = group[0]
590
        font_name = sample.get('fontname', 'Unknown')
591
        font_size = sample.get('size', 0)
592
        char_count = len(group)
593
        
594
        print(f"Font: {font_name}, Size: {font_size}, Characters: {char_count}")
595
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/