0
# Page Manipulation
1
2
Page cropping, object filtering, bounding box operations, coordinate transformations, and derived page creation for precise PDF element analysis.
3
4
## Capabilities
5
6
### Page Cropping
7
8
Create cropped views of pages with filtered objects based on bounding box regions.
9
10
```python { .api }
11
def crop(bbox, relative=False, strict=True):
12
"""
13
Crop page to bounding box.
14
15
Parameters:
16
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box (x0, top, x1, bottom)
17
- relative: bool - Treat coordinates as relative to page (0-1 range)
18
- strict: bool - Strict filtering (objects must be entirely within bbox)
19
20
Returns:
21
CroppedPage: New page object with cropped view
22
"""
23
```
24
25
**Usage Examples:**
26
27
```python
28
with pdfplumber.open("document.pdf") as pdf:
29
page = pdf.pages[0]
30
31
# Crop to specific region (absolute coordinates)
32
cropped = page.crop((100, 100, 400, 300))
33
text = cropped.extract_text()
34
print(f"Cropped region text: {text}")
35
36
# Crop to relative coordinates (percentages)
37
# Top-left quarter of page
38
quarter = page.crop((0, 0, 0.5, 0.5), relative=True)
39
40
# Crop with non-strict filtering (partial overlap allowed)
41
loose_crop = page.crop((100, 100, 400, 300), strict=False)
42
43
# Chain cropping operations
44
top_half = page.crop((0, 0, 1, 0.5), relative=True)
45
top_left = top_half.crop((0, 0, 0.5, 1), relative=True)
46
```
47
48
### Bounding Box Filtering
49
50
Filter page objects based on spatial relationships to bounding boxes.
51
52
```python { .api }
53
def within_bbox(bbox, relative=False, strict=True):
54
"""
55
Filter objects within bounding box.
56
57
Parameters:
58
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
59
- relative: bool - Treat coordinates as relative to page
60
- strict: bool - Objects must be entirely within bbox
61
62
Returns:
63
FilteredPage: New page with filtered objects
64
"""
65
66
def outside_bbox(bbox, relative=False, strict=True):
67
"""
68
Filter objects outside bounding box.
69
70
Parameters:
71
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
72
- relative: bool - Treat coordinates as relative to page
73
- strict: bool - Objects must be entirely outside bbox
74
75
Returns:
76
FilteredPage: New page with filtered objects
77
"""
78
```
79
80
**Usage Examples:**
81
82
```python
83
with pdfplumber.open("document.pdf") as pdf:
84
page = pdf.pages[0]
85
86
# Get objects in specific region
87
header_region = (0, 0, page.width, 100)
88
header_page = page.within_bbox(header_region)
89
header_text = header_page.extract_text()
90
91
# Get objects outside a region (exclude header/footer)
92
content_region = (0, 100, page.width, page.height - 100)
93
content_page = page.within_bbox(content_region)
94
95
# Use relative coordinates
96
middle_third = page.within_bbox((0, 0.33, 1, 0.67), relative=True)
97
98
# Non-strict filtering (partial overlap)
99
overlapping = page.within_bbox((100, 100, 200, 200), strict=False)
100
101
# Exclude specific region
102
no_header = page.outside_bbox((0, 0, page.width, 50))
103
```
104
105
### Custom Object Filtering
106
107
Filter objects using custom test functions for complex selection criteria.
108
109
```python { .api }
110
def filter(test_function):
111
"""
112
Filter objects using custom function.
113
114
Parameters:
115
- test_function: Callable[[T_obj], bool] - Function that returns True for objects to keep
116
117
Returns:
118
FilteredPage: New page with filtered objects based on test function
119
"""
120
```
121
122
**Usage Examples:**
123
124
```python
125
with pdfplumber.open("document.pdf") as pdf:
126
page = pdf.pages[0]
127
128
# Filter by font size
129
large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
130
131
# Filter by font name
132
arial_text = page.filter(lambda obj: 'Arial' in obj.get('fontname', ''))
133
134
# Filter by color
135
red_objects = page.filter(lambda obj: obj.get('non_stroking_color') == (1, 0, 0))
136
137
# Filter characters by content
138
digits_only = page.filter(lambda obj: obj.get('text', '').isdigit())
139
140
# Complex filtering - large bold text
141
def is_large_bold(obj):
142
return (obj.get('size', 0) > 14 and
143
'Bold' in obj.get('fontname', ''))
144
145
headers = page.filter(is_large_bold)
146
header_text = headers.extract_text()
147
```
148
149
### Derived Page Classes
150
151
Specialized page classes for manipulated views.
152
153
```python { .api }
154
class CroppedPage(DerivedPage):
155
"""Page cropped to specific bounding box."""
156
157
def __init__(self, parent_page, bbox, relative=False, strict=True):
158
"""Initialize cropped page view."""
159
160
@property
161
def parent_page(self) -> Page:
162
"""Original page object."""
163
164
@property
165
def bbox(self) -> T_bbox:
166
"""Cropping bounding box."""
167
168
class FilteredPage(DerivedPage):
169
"""Page with filtered objects."""
170
171
def __init__(self, parent_page, test_function):
172
"""Initialize filtered page view."""
173
174
@property
175
def parent_page(self) -> Page:
176
"""Original page object."""
177
178
@property
179
def test_function(self) -> Callable:
180
"""Filtering test function."""
181
182
class DerivedPage:
183
"""Base class for page views derived from other pages."""
184
185
@property
186
def width(self) -> T_num:
187
"""Page width."""
188
189
@property
190
def height(self) -> T_num:
191
"""Page height."""
192
193
@property
194
def bbox(self) -> T_bbox:
195
"""Page bounding box."""
196
197
# All Container and Page methods available
198
def extract_text(self, **kwargs): ...
199
def extract_tables(self, **kwargs): ...
200
def crop(self, bbox, **kwargs): ...
201
def filter(self, test_function): ...
202
```
203
204
### Character Deduplication
205
206
Remove duplicate character objects that may occur from PDF processing.
207
208
```python { .api }
209
def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):
210
"""
211
Remove duplicate characters.
212
213
Parameters:
214
- tolerance: T_num - Distance tolerance for duplicate detection
215
- use_text_flow: bool - Consider text flow direction in deduplication
216
- **kwargs: Additional deduplication options
217
218
Returns:
219
Page: New page object with deduplicated characters
220
"""
221
```
222
223
**Usage Examples:**
224
225
```python
226
with pdfplumber.open("document.pdf") as pdf:
227
page = pdf.pages[0]
228
229
# Remove duplicate characters with default tolerance
230
clean_page = page.dedupe_chars()
231
232
# Strict deduplication with tight tolerance
233
very_clean = page.dedupe_chars(tolerance=0.5)
234
235
# Consider text flow for better deduplication
236
flow_aware = page.dedupe_chars(use_text_flow=True)
237
238
# Compare character counts
239
original_chars = len(page.chars)
240
clean_chars = len(clean_page.chars)
241
print(f"Removed {original_chars - clean_chars} duplicate characters")
242
```
243
244
## Coordinate Systems and Transformations
245
246
### Understanding PDF Coordinates
247
248
PDFplumber uses PDF coordinate system where:
249
- Origin (0,0) is at bottom-left of page
250
- X increases rightward
251
- Y increases upward
252
- Page dimensions available as `page.width` and `page.height`
253
254
### Relative Coordinates
255
256
```python
257
with pdfplumber.open("document.pdf") as pdf:
258
page = pdf.pages[0]
259
260
# Convert relative to absolute coordinates
261
rel_bbox = (0.1, 0.2, 0.9, 0.8) # 10% margin on all sides
262
abs_bbox = (
263
rel_bbox[0] * page.width,
264
rel_bbox[1] * page.height,
265
rel_bbox[2] * page.width,
266
rel_bbox[3] * page.height
267
)
268
269
# Use relative coordinates directly
270
center_region = page.crop((0.25, 0.25, 0.75, 0.75), relative=True)
271
```
272
273
### Chaining Operations
274
275
```python
276
with pdfplumber.open("document.pdf") as pdf:
277
page = pdf.pages[0]
278
279
# Chain multiple operations
280
processed_page = (page
281
.dedupe_chars()
282
.crop((50, 50, page.width-50, page.height-50))
283
.filter(lambda obj: obj.get('size', 0) > 10))
284
285
# Each operation returns a new page-like object
286
text = processed_page.extract_text()
287
tables = processed_page.extract_tables()
288
```
289
290
### Performance Considerations
291
292
```python
293
with pdfplumber.open("document.pdf") as pdf:
294
page = pdf.pages[0]
295
296
# Efficient: filter before expensive operations
297
large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
298
tables = large_text.extract_tables() # Operates on fewer objects
299
300
# Less efficient: extract from full page then filter results
301
all_tables = page.extract_tables()
302
# Manual filtering of results
303
```
304
305
## Object Access in Derived Pages
306
307
All derived pages maintain access to the full Container API:
308
309
```python
310
with pdfplumber.open("document.pdf") as pdf:
311
page = pdf.pages[0]
312
cropped = page.crop((100, 100, 400, 300))
313
314
# Access filtered object collections
315
chars = cropped.chars # Only characters in cropped region
316
lines = cropped.lines # Only lines in cropped region
317
rects = cropped.rects # Only rectangles in cropped region
318
images = cropped.images # Only images in cropped region
319
320
# Derived properties work with filtered objects
321
edges = cropped.edges # All edges from filtered objects
322
h_edges = cropped.horizontal_edges
323
v_edges = cropped.vertical_edges
324
325
# Export filtered objects
326
cropped.to_json("cropped_objects.json")
327
cropped.to_csv("cropped_data.csv")
328
```