or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdindex.mdpage-manipulation.mdpdf-operations.mdtable-extraction.mdtext-extraction.mdutilities.mdvisual-debugging.md

page-manipulation.mddocs/

0

# Page Manipulation

1

2

Page cropping, object filtering, bounding box operations, coordinate transformations, and derived page creation for precise PDF element analysis.

3

4

## Capabilities

5

6

### Page Cropping

7

8

Create cropped views of pages with filtered objects based on bounding box regions.

9

10

```python { .api }

11

def crop(bbox, relative=False, strict=True):

12

"""

13

Crop page to bounding box.

14

15

Parameters:

16

- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box (x0, top, x1, bottom)

17

- relative: bool - Treat coordinates as relative to page (0-1 range)

18

- strict: bool - Strict filtering (objects must be entirely within bbox)

19

20

Returns:

21

CroppedPage: New page object with cropped view

22

"""

23

```

24

25

**Usage Examples:**

26

27

```python

28

with pdfplumber.open("document.pdf") as pdf:

29

page = pdf.pages[0]

30

31

# Crop to specific region (absolute coordinates)

32

cropped = page.crop((100, 100, 400, 300))

33

text = cropped.extract_text()

34

print(f"Cropped region text: {text}")

35

36

# Crop to relative coordinates (percentages)

37

# Top-left quarter of page

38

quarter = page.crop((0, 0, 0.5, 0.5), relative=True)

39

40

# Crop with non-strict filtering (partial overlap allowed)

41

loose_crop = page.crop((100, 100, 400, 300), strict=False)

42

43

# Chain cropping operations

44

top_half = page.crop((0, 0, 1, 0.5), relative=True)

45

top_left = top_half.crop((0, 0, 0.5, 1), relative=True)

46

```

47

48

### Bounding Box Filtering

49

50

Filter page objects based on spatial relationships to bounding boxes.

51

52

```python { .api }

53

def within_bbox(bbox, relative=False, strict=True):

54

"""

55

Filter objects within bounding box.

56

57

Parameters:

58

- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates

59

- relative: bool - Treat coordinates as relative to page

60

- strict: bool - Objects must be entirely within bbox

61

62

Returns:

63

FilteredPage: New page with filtered objects

64

"""

65

66

def outside_bbox(bbox, relative=False, strict=True):

67

"""

68

Filter objects outside bounding box.

69

70

Parameters:

71

- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates

72

- relative: bool - Treat coordinates as relative to page

73

- strict: bool - Objects must be entirely outside bbox

74

75

Returns:

76

FilteredPage: New page with filtered objects

77

"""

78

```

79

80

**Usage Examples:**

81

82

```python

83

with pdfplumber.open("document.pdf") as pdf:

84

page = pdf.pages[0]

85

86

# Get objects in specific region

87

header_region = (0, 0, page.width, 100)

88

header_page = page.within_bbox(header_region)

89

header_text = header_page.extract_text()

90

91

# Get objects outside a region (exclude header/footer)

92

content_region = (0, 100, page.width, page.height - 100)

93

content_page = page.within_bbox(content_region)

94

95

# Use relative coordinates

96

middle_third = page.within_bbox((0, 0.33, 1, 0.67), relative=True)

97

98

# Non-strict filtering (partial overlap)

99

overlapping = page.within_bbox((100, 100, 200, 200), strict=False)

100

101

# Exclude specific region

102

no_header = page.outside_bbox((0, 0, page.width, 50))

103

```

104

105

### Custom Object Filtering

106

107

Filter objects using custom test functions for complex selection criteria.

108

109

```python { .api }

110

def filter(test_function):

111

"""

112

Filter objects using custom function.

113

114

Parameters:

115

- test_function: Callable[[T_obj], bool] - Function that returns True for objects to keep

116

117

Returns:

118

FilteredPage: New page with filtered objects based on test function

119

"""

120

```

121

122

**Usage Examples:**

123

124

```python

125

with pdfplumber.open("document.pdf") as pdf:

126

page = pdf.pages[0]

127

128

# Filter by font size

129

large_text = page.filter(lambda obj: obj.get('size', 0) > 12)

130

131

# Filter by font name

132

arial_text = page.filter(lambda obj: 'Arial' in obj.get('fontname', ''))

133

134

# Filter by color

135

red_objects = page.filter(lambda obj: obj.get('non_stroking_color') == (1, 0, 0))

136

137

# Filter characters by content

138

digits_only = page.filter(lambda obj: obj.get('text', '').isdigit())

139

140

# Complex filtering - large bold text

141

def is_large_bold(obj):

142

return (obj.get('size', 0) > 14 and

143

'Bold' in obj.get('fontname', ''))

144

145

headers = page.filter(is_large_bold)

146

header_text = headers.extract_text()

147

```

148

149

### Derived Page Classes

150

151

Specialized page classes for manipulated views.

152

153

```python { .api }

154

class CroppedPage(DerivedPage):

155

"""Page cropped to specific bounding box."""

156

157

def __init__(self, parent_page, bbox, relative=False, strict=True):

158

"""Initialize cropped page view."""

159

160

@property

161

def parent_page(self) -> Page:

162

"""Original page object."""

163

164

@property

165

def bbox(self) -> T_bbox:

166

"""Cropping bounding box."""

167

168

class FilteredPage(DerivedPage):

169

"""Page with filtered objects."""

170

171

def __init__(self, parent_page, test_function):

172

"""Initialize filtered page view."""

173

174

@property

175

def parent_page(self) -> Page:

176

"""Original page object."""

177

178

@property

179

def test_function(self) -> Callable:

180

"""Filtering test function."""

181

182

class DerivedPage:

183

"""Base class for page views derived from other pages."""

184

185

@property

186

def width(self) -> T_num:

187

"""Page width."""

188

189

@property

190

def height(self) -> T_num:

191

"""Page height."""

192

193

@property

194

def bbox(self) -> T_bbox:

195

"""Page bounding box."""

196

197

# All Container and Page methods available

198

def extract_text(self, **kwargs): ...

199

def extract_tables(self, **kwargs): ...

200

def crop(self, bbox, **kwargs): ...

201

def filter(self, test_function): ...

202

```

203

204

### Character Deduplication

205

206

Remove duplicate character objects that may occur from PDF processing.

207

208

```python { .api }

209

def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):

210

"""

211

Remove duplicate characters.

212

213

Parameters:

214

- tolerance: T_num - Distance tolerance for duplicate detection

215

- use_text_flow: bool - Consider text flow direction in deduplication

216

- **kwargs: Additional deduplication options

217

218

Returns:

219

Page: New page object with deduplicated characters

220

"""

221

```

222

223

**Usage Examples:**

224

225

```python

226

with pdfplumber.open("document.pdf") as pdf:

227

page = pdf.pages[0]

228

229

# Remove duplicate characters with default tolerance

230

clean_page = page.dedupe_chars()

231

232

# Strict deduplication with tight tolerance

233

very_clean = page.dedupe_chars(tolerance=0.5)

234

235

# Consider text flow for better deduplication

236

flow_aware = page.dedupe_chars(use_text_flow=True)

237

238

# Compare character counts

239

original_chars = len(page.chars)

240

clean_chars = len(clean_page.chars)

241

print(f"Removed {original_chars - clean_chars} duplicate characters")

242

```

243

244

## Coordinate Systems and Transformations

245

246

### Understanding PDF Coordinates

247

248

PDFplumber uses PDF coordinate system where:

249

- Origin (0,0) is at bottom-left of page

250

- X increases rightward

251

- Y increases upward

252

- Page dimensions available as `page.width` and `page.height`

253

254

### Relative Coordinates

255

256

```python

257

with pdfplumber.open("document.pdf") as pdf:

258

page = pdf.pages[0]

259

260

# Convert relative to absolute coordinates

261

rel_bbox = (0.1, 0.2, 0.9, 0.8) # 10% margin on all sides

262

abs_bbox = (

263

rel_bbox[0] * page.width,

264

rel_bbox[1] * page.height,

265

rel_bbox[2] * page.width,

266

rel_bbox[3] * page.height

267

)

268

269

# Use relative coordinates directly

270

center_region = page.crop((0.25, 0.25, 0.75, 0.75), relative=True)

271

```

272

273

### Chaining Operations

274

275

```python

276

with pdfplumber.open("document.pdf") as pdf:

277

page = pdf.pages[0]

278

279

# Chain multiple operations

280

processed_page = (page

281

.dedupe_chars()

282

.crop((50, 50, page.width-50, page.height-50))

283

.filter(lambda obj: obj.get('size', 0) > 10))

284

285

# Each operation returns a new page-like object

286

text = processed_page.extract_text()

287

tables = processed_page.extract_tables()

288

```

289

290

### Performance Considerations

291

292

```python

293

with pdfplumber.open("document.pdf") as pdf:

294

page = pdf.pages[0]

295

296

# Efficient: filter before expensive operations

297

large_text = page.filter(lambda obj: obj.get('size', 0) > 12)

298

tables = large_text.extract_tables() # Operates on fewer objects

299

300

# Less efficient: extract from full page then filter results

301

all_tables = page.extract_tables()

302

# Manual filtering of results

303

```

304

305

## Object Access in Derived Pages

306

307

All derived pages maintain access to the full Container API:

308

309

```python

310

with pdfplumber.open("document.pdf") as pdf:

311

page = pdf.pages[0]

312

cropped = page.crop((100, 100, 400, 300))

313

314

# Access filtered object collections

315

chars = cropped.chars # Only characters in cropped region

316

lines = cropped.lines # Only lines in cropped region

317

rects = cropped.rects # Only rectangles in cropped region

318

images = cropped.images # Only images in cropped region

319

320

# Derived properties work with filtered objects

321

edges = cropped.edges # All edges from filtered objects

322

h_edges = cropped.horizontal_edges

323

v_edges = cropped.vertical_edges

324

325

# Export filtered objects

326

cropped.to_json("cropped_objects.json")

327

cropped.to_csv("cropped_data.csv")

328

```