0
# Table Extraction
1
2
Sophisticated table detection and extraction capabilities with customizable strategies, edge detection algorithms, comprehensive configuration options, and visual debugging support.
3
4
## Capabilities
5
6
### Table Finding
7
8
Detect all tables on a page using various detection strategies and algorithms.
9
10
```python { .api }
11
def find_tables(table_settings=None):
12
"""
13
Find all tables using detection algorithms.
14
15
Parameters:
16
- table_settings: TableSettings or dict, optional - Configuration for detection
17
18
Returns:
19
List[Table]: List of detected table objects
20
"""
21
22
def find_table(table_settings=None):
23
"""
24
Find largest table on page.
25
26
Parameters:
27
- table_settings: TableSettings or dict, optional - Configuration for detection
28
29
Returns:
30
Table or None: Largest detected table or None if no tables found
31
"""
32
```
33
34
**Usage Examples:**
35
36
```python
37
with pdfplumber.open("document.pdf") as pdf:
38
page = pdf.pages[0]
39
40
# Find all tables with default settings
41
tables = page.find_tables()
42
print(f"Found {len(tables)} tables")
43
44
# Find largest table only
45
main_table = page.find_table()
46
if main_table:
47
print(f"Main table area: {main_table.bbox}")
48
49
# Find tables with custom settings
50
custom_settings = {
51
"vertical_strategy": "text",
52
"horizontal_strategy": "lines"
53
}
54
tables = page.find_tables(table_settings=custom_settings)
55
```
56
57
### Table Extraction
58
59
Extract table data as structured 2D arrays with various formatting options.
60
61
```python { .api }
62
def extract_tables(table_settings=None):
63
"""
64
Extract all tables as 2D arrays.
65
66
Parameters:
67
- table_settings: TableSettings or dict, optional - Configuration for detection
68
69
Returns:
70
List[List[List[str]]]: List of tables, each as 2D array of strings
71
"""
72
73
def extract_table(table_settings=None):
74
"""
75
Extract largest table as 2D array.
76
77
Parameters:
78
- table_settings: TableSettings or dict, optional - Configuration for detection
79
80
Returns:
81
List[List[str]] or None: 2D array of strings or None if no table found
82
"""
83
```
84
85
**Usage Examples:**
86
87
```python
88
with pdfplumber.open("document.pdf") as pdf:
89
page = pdf.pages[0]
90
91
# Extract all tables
92
tables = page.extract_tables()
93
for i, table in enumerate(tables):
94
print(f"Table {i+1}:")
95
for row in table:
96
print(" ", row)
97
98
# Extract main table only
99
main_table = page.extract_table()
100
if main_table:
101
# Process header row
102
headers = main_table[0]
103
data_rows = main_table[1:]
104
105
for row in data_rows:
106
row_dict = dict(zip(headers, row))
107
print(row_dict)
108
```
109
110
### Table Class
111
112
Represents a detected table with extraction and analysis capabilities.
113
114
```python { .api }
115
class Table:
116
"""Detected table with extraction capabilities."""
117
118
def __init__(self, page, cells):
119
"""Initialize table from page and cell data."""
120
121
@property
122
def bbox(self) -> T_bbox:
123
"""Table bounding box coordinates."""
124
125
@property
126
def cells(self) -> List[T_bbox]:
127
"""List of cell bounding boxes."""
128
129
@property
130
def rows(self) -> List[CellGroup]:
131
"""Table rows as CellGroup objects."""
132
133
@property
134
def columns(self) -> List[CellGroup]:
135
"""Table columns as CellGroup objects."""
136
137
def extract(self, **kwargs):
138
"""
139
Extract table data as 2D array.
140
141
Parameters:
142
- **kwargs: Text extraction options for cell content
143
144
Returns:
145
List[List[str]]: 2D array of cell text content
146
"""
147
```
148
149
**Usage Examples:**
150
151
```python
152
with pdfplumber.open("document.pdf") as pdf:
153
page = pdf.pages[0]
154
155
tables = page.find_tables()
156
for table in tables:
157
print(f"Table at {table.bbox}")
158
print(f"Dimensions: {len(table.rows)} rows × {len(table.columns)} columns")
159
160
# Extract with custom text options
161
data = table.extract(layout=True, x_tolerance=1)
162
163
# Analyze cell structure
164
for i, row in enumerate(table.rows):
165
print(f"Row {i}: {len(row.cells)} cells")
166
```
167
168
### TableFinder Class
169
170
Handles the table detection algorithm implementation and provides debugging capabilities.
171
172
```python { .api }
173
class TableFinder:
174
"""Table detection algorithm implementation."""
175
176
def __init__(self, page, settings=None):
177
"""Initialize TableFinder with page and settings."""
178
179
@property
180
def page(self) -> Page:
181
"""Source page object."""
182
183
@property
184
def settings(self) -> TableSettings:
185
"""Table detection settings."""
186
187
@property
188
def edges(self) -> T_obj_list:
189
"""Detected edges for table detection."""
190
191
@property
192
def intersections(self) -> T_intersections:
193
"""Edge intersection points."""
194
195
@property
196
def cells(self) -> List[T_bbox]:
197
"""Detected table cells."""
198
199
@property
200
def tables(self) -> List[Table]:
201
"""Detected table objects."""
202
203
def get_edges(self):
204
"""Get edges based on detection strategy."""
205
```
206
207
### TableSettings Class
208
209
Comprehensive configuration class for table detection parameters and strategies.
210
211
```python { .api }
212
class TableSettings:
213
"""Configuration for table detection parameters."""
214
215
def __init__(self, vertical_strategy="lines", horizontal_strategy="lines",
216
explicit_vertical_lines=None, explicit_horizontal_lines=None,
217
snap_tolerance=3, snap_x_tolerance=None, snap_y_tolerance=None,
218
join_tolerance=3, join_x_tolerance=None, join_y_tolerance=None,
219
edge_min_length=3, min_words_vertical=3, min_words_horizontal=1,
220
intersection_tolerance=3, intersection_x_tolerance=None,
221
intersection_y_tolerance=None, text_settings=None):
222
"""Initialize table detection settings."""
223
224
@classmethod
225
def resolve(cls, settings):
226
"""
227
Create TableSettings from dict or existing instance.
228
229
Parameters:
230
- settings: dict, TableSettings, or None
231
232
Returns:
233
TableSettings: Resolved settings object
234
"""
235
236
# Detection strategy options
237
vertical_strategy: str # "lines", "lines_strict", "text", "explicit"
238
horizontal_strategy: str # "lines", "lines_strict", "text", "explicit"
239
240
# Explicit line positions
241
explicit_vertical_lines: Optional[List[T_num]]
242
explicit_horizontal_lines: Optional[List[T_num]]
243
244
# Edge processing tolerances
245
snap_tolerance: T_num
246
snap_x_tolerance: Optional[T_num]
247
snap_y_tolerance: Optional[T_num]
248
join_tolerance: T_num
249
join_x_tolerance: Optional[T_num]
250
join_y_tolerance: Optional[T_num]
251
edge_min_length: T_num
252
253
# Text-based detection parameters
254
min_words_vertical: int
255
min_words_horizontal: int
256
257
# Intersection detection
258
intersection_tolerance: T_num
259
intersection_x_tolerance: Optional[T_num]
260
intersection_y_tolerance: Optional[T_num]
261
262
# Text extraction settings for cells
263
text_settings: Optional[Dict[str, Any]]
264
```
265
266
**Usage Examples:**
267
268
```python
269
from pdfplumber.table import TableSettings
270
271
with pdfplumber.open("document.pdf") as pdf:
272
page = pdf.pages[0]
273
274
# Custom settings for line-based detection
275
line_settings = TableSettings(
276
vertical_strategy="lines_strict",
277
horizontal_strategy="lines_strict",
278
snap_tolerance=2,
279
edge_min_length=10
280
)
281
282
# Custom settings for text-based detection
283
text_settings = TableSettings(
284
vertical_strategy="text",
285
horizontal_strategy="text",
286
min_words_vertical=2,
287
min_words_horizontal=1
288
)
289
290
# Explicit line positions
291
explicit_settings = TableSettings(
292
vertical_strategy="explicit",
293
horizontal_strategy="explicit",
294
explicit_vertical_lines=[100, 200, 300, 400],
295
explicit_horizontal_lines=[50, 100, 150, 200]
296
)
297
298
# Use settings
299
tables = page.find_tables(table_settings=line_settings)
300
```
301
302
### Table Debugging
303
304
Visual debugging capabilities for understanding table detection algorithms.
305
306
```python { .api }
307
def debug_tablefinder(table_settings=None):
308
"""
309
Get TableFinder for debugging table detection.
310
311
Parameters:
312
- table_settings: TableSettings or dict, optional
313
314
Returns:
315
TableFinder: TableFinder object for algorithm inspection
316
"""
317
```
318
319
**Usage Examples:**
320
321
```python
322
with pdfplumber.open("document.pdf") as pdf:
323
page = pdf.pages[0]
324
325
# Debug table detection process
326
finder = page.debug_tablefinder()
327
328
print(f"Detected {len(finder.edges)} edges")
329
print(f"Found {len(finder.intersections)} intersections")
330
print(f"Identified {len(finder.cells)} cells")
331
print(f"Grouped into {len(finder.tables)} tables")
332
333
# Visualize detection process
334
im = page.to_image()
335
im.debug_tablefinder(table_settings=finder.settings)
336
im.save("table_debug.png")
337
```
338
339
### Cell Group Classes
340
341
Helper classes for table structure analysis.
342
343
```python { .api }
344
class CellGroup:
345
"""Base class for table rows and columns."""
346
347
@property
348
def cells(self) -> List[T_bbox]:
349
"""Cell bounding boxes in this group."""
350
351
@property
352
def bbox(self) -> T_bbox:
353
"""Bounding box of entire group."""
354
355
class Row(CellGroup):
356
"""Table row representation."""
357
358
class Column(CellGroup):
359
"""Table column representation."""
360
```
361
362
## Advanced Table Detection Strategies
363
364
### Line-Based Detection
365
366
```python
367
# Strict line detection - only uses actual PDF line objects
368
settings = TableSettings(
369
vertical_strategy="lines_strict",
370
horizontal_strategy="lines_strict"
371
)
372
373
# Flexible line detection - includes rectangle edges
374
settings = TableSettings(
375
vertical_strategy="lines",
376
horizontal_strategy="lines"
377
)
378
```
379
380
### Text-Based Detection
381
382
```python
383
# Use text alignment to infer table structure
384
settings = TableSettings(
385
vertical_strategy="text",
386
horizontal_strategy="text",
387
min_words_vertical=3, # Minimum words to establish column
388
min_words_horizontal=2 # Minimum words to establish row
389
)
390
```
391
392
### Explicit Line Detection
393
394
```python
395
# Manually specify table grid lines
396
settings = TableSettings(
397
vertical_strategy="explicit",
398
horizontal_strategy="explicit",
399
explicit_vertical_lines=[72, 144, 216, 288], # X coordinates
400
explicit_horizontal_lines=[100, 130, 160, 190] # Y coordinates
401
)
402
```
403
404
### Hybrid Detection
405
406
```python
407
# Combine different strategies for horizontal and vertical
408
settings = TableSettings(
409
vertical_strategy="text", # Use text alignment for columns
410
horizontal_strategy="lines", # Use lines for rows
411
snap_tolerance=5, # Snap nearby elements together
412
join_tolerance=2 # Join connected elements
413
)
414
```