0
# Table Extraction
1
2
Advanced table detection and extraction capabilities with support for table structure analysis, cell content extraction, and export to various formats including pandas DataFrames. PyMuPDF provides sophisticated algorithms for identifying and parsing tabular data from PDF documents.
3
4
## Capabilities
5
6
### Table Finding and Detection
7
8
Locate tables within document pages with configurable detection settings.
9
10
```python { .api }
11
class TableFinder:
12
def __init__(self, page: Page):
13
"""
14
Create table finder for a page.
15
16
Parameters:
17
- page: Page object to search for tables
18
"""
19
20
def find_tables(self, clip: Rect = None, strategy: str = "lines_strict",
21
vertical_strategy: str = "lines", horizontal_strategy: str = "lines",
22
explicit_vertical_lines: list = None, explicit_horizontal_lines: list = None,
23
snap_tolerance: float = 3, snap_x_tolerance: float = None,
24
snap_y_tolerance: float = None, join_tolerance: float = 3,
25
join_x_tolerance: float = None, join_y_tolerance: float = None,
26
edge_min_length: float = 3, min_words_vertical: float = 3,
27
min_words_horizontal: float = 1, intersection_tolerance: float = 3,
28
intersection_x_tolerance: float = None, intersection_y_tolerance: float = None,
29
text_tolerance: float = 3, text_x_tolerance: float = None,
30
text_y_tolerance: float = None) -> list:
31
"""
32
Find tables on the page.
33
34
Parameters:
35
- clip: rectangle to limit search area
36
- strategy: table detection strategy ("lines_strict", "lines", "text", "explicit")
37
- vertical_strategy: strategy for detecting vertical lines
38
- horizontal_strategy: strategy for detecting horizontal lines
39
- explicit_vertical_lines: explicit vertical line positions
40
- explicit_horizontal_lines: explicit horizontal line positions
41
- snap_tolerance: tolerance for snapping lines to text
42
- snap_x_tolerance: x-direction snap tolerance
43
- snap_y_tolerance: y-direction snap tolerance
44
- join_tolerance: tolerance for joining line segments
45
- join_x_tolerance: x-direction join tolerance
46
- join_y_tolerance: y-direction join tolerance
47
- edge_min_length: minimum line length to consider
48
- min_words_vertical: minimum words to form vertical line
49
- min_words_horizontal: minimum words to form horizontal line
50
- intersection_tolerance: tolerance for line intersections
51
- intersection_x_tolerance: x-direction intersection tolerance
52
- intersection_y_tolerance: y-direction intersection tolerance
53
- text_tolerance: tolerance for text-based table detection
54
- text_x_tolerance: x-direction text tolerance
55
- text_y_tolerance: y-direction text tolerance
56
57
Returns:
58
List of Table objects found on the page
59
"""
60
```
61
62
### Table Class
63
64
Individual table representation with extraction and manipulation capabilities.
65
66
```python { .api }
67
class Table:
68
def __init__(self, page: Page, bbox: Rect):
69
"""
70
Create table object.
71
72
Parameters:
73
- page: parent Page object
74
- bbox: table bounding rectangle
75
"""
76
77
def extract(self, x_tolerance: float = 3, y_tolerance: float = 3) -> list:
78
"""
79
Extract table data as list of rows.
80
81
Parameters:
82
- x_tolerance: horizontal tolerance for cell alignment
83
- y_tolerance: vertical tolerance for cell alignment
84
85
Returns:
86
List of lists representing table rows and cells
87
"""
88
89
def to_pandas(self, **kwargs) -> 'pandas.DataFrame':
90
"""
91
Convert table to pandas DataFrame.
92
93
Parameters:
94
- kwargs: additional pandas DataFrame parameters
95
96
Returns:
97
pandas DataFrame with table data
98
"""
99
100
def to_csv(self, file_path: str = None, **kwargs) -> str:
101
"""
102
Export table to CSV format.
103
104
Parameters:
105
- file_path: output file path (None for string return)
106
- kwargs: additional CSV export parameters
107
108
Returns:
109
CSV string if file_path is None, otherwise None
110
"""
111
112
def to_dict(self, orient: str = "records") -> typing.Union[list, dict]:
113
"""
114
Convert table to dictionary format.
115
116
Parameters:
117
- orient: dictionary orientation ("records", "list", "dict", etc.)
118
119
Returns:
120
Table data as dictionary
121
"""
122
123
@property
124
def bbox(self) -> Rect:
125
"""Table bounding rectangle."""
126
127
@property
128
def cells(self) -> list:
129
"""List of table cells with positions and content."""
130
131
@property
132
def rows(self) -> list:
133
"""List of table rows."""
134
135
@property
136
def cols(self) -> list:
137
"""List of table columns."""
138
```
139
140
### Table Settings and Configuration
141
142
Fine-tune table detection parameters for different document types.
143
144
```python { .api }
145
class TableSettings:
146
def __init__(self):
147
"""Create default table settings."""
148
149
@property
150
def vertical_strategy(self) -> str:
151
"""Strategy for vertical line detection."""
152
153
@property
154
def horizontal_strategy(self) -> str:
155
"""Strategy for horizontal line detection."""
156
157
@property
158
def snap_tolerance(self) -> float:
159
"""Tolerance for snapping lines to text."""
160
161
@property
162
def join_tolerance(self) -> float:
163
"""Tolerance for joining line segments."""
164
165
@property
166
def edge_min_length(self) -> float:
167
"""Minimum line length to consider."""
168
169
@property
170
def min_words_vertical(self) -> float:
171
"""Minimum words to form vertical line."""
172
173
@property
174
def min_words_horizontal(self) -> float:
175
"""Minimum words to form horizontal line."""
176
177
@property
178
def intersection_tolerance(self) -> float:
179
"""Tolerance for line intersections."""
180
181
@property
182
def text_tolerance(self) -> float:
183
"""Tolerance for text-based detection."""
184
```
185
186
### Advanced Table Analysis
187
188
Analyze table structure and content for complex data extraction.
189
190
```python { .api }
191
class TableRow:
192
@property
193
def cells(self) -> list:
194
"""Cells in this row."""
195
196
@property
197
def bbox(self) -> Rect:
198
"""Row bounding rectangle."""
199
200
@property
201
def height(self) -> float:
202
"""Row height."""
203
204
class TableHeader:
205
@property
206
def cells(self) -> list:
207
"""Header cells."""
208
209
@property
210
def bbox(self) -> Rect:
211
"""Header bounding rectangle."""
212
213
# Cell content analysis
214
class TextMap:
215
def __init__(self, page: Page):
216
"""Create text map for table analysis."""
217
218
def get_text_in_bbox(self, bbox: Rect) -> str:
219
"""Get text within bounding box."""
220
221
class WordMap:
222
def __init__(self, page: Page):
223
"""Create word map for table analysis."""
224
225
def get_words_in_bbox(self, bbox: Rect) -> list:
226
"""Get words within bounding box."""
227
```
228
229
### Simple Table Extraction Function
230
231
Convenient high-level function for basic table extraction.
232
233
```python { .api }
234
def find_tables(page: Page, **kwargs) -> list:
235
"""
236
Find tables on page (convenience function).
237
238
Parameters:
239
- page: Page object to search
240
- kwargs: table detection parameters
241
242
Returns:
243
List of Table objects
244
"""
245
```
246
247
## Usage Examples
248
249
### Basic Table Extraction
250
251
```python
252
import pymupdf
253
254
doc = pymupdf.open("document_with_tables.pdf")
255
page = doc.load_page(0)
256
257
# Find tables on the page
258
tables = page.find_tables()
259
260
print(f"Found {len(tables)} tables")
261
262
for i, table in enumerate(tables):
263
print(f"\nTable {i + 1}:")
264
print(f" Bounding box: {table.bbox}")
265
266
# Extract table data
267
table_data = table.extract()
268
269
# Print table content
270
for row_num, row in enumerate(table_data):
271
print(f" Row {row_num}: {row}")
272
273
doc.close()
274
```
275
276
### Advanced Table Detection
277
278
```python
279
import pymupdf
280
281
doc = pymupdf.open("complex_document.pdf")
282
page = doc.load_page(0)
283
284
# Create table finder with custom settings
285
table_finder = pymupdf.TableFinder(page)
286
287
# Find tables with custom parameters
288
tables = table_finder.find_tables(
289
strategy="lines", # Use line-based detection
290
snap_tolerance=5, # More lenient line snapping
291
join_tolerance=5, # More aggressive line joining
292
edge_min_length=10, # Longer minimum lines
293
min_words_vertical=2, # Fewer words needed for vertical lines
294
text_tolerance=5 # Text-based detection tolerance
295
)
296
297
print(f"Found {len(tables)} tables with custom settings")
298
299
for table in tables:
300
# Extract with custom tolerances
301
data = table.extract(x_tolerance=5, y_tolerance=3)
302
print(f"Table with {len(data)} rows")
303
304
doc.close()
305
```
306
307
### Converting Tables to Different Formats
308
309
```python
310
import pymupdf
311
import pandas as pd
312
313
doc = pymupdf.open("data_report.pdf")
314
page = doc.load_page(0)
315
316
tables = page.find_tables()
317
318
for i, table in enumerate(tables):
319
# Convert to pandas DataFrame
320
try:
321
df = table.to_pandas()
322
print(f"Table {i + 1}: {df.shape} DataFrame")
323
print(df.head())
324
325
# Save as CSV
326
df.to_csv(f"table_{i + 1}.csv", index=False)
327
328
# Save as Excel
329
df.to_excel(f"table_{i + 1}.xlsx", index=False)
330
331
except Exception as e:
332
print(f"Error converting table {i + 1}: {e}")
333
334
# Convert to dictionary
335
table_dict = table.to_dict(orient="records")
336
print(f"Table as dict: {len(table_dict)} records")
337
338
# Convert to CSV string
339
csv_string = table.to_csv()
340
print(f"CSV length: {len(csv_string)} characters")
341
342
doc.close()
343
```
344
345
### Searching for Specific Tables
346
347
```python
348
import pymupdf
349
350
def find_tables_containing_text(page: pymupdf.Page, search_text: str) -> list:
351
"""Find tables that contain specific text."""
352
tables = page.find_tables()
353
matching_tables = []
354
355
for table in tables:
356
table_data = table.extract()
357
358
# Check if any cell contains the search text
359
for row in table_data:
360
for cell in row:
361
if cell and search_text.lower() in str(cell).lower():
362
matching_tables.append(table)
363
break
364
if table in matching_tables:
365
break
366
367
return matching_tables
368
369
doc = pymupdf.open("financial_report.pdf")
370
371
# Search all pages for tables containing "Revenue"
372
revenue_tables = []
373
for page_num in range(doc.page_count):
374
page = doc.load_page(page_num)
375
tables = find_tables_containing_text(page, "Revenue")
376
revenue_tables.extend([(page_num, table) for table in tables])
377
378
print(f"Found {len(revenue_tables)} tables containing 'Revenue'")
379
380
for page_num, table in revenue_tables:
381
print(f"Page {page_num + 1}: Table at {table.bbox}")
382
data = table.extract()
383
# Process revenue table data...
384
385
doc.close()
386
```
387
388
### Table Structure Analysis
389
390
```python
391
import pymupdf
392
393
def analyze_table_structure(table: pymupdf.Table) -> dict:
394
"""Analyze table structure and provide statistics."""
395
data = table.extract()
396
397
if not data:
398
return {"error": "Empty table"}
399
400
num_rows = len(data)
401
num_cols = len(data[0]) if data else 0
402
403
# Check for consistent column count
404
consistent_cols = all(len(row) == num_cols for row in data)
405
406
# Find empty cells
407
empty_cells = 0
408
total_cells = 0
409
410
for row in data:
411
for cell in row:
412
total_cells += 1
413
if not cell or str(cell).strip() == "":
414
empty_cells += 1
415
416
# Detect header row (often has different formatting)
417
likely_header = 0 # First row is most likely header
418
419
# Check for numeric columns
420
numeric_cols = []
421
for col_idx in range(num_cols):
422
numeric_count = 0
423
for row_idx in range(1, num_rows): # Skip header
424
if row_idx < len(data) and col_idx < len(data[row_idx]):
425
cell = data[row_idx][col_idx]
426
try:
427
float(str(cell).replace(',', '').replace('$', ''))
428
numeric_count += 1
429
except (ValueError, AttributeError):
430
pass
431
432
if numeric_count > (num_rows - 1) * 0.7: # 70% numeric
433
numeric_cols.append(col_idx)
434
435
return {
436
"dimensions": (num_rows, num_cols),
437
"consistent_columns": consistent_cols,
438
"empty_cells": empty_cells,
439
"total_cells": total_cells,
440
"fill_rate": (total_cells - empty_cells) / total_cells if total_cells > 0 else 0,
441
"likely_header_row": likely_header,
442
"numeric_columns": numeric_cols,
443
"bbox": table.bbox
444
}
445
446
doc = pymupdf.open("data_tables.pdf")
447
page = doc.load_page(0)
448
tables = page.find_tables()
449
450
for i, table in enumerate(tables):
451
analysis = analyze_table_structure(table)
452
print(f"\nTable {i + 1} Analysis:")
453
for key, value in analysis.items():
454
print(f" {key}: {value}")
455
456
doc.close()
457
```
458
459
### Merging Tables Across Pages
460
461
```python
462
import pymupdf
463
import pandas as pd
464
465
def extract_all_tables(doc: pymupdf.Document) -> list:
466
"""Extract all tables from all pages."""
467
all_tables = []
468
469
for page_num in range(doc.page_count):
470
page = doc.load_page(page_num)
471
tables = page.find_tables()
472
473
for table in tables:
474
table_data = {
475
"page": page_num,
476
"bbox": table.bbox,
477
"data": table.extract(),
478
"dataframe": table.to_pandas() if table.extract() else None
479
}
480
all_tables.append(table_data)
481
482
return all_tables
483
484
def merge_similar_tables(tables: list, similarity_threshold: float = 0.8) -> list:
485
"""Merge tables with similar column structures."""
486
merged_groups = []
487
488
for table in tables:
489
if table["dataframe"] is None:
490
continue
491
492
# Find similar tables
493
similar_group = None
494
for group in merged_groups:
495
if len(group) > 0:
496
reference_df = group[0]["dataframe"]
497
current_df = table["dataframe"]
498
499
# Check column similarity (simple heuristic)
500
if (len(reference_df.columns) == len(current_df.columns) and
501
len(set(reference_df.columns) & set(current_df.columns)) /
502
len(reference_df.columns) >= similarity_threshold):
503
similar_group = group
504
break
505
506
if similar_group:
507
similar_group.append(table)
508
else:
509
merged_groups.append([table])
510
511
return merged_groups
512
513
# Usage
514
doc = pymupdf.open("multi_page_report.pdf")
515
all_tables = extract_all_tables(doc)
516
print(f"Found {len(all_tables)} total tables")
517
518
# Group similar tables
519
table_groups = merge_similar_tables(all_tables)
520
print(f"Grouped into {len(table_groups)} similar table groups")
521
522
# Merge each group
523
for i, group in enumerate(table_groups):
524
if len(group) > 1:
525
# Merge DataFrames
526
dfs = [table["dataframe"] for table in group if table["dataframe"] is not None]
527
merged_df = pd.concat(dfs, ignore_index=True)
528
529
print(f"Group {i + 1}: Merged {len(group)} tables into {merged_df.shape} DataFrame")
530
merged_df.to_csv(f"merged_tables_group_{i + 1}.csv", index=False)
531
else:
532
# Single table
533
table = group[0]
534
if table["dataframe"] is not None:
535
table["dataframe"].to_csv(f"single_table_page_{table['page'] + 1}.csv", index=False)
536
537
doc.close()
538
```
539
540
### Custom Table Detection Strategies
541
542
```python
543
import pymupdf
544
545
def detect_tables_by_whitespace(page: pymupdf.Page, min_gap: float = 20) -> list:
546
"""Detect tables by analyzing whitespace patterns."""
547
# Get all words with positions
548
words = page.get_text("words")
549
550
if not words:
551
return []
552
553
# Group words by approximate rows based on y-coordinates
554
rows = {}
555
for word in words:
556
x0, y0, x1, y1, text, block_no, line_no, word_no = word
557
y_key = round(y0 / 5) * 5 # Group by 5-point intervals
558
559
if y_key not in rows:
560
rows[y_key] = []
561
rows[y_key].append((x0, x1, text))
562
563
# Analyze column alignment
564
potential_tables = []
565
sorted_rows = sorted(rows.items())
566
567
for y_pos, row_words in sorted_rows:
568
if len(row_words) >= 3: # At least 3 columns
569
row_words.sort() # Sort by x position
570
571
# Check for regular spacing
572
gaps = []
573
for i in range(1, len(row_words)):
574
gap = row_words[i][0] - row_words[i-1][1]
575
gaps.append(gap)
576
577
if gaps and min(gaps) > min_gap: # Significant gaps between words
578
potential_tables.append((y_pos, row_words))
579
580
# Convert to Table-like objects (simplified)
581
tables = []
582
for y_pos, words in potential_tables:
583
# Create bounding box
584
min_x = min(word[0] for word in words)
585
max_x = max(word[1] for word in words)
586
bbox = pymupdf.Rect(min_x, y_pos - 5, max_x, y_pos + 15)
587
588
# This would need more sophisticated conversion to actual Table objects
589
# For demonstration, we'll use the regular table finder on this area
590
tables_in_area = page.find_tables(clip=bbox)
591
tables.extend(tables_in_area)
592
593
return tables
594
595
# Usage
596
doc = pymupdf.open("whitespace_tables.pdf")
597
page = doc.load_page(0)
598
599
# Try different detection methods
600
regular_tables = page.find_tables()
601
whitespace_tables = detect_tables_by_whitespace(page)
602
603
print(f"Regular detection: {len(regular_tables)} tables")
604
print(f"Whitespace detection: {len(whitespace_tables)} tables")
605
606
doc.close()
607
```