High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advanced table detection and extraction capabilities with support for table structure analysis, cell content extraction, and export to various formats including pandas DataFrames. PyMuPDF provides sophisticated algorithms for identifying and parsing tabular data from PDF documents.
Locate tables within document pages with configurable detection settings.
class TableFinder:
def __init__(self, page: Page):
"""
Create table finder for a page.
Parameters:
- page: Page object to search for tables
"""
def find_tables(self, clip: Rect = None, strategy: str = "lines_strict",
vertical_strategy: str = "lines", horizontal_strategy: str = "lines",
explicit_vertical_lines: list = None, explicit_horizontal_lines: list = None,
snap_tolerance: float = 3, snap_x_tolerance: float = None,
snap_y_tolerance: float = None, join_tolerance: float = 3,
join_x_tolerance: float = None, join_y_tolerance: float = None,
edge_min_length: float = 3, min_words_vertical: float = 3,
min_words_horizontal: float = 1, intersection_tolerance: float = 3,
intersection_x_tolerance: float = None, intersection_y_tolerance: float = None,
text_tolerance: float = 3, text_x_tolerance: float = None,
text_y_tolerance: float = None) -> list:
"""
Find tables on the page.
Parameters:
- clip: rectangle to limit search area
- strategy: table detection strategy ("lines_strict", "lines", "text", "explicit")
- vertical_strategy: strategy for detecting vertical lines
- horizontal_strategy: strategy for detecting horizontal lines
- explicit_vertical_lines: explicit vertical line positions
- explicit_horizontal_lines: explicit horizontal line positions
- snap_tolerance: tolerance for snapping lines to text
- snap_x_tolerance: x-direction snap tolerance
- snap_y_tolerance: y-direction snap tolerance
- join_tolerance: tolerance for joining line segments
- join_x_tolerance: x-direction join tolerance
- join_y_tolerance: y-direction join tolerance
- edge_min_length: minimum line length to consider
- min_words_vertical: minimum words to form vertical line
- min_words_horizontal: minimum words to form horizontal line
- intersection_tolerance: tolerance for line intersections
- intersection_x_tolerance: x-direction intersection tolerance
- intersection_y_tolerance: y-direction intersection tolerance
- text_tolerance: tolerance for text-based table detection
- text_x_tolerance: x-direction text tolerance
- text_y_tolerance: y-direction text tolerance
Returns:
List of Table objects found on the page
"""Individual table representation with extraction and manipulation capabilities.
class Table:
def __init__(self, page: Page, bbox: Rect):
"""
Create table object.
Parameters:
- page: parent Page object
- bbox: table bounding rectangle
"""
def extract(self, x_tolerance: float = 3, y_tolerance: float = 3) -> list:
"""
Extract table data as list of rows.
Parameters:
- x_tolerance: horizontal tolerance for cell alignment
- y_tolerance: vertical tolerance for cell alignment
Returns:
List of lists representing table rows and cells
"""
def to_pandas(self, **kwargs) -> 'pandas.DataFrame':
"""
Convert table to pandas DataFrame.
Parameters:
- kwargs: additional pandas DataFrame parameters
Returns:
pandas DataFrame with table data
"""
def to_csv(self, file_path: str = None, **kwargs) -> str:
"""
Export table to CSV format.
Parameters:
- file_path: output file path (None for string return)
- kwargs: additional CSV export parameters
Returns:
CSV string if file_path is None, otherwise None
"""
def to_dict(self, orient: str = "records") -> typing.Union[list, dict]:
"""
Convert table to dictionary format.
Parameters:
- orient: dictionary orientation ("records", "list", "dict", etc.)
Returns:
Table data as dictionary
"""
@property
def bbox(self) -> Rect:
"""Table bounding rectangle."""
@property
def cells(self) -> list:
"""List of table cells with positions and content."""
@property
def rows(self) -> list:
"""List of table rows."""
@property
def cols(self) -> list:
"""List of table columns."""Fine-tune table detection parameters for different document types.
class TableSettings:
def __init__(self):
"""Create default table settings."""
@property
def vertical_strategy(self) -> str:
"""Strategy for vertical line detection."""
@property
def horizontal_strategy(self) -> str:
"""Strategy for horizontal line detection."""
@property
def snap_tolerance(self) -> float:
"""Tolerance for snapping lines to text."""
@property
def join_tolerance(self) -> float:
"""Tolerance for joining line segments."""
@property
def edge_min_length(self) -> float:
"""Minimum line length to consider."""
@property
def min_words_vertical(self) -> float:
"""Minimum words to form vertical line."""
@property
def min_words_horizontal(self) -> float:
"""Minimum words to form horizontal line."""
@property
def intersection_tolerance(self) -> float:
"""Tolerance for line intersections."""
@property
def text_tolerance(self) -> float:
"""Tolerance for text-based detection."""Analyze table structure and content for complex data extraction.
class TableRow:
@property
def cells(self) -> list:
"""Cells in this row."""
@property
def bbox(self) -> Rect:
"""Row bounding rectangle."""
@property
def height(self) -> float:
"""Row height."""
class TableHeader:
@property
def cells(self) -> list:
"""Header cells."""
@property
def bbox(self) -> Rect:
"""Header bounding rectangle."""
# Cell content analysis
class TextMap:
def __init__(self, page: Page):
"""Create text map for table analysis."""
def get_text_in_bbox(self, bbox: Rect) -> str:
"""Get text within bounding box."""
class WordMap:
def __init__(self, page: Page):
"""Create word map for table analysis."""
def get_words_in_bbox(self, bbox: Rect) -> list:
"""Get words within bounding box."""Convenient high-level function for basic table extraction.
def find_tables(page: Page, **kwargs) -> list:
"""
Find tables on page (convenience function).
Parameters:
- page: Page object to search
- kwargs: table detection parameters
Returns:
List of Table objects
"""import pymupdf
doc = pymupdf.open("document_with_tables.pdf")
page = doc.load_page(0)
# Find tables on the page
tables = page.find_tables()
print(f"Found {len(tables)} tables")
for i, table in enumerate(tables):
print(f"\nTable {i + 1}:")
print(f" Bounding box: {table.bbox}")
# Extract table data
table_data = table.extract()
# Print table content
for row_num, row in enumerate(table_data):
print(f" Row {row_num}: {row}")
doc.close()import pymupdf
doc = pymupdf.open("complex_document.pdf")
page = doc.load_page(0)
# Create table finder with custom settings
table_finder = pymupdf.TableFinder(page)
# Find tables with custom parameters
tables = table_finder.find_tables(
strategy="lines", # Use line-based detection
snap_tolerance=5, # More lenient line snapping
join_tolerance=5, # More aggressive line joining
edge_min_length=10, # Longer minimum lines
min_words_vertical=2, # Fewer words needed for vertical lines
text_tolerance=5 # Text-based detection tolerance
)
print(f"Found {len(tables)} tables with custom settings")
for table in tables:
# Extract with custom tolerances
data = table.extract(x_tolerance=5, y_tolerance=3)
print(f"Table with {len(data)} rows")
doc.close()import pymupdf
import pandas as pd
doc = pymupdf.open("data_report.pdf")
page = doc.load_page(0)
tables = page.find_tables()
for i, table in enumerate(tables):
# Convert to pandas DataFrame
try:
df = table.to_pandas()
print(f"Table {i + 1}: {df.shape} DataFrame")
print(df.head())
# Save as CSV
df.to_csv(f"table_{i + 1}.csv", index=False)
# Save as Excel
df.to_excel(f"table_{i + 1}.xlsx", index=False)
except Exception as e:
print(f"Error converting table {i + 1}: {e}")
# Convert to dictionary
table_dict = table.to_dict(orient="records")
print(f"Table as dict: {len(table_dict)} records")
# Convert to CSV string
csv_string = table.to_csv()
print(f"CSV length: {len(csv_string)} characters")
doc.close()import pymupdf
def find_tables_containing_text(page: pymupdf.Page, search_text: str) -> list:
"""Find tables that contain specific text."""
tables = page.find_tables()
matching_tables = []
for table in tables:
table_data = table.extract()
# Check if any cell contains the search text
for row in table_data:
for cell in row:
if cell and search_text.lower() in str(cell).lower():
matching_tables.append(table)
break
if table in matching_tables:
break
return matching_tables
doc = pymupdf.open("financial_report.pdf")
# Search all pages for tables containing "Revenue"
revenue_tables = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
tables = find_tables_containing_text(page, "Revenue")
revenue_tables.extend([(page_num, table) for table in tables])
print(f"Found {len(revenue_tables)} tables containing 'Revenue'")
for page_num, table in revenue_tables:
print(f"Page {page_num + 1}: Table at {table.bbox}")
data = table.extract()
# Process revenue table data...
doc.close()import pymupdf
def analyze_table_structure(table: pymupdf.Table) -> dict:
"""Analyze table structure and provide statistics."""
data = table.extract()
if not data:
return {"error": "Empty table"}
num_rows = len(data)
num_cols = len(data[0]) if data else 0
# Check for consistent column count
consistent_cols = all(len(row) == num_cols for row in data)
# Find empty cells
empty_cells = 0
total_cells = 0
for row in data:
for cell in row:
total_cells += 1
if not cell or str(cell).strip() == "":
empty_cells += 1
# Detect header row (often has different formatting)
likely_header = 0 # First row is most likely header
# Check for numeric columns
numeric_cols = []
for col_idx in range(num_cols):
numeric_count = 0
for row_idx in range(1, num_rows): # Skip header
if row_idx < len(data) and col_idx < len(data[row_idx]):
cell = data[row_idx][col_idx]
try:
float(str(cell).replace(',', '').replace('$', ''))
numeric_count += 1
except (ValueError, AttributeError):
pass
if numeric_count > (num_rows - 1) * 0.7: # 70% numeric
numeric_cols.append(col_idx)
return {
"dimensions": (num_rows, num_cols),
"consistent_columns": consistent_cols,
"empty_cells": empty_cells,
"total_cells": total_cells,
"fill_rate": (total_cells - empty_cells) / total_cells if total_cells > 0 else 0,
"likely_header_row": likely_header,
"numeric_columns": numeric_cols,
"bbox": table.bbox
}
doc = pymupdf.open("data_tables.pdf")
page = doc.load_page(0)
tables = page.find_tables()
for i, table in enumerate(tables):
analysis = analyze_table_structure(table)
print(f"\nTable {i + 1} Analysis:")
for key, value in analysis.items():
print(f" {key}: {value}")
doc.close()import pymupdf
import pandas as pd
def extract_all_tables(doc: pymupdf.Document) -> list:
"""Extract all tables from all pages."""
all_tables = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
tables = page.find_tables()
for table in tables:
table_data = {
"page": page_num,
"bbox": table.bbox,
"data": table.extract(),
"dataframe": table.to_pandas() if table.extract() else None
}
all_tables.append(table_data)
return all_tables
def merge_similar_tables(tables: list, similarity_threshold: float = 0.8) -> list:
"""Merge tables with similar column structures."""
merged_groups = []
for table in tables:
if table["dataframe"] is None:
continue
# Find similar tables
similar_group = None
for group in merged_groups:
if len(group) > 0:
reference_df = group[0]["dataframe"]
current_df = table["dataframe"]
# Check column similarity (simple heuristic)
if (len(reference_df.columns) == len(current_df.columns) and
len(set(reference_df.columns) & set(current_df.columns)) /
len(reference_df.columns) >= similarity_threshold):
similar_group = group
break
if similar_group:
similar_group.append(table)
else:
merged_groups.append([table])
return merged_groups
# Usage
doc = pymupdf.open("multi_page_report.pdf")
all_tables = extract_all_tables(doc)
print(f"Found {len(all_tables)} total tables")
# Group similar tables
table_groups = merge_similar_tables(all_tables)
print(f"Grouped into {len(table_groups)} similar table groups")
# Merge each group
for i, group in enumerate(table_groups):
if len(group) > 1:
# Merge DataFrames
dfs = [table["dataframe"] for table in group if table["dataframe"] is not None]
merged_df = pd.concat(dfs, ignore_index=True)
print(f"Group {i + 1}: Merged {len(group)} tables into {merged_df.shape} DataFrame")
merged_df.to_csv(f"merged_tables_group_{i + 1}.csv", index=False)
else:
# Single table
table = group[0]
if table["dataframe"] is not None:
table["dataframe"].to_csv(f"single_table_page_{table['page'] + 1}.csv", index=False)
doc.close()import pymupdf
def detect_tables_by_whitespace(page: pymupdf.Page, min_gap: float = 20) -> list:
"""Detect tables by analyzing whitespace patterns."""
# Get all words with positions
words = page.get_text("words")
if not words:
return []
# Group words by approximate rows based on y-coordinates
rows = {}
for word in words:
x0, y0, x1, y1, text, block_no, line_no, word_no = word
y_key = round(y0 / 5) * 5 # Group by 5-point intervals
if y_key not in rows:
rows[y_key] = []
rows[y_key].append((x0, x1, text))
# Analyze column alignment
potential_tables = []
sorted_rows = sorted(rows.items())
for y_pos, row_words in sorted_rows:
if len(row_words) >= 3: # At least 3 columns
row_words.sort() # Sort by x position
# Check for regular spacing
gaps = []
for i in range(1, len(row_words)):
gap = row_words[i][0] - row_words[i-1][1]
gaps.append(gap)
if gaps and min(gaps) > min_gap: # Significant gaps between words
potential_tables.append((y_pos, row_words))
# Convert to Table-like objects (simplified)
tables = []
for y_pos, words in potential_tables:
# Create bounding box
min_x = min(word[0] for word in words)
max_x = max(word[1] for word in words)
bbox = pymupdf.Rect(min_x, y_pos - 5, max_x, y_pos + 15)
# This would need more sophisticated conversion to actual Table objects
# For demonstration, we'll use the regular table finder on this area
tables_in_area = page.find_tables(clip=bbox)
tables.extend(tables_in_area)
return tables
# Usage
doc = pymupdf.open("whitespace_tables.pdf")
page = doc.load_page(0)
# Try different detection methods
regular_tables = page.find_tables()
whitespace_tables = detect_tables_by_whitespace(page)
print(f"Regular detection: {len(regular_tables)} tables")
print(f"Whitespace detection: {len(whitespace_tables)} tables")
doc.close()Install with Tessl CLI
npx tessl i tessl/pypi-pymupdf