tessl/pypi-pymupdf

High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Table Extraction

Name: tessl/pypi-pymupdf
Author: tessl

Advanced table detection and extraction capabilities with support for table structure analysis, cell content extraction, and export to various formats including pandas DataFrames. PyMuPDF provides sophisticated algorithms for identifying and parsing tabular data from PDF documents.

Capabilities

Table Finding and Detection

Locate tables within document pages with configurable detection settings.

class TableFinder:
    def __init__(self, page: Page):
        """
        Create table finder for a page.
        
        Parameters:
        - page: Page object to search for tables
        """
    
    def find_tables(self, clip: Rect = None, strategy: str = "lines_strict",
                   vertical_strategy: str = "lines", horizontal_strategy: str = "lines",
                   explicit_vertical_lines: list = None, explicit_horizontal_lines: list = None,
                   snap_tolerance: float = 3, snap_x_tolerance: float = None,
                   snap_y_tolerance: float = None, join_tolerance: float = 3,
                   join_x_tolerance: float = None, join_y_tolerance: float = None,
                   edge_min_length: float = 3, min_words_vertical: float = 3,
                   min_words_horizontal: float = 1, intersection_tolerance: float = 3,
                   intersection_x_tolerance: float = None, intersection_y_tolerance: float = None,
                   text_tolerance: float = 3, text_x_tolerance: float = None,
                   text_y_tolerance: float = None) -> list:
        """
        Find tables on the page.
        
        Parameters:
        - clip: rectangle to limit search area
        - strategy: table detection strategy ("lines_strict", "lines", "text", "explicit")
        - vertical_strategy: strategy for detecting vertical lines
        - horizontal_strategy: strategy for detecting horizontal lines
        - explicit_vertical_lines: explicit vertical line positions
        - explicit_horizontal_lines: explicit horizontal line positions
        - snap_tolerance: tolerance for snapping lines to text
        - snap_x_tolerance: x-direction snap tolerance
        - snap_y_tolerance: y-direction snap tolerance
        - join_tolerance: tolerance for joining line segments
        - join_x_tolerance: x-direction join tolerance
        - join_y_tolerance: y-direction join tolerance
        - edge_min_length: minimum line length to consider
        - min_words_vertical: minimum words to form vertical line
        - min_words_horizontal: minimum words to form horizontal line
        - intersection_tolerance: tolerance for line intersections
        - intersection_x_tolerance: x-direction intersection tolerance
        - intersection_y_tolerance: y-direction intersection tolerance
        - text_tolerance: tolerance for text-based table detection
        - text_x_tolerance: x-direction text tolerance
        - text_y_tolerance: y-direction text tolerance
        
        Returns:
        List of Table objects found on the page
        """

Table Class

Individual table representation with extraction and manipulation capabilities.

class Table:
    def __init__(self, page: Page, bbox: Rect):
        """
        Create table object.
        
        Parameters:
        - page: parent Page object
        - bbox: table bounding rectangle
        """
    
    def extract(self, x_tolerance: float = 3, y_tolerance: float = 3) -> list:
        """
        Extract table data as list of rows.
        
        Parameters:
        - x_tolerance: horizontal tolerance for cell alignment
        - y_tolerance: vertical tolerance for cell alignment
        
        Returns:
        List of lists representing table rows and cells
        """
    
    def to_pandas(self, **kwargs) -> 'pandas.DataFrame':
        """
        Convert table to pandas DataFrame.
        
        Parameters:
        - kwargs: additional pandas DataFrame parameters
        
        Returns:
        pandas DataFrame with table data
        """
    
    def to_csv(self, file_path: str = None, **kwargs) -> str:
        """
        Export table to CSV format.
        
        Parameters:
        - file_path: output file path (None for string return)
        - kwargs: additional CSV export parameters
        
        Returns:
        CSV string if file_path is None, otherwise None
        """
    
    def to_dict(self, orient: str = "records") -> typing.Union[list, dict]:
        """
        Convert table to dictionary format.
        
        Parameters:
        - orient: dictionary orientation ("records", "list", "dict", etc.)
        
        Returns:
        Table data as dictionary
        """
    
    @property
    def bbox(self) -> Rect:
        """Table bounding rectangle."""
    
    @property
    def cells(self) -> list:
        """List of table cells with positions and content."""
    
    @property
    def rows(self) -> list:
        """List of table rows."""
    
    @property
    def cols(self) -> list:
        """List of table columns."""

Table Settings and Configuration

Fine-tune table detection parameters for different document types.

class TableSettings:
    def __init__(self):
        """Create default table settings."""
    
    @property
    def vertical_strategy(self) -> str:
        """Strategy for vertical line detection."""
    
    @property
    def horizontal_strategy(self) -> str:
        """Strategy for horizontal line detection."""
    
    @property
    def snap_tolerance(self) -> float:
        """Tolerance for snapping lines to text."""
    
    @property
    def join_tolerance(self) -> float:
        """Tolerance for joining line segments."""
    
    @property
    def edge_min_length(self) -> float:
        """Minimum line length to consider."""
    
    @property
    def min_words_vertical(self) -> float:
        """Minimum words to form vertical line."""
    
    @property
    def min_words_horizontal(self) -> float:
        """Minimum words to form horizontal line."""
    
    @property
    def intersection_tolerance(self) -> float:
        """Tolerance for line intersections."""
    
    @property
    def text_tolerance(self) -> float:
        """Tolerance for text-based detection."""

Advanced Table Analysis

Analyze table structure and content for complex data extraction.

class TableRow:
    @property
    def cells(self) -> list:
        """Cells in this row."""
    
    @property
    def bbox(self) -> Rect:
        """Row bounding rectangle."""
    
    @property
    def height(self) -> float:
        """Row height."""

class TableHeader:
    @property
    def cells(self) -> list:
        """Header cells."""
    
    @property
    def bbox(self) -> Rect:
        """Header bounding rectangle."""

# Cell content analysis
class TextMap:
    def __init__(self, page: Page):
        """Create text map for table analysis."""
    
    def get_text_in_bbox(self, bbox: Rect) -> str:
        """Get text within bounding box."""

class WordMap:
    def __init__(self, page: Page):
        """Create word map for table analysis."""
    
    def get_words_in_bbox(self, bbox: Rect) -> list:
        """Get words within bounding box."""

Simple Table Extraction Function

Convenient high-level function for basic table extraction.

def find_tables(page: Page, **kwargs) -> list:
    """
    Find tables on page (convenience function).
    
    Parameters:
    - page: Page object to search
    - kwargs: table detection parameters
    
    Returns:
    List of Table objects
    """

Usage Examples

Basic Table Extraction

import pymupdf

doc = pymupdf.open("document_with_tables.pdf")
page = doc.load_page(0)

# Find tables on the page
tables = page.find_tables()

print(f"Found {len(tables)} tables")

for i, table in enumerate(tables):
    print(f"\nTable {i + 1}:")
    print(f"  Bounding box: {table.bbox}")
    
    # Extract table data
    table_data = table.extract()
    
    # Print table content
    for row_num, row in enumerate(table_data):
        print(f"  Row {row_num}: {row}")

doc.close()

Advanced Table Detection

import pymupdf

doc = pymupdf.open("complex_document.pdf")
page = doc.load_page(0)

# Create table finder with custom settings
table_finder = pymupdf.TableFinder(page)

# Find tables with custom parameters
tables = table_finder.find_tables(
    strategy="lines",  # Use line-based detection
    snap_tolerance=5,  # More lenient line snapping
    join_tolerance=5,  # More aggressive line joining
    edge_min_length=10,  # Longer minimum lines
    min_words_vertical=2,  # Fewer words needed for vertical lines
    text_tolerance=5  # Text-based detection tolerance
)

print(f"Found {len(tables)} tables with custom settings")

for table in tables:
    # Extract with custom tolerances
    data = table.extract(x_tolerance=5, y_tolerance=3)
    print(f"Table with {len(data)} rows")

doc.close()

Converting Tables to Different Formats

import pymupdf
import pandas as pd

doc = pymupdf.open("data_report.pdf")
page = doc.load_page(0)

tables = page.find_tables()

for i, table in enumerate(tables):
    # Convert to pandas DataFrame
    try:
        df = table.to_pandas()
        print(f"Table {i + 1}: {df.shape} DataFrame")
        print(df.head())
        
        # Save as CSV
        df.to_csv(f"table_{i + 1}.csv", index=False)
        
        # Save as Excel
        df.to_excel(f"table_{i + 1}.xlsx", index=False)
        
    except Exception as e:
        print(f"Error converting table {i + 1}: {e}")
    
    # Convert to dictionary
    table_dict = table.to_dict(orient="records")
    print(f"Table as dict: {len(table_dict)} records")
    
    # Convert to CSV string
    csv_string = table.to_csv()
    print(f"CSV length: {len(csv_string)} characters")

doc.close()

Searching for Specific Tables

import pymupdf

def find_tables_containing_text(page: pymupdf.Page, search_text: str) -> list:
    """Find tables that contain specific text."""
    tables = page.find_tables()
    matching_tables = []
    
    for table in tables:
        table_data = table.extract()
        
        # Check if any cell contains the search text
        for row in table_data:
            for cell in row:
                if cell and search_text.lower() in str(cell).lower():
                    matching_tables.append(table)
                    break
            if table in matching_tables:
                break
    
    return matching_tables

doc = pymupdf.open("financial_report.pdf")

# Search all pages for tables containing "Revenue"
revenue_tables = []
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    tables = find_tables_containing_text(page, "Revenue")
    revenue_tables.extend([(page_num, table) for table in tables])

print(f"Found {len(revenue_tables)} tables containing 'Revenue'")

for page_num, table in revenue_tables:
    print(f"Page {page_num + 1}: Table at {table.bbox}")
    data = table.extract()
    # Process revenue table data...

doc.close()

Table Structure Analysis

import pymupdf

def analyze_table_structure(table: pymupdf.Table) -> dict:
    """Analyze table structure and provide statistics."""
    data = table.extract()
    
    if not data:
        return {"error": "Empty table"}
    
    num_rows = len(data)
    num_cols = len(data[0]) if data else 0
    
    # Check for consistent column count
    consistent_cols = all(len(row) == num_cols for row in data)
    
    # Find empty cells
    empty_cells = 0
    total_cells = 0
    
    for row in data:
        for cell in row:
            total_cells += 1
            if not cell or str(cell).strip() == "":
                empty_cells += 1
    
    # Detect header row (often has different formatting)
    likely_header = 0  # First row is most likely header
    
    # Check for numeric columns
    numeric_cols = []
    for col_idx in range(num_cols):
        numeric_count = 0
        for row_idx in range(1, num_rows):  # Skip header
            if row_idx < len(data) and col_idx < len(data[row_idx]):
                cell = data[row_idx][col_idx]
                try:
                    float(str(cell).replace(',', '').replace('$', ''))
                    numeric_count += 1
                except (ValueError, AttributeError):
                    pass
        
        if numeric_count > (num_rows - 1) * 0.7:  # 70% numeric
            numeric_cols.append(col_idx)
    
    return {
        "dimensions": (num_rows, num_cols),
        "consistent_columns": consistent_cols,
        "empty_cells": empty_cells,
        "total_cells": total_cells,
        "fill_rate": (total_cells - empty_cells) / total_cells if total_cells > 0 else 0,
        "likely_header_row": likely_header,
        "numeric_columns": numeric_cols,
        "bbox": table.bbox
    }

doc = pymupdf.open("data_tables.pdf")
page = doc.load_page(0)
tables = page.find_tables()

for i, table in enumerate(tables):
    analysis = analyze_table_structure(table)
    print(f"\nTable {i + 1} Analysis:")
    for key, value in analysis.items():
        print(f"  {key}: {value}")

doc.close()

Merging Tables Across Pages

import pymupdf
import pandas as pd

def extract_all_tables(doc: pymupdf.Document) -> list:
    """Extract all tables from all pages."""
    all_tables = []
    
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        tables = page.find_tables()
        
        for table in tables:
            table_data = {
                "page": page_num,
                "bbox": table.bbox,
                "data": table.extract(),
                "dataframe": table.to_pandas() if table.extract() else None
            }
            all_tables.append(table_data)
    
    return all_tables

def merge_similar_tables(tables: list, similarity_threshold: float = 0.8) -> list:
    """Merge tables with similar column structures."""
    merged_groups = []
    
    for table in tables:
        if table["dataframe"] is None:
            continue
            
        # Find similar tables
        similar_group = None
        for group in merged_groups:
            if len(group) > 0:
                reference_df = group[0]["dataframe"]
                current_df = table["dataframe"]
                
                # Check column similarity (simple heuristic)
                if (len(reference_df.columns) == len(current_df.columns) and
                    len(set(reference_df.columns) & set(current_df.columns)) / 
                    len(reference_df.columns) >= similarity_threshold):
                    similar_group = group
                    break
        
        if similar_group:
            similar_group.append(table)
        else:
            merged_groups.append([table])
    
    return merged_groups

# Usage
doc = pymupdf.open("multi_page_report.pdf")
all_tables = extract_all_tables(doc)
print(f"Found {len(all_tables)} total tables")

# Group similar tables
table_groups = merge_similar_tables(all_tables)
print(f"Grouped into {len(table_groups)} similar table groups")

# Merge each group
for i, group in enumerate(table_groups):
    if len(group) > 1:
        # Merge DataFrames
        dfs = [table["dataframe"] for table in group if table["dataframe"] is not None]
        merged_df = pd.concat(dfs, ignore_index=True)
        
        print(f"Group {i + 1}: Merged {len(group)} tables into {merged_df.shape} DataFrame")
        merged_df.to_csv(f"merged_tables_group_{i + 1}.csv", index=False)
    else:
        # Single table
        table = group[0]
        if table["dataframe"] is not None:
            table["dataframe"].to_csv(f"single_table_page_{table['page'] + 1}.csv", index=False)

doc.close()

Custom Table Detection Strategies

import pymupdf

def detect_tables_by_whitespace(page: pymupdf.Page, min_gap: float = 20) -> list:
    """Detect tables by analyzing whitespace patterns."""
    # Get all words with positions
    words = page.get_text("words")
    
    if not words:
        return []
    
    # Group words by approximate rows based on y-coordinates
    rows = {}
    for word in words:
        x0, y0, x1, y1, text, block_no, line_no, word_no = word
        y_key = round(y0 / 5) * 5  # Group by 5-point intervals
        
        if y_key not in rows:
            rows[y_key] = []
        rows[y_key].append((x0, x1, text))
    
    # Analyze column alignment
    potential_tables = []
    sorted_rows = sorted(rows.items())
    
    for y_pos, row_words in sorted_rows:
        if len(row_words) >= 3:  # At least 3 columns
            row_words.sort()  # Sort by x position
            
            # Check for regular spacing
            gaps = []
            for i in range(1, len(row_words)):
                gap = row_words[i][0] - row_words[i-1][1]
                gaps.append(gap)
            
            if gaps and min(gaps) > min_gap:  # Significant gaps between words
                potential_tables.append((y_pos, row_words))
    
    # Convert to Table-like objects (simplified)
    tables = []
    for y_pos, words in potential_tables:
        # Create bounding box
        min_x = min(word[0] for word in words)
        max_x = max(word[1] for word in words)
        bbox = pymupdf.Rect(min_x, y_pos - 5, max_x, y_pos + 15)
        
        # This would need more sophisticated conversion to actual Table objects
        # For demonstration, we'll use the regular table finder on this area
        tables_in_area = page.find_tables(clip=bbox)
        tables.extend(tables_in_area)
    
    return tables

# Usage
doc = pymupdf.open("whitespace_tables.pdf")
page = doc.load_page(0)

# Try different detection methods
regular_tables = page.find_tables()
whitespace_tables = detect_tables_by_whitespace(page)

print(f"Regular detection: {len(regular_tables)} tables")
print(f"Whitespace detection: {len(whitespace_tables)} tables")

doc.close()

Install with Tessl CLI