tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

Table Extraction

Name: tessl/pypi-pdfplumber
Author: tessl

Sophisticated table detection and extraction capabilities with customizable strategies, edge detection algorithms, comprehensive configuration options, and visual debugging support.

Capabilities

Table Finding

Detect all tables on a page using various detection strategies and algorithms.

def find_tables(table_settings=None):
    """
    Find all tables using detection algorithms.
    
    Parameters:
    - table_settings: TableSettings or dict, optional - Configuration for detection
    
    Returns:
    List[Table]: List of detected table objects
    """

def find_table(table_settings=None):
    """
    Find largest table on page.
    
    Parameters:
    - table_settings: TableSettings or dict, optional - Configuration for detection
    
    Returns:
    Table or None: Largest detected table or None if no tables found
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Find all tables with default settings
    tables = page.find_tables()
    print(f"Found {len(tables)} tables")
    
    # Find largest table only
    main_table = page.find_table()
    if main_table:
        print(f"Main table area: {main_table.bbox}")
    
    # Find tables with custom settings
    custom_settings = {
        "vertical_strategy": "text",
        "horizontal_strategy": "lines"
    }
    tables = page.find_tables(table_settings=custom_settings)

Table Extraction

Extract table data as structured 2D arrays with various formatting options.

def extract_tables(table_settings=None):
    """
    Extract all tables as 2D arrays.
    
    Parameters:
    - table_settings: TableSettings or dict, optional - Configuration for detection
    
    Returns:
    List[List[List[str]]]: List of tables, each as 2D array of strings
    """

def extract_table(table_settings=None):
    """
    Extract largest table as 2D array.
    
    Parameters:
    - table_settings: TableSettings or dict, optional - Configuration for detection
    
    Returns:
    List[List[str]] or None: 2D array of strings or None if no table found
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Extract all tables
    tables = page.extract_tables()
    for i, table in enumerate(tables):
        print(f"Table {i+1}:")
        for row in table:
            print("  ", row)
    
    # Extract main table only  
    main_table = page.extract_table()
    if main_table:
        # Process header row
        headers = main_table[0]
        data_rows = main_table[1:]
        
        for row in data_rows:
            row_dict = dict(zip(headers, row))
            print(row_dict)

Table Class

Represents a detected table with extraction and analysis capabilities.

class Table:
    """Detected table with extraction capabilities."""
    
    def __init__(self, page, cells):
        """Initialize table from page and cell data."""
    
    @property
    def bbox(self) -> T_bbox:
        """Table bounding box coordinates."""
    
    @property
    def cells(self) -> List[T_bbox]:
        """List of cell bounding boxes."""
    
    @property
    def rows(self) -> List[CellGroup]:
        """Table rows as CellGroup objects."""
    
    @property
    def columns(self) -> List[CellGroup]:
        """Table columns as CellGroup objects."""
    
    def extract(self, **kwargs):
        """
        Extract table data as 2D array.
        
        Parameters:
        - **kwargs: Text extraction options for cell content
        
        Returns:
        List[List[str]]: 2D array of cell text content
        """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    tables = page.find_tables()
    for table in tables:
        print(f"Table at {table.bbox}")
        print(f"Dimensions: {len(table.rows)} rows × {len(table.columns)} columns")
        
        # Extract with custom text options
        data = table.extract(layout=True, x_tolerance=1)
        
        # Analyze cell structure
        for i, row in enumerate(table.rows):
            print(f"Row {i}: {len(row.cells)} cells")

TableFinder Class

Handles the table detection algorithm implementation and provides debugging capabilities.

class TableFinder:
    """Table detection algorithm implementation."""
    
    def __init__(self, page, settings=None):
        """Initialize TableFinder with page and settings."""
    
    @property
    def page(self) -> Page:
        """Source page object."""
    
    @property
    def settings(self) -> TableSettings:
        """Table detection settings."""
    
    @property
    def edges(self) -> T_obj_list:
        """Detected edges for table detection."""
    
    @property
    def intersections(self) -> T_intersections:
        """Edge intersection points."""
    
    @property
    def cells(self) -> List[T_bbox]:
        """Detected table cells."""
    
    @property
    def tables(self) -> List[Table]:
        """Detected table objects."""
    
    def get_edges(self):
        """Get edges based on detection strategy."""

TableSettings Class

Comprehensive configuration class for table detection parameters and strategies.

class TableSettings:
    """Configuration for table detection parameters."""
    
    def __init__(self, vertical_strategy="lines", horizontal_strategy="lines",
                 explicit_vertical_lines=None, explicit_horizontal_lines=None,
                 snap_tolerance=3, snap_x_tolerance=None, snap_y_tolerance=None,
                 join_tolerance=3, join_x_tolerance=None, join_y_tolerance=None,
                 edge_min_length=3, min_words_vertical=3, min_words_horizontal=1,
                 intersection_tolerance=3, intersection_x_tolerance=None,
                 intersection_y_tolerance=None, text_settings=None):
        """Initialize table detection settings."""
    
    @classmethod
    def resolve(cls, settings):
        """
        Create TableSettings from dict or existing instance.
        
        Parameters:
        - settings: dict, TableSettings, or None
        
        Returns:
        TableSettings: Resolved settings object
        """
    
    # Detection strategy options
    vertical_strategy: str  # "lines", "lines_strict", "text", "explicit"
    horizontal_strategy: str  # "lines", "lines_strict", "text", "explicit"
    
    # Explicit line positions
    explicit_vertical_lines: Optional[List[T_num]]
    explicit_horizontal_lines: Optional[List[T_num]]
    
    # Edge processing tolerances
    snap_tolerance: T_num
    snap_x_tolerance: Optional[T_num]
    snap_y_tolerance: Optional[T_num]
    join_tolerance: T_num
    join_x_tolerance: Optional[T_num]
    join_y_tolerance: Optional[T_num]
    edge_min_length: T_num
    
    # Text-based detection parameters
    min_words_vertical: int
    min_words_horizontal: int
    
    # Intersection detection
    intersection_tolerance: T_num
    intersection_x_tolerance: Optional[T_num]
    intersection_y_tolerance: Optional[T_num]
    
    # Text extraction settings for cells
    text_settings: Optional[Dict[str, Any]]

Usage Examples:

from pdfplumber.table import TableSettings

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Custom settings for line-based detection
    line_settings = TableSettings(
        vertical_strategy="lines_strict",
        horizontal_strategy="lines_strict",
        snap_tolerance=2,
        edge_min_length=10
    )
    
    # Custom settings for text-based detection
    text_settings = TableSettings(
        vertical_strategy="text",
        horizontal_strategy="text", 
        min_words_vertical=2,
        min_words_horizontal=1
    )
    
    # Explicit line positions
    explicit_settings = TableSettings(
        vertical_strategy="explicit",
        horizontal_strategy="explicit",
        explicit_vertical_lines=[100, 200, 300, 400],
        explicit_horizontal_lines=[50, 100, 150, 200]
    )
    
    # Use settings
    tables = page.find_tables(table_settings=line_settings)

Table Debugging

Visual debugging capabilities for understanding table detection algorithms.

def debug_tablefinder(table_settings=None):
    """
    Get TableFinder for debugging table detection.
    
    Parameters:
    - table_settings: TableSettings or dict, optional
    
    Returns:
    TableFinder: TableFinder object for algorithm inspection
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Debug table detection process
    finder = page.debug_tablefinder()
    
    print(f"Detected {len(finder.edges)} edges")
    print(f"Found {len(finder.intersections)} intersections")
    print(f"Identified {len(finder.cells)} cells")
    print(f"Grouped into {len(finder.tables)} tables")
    
    # Visualize detection process
    im = page.to_image()
    im.debug_tablefinder(table_settings=finder.settings)
    im.save("table_debug.png")

Cell Group Classes

Helper classes for table structure analysis.

class CellGroup:
    """Base class for table rows and columns."""
    
    @property
    def cells(self) -> List[T_bbox]:
        """Cell bounding boxes in this group."""
    
    @property
    def bbox(self) -> T_bbox:
        """Bounding box of entire group."""

class Row(CellGroup):
    """Table row representation."""

class Column(CellGroup):
    """Table column representation."""

Advanced Table Detection Strategies

Line-Based Detection

# Strict line detection - only uses actual PDF line objects
settings = TableSettings(
    vertical_strategy="lines_strict",
    horizontal_strategy="lines_strict"
)

# Flexible line detection - includes rectangle edges
settings = TableSettings(
    vertical_strategy="lines",
    horizontal_strategy="lines"
)

Text-Based Detection

# Use text alignment to infer table structure
settings = TableSettings(
    vertical_strategy="text",
    horizontal_strategy="text",
    min_words_vertical=3,  # Minimum words to establish column
    min_words_horizontal=2  # Minimum words to establish row
)

Explicit Line Detection

# Manually specify table grid lines
settings = TableSettings(
    vertical_strategy="explicit",
    horizontal_strategy="explicit",
    explicit_vertical_lines=[72, 144, 216, 288],  # X coordinates
    explicit_horizontal_lines=[100, 130, 160, 190]  # Y coordinates
)

Hybrid Detection

# Combine different strategies for horizontal and vertical
settings = TableSettings(
    vertical_strategy="text",      # Use text alignment for columns
    horizontal_strategy="lines",   # Use lines for rows
    snap_tolerance=5,             # Snap nearby elements together
    join_tolerance=2              # Join connected elements
)

Install with Tessl CLI