Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Sophisticated table detection and extraction capabilities with customizable strategies, edge detection algorithms, comprehensive configuration options, and visual debugging support.
Detect all tables on a page using various detection strategies and algorithms.
def find_tables(table_settings=None):
"""
Find all tables using detection algorithms.
Parameters:
- table_settings: TableSettings or dict, optional - Configuration for detection
Returns:
List[Table]: List of detected table objects
"""
def find_table(table_settings=None):
"""
Find largest table on page.
Parameters:
- table_settings: TableSettings or dict, optional - Configuration for detection
Returns:
Table or None: Largest detected table or None if no tables found
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Find all tables with default settings
tables = page.find_tables()
print(f"Found {len(tables)} tables")
# Find largest table only
main_table = page.find_table()
if main_table:
print(f"Main table area: {main_table.bbox}")
# Find tables with custom settings
custom_settings = {
"vertical_strategy": "text",
"horizontal_strategy": "lines"
}
tables = page.find_tables(table_settings=custom_settings)Extract table data as structured 2D arrays with various formatting options.
def extract_tables(table_settings=None):
"""
Extract all tables as 2D arrays.
Parameters:
- table_settings: TableSettings or dict, optional - Configuration for detection
Returns:
List[List[List[str]]]: List of tables, each as 2D array of strings
"""
def extract_table(table_settings=None):
"""
Extract largest table as 2D array.
Parameters:
- table_settings: TableSettings or dict, optional - Configuration for detection
Returns:
List[List[str]] or None: 2D array of strings or None if no table found
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Extract all tables
tables = page.extract_tables()
for i, table in enumerate(tables):
print(f"Table {i+1}:")
for row in table:
print(" ", row)
# Extract main table only
main_table = page.extract_table()
if main_table:
# Process header row
headers = main_table[0]
data_rows = main_table[1:]
for row in data_rows:
row_dict = dict(zip(headers, row))
print(row_dict)Represents a detected table with extraction and analysis capabilities.
class Table:
"""Detected table with extraction capabilities."""
def __init__(self, page, cells):
"""Initialize table from page and cell data."""
@property
def bbox(self) -> T_bbox:
"""Table bounding box coordinates."""
@property
def cells(self) -> List[T_bbox]:
"""List of cell bounding boxes."""
@property
def rows(self) -> List[CellGroup]:
"""Table rows as CellGroup objects."""
@property
def columns(self) -> List[CellGroup]:
"""Table columns as CellGroup objects."""
def extract(self, **kwargs):
"""
Extract table data as 2D array.
Parameters:
- **kwargs: Text extraction options for cell content
Returns:
List[List[str]]: 2D array of cell text content
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
tables = page.find_tables()
for table in tables:
print(f"Table at {table.bbox}")
print(f"Dimensions: {len(table.rows)} rows × {len(table.columns)} columns")
# Extract with custom text options
data = table.extract(layout=True, x_tolerance=1)
# Analyze cell structure
for i, row in enumerate(table.rows):
print(f"Row {i}: {len(row.cells)} cells")Handles the table detection algorithm implementation and provides debugging capabilities.
class TableFinder:
"""Table detection algorithm implementation."""
def __init__(self, page, settings=None):
"""Initialize TableFinder with page and settings."""
@property
def page(self) -> Page:
"""Source page object."""
@property
def settings(self) -> TableSettings:
"""Table detection settings."""
@property
def edges(self) -> T_obj_list:
"""Detected edges for table detection."""
@property
def intersections(self) -> T_intersections:
"""Edge intersection points."""
@property
def cells(self) -> List[T_bbox]:
"""Detected table cells."""
@property
def tables(self) -> List[Table]:
"""Detected table objects."""
def get_edges(self):
"""Get edges based on detection strategy."""Comprehensive configuration class for table detection parameters and strategies.
class TableSettings:
"""Configuration for table detection parameters."""
def __init__(self, vertical_strategy="lines", horizontal_strategy="lines",
explicit_vertical_lines=None, explicit_horizontal_lines=None,
snap_tolerance=3, snap_x_tolerance=None, snap_y_tolerance=None,
join_tolerance=3, join_x_tolerance=None, join_y_tolerance=None,
edge_min_length=3, min_words_vertical=3, min_words_horizontal=1,
intersection_tolerance=3, intersection_x_tolerance=None,
intersection_y_tolerance=None, text_settings=None):
"""Initialize table detection settings."""
@classmethod
def resolve(cls, settings):
"""
Create TableSettings from dict or existing instance.
Parameters:
- settings: dict, TableSettings, or None
Returns:
TableSettings: Resolved settings object
"""
# Detection strategy options
vertical_strategy: str # "lines", "lines_strict", "text", "explicit"
horizontal_strategy: str # "lines", "lines_strict", "text", "explicit"
# Explicit line positions
explicit_vertical_lines: Optional[List[T_num]]
explicit_horizontal_lines: Optional[List[T_num]]
# Edge processing tolerances
snap_tolerance: T_num
snap_x_tolerance: Optional[T_num]
snap_y_tolerance: Optional[T_num]
join_tolerance: T_num
join_x_tolerance: Optional[T_num]
join_y_tolerance: Optional[T_num]
edge_min_length: T_num
# Text-based detection parameters
min_words_vertical: int
min_words_horizontal: int
# Intersection detection
intersection_tolerance: T_num
intersection_x_tolerance: Optional[T_num]
intersection_y_tolerance: Optional[T_num]
# Text extraction settings for cells
text_settings: Optional[Dict[str, Any]]Usage Examples:
from pdfplumber.table import TableSettings
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Custom settings for line-based detection
line_settings = TableSettings(
vertical_strategy="lines_strict",
horizontal_strategy="lines_strict",
snap_tolerance=2,
edge_min_length=10
)
# Custom settings for text-based detection
text_settings = TableSettings(
vertical_strategy="text",
horizontal_strategy="text",
min_words_vertical=2,
min_words_horizontal=1
)
# Explicit line positions
explicit_settings = TableSettings(
vertical_strategy="explicit",
horizontal_strategy="explicit",
explicit_vertical_lines=[100, 200, 300, 400],
explicit_horizontal_lines=[50, 100, 150, 200]
)
# Use settings
tables = page.find_tables(table_settings=line_settings)Visual debugging capabilities for understanding table detection algorithms.
def debug_tablefinder(table_settings=None):
"""
Get TableFinder for debugging table detection.
Parameters:
- table_settings: TableSettings or dict, optional
Returns:
TableFinder: TableFinder object for algorithm inspection
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Debug table detection process
finder = page.debug_tablefinder()
print(f"Detected {len(finder.edges)} edges")
print(f"Found {len(finder.intersections)} intersections")
print(f"Identified {len(finder.cells)} cells")
print(f"Grouped into {len(finder.tables)} tables")
# Visualize detection process
im = page.to_image()
im.debug_tablefinder(table_settings=finder.settings)
im.save("table_debug.png")Helper classes for table structure analysis.
class CellGroup:
"""Base class for table rows and columns."""
@property
def cells(self) -> List[T_bbox]:
"""Cell bounding boxes in this group."""
@property
def bbox(self) -> T_bbox:
"""Bounding box of entire group."""
class Row(CellGroup):
"""Table row representation."""
class Column(CellGroup):
"""Table column representation."""# Strict line detection - only uses actual PDF line objects
settings = TableSettings(
vertical_strategy="lines_strict",
horizontal_strategy="lines_strict"
)
# Flexible line detection - includes rectangle edges
settings = TableSettings(
vertical_strategy="lines",
horizontal_strategy="lines"
)# Use text alignment to infer table structure
settings = TableSettings(
vertical_strategy="text",
horizontal_strategy="text",
min_words_vertical=3, # Minimum words to establish column
min_words_horizontal=2 # Minimum words to establish row
)# Manually specify table grid lines
settings = TableSettings(
vertical_strategy="explicit",
horizontal_strategy="explicit",
explicit_vertical_lines=[72, 144, 216, 288], # X coordinates
explicit_horizontal_lines=[100, 130, 160, 190] # Y coordinates
)# Combine different strategies for horizontal and vertical
settings = TableSettings(
vertical_strategy="text", # Use text alignment for columns
horizontal_strategy="lines", # Use lines for rows
snap_tolerance=5, # Snap nearby elements together
join_tolerance=2 # Join connected elements
)Install with Tessl CLI
npx tessl i tessl/pypi-pdfplumber