Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
npx @tessl/cli install tessl/pypi-pdfplumber@0.11.0A comprehensive Python library for detailed PDF analysis and extraction. PDFplumber provides granular access to PDF structure including text characters, rectangles, lines, curves, images, and annotations. It offers advanced table extraction capabilities with customizable detection strategies, visual debugging tools for understanding PDF structure, and comprehensive text extraction with layout preservation options.
pip install pdfplumberimport pdfplumberCommon usage patterns:
from pdfplumber import open
from pdfplumber.utils import extract_text, bbox_to_rectimport pdfplumber
# Open a PDF file
with pdfplumber.open("document.pdf") as pdf:
# Access the first page
first_page = pdf.pages[0]
# Extract text from the page
text = first_page.extract_text()
print(text)
# Extract tables
tables = first_page.extract_tables()
for table in tables:
for row in table:
print(row)
# Visual debugging - save page as image with overlays
im = first_page.to_image()
im.draw_rects(first_page.chars, fill=(255, 0, 0, 30))
im.save("debug.png")
# Alternative - open without context manager
pdf = pdfplumber.open("document.pdf")
page = pdf.pages[0]
text = page.extract_text()
pdf.close()PDFplumber's architecture centers around:
This design provides maximum flexibility for PDF analysis tasks, from simple text extraction to complex document structure analysis and table detection.
Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, and document-level operations.
def open(path_or_fp, pages=None, laparams=None, password=None,
strict_metadata=False, unicode_norm=None, repair=False,
gs_path=None, repair_setting="default", raise_unicode_errors=True):
"""Open PDF document from file path or stream."""
...
def repair(path_or_fp, outfile=None, password=None, gs_path=None,
setting="default"):
"""Repair PDF using Ghostscript."""
...Advanced text extraction with layout-aware algorithms, word detection, text search, and character-level analysis with position information.
def extract_text(**kwargs):
"""Extract text using layout-aware algorithm."""
...
def extract_words(**kwargs):
"""Extract words as objects with position data."""
...
def search(pattern, regex=True, case=True, **kwargs):
"""Search for text patterns with regex support."""
...Sophisticated table detection and extraction with customizable strategies, edge detection algorithms, and comprehensive configuration options.
def find_tables(table_settings=None):
"""Find all tables using detection algorithms."""
...
def extract_tables(table_settings=None):
"""Extract tables as 2D arrays."""
...
class TableSettings:
"""Configuration for table detection parameters."""
...Page cropping, object filtering, bounding box operations, and coordinate transformations for precise PDF element analysis.
def crop(bbox, relative=False, strict=True):
"""Crop page to bounding box."""
...
def within_bbox(bbox, relative=False, strict=True):
"""Filter objects within bounding box."""
...
def filter(test_function):
"""Filter objects using custom function."""
...Comprehensive visualization tools for overlaying debug information on PDF pages, including object highlighting, table structure visualization, and custom drawing operations.
def to_image(resolution=None, width=None, height=None, antialias=False):
"""Convert page to image for debugging."""
...
class PageImage:
"""Image representation with drawing capabilities."""
def draw_rects(self, list_of_rects, **kwargs): ...
def debug_table(self, table, **kwargs): ...Extensive utility functions for geometry operations, text processing, clustering algorithms, and PDF internal structure manipulation.
def bbox_to_rect(bbox):
"""Convert bounding box to rectangle dictionary."""
...
def merge_bboxes(bboxes):
"""Merge multiple bounding boxes."""
...
def cluster_objects(objs, key_fn, tolerance):
"""Cluster objects by key function."""
...Complete command-line interface for PDF processing with support for text extraction, object export, and structure analysis.
def main(args_raw=None):
"""CLI entry point with full argument parsing."""
...Note: The set_debug function is listed in the package's __all__ export list but is not actually implemented in version 0.11.7. Attempting to use pdfplumber.set_debug() will result in an AttributeError.
# Core type aliases
T_num = Union[int, float]
T_bbox = Tuple[T_num, T_num, T_num, T_num] # (x0, top, x1, bottom)
T_obj = Dict[str, Any] # PDF object representation
T_obj_list = List[T_obj]
# Custom exceptions
class MalformedPDFException(Exception):
"""Raised for malformed PDF files."""
...
class PdfminerException(Exception):
"""Wrapper for pdfminer exceptions."""
...