tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

Page Manipulation

Name: tessl/pypi-pdfplumber
Author: tessl

Page cropping, object filtering, bounding box operations, coordinate transformations, and derived page creation for precise PDF element analysis.

Capabilities

Page Cropping

Create cropped views of pages with filtered objects based on bounding box regions.

def crop(bbox, relative=False, strict=True):
    """
    Crop page to bounding box.
    
    Parameters:
    - bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box (x0, top, x1, bottom)
    - relative: bool - Treat coordinates as relative to page (0-1 range)
    - strict: bool - Strict filtering (objects must be entirely within bbox)
    
    Returns:
    CroppedPage: New page object with cropped view
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Crop to specific region (absolute coordinates)
    cropped = page.crop((100, 100, 400, 300))
    text = cropped.extract_text()
    print(f"Cropped region text: {text}")
    
    # Crop to relative coordinates (percentages)
    # Top-left quarter of page
    quarter = page.crop((0, 0, 0.5, 0.5), relative=True)
    
    # Crop with non-strict filtering (partial overlap allowed)
    loose_crop = page.crop((100, 100, 400, 300), strict=False)
    
    # Chain cropping operations
    top_half = page.crop((0, 0, 1, 0.5), relative=True)
    top_left = top_half.crop((0, 0, 0.5, 1), relative=True)

Bounding Box Filtering

Filter page objects based on spatial relationships to bounding boxes.

def within_bbox(bbox, relative=False, strict=True):
    """
    Filter objects within bounding box.
    
    Parameters:
    - bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
    - relative: bool - Treat coordinates as relative to page
    - strict: bool - Objects must be entirely within bbox
    
    Returns:
    FilteredPage: New page with filtered objects
    """

def outside_bbox(bbox, relative=False, strict=True):
    """
    Filter objects outside bounding box.
    
    Parameters:
    - bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
    - relative: bool - Treat coordinates as relative to page
    - strict: bool - Objects must be entirely outside bbox
    
    Returns:
    FilteredPage: New page with filtered objects
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Get objects in specific region
    header_region = (0, 0, page.width, 100)
    header_page = page.within_bbox(header_region)
    header_text = header_page.extract_text()
    
    # Get objects outside a region (exclude header/footer)
    content_region = (0, 100, page.width, page.height - 100)
    content_page = page.within_bbox(content_region)
    
    # Use relative coordinates
    middle_third = page.within_bbox((0, 0.33, 1, 0.67), relative=True)
    
    # Non-strict filtering (partial overlap)
    overlapping = page.within_bbox((100, 100, 200, 200), strict=False)
    
    # Exclude specific region
    no_header = page.outside_bbox((0, 0, page.width, 50))

Custom Object Filtering

Filter objects using custom test functions for complex selection criteria.

def filter(test_function):
    """
    Filter objects using custom function.
    
    Parameters:
    - test_function: Callable[[T_obj], bool] - Function that returns True for objects to keep
    
    Returns:
    FilteredPage: New page with filtered objects based on test function
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Filter by font size
    large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
    
    # Filter by font name
    arial_text = page.filter(lambda obj: 'Arial' in obj.get('fontname', ''))
    
    # Filter by color
    red_objects = page.filter(lambda obj: obj.get('non_stroking_color') == (1, 0, 0))
    
    # Filter characters by content
    digits_only = page.filter(lambda obj: obj.get('text', '').isdigit())
    
    # Complex filtering - large bold text
    def is_large_bold(obj):
        return (obj.get('size', 0) > 14 and 
                'Bold' in obj.get('fontname', ''))
    
    headers = page.filter(is_large_bold)
    header_text = headers.extract_text()

Derived Page Classes

Specialized page classes for manipulated views.

class CroppedPage(DerivedPage):
    """Page cropped to specific bounding box."""
    
    def __init__(self, parent_page, bbox, relative=False, strict=True):
        """Initialize cropped page view."""
    
    @property
    def parent_page(self) -> Page:
        """Original page object."""
    
    @property
    def bbox(self) -> T_bbox:
        """Cropping bounding box."""

class FilteredPage(DerivedPage):
    """Page with filtered objects."""
    
    def __init__(self, parent_page, test_function):
        """Initialize filtered page view."""
    
    @property  
    def parent_page(self) -> Page:
        """Original page object."""
    
    @property
    def test_function(self) -> Callable:
        """Filtering test function."""

class DerivedPage:
    """Base class for page views derived from other pages."""
    
    @property
    def width(self) -> T_num:
        """Page width."""
    
    @property
    def height(self) -> T_num:
        """Page height."""
    
    @property
    def bbox(self) -> T_bbox:
        """Page bounding box."""
    
    # All Container and Page methods available
    def extract_text(self, **kwargs): ...
    def extract_tables(self, **kwargs): ...
    def crop(self, bbox, **kwargs): ...
    def filter(self, test_function): ...

Character Deduplication

Remove duplicate character objects that may occur from PDF processing.

def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):
    """
    Remove duplicate characters.
    
    Parameters:
    - tolerance: T_num - Distance tolerance for duplicate detection
    - use_text_flow: bool - Consider text flow direction in deduplication
    - **kwargs: Additional deduplication options
    
    Returns:
    Page: New page object with deduplicated characters
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Remove duplicate characters with default tolerance
    clean_page = page.dedupe_chars()
    
    # Strict deduplication with tight tolerance
    very_clean = page.dedupe_chars(tolerance=0.5)
    
    # Consider text flow for better deduplication
    flow_aware = page.dedupe_chars(use_text_flow=True)
    
    # Compare character counts
    original_chars = len(page.chars)
    clean_chars = len(clean_page.chars)
    print(f"Removed {original_chars - clean_chars} duplicate characters")

Coordinate Systems and Transformations

Understanding PDF Coordinates

PDFplumber uses PDF coordinate system where:

Origin (0,0) is at bottom-left of page
X increases rightward
Y increases upward
Page dimensions available as page.width and page.height

Relative Coordinates

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Convert relative to absolute coordinates
    rel_bbox = (0.1, 0.2, 0.9, 0.8)  # 10% margin on all sides
    abs_bbox = (
        rel_bbox[0] * page.width,
        rel_bbox[1] * page.height, 
        rel_bbox[2] * page.width,
        rel_bbox[3] * page.height
    )
    
    # Use relative coordinates directly
    center_region = page.crop((0.25, 0.25, 0.75, 0.75), relative=True)

Chaining Operations

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Chain multiple operations
    processed_page = (page
                     .dedupe_chars()
                     .crop((50, 50, page.width-50, page.height-50))
                     .filter(lambda obj: obj.get('size', 0) > 10))
    
    # Each operation returns a new page-like object
    text = processed_page.extract_text()
    tables = processed_page.extract_tables()

Performance Considerations

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Efficient: filter before expensive operations
    large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
    tables = large_text.extract_tables()  # Operates on fewer objects
    
    # Less efficient: extract from full page then filter results
    all_tables = page.extract_tables()
    # Manual filtering of results

Object Access in Derived Pages

All derived pages maintain access to the full Container API:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    cropped = page.crop((100, 100, 400, 300))
    
    # Access filtered object collections
    chars = cropped.chars          # Only characters in cropped region
    lines = cropped.lines          # Only lines in cropped region
    rects = cropped.rects          # Only rectangles in cropped region
    images = cropped.images        # Only images in cropped region
    
    # Derived properties work with filtered objects
    edges = cropped.edges          # All edges from filtered objects
    h_edges = cropped.horizontal_edges
    v_edges = cropped.vertical_edges
    
    # Export filtered objects
    cropped.to_json("cropped_objects.json")
    cropped.to_csv("cropped_data.csv")

Install with Tessl CLI