tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-pdfplumber
Author: tessl

Extensive utility functions for geometry operations, text processing, clustering algorithms, PDF internal structure manipulation, and data conversion utilities.

Capabilities

Geometry Operations

Comprehensive geometric operations for bounding boxes, object positioning, and spatial analysis.

def bbox_to_rect(bbox):
    """
    Convert bounding box to rectangle dictionary.
    
    Parameters:
    - bbox: Tuple[T_num, T_num, T_num, T_num] - (x0, top, x1, bottom)
    
    Returns:
    Dict[str, T_num]: Rectangle with x0, top, x1, bottom, width, height
    """

def calculate_area(bbox):
    """
    Calculate bounding box area.
    
    Parameters:
    - bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
    
    Returns:
    T_num: Area of bounding box
    """

def merge_bboxes(bboxes):
    """
    Merge multiple bounding boxes into single encompassing box.
    
    Parameters:
    - bboxes: List[T_bbox] - List of bounding boxes
    
    Returns:
    T_bbox: Single bounding box containing all input boxes
    """

def get_bbox_overlap(a, b):
    """
    Get overlap between two bounding boxes.
    
    Parameters:
    - a, b: T_bbox - Two bounding boxes
    
    Returns:
    T_bbox or None: Overlapping region or None if no overlap
    """

def objects_to_bbox(objects):
    """
    Get bounding box containing all objects.
    
    Parameters:
    - objects: List[T_obj] - List of objects with bbox information
    
    Returns:
    T_bbox: Bounding box encompassing all objects
    """

def objects_to_rect(objects):
    """
    Get rectangle containing all objects.
    
    Parameters:
    - objects: List[T_obj] - List of objects
    
    Returns:
    Dict[str, T_num]: Rectangle dictionary
    """

Usage Examples:

from pdfplumber.utils import bbox_to_rect, merge_bboxes, calculate_area

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Convert bbox to rect format
    char = page.chars[0]
    rect = bbox_to_rect((char['x0'], char['top'], char['x1'], char['bottom']))
    print(f"Character width: {rect['width']}, height: {rect['height']}")
    
    # Find bounding box of all characters
    all_chars_bbox = objects_to_bbox(page.chars)
    print(f"Text area: {all_chars_bbox}")
    
    # Calculate text coverage
    page_area = calculate_area((0, 0, page.width, page.height))
    text_area = calculate_area(all_chars_bbox)
    coverage = text_area / page_area
    print(f"Text covers {coverage:.1%} of page")

Object Spatial Filtering

Filter objects based on spatial relationships and positioning.

def within_bbox(objs, bbox):
    """
    Filter objects within bounding box.
    
    Parameters:
    - objs: List[T_obj] - Objects to filter
    - bbox: T_bbox - Bounding box for filtering
    
    Returns:
    List[T_obj]: Objects within bounding box
    """

def outside_bbox(objs, bbox):
    """
    Filter objects outside bounding box.
    
    Parameters:
    - objs: List[T_obj] - Objects to filter
    - bbox: T_bbox - Bounding box for filtering
    
    Returns:
    List[T_obj]: Objects outside bounding box
    """

def intersects_bbox(objs, bbox):
    """
    Filter objects intersecting bounding box.
    
    Parameters:
    - objs: List[T_obj] - Objects to filter
    - bbox: T_bbox - Bounding box for intersection test
    
    Returns:
    List[T_obj]: Objects intersecting bounding box
    """

def crop_to_bbox(objs, bbox):
    """
    Filter objects intersecting bbox (alias for intersects_bbox).
    
    Parameters:
    - objs: List[T_obj] - Objects to filter
    - bbox: T_bbox - Bounding box
    
    Returns:
    List[T_obj]: Objects intersecting bounding box
    """

Object Manipulation

Transform and modify object properties and positioning.

def move_object(obj, axis, value):
    """
    Move object along specified axis.
    
    Parameters:
    - obj: T_obj - Object to move
    - axis: str - Axis to move along ('x' or 'y')
    - value: T_num - Distance to move
    
    Returns:
    T_obj: New object with updated coordinates
    """

def resize_object(obj, key, value):
    """
    Resize object property.
    
    Parameters:
    - obj: T_obj - Object to resize
    - key: str - Property to modify
    - value: T_num - New value
    
    Returns:
    T_obj: New object with updated property
    """

def clip_obj(obj, bbox):
    """
    Clip object to bounding box.
    
    Parameters:
    - obj: T_obj - Object to clip
    - bbox: T_bbox - Clipping boundary
    
    Returns:
    T_obj or None: Clipped object or None if completely outside
    """

Edge and Line Processing

Convert objects to edges and process line elements.

def obj_to_edges(obj):
    """
    Convert object to edges.
    
    Parameters:
    - obj: T_obj - Object (rectangle, curve, etc.)
    
    Returns:
    List[T_obj]: List of edge objects
    """

def line_to_edge(line):
    """
    Convert line object to edge.
    
    Parameters:
    - line: T_obj - Line object
    
    Returns:
    T_obj: Edge object
    """

def curve_to_edges(curve):
    """
    Convert curve to edges.
    
    Parameters:
    - curve: T_obj - Curve object
    
    Returns:
    List[T_obj]: List of edge objects from curve
    """

def rect_to_edges(rect):
    """
    Convert rectangle to edges.
    
    Parameters:
    - rect: T_obj - Rectangle object
    
    Returns:
    List[T_obj]: Four edge objects (top, bottom, left, right)
    """

def filter_edges(edges, orientation=None, edge_type=None, min_length=1):
    """
    Filter edges by orientation, type, and minimum length.
    
    Parameters:
    - edges: List[T_obj] - Edge objects to filter
    - orientation: str, optional - 'h' for horizontal, 'v' for vertical
    - edge_type: str, optional - Type of edge to include
    - min_length: T_num - Minimum edge length
    
    Returns:
    List[T_obj]: Filtered edge objects
    """

Object Snapping and Alignment

Align objects to common positions and snap coordinates.

def snap_objects(objs, attr, tolerance):
    """
    Snap objects to common values.
    
    Parameters:
    - objs: List[T_obj] - Objects to snap
    - attr: str - Attribute to snap (e.g., 'x0', 'top')
    - tolerance: T_num - Snapping tolerance
    
    Returns:
    List[T_obj]: Objects with snapped coordinates
    """

Clustering Operations

Group objects and values using clustering algorithms.

def cluster_list(xs, tolerance=0):
    """
    Cluster list of numbers.
    
    Parameters:
    - xs: List[T_num] - Numbers to cluster
    - tolerance: T_num - Clustering tolerance
    
    Returns:
    List[List[T_num]]: Clusters of numbers
    """

def cluster_objects(objs, key_fn, tolerance):
    """
    Cluster objects by key function.
    
    Parameters:
    - objs: List[T_obj] - Objects to cluster
    - key_fn: Callable[[T_obj], T_num] - Function to extract clustering key
    - tolerance: T_num - Clustering tolerance
    
    Returns:
    List[List[T_obj]]: Clusters of objects
    """

def make_cluster_dict(values, tolerance):
    """
    Create value-to-cluster mapping.
    
    Parameters:
    - values: List[T_num] - Values to cluster
    - tolerance: T_num - Clustering tolerance
    
    Returns:
    Dict[T_num, T_num]: Mapping from value to cluster representative
    """

Usage Examples:

from pdfplumber.utils import cluster_objects, cluster_list

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Cluster characters by font size
    size_clusters = cluster_objects(
        page.chars, 
        lambda c: c.get('size', 0), 
        tolerance=1
    )
    print(f"Found {len(size_clusters)} font size groups")
    
    # Cluster horizontal positions
    x_positions = [c['x0'] for c in page.chars]
    x_clusters = cluster_list(x_positions, tolerance=5)
    print(f"Text aligns to {len(x_clusters)} column positions")
    
    # Find common Y positions (likely text lines)
    y_positions = [c['top'] for c in page.chars]  
    y_clusters = cluster_list(y_positions, tolerance=2)
    print(f"Text appears on {len(y_clusters)} distinct lines")

Text Processing

Advanced text processing and character manipulation utilities.

def extract_text(chars, **kwargs):
    """
    Extract text from character objects.
    
    Parameters:
    - chars: List[T_obj] - Character objects
    - **kwargs: Text extraction options
    
    Returns:
    str: Extracted text
    """

def extract_text_simple(chars, **kwargs):
    """
    Simple text extraction from characters.
    
    Parameters:
    - chars: List[T_obj] - Character objects
    - **kwargs: Extraction options
    
    Returns:
    str: Extracted text without layout preservation
    """

def extract_words(chars, **kwargs):
    """
    Extract words from character objects.
    
    Parameters:
    - chars: List[T_obj] - Character objects
    - **kwargs: Word extraction options
    
    Returns:
    List[T_obj]: Word objects with position data
    """

def dedupe_chars(chars, tolerance=1, **kwargs):
    """
    Remove duplicate characters from list.
    
    Parameters:
    - chars: List[T_obj] - Character objects
    - tolerance: T_num - Distance tolerance for duplicate detection
    - **kwargs: Deduplication options
    
    Returns:
    List[T_obj]: Deduplicated character objects
    """

def chars_to_textmap(chars, **kwargs):
    """
    Convert characters to TextMap object.
    
    Parameters:
    - chars: List[T_obj] - Character objects
    - **kwargs: TextMap options
    
    Returns:
    TextMap: Character mapping object
    """

def collate_line(chars, **kwargs):
    """
    Collate characters into text line.
    
    Parameters:
    - chars: List[T_obj] - Character objects for single line
    - **kwargs: Line collation options
    
    Returns:
    str: Text content of line
    """

PDF Internals

Low-level PDF object processing and decoding utilities.

def resolve(x):
    """
    Resolve PDF object references.
    
    Parameters:
    - x: Any - PDF object that may contain references
    
    Returns:
    Any: Resolved object with references dereferenced
    """

def resolve_all(x):
    """
    Recursively resolve PDF objects.
    
    Parameters:
    - x: Any - PDF object structure
    
    Returns:
    Any: Completely resolved object structure
    """

def resolve_and_decode(obj):
    """
    Resolve and decode PDF object.
    
    Parameters:
    - obj: Any - PDF object
    
    Returns:
    Any: Resolved and decoded object
    """

def decode_text(s):
    """
    Decode text from bytes/string.
    
    Parameters:
    - s: bytes or str - Text to decode
    
    Returns:
    str: Decoded text string
    """

def decode_psl_list(psl_list):
    """
    Decode PSLiteral list.
    
    Parameters:
    - psl_list: List - List of PSLiteral objects
    
    Returns:
    List: Decoded list
    """

Generic Utilities

General-purpose utility functions.

def to_list(collection):
    """
    Convert collection to list.
    
    Parameters:
    - collection: Any - Collection to convert (list, tuple, generator, etc.)
    
    Returns:
    List: List representation of collection
    """

Constants

Commonly used default values and tolerances.

# Text processing constants
DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3
DEFAULT_X_DENSITY = 7.25
DEFAULT_Y_DENSITY = 13

Usage Examples:

from pdfplumber.utils import (
    DEFAULT_X_TOLERANCE, DEFAULT_Y_TOLERANCE,
    extract_text, resolve_all
)

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Use default tolerances
    text = extract_text(page.chars, 
                       x_tolerance=DEFAULT_X_TOLERANCE,
                       y_tolerance=DEFAULT_Y_TOLERANCE)
    
    # Process PDF internals
    raw_chars = page._objs.get('char', [])  # Access raw PDF objects
    resolved_chars = [resolve_all(char) for char in raw_chars]

Advanced Utility Workflows

Spatial Analysis:

from pdfplumber.utils import cluster_objects, objects_to_bbox

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Find text columns
    char_clusters = cluster_objects(
        page.chars,
        lambda c: c['x0'],  # Group by left edge
        tolerance=10
    )
    
    columns = []
    for cluster in char_clusters:
        column_bbox = objects_to_bbox(cluster)
        column_text = extract_text(cluster)
        columns.append({
            'bbox': column_bbox,
            'text': column_text,
            'char_count': len(cluster)
        })
    
    print(f"Document has {len(columns)} columns")

Font Analysis:

from pdfplumber.utils import cluster_objects

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Group by font properties
    font_groups = cluster_objects(
        page.chars,
        lambda c: (c.get('fontname', ''), c.get('size', 0)),
        tolerance=0  # Exact matching for fonts
    )
    
    for group in font_groups:
        sample = group[0]
        font_name = sample.get('fontname', 'Unknown')
        font_size = sample.get('size', 0)
        char_count = len(group)
        
        print(f"Font: {font_name}, Size: {font_size}, Characters: {char_count}")

Install with Tessl CLI