tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

Visual Debugging

Name: tessl/pypi-pdfplumber
Author: tessl

Comprehensive visualization tools for overlaying debug information on PDF pages, including object highlighting, table structure visualization, custom drawing operations, and image export capabilities.

Capabilities

Page to Image Conversion

Convert PDF pages to images for visualization and debugging purposes.

def to_image(resolution=None, width=None, height=None, antialias=False, 
             force_mediabox=False):
    """
    Convert page to image for debugging.
    
    Parameters:
    - resolution: int or float, optional - Image resolution in DPI (default: 72)
    - width: int, optional - Target image width in pixels
    - height: int, optional - Target image height in pixels
    - antialias: bool - Enable antialiasing for smoother rendering
    - force_mediabox: bool - Use MediaBox instead of CropBox for dimensions
    
    Returns:
    PageImage: Image object with drawing capabilities
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Basic image conversion
    im = page.to_image()
    im.save("page.png")
    
    # High resolution image
    hires = page.to_image(resolution=300)
    hires.save("page_hires.png")
    
    # Specific dimensions
    thumb = page.to_image(width=400, height=600)
    thumb.save("thumbnail.png")
    
    # Antialiased rendering
    smooth = page.to_image(antialias=True)
    smooth.save("smooth.png")

PageImage Class

Image representation with comprehensive drawing and debugging capabilities.

class PageImage:
    """Image representation with drawing capabilities."""
    
    def __init__(self, page, original=None, resolution=72, antialias=False, 
                 force_mediabox=False):
        """Initialize PageImage from page."""
    
    @property
    def page(self) -> Page:
        """Source page object."""
    
    @property
    def original(self) -> PIL.Image.Image:
        """Original page image without annotations."""
    
    @property
    def annotated(self) -> PIL.Image.Image:
        """Current image with annotations."""
    
    @property
    def resolution(self) -> Union[int, float]:
        """Image resolution in DPI."""
    
    @property
    def scale(self) -> float:
        """Scale factor from PDF coordinates to image pixels."""
    
    def reset(self):
        """Reset annotations to original image."""
    
    def copy(self):
        """Create copy of PageImage."""
    
    def save(self, dest, format="PNG", quantize=True, colors=256, bits=8, **kwargs):
        """Save image to file."""
    
    def show(self):
        """Display image (in interactive environments)."""

Drawing Lines

Draw lines and line collections on the image.

def draw_line(points_or_obj, stroke=(255, 0, 0, 200), stroke_width=1):
    """
    Draw single line.
    
    Parameters:
    - points_or_obj: List of points or line object with coordinates
    - stroke: Tuple[int, int, int, int] - RGBA color for line
    - stroke_width: int - Line width in pixels
    
    Returns:
    PageImage: Self for method chaining
    """

def draw_lines(list_of_lines, stroke=(255, 0, 0, 200), stroke_width=1):
    """
    Draw multiple lines.
    
    Parameters:
    - list_of_lines: List of line objects or point lists
    - stroke: RGBA color tuple
    - stroke_width: int - Line width
    
    Returns:
    PageImage: Self for method chaining
    """

def draw_vline(location, stroke=(255, 0, 0, 200), stroke_width=1):
    """Draw vertical line at X coordinate."""

def draw_vlines(locations, stroke=(255, 0, 0, 200), stroke_width=1):
    """Draw multiple vertical lines."""

def draw_hline(location, stroke=(255, 0, 0, 200), stroke_width=1):
    """Draw horizontal line at Y coordinate."""

def draw_hlines(locations, stroke=(255, 0, 0, 200), stroke_width=1):
    """Draw multiple horizontal lines."""

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image()
    
    # Draw all lines on page
    im.draw_lines(page.lines)
    
    # Draw custom line
    im.draw_line([(100, 100), (200, 200)], stroke=(0, 255, 0, 255), stroke_width=3)
    
    # Draw grid lines
    im.draw_vlines([100, 200, 300, 400], stroke=(0, 0, 255, 100))
    im.draw_hlines([100, 200, 300], stroke=(0, 0, 255, 100))
    
    im.save("lines_debug.png")

Drawing Rectangles

Draw rectangles and rectangle collections with fill and stroke options.

def draw_rect(bbox_or_obj, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200), 
              stroke_width=1):
    """
    Draw rectangle.
    
    Parameters:
    - bbox_or_obj: Bounding box tuple or object with bbox coordinates
    - fill: RGBA color tuple for rectangle fill
    - stroke: RGBA color tuple for rectangle outline
    - stroke_width: int - Outline width
    
    Returns:
    PageImage: Self for method chaining
    """

def draw_rects(list_of_rects, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200), 
               stroke_width=1):
    """Draw multiple rectangles."""

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image()
    
    # Highlight all rectangles
    im.draw_rects(page.rects)
    
    # Highlight character bounding boxes
    im.draw_rects(page.chars, fill=(255, 0, 0, 30), stroke=(255, 0, 0, 100))
    
    # Custom rectangle
    im.draw_rect((100, 100, 300, 200), fill=(0, 255, 0, 100))
    
    im.save("rects_debug.png")

Drawing Circles

Draw circles and circular markers.

def draw_circle(center_or_obj, radius=5, fill=(0, 0, 255, 50), 
                stroke=(255, 0, 0, 200)):
    """
    Draw circle.
    
    Parameters:
    - center_or_obj: Center point tuple or object with center coordinates
    - radius: int - Circle radius in pixels
    - fill: RGBA color tuple for circle fill
    - stroke: RGBA color tuple for circle outline
    
    Returns:
    PageImage: Self for method chaining
    """

def draw_circles(list_of_circles, radius=5, fill=(0, 0, 255, 50), 
                 stroke=(255, 0, 0, 200)):
    """Draw multiple circles."""

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image()
    
    # Mark character centers
    char_centers = [(c['x0'] + c['x1'])/2, (c['top'] + c['bottom'])/2) 
                    for c in page.chars]
    im.draw_circles(char_centers, radius=2, fill=(255, 0, 0, 100))
    
    # Mark specific points
    im.draw_circle((page.width/2, page.height/2), radius=10, 
                   fill=(0, 255, 0, 200))
    
    im.save("circles_debug.png")

Text Visualization

Specialized methods for visualizing text elements and word boundaries.

def outline_words(stroke=(255, 0, 0, 200), fill=(255, 0, 0, 50), 
                  stroke_width=1, x_tolerance=3, y_tolerance=3):
    """
    Outline detected words.
    
    Parameters:
    - stroke: RGBA color for word outlines
    - fill: RGBA color for word fill
    - stroke_width: int - Outline width
    - x_tolerance: float - Horizontal tolerance for word detection
    - y_tolerance: float - Vertical tolerance for word detection
    
    Returns:
    PageImage: Self for method chaining
    """

def outline_chars(stroke=(255, 0, 0, 255), fill=(255, 0, 0, 63), 
                  stroke_width=1):
    """
    Outline individual characters.
    
    Parameters:
    - stroke: RGBA color for character outlines
    - fill: RGBA color for character fill
    - stroke_width: int - Outline width
    
    Returns:
    PageImage: Self for method chaining
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image()
    
    # Outline all words
    im.outline_words()
    
    # Outline characters with custom colors
    im.outline_chars(stroke=(0, 255, 0, 255), fill=(0, 255, 0, 30))
    
    # Fine-tuned word detection
    im.outline_words(x_tolerance=1, y_tolerance=1, 
                     stroke=(0, 0, 255, 200))
    
    im.save("text_debug.png")

Table Debugging

Specialized visualization for table detection and structure analysis.

def debug_table(table, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200), 
                stroke_width=1):
    """
    Visualize table structure.
    
    Parameters:
    - table: Table object to visualize
    - fill: RGBA color for cell fill
    - stroke: RGBA color for cell outlines
    - stroke_width: int - Outline width
    
    Returns:
    PageImage: Self for method chaining
    """

def debug_tablefinder(table_settings=None):
    """
    Visualize table detection process.
    
    Parameters:
    - table_settings: TableSettings or dict for detection configuration
    
    Returns:
    PageImage: Self for method chaining
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image()
    
    # Debug all detected tables
    tables = page.find_tables()
    for i, table in enumerate(tables):
        color = [(255, 0, 0, 50), (0, 255, 0, 50), (0, 0, 255, 50)][i % 3]
        im.debug_table(table, fill=color)
    
    # Debug table detection algorithm
    im.debug_tablefinder()
    
    # Debug with custom settings
    custom_settings = {"vertical_strategy": "text", "horizontal_strategy": "lines"}
    im.debug_tablefinder(table_settings=custom_settings)
    
    im.save("table_debug.png")

Drawing Constants

Default colors and styling options for drawing operations.

# Default drawing constants
DEFAULT_RESOLUTION = 72
DEFAULT_FILL = (0, 0, 255, 50)     # Semi-transparent blue
DEFAULT_STROKE = (255, 0, 0, 200)  # Semi-transparent red
DEFAULT_STROKE_WIDTH = 1

Advanced Visualization Workflows

Multi-layer Debugging:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    im = page.to_image(resolution=150)
    
    # Layer 1: Page structure
    im.draw_rects(page.rects, fill=(200, 200, 200, 30))
    im.draw_lines(page.lines, stroke=(100, 100, 100, 150))
    
    # Layer 2: Text elements
    im.outline_chars(stroke=(255, 0, 0, 100), fill=(255, 0, 0, 20))
    
    # Layer 3: Tables
    tables = page.find_tables()
    for table in tables:
        im.debug_table(table, fill=(0, 255, 0, 40), stroke=(0, 255, 0, 200))
    
    # Layer 4: Custom annotations
    # Highlight large text
    large_chars = [c for c in page.chars if c.get('size', 0) > 12]
    im.draw_rects(large_chars, fill=(255, 255, 0, 80))
    
    im.save("comprehensive_debug.png")

Comparative Analysis:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Compare different table detection strategies
    strategies = [
        {"vertical_strategy": "lines", "horizontal_strategy": "lines"},
        {"vertical_strategy": "text", "horizontal_strategy": "text"}
    ]
    
    for i, settings in enumerate(strategies):
        im = page.to_image()
        im.debug_tablefinder(table_settings=settings)
        im.save(f"table_strategy_{i+1}.png")

Region-Specific Debugging:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Debug specific page regions
    regions = [
        ("header", (0, 0, page.width, 100)),
        ("content", (0, 100, page.width, page.height-100)),
        ("footer", (0, page.height-50, page.width, page.height))
    ]
    
    for name, bbox in regions:
        cropped = page.crop(bbox)
        im = cropped.to_image()
        im.outline_words()
        im.save(f"{name}_debug.png")

Install with Tessl CLI