Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Comprehensive visualization tools for overlaying debug information on PDF pages, including object highlighting, table structure visualization, custom drawing operations, and image export capabilities.
Convert PDF pages to images for visualization and debugging purposes.
def to_image(resolution=None, width=None, height=None, antialias=False,
force_mediabox=False):
"""
Convert page to image for debugging.
Parameters:
- resolution: int or float, optional - Image resolution in DPI (default: 72)
- width: int, optional - Target image width in pixels
- height: int, optional - Target image height in pixels
- antialias: bool - Enable antialiasing for smoother rendering
- force_mediabox: bool - Use MediaBox instead of CropBox for dimensions
Returns:
PageImage: Image object with drawing capabilities
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Basic image conversion
im = page.to_image()
im.save("page.png")
# High resolution image
hires = page.to_image(resolution=300)
hires.save("page_hires.png")
# Specific dimensions
thumb = page.to_image(width=400, height=600)
thumb.save("thumbnail.png")
# Antialiased rendering
smooth = page.to_image(antialias=True)
smooth.save("smooth.png")Image representation with comprehensive drawing and debugging capabilities.
class PageImage:
"""Image representation with drawing capabilities."""
def __init__(self, page, original=None, resolution=72, antialias=False,
force_mediabox=False):
"""Initialize PageImage from page."""
@property
def page(self) -> Page:
"""Source page object."""
@property
def original(self) -> PIL.Image.Image:
"""Original page image without annotations."""
@property
def annotated(self) -> PIL.Image.Image:
"""Current image with annotations."""
@property
def resolution(self) -> Union[int, float]:
"""Image resolution in DPI."""
@property
def scale(self) -> float:
"""Scale factor from PDF coordinates to image pixels."""
def reset(self):
"""Reset annotations to original image."""
def copy(self):
"""Create copy of PageImage."""
def save(self, dest, format="PNG", quantize=True, colors=256, bits=8, **kwargs):
"""Save image to file."""
def show(self):
"""Display image (in interactive environments)."""Draw lines and line collections on the image.
def draw_line(points_or_obj, stroke=(255, 0, 0, 200), stroke_width=1):
"""
Draw single line.
Parameters:
- points_or_obj: List of points or line object with coordinates
- stroke: Tuple[int, int, int, int] - RGBA color for line
- stroke_width: int - Line width in pixels
Returns:
PageImage: Self for method chaining
"""
def draw_lines(list_of_lines, stroke=(255, 0, 0, 200), stroke_width=1):
"""
Draw multiple lines.
Parameters:
- list_of_lines: List of line objects or point lists
- stroke: RGBA color tuple
- stroke_width: int - Line width
Returns:
PageImage: Self for method chaining
"""
def draw_vline(location, stroke=(255, 0, 0, 200), stroke_width=1):
"""Draw vertical line at X coordinate."""
def draw_vlines(locations, stroke=(255, 0, 0, 200), stroke_width=1):
"""Draw multiple vertical lines."""
def draw_hline(location, stroke=(255, 0, 0, 200), stroke_width=1):
"""Draw horizontal line at Y coordinate."""
def draw_hlines(locations, stroke=(255, 0, 0, 200), stroke_width=1):
"""Draw multiple horizontal lines."""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
# Draw all lines on page
im.draw_lines(page.lines)
# Draw custom line
im.draw_line([(100, 100), (200, 200)], stroke=(0, 255, 0, 255), stroke_width=3)
# Draw grid lines
im.draw_vlines([100, 200, 300, 400], stroke=(0, 0, 255, 100))
im.draw_hlines([100, 200, 300], stroke=(0, 0, 255, 100))
im.save("lines_debug.png")Draw rectangles and rectangle collections with fill and stroke options.
def draw_rect(bbox_or_obj, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200),
stroke_width=1):
"""
Draw rectangle.
Parameters:
- bbox_or_obj: Bounding box tuple or object with bbox coordinates
- fill: RGBA color tuple for rectangle fill
- stroke: RGBA color tuple for rectangle outline
- stroke_width: int - Outline width
Returns:
PageImage: Self for method chaining
"""
def draw_rects(list_of_rects, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200),
stroke_width=1):
"""Draw multiple rectangles."""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
# Highlight all rectangles
im.draw_rects(page.rects)
# Highlight character bounding boxes
im.draw_rects(page.chars, fill=(255, 0, 0, 30), stroke=(255, 0, 0, 100))
# Custom rectangle
im.draw_rect((100, 100, 300, 200), fill=(0, 255, 0, 100))
im.save("rects_debug.png")Draw circles and circular markers.
def draw_circle(center_or_obj, radius=5, fill=(0, 0, 255, 50),
stroke=(255, 0, 0, 200)):
"""
Draw circle.
Parameters:
- center_or_obj: Center point tuple or object with center coordinates
- radius: int - Circle radius in pixels
- fill: RGBA color tuple for circle fill
- stroke: RGBA color tuple for circle outline
Returns:
PageImage: Self for method chaining
"""
def draw_circles(list_of_circles, radius=5, fill=(0, 0, 255, 50),
stroke=(255, 0, 0, 200)):
"""Draw multiple circles."""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
# Mark character centers
char_centers = [(c['x0'] + c['x1'])/2, (c['top'] + c['bottom'])/2)
for c in page.chars]
im.draw_circles(char_centers, radius=2, fill=(255, 0, 0, 100))
# Mark specific points
im.draw_circle((page.width/2, page.height/2), radius=10,
fill=(0, 255, 0, 200))
im.save("circles_debug.png")Specialized methods for visualizing text elements and word boundaries.
def outline_words(stroke=(255, 0, 0, 200), fill=(255, 0, 0, 50),
stroke_width=1, x_tolerance=3, y_tolerance=3):
"""
Outline detected words.
Parameters:
- stroke: RGBA color for word outlines
- fill: RGBA color for word fill
- stroke_width: int - Outline width
- x_tolerance: float - Horizontal tolerance for word detection
- y_tolerance: float - Vertical tolerance for word detection
Returns:
PageImage: Self for method chaining
"""
def outline_chars(stroke=(255, 0, 0, 255), fill=(255, 0, 0, 63),
stroke_width=1):
"""
Outline individual characters.
Parameters:
- stroke: RGBA color for character outlines
- fill: RGBA color for character fill
- stroke_width: int - Outline width
Returns:
PageImage: Self for method chaining
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
# Outline all words
im.outline_words()
# Outline characters with custom colors
im.outline_chars(stroke=(0, 255, 0, 255), fill=(0, 255, 0, 30))
# Fine-tuned word detection
im.outline_words(x_tolerance=1, y_tolerance=1,
stroke=(0, 0, 255, 200))
im.save("text_debug.png")Specialized visualization for table detection and structure analysis.
def debug_table(table, fill=(0, 0, 255, 50), stroke=(255, 0, 0, 200),
stroke_width=1):
"""
Visualize table structure.
Parameters:
- table: Table object to visualize
- fill: RGBA color for cell fill
- stroke: RGBA color for cell outlines
- stroke_width: int - Outline width
Returns:
PageImage: Self for method chaining
"""
def debug_tablefinder(table_settings=None):
"""
Visualize table detection process.
Parameters:
- table_settings: TableSettings or dict for detection configuration
Returns:
PageImage: Self for method chaining
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
# Debug all detected tables
tables = page.find_tables()
for i, table in enumerate(tables):
color = [(255, 0, 0, 50), (0, 255, 0, 50), (0, 0, 255, 50)][i % 3]
im.debug_table(table, fill=color)
# Debug table detection algorithm
im.debug_tablefinder()
# Debug with custom settings
custom_settings = {"vertical_strategy": "text", "horizontal_strategy": "lines"}
im.debug_tablefinder(table_settings=custom_settings)
im.save("table_debug.png")Default colors and styling options for drawing operations.
# Default drawing constants
DEFAULT_RESOLUTION = 72
DEFAULT_FILL = (0, 0, 255, 50) # Semi-transparent blue
DEFAULT_STROKE = (255, 0, 0, 200) # Semi-transparent red
DEFAULT_STROKE_WIDTH = 1Multi-layer Debugging:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image(resolution=150)
# Layer 1: Page structure
im.draw_rects(page.rects, fill=(200, 200, 200, 30))
im.draw_lines(page.lines, stroke=(100, 100, 100, 150))
# Layer 2: Text elements
im.outline_chars(stroke=(255, 0, 0, 100), fill=(255, 0, 0, 20))
# Layer 3: Tables
tables = page.find_tables()
for table in tables:
im.debug_table(table, fill=(0, 255, 0, 40), stroke=(0, 255, 0, 200))
# Layer 4: Custom annotations
# Highlight large text
large_chars = [c for c in page.chars if c.get('size', 0) > 12]
im.draw_rects(large_chars, fill=(255, 255, 0, 80))
im.save("comprehensive_debug.png")Comparative Analysis:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Compare different table detection strategies
strategies = [
{"vertical_strategy": "lines", "horizontal_strategy": "lines"},
{"vertical_strategy": "text", "horizontal_strategy": "text"}
]
for i, settings in enumerate(strategies):
im = page.to_image()
im.debug_tablefinder(table_settings=settings)
im.save(f"table_strategy_{i+1}.png")Region-Specific Debugging:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Debug specific page regions
regions = [
("header", (0, 0, page.width, 100)),
("content", (0, 100, page.width, page.height-100)),
("footer", (0, page.height-50, page.width, page.height))
]
for name, bbox in regions:
cropped = page.crop(bbox)
im = cropped.to_image()
im.outline_words()
im.save(f"{name}_debug.png")Install with Tessl CLI
npx tessl i tessl/pypi-pdfplumber