Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Page cropping, object filtering, bounding box operations, coordinate transformations, and derived page creation for precise PDF element analysis.
Create cropped views of pages with filtered objects based on bounding box regions.
def crop(bbox, relative=False, strict=True):
"""
Crop page to bounding box.
Parameters:
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box (x0, top, x1, bottom)
- relative: bool - Treat coordinates as relative to page (0-1 range)
- strict: bool - Strict filtering (objects must be entirely within bbox)
Returns:
CroppedPage: New page object with cropped view
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Crop to specific region (absolute coordinates)
cropped = page.crop((100, 100, 400, 300))
text = cropped.extract_text()
print(f"Cropped region text: {text}")
# Crop to relative coordinates (percentages)
# Top-left quarter of page
quarter = page.crop((0, 0, 0.5, 0.5), relative=True)
# Crop with non-strict filtering (partial overlap allowed)
loose_crop = page.crop((100, 100, 400, 300), strict=False)
# Chain cropping operations
top_half = page.crop((0, 0, 1, 0.5), relative=True)
top_left = top_half.crop((0, 0, 0.5, 1), relative=True)Filter page objects based on spatial relationships to bounding boxes.
def within_bbox(bbox, relative=False, strict=True):
"""
Filter objects within bounding box.
Parameters:
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
- relative: bool - Treat coordinates as relative to page
- strict: bool - Objects must be entirely within bbox
Returns:
FilteredPage: New page with filtered objects
"""
def outside_bbox(bbox, relative=False, strict=True):
"""
Filter objects outside bounding box.
Parameters:
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
- relative: bool - Treat coordinates as relative to page
- strict: bool - Objects must be entirely outside bbox
Returns:
FilteredPage: New page with filtered objects
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Get objects in specific region
header_region = (0, 0, page.width, 100)
header_page = page.within_bbox(header_region)
header_text = header_page.extract_text()
# Get objects outside a region (exclude header/footer)
content_region = (0, 100, page.width, page.height - 100)
content_page = page.within_bbox(content_region)
# Use relative coordinates
middle_third = page.within_bbox((0, 0.33, 1, 0.67), relative=True)
# Non-strict filtering (partial overlap)
overlapping = page.within_bbox((100, 100, 200, 200), strict=False)
# Exclude specific region
no_header = page.outside_bbox((0, 0, page.width, 50))Filter objects using custom test functions for complex selection criteria.
def filter(test_function):
"""
Filter objects using custom function.
Parameters:
- test_function: Callable[[T_obj], bool] - Function that returns True for objects to keep
Returns:
FilteredPage: New page with filtered objects based on test function
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Filter by font size
large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
# Filter by font name
arial_text = page.filter(lambda obj: 'Arial' in obj.get('fontname', ''))
# Filter by color
red_objects = page.filter(lambda obj: obj.get('non_stroking_color') == (1, 0, 0))
# Filter characters by content
digits_only = page.filter(lambda obj: obj.get('text', '').isdigit())
# Complex filtering - large bold text
def is_large_bold(obj):
return (obj.get('size', 0) > 14 and
'Bold' in obj.get('fontname', ''))
headers = page.filter(is_large_bold)
header_text = headers.extract_text()Specialized page classes for manipulated views.
class CroppedPage(DerivedPage):
"""Page cropped to specific bounding box."""
def __init__(self, parent_page, bbox, relative=False, strict=True):
"""Initialize cropped page view."""
@property
def parent_page(self) -> Page:
"""Original page object."""
@property
def bbox(self) -> T_bbox:
"""Cropping bounding box."""
class FilteredPage(DerivedPage):
"""Page with filtered objects."""
def __init__(self, parent_page, test_function):
"""Initialize filtered page view."""
@property
def parent_page(self) -> Page:
"""Original page object."""
@property
def test_function(self) -> Callable:
"""Filtering test function."""
class DerivedPage:
"""Base class for page views derived from other pages."""
@property
def width(self) -> T_num:
"""Page width."""
@property
def height(self) -> T_num:
"""Page height."""
@property
def bbox(self) -> T_bbox:
"""Page bounding box."""
# All Container and Page methods available
def extract_text(self, **kwargs): ...
def extract_tables(self, **kwargs): ...
def crop(self, bbox, **kwargs): ...
def filter(self, test_function): ...Remove duplicate character objects that may occur from PDF processing.
def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):
"""
Remove duplicate characters.
Parameters:
- tolerance: T_num - Distance tolerance for duplicate detection
- use_text_flow: bool - Consider text flow direction in deduplication
- **kwargs: Additional deduplication options
Returns:
Page: New page object with deduplicated characters
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Remove duplicate characters with default tolerance
clean_page = page.dedupe_chars()
# Strict deduplication with tight tolerance
very_clean = page.dedupe_chars(tolerance=0.5)
# Consider text flow for better deduplication
flow_aware = page.dedupe_chars(use_text_flow=True)
# Compare character counts
original_chars = len(page.chars)
clean_chars = len(clean_page.chars)
print(f"Removed {original_chars - clean_chars} duplicate characters")PDFplumber uses PDF coordinate system where:
page.width and page.heightwith pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Convert relative to absolute coordinates
rel_bbox = (0.1, 0.2, 0.9, 0.8) # 10% margin on all sides
abs_bbox = (
rel_bbox[0] * page.width,
rel_bbox[1] * page.height,
rel_bbox[2] * page.width,
rel_bbox[3] * page.height
)
# Use relative coordinates directly
center_region = page.crop((0.25, 0.25, 0.75, 0.75), relative=True)with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Chain multiple operations
processed_page = (page
.dedupe_chars()
.crop((50, 50, page.width-50, page.height-50))
.filter(lambda obj: obj.get('size', 0) > 10))
# Each operation returns a new page-like object
text = processed_page.extract_text()
tables = processed_page.extract_tables()with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Efficient: filter before expensive operations
large_text = page.filter(lambda obj: obj.get('size', 0) > 12)
tables = large_text.extract_tables() # Operates on fewer objects
# Less efficient: extract from full page then filter results
all_tables = page.extract_tables()
# Manual filtering of resultsAll derived pages maintain access to the full Container API:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
cropped = page.crop((100, 100, 400, 300))
# Access filtered object collections
chars = cropped.chars # Only characters in cropped region
lines = cropped.lines # Only lines in cropped region
rects = cropped.rects # Only rectangles in cropped region
images = cropped.images # Only images in cropped region
# Derived properties work with filtered objects
edges = cropped.edges # All edges from filtered objects
h_edges = cropped.horizontal_edges
v_edges = cropped.vertical_edges
# Export filtered objects
cropped.to_json("cropped_objects.json")
cropped.to_csv("cropped_data.csv")Install with Tessl CLI
npx tessl i tessl/pypi-pdfplumber