Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Extensive utility functions for geometry operations, text processing, clustering algorithms, PDF internal structure manipulation, and data conversion utilities.
Comprehensive geometric operations for bounding boxes, object positioning, and spatial analysis.
def bbox_to_rect(bbox):
"""
Convert bounding box to rectangle dictionary.
Parameters:
- bbox: Tuple[T_num, T_num, T_num, T_num] - (x0, top, x1, bottom)
Returns:
Dict[str, T_num]: Rectangle with x0, top, x1, bottom, width, height
"""
def calculate_area(bbox):
"""
Calculate bounding box area.
Parameters:
- bbox: Tuple[T_num, T_num, T_num, T_num] - Bounding box coordinates
Returns:
T_num: Area of bounding box
"""
def merge_bboxes(bboxes):
"""
Merge multiple bounding boxes into single encompassing box.
Parameters:
- bboxes: List[T_bbox] - List of bounding boxes
Returns:
T_bbox: Single bounding box containing all input boxes
"""
def get_bbox_overlap(a, b):
"""
Get overlap between two bounding boxes.
Parameters:
- a, b: T_bbox - Two bounding boxes
Returns:
T_bbox or None: Overlapping region or None if no overlap
"""
def objects_to_bbox(objects):
"""
Get bounding box containing all objects.
Parameters:
- objects: List[T_obj] - List of objects with bbox information
Returns:
T_bbox: Bounding box encompassing all objects
"""
def objects_to_rect(objects):
"""
Get rectangle containing all objects.
Parameters:
- objects: List[T_obj] - List of objects
Returns:
Dict[str, T_num]: Rectangle dictionary
"""Usage Examples:
from pdfplumber.utils import bbox_to_rect, merge_bboxes, calculate_area
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Convert bbox to rect format
char = page.chars[0]
rect = bbox_to_rect((char['x0'], char['top'], char['x1'], char['bottom']))
print(f"Character width: {rect['width']}, height: {rect['height']}")
# Find bounding box of all characters
all_chars_bbox = objects_to_bbox(page.chars)
print(f"Text area: {all_chars_bbox}")
# Calculate text coverage
page_area = calculate_area((0, 0, page.width, page.height))
text_area = calculate_area(all_chars_bbox)
coverage = text_area / page_area
print(f"Text covers {coverage:.1%} of page")Filter objects based on spatial relationships and positioning.
def within_bbox(objs, bbox):
"""
Filter objects within bounding box.
Parameters:
- objs: List[T_obj] - Objects to filter
- bbox: T_bbox - Bounding box for filtering
Returns:
List[T_obj]: Objects within bounding box
"""
def outside_bbox(objs, bbox):
"""
Filter objects outside bounding box.
Parameters:
- objs: List[T_obj] - Objects to filter
- bbox: T_bbox - Bounding box for filtering
Returns:
List[T_obj]: Objects outside bounding box
"""
def intersects_bbox(objs, bbox):
"""
Filter objects intersecting bounding box.
Parameters:
- objs: List[T_obj] - Objects to filter
- bbox: T_bbox - Bounding box for intersection test
Returns:
List[T_obj]: Objects intersecting bounding box
"""
def crop_to_bbox(objs, bbox):
"""
Filter objects intersecting bbox (alias for intersects_bbox).
Parameters:
- objs: List[T_obj] - Objects to filter
- bbox: T_bbox - Bounding box
Returns:
List[T_obj]: Objects intersecting bounding box
"""Transform and modify object properties and positioning.
def move_object(obj, axis, value):
"""
Move object along specified axis.
Parameters:
- obj: T_obj - Object to move
- axis: str - Axis to move along ('x' or 'y')
- value: T_num - Distance to move
Returns:
T_obj: New object with updated coordinates
"""
def resize_object(obj, key, value):
"""
Resize object property.
Parameters:
- obj: T_obj - Object to resize
- key: str - Property to modify
- value: T_num - New value
Returns:
T_obj: New object with updated property
"""
def clip_obj(obj, bbox):
"""
Clip object to bounding box.
Parameters:
- obj: T_obj - Object to clip
- bbox: T_bbox - Clipping boundary
Returns:
T_obj or None: Clipped object or None if completely outside
"""Convert objects to edges and process line elements.
def obj_to_edges(obj):
"""
Convert object to edges.
Parameters:
- obj: T_obj - Object (rectangle, curve, etc.)
Returns:
List[T_obj]: List of edge objects
"""
def line_to_edge(line):
"""
Convert line object to edge.
Parameters:
- line: T_obj - Line object
Returns:
T_obj: Edge object
"""
def curve_to_edges(curve):
"""
Convert curve to edges.
Parameters:
- curve: T_obj - Curve object
Returns:
List[T_obj]: List of edge objects from curve
"""
def rect_to_edges(rect):
"""
Convert rectangle to edges.
Parameters:
- rect: T_obj - Rectangle object
Returns:
List[T_obj]: Four edge objects (top, bottom, left, right)
"""
def filter_edges(edges, orientation=None, edge_type=None, min_length=1):
"""
Filter edges by orientation, type, and minimum length.
Parameters:
- edges: List[T_obj] - Edge objects to filter
- orientation: str, optional - 'h' for horizontal, 'v' for vertical
- edge_type: str, optional - Type of edge to include
- min_length: T_num - Minimum edge length
Returns:
List[T_obj]: Filtered edge objects
"""Align objects to common positions and snap coordinates.
def snap_objects(objs, attr, tolerance):
"""
Snap objects to common values.
Parameters:
- objs: List[T_obj] - Objects to snap
- attr: str - Attribute to snap (e.g., 'x0', 'top')
- tolerance: T_num - Snapping tolerance
Returns:
List[T_obj]: Objects with snapped coordinates
"""Group objects and values using clustering algorithms.
def cluster_list(xs, tolerance=0):
"""
Cluster list of numbers.
Parameters:
- xs: List[T_num] - Numbers to cluster
- tolerance: T_num - Clustering tolerance
Returns:
List[List[T_num]]: Clusters of numbers
"""
def cluster_objects(objs, key_fn, tolerance):
"""
Cluster objects by key function.
Parameters:
- objs: List[T_obj] - Objects to cluster
- key_fn: Callable[[T_obj], T_num] - Function to extract clustering key
- tolerance: T_num - Clustering tolerance
Returns:
List[List[T_obj]]: Clusters of objects
"""
def make_cluster_dict(values, tolerance):
"""
Create value-to-cluster mapping.
Parameters:
- values: List[T_num] - Values to cluster
- tolerance: T_num - Clustering tolerance
Returns:
Dict[T_num, T_num]: Mapping from value to cluster representative
"""Usage Examples:
from pdfplumber.utils import cluster_objects, cluster_list
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Cluster characters by font size
size_clusters = cluster_objects(
page.chars,
lambda c: c.get('size', 0),
tolerance=1
)
print(f"Found {len(size_clusters)} font size groups")
# Cluster horizontal positions
x_positions = [c['x0'] for c in page.chars]
x_clusters = cluster_list(x_positions, tolerance=5)
print(f"Text aligns to {len(x_clusters)} column positions")
# Find common Y positions (likely text lines)
y_positions = [c['top'] for c in page.chars]
y_clusters = cluster_list(y_positions, tolerance=2)
print(f"Text appears on {len(y_clusters)} distinct lines")Advanced text processing and character manipulation utilities.
def extract_text(chars, **kwargs):
"""
Extract text from character objects.
Parameters:
- chars: List[T_obj] - Character objects
- **kwargs: Text extraction options
Returns:
str: Extracted text
"""
def extract_text_simple(chars, **kwargs):
"""
Simple text extraction from characters.
Parameters:
- chars: List[T_obj] - Character objects
- **kwargs: Extraction options
Returns:
str: Extracted text without layout preservation
"""
def extract_words(chars, **kwargs):
"""
Extract words from character objects.
Parameters:
- chars: List[T_obj] - Character objects
- **kwargs: Word extraction options
Returns:
List[T_obj]: Word objects with position data
"""
def dedupe_chars(chars, tolerance=1, **kwargs):
"""
Remove duplicate characters from list.
Parameters:
- chars: List[T_obj] - Character objects
- tolerance: T_num - Distance tolerance for duplicate detection
- **kwargs: Deduplication options
Returns:
List[T_obj]: Deduplicated character objects
"""
def chars_to_textmap(chars, **kwargs):
"""
Convert characters to TextMap object.
Parameters:
- chars: List[T_obj] - Character objects
- **kwargs: TextMap options
Returns:
TextMap: Character mapping object
"""
def collate_line(chars, **kwargs):
"""
Collate characters into text line.
Parameters:
- chars: List[T_obj] - Character objects for single line
- **kwargs: Line collation options
Returns:
str: Text content of line
"""Low-level PDF object processing and decoding utilities.
def resolve(x):
"""
Resolve PDF object references.
Parameters:
- x: Any - PDF object that may contain references
Returns:
Any: Resolved object with references dereferenced
"""
def resolve_all(x):
"""
Recursively resolve PDF objects.
Parameters:
- x: Any - PDF object structure
Returns:
Any: Completely resolved object structure
"""
def resolve_and_decode(obj):
"""
Resolve and decode PDF object.
Parameters:
- obj: Any - PDF object
Returns:
Any: Resolved and decoded object
"""
def decode_text(s):
"""
Decode text from bytes/string.
Parameters:
- s: bytes or str - Text to decode
Returns:
str: Decoded text string
"""
def decode_psl_list(psl_list):
"""
Decode PSLiteral list.
Parameters:
- psl_list: List - List of PSLiteral objects
Returns:
List: Decoded list
"""General-purpose utility functions.
def to_list(collection):
"""
Convert collection to list.
Parameters:
- collection: Any - Collection to convert (list, tuple, generator, etc.)
Returns:
List: List representation of collection
"""Commonly used default values and tolerances.
# Text processing constants
DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3
DEFAULT_X_DENSITY = 7.25
DEFAULT_Y_DENSITY = 13Usage Examples:
from pdfplumber.utils import (
DEFAULT_X_TOLERANCE, DEFAULT_Y_TOLERANCE,
extract_text, resolve_all
)
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Use default tolerances
text = extract_text(page.chars,
x_tolerance=DEFAULT_X_TOLERANCE,
y_tolerance=DEFAULT_Y_TOLERANCE)
# Process PDF internals
raw_chars = page._objs.get('char', []) # Access raw PDF objects
resolved_chars = [resolve_all(char) for char in raw_chars]Spatial Analysis:
from pdfplumber.utils import cluster_objects, objects_to_bbox
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Find text columns
char_clusters = cluster_objects(
page.chars,
lambda c: c['x0'], # Group by left edge
tolerance=10
)
columns = []
for cluster in char_clusters:
column_bbox = objects_to_bbox(cluster)
column_text = extract_text(cluster)
columns.append({
'bbox': column_bbox,
'text': column_text,
'char_count': len(cluster)
})
print(f"Document has {len(columns)} columns")Font Analysis:
from pdfplumber.utils import cluster_objects
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Group by font properties
font_groups = cluster_objects(
page.chars,
lambda c: (c.get('fontname', ''), c.get('size', 0)),
tolerance=0 # Exact matching for fonts
)
for group in font_groups:
sample = group[0]
font_name = sample.get('fontname', 'Unknown')
font_size = sample.get('size', 0)
char_count = len(group)
print(f"Font: {font_name}, Size: {font_size}, Characters: {char_count}")Install with Tessl CLI
npx tessl i tessl/pypi-pdfplumber