Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing
—
Comprehensive text extraction and search capabilities with support for bounded text extraction, character-level positioning, full-text search, and detailed text analysis. The PdfTextPage class provides access to all text-related operations.
Extract text content from PDF pages with various extraction modes and error handling options.
class PdfTextPage:
def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False) -> str:
"""
Extract text from a character range.
Parameters:
- index: int, starting character index (0-based)
- count: int, number of characters to extract (-1 for all remaining)
- errors: str, error handling mode ("ignore", "strict", "replace")
- force_this: bool, force extraction from this specific text page
Returns:
str: Extracted text content
"""
def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore") -> str:
"""
Extract text within specified bounding rectangle.
Parameters:
- left: float, left boundary in PDF units (None = page left)
- bottom: float, bottom boundary in PDF units (None = page bottom)
- right: float, right boundary in PDF units (None = page right)
- top: float, top boundary in PDF units (None = page top)
- errors: str, error handling mode ("ignore", "strict", "replace")
Returns:
str: Text within the specified bounds
"""Basic text extraction examples:
import pypdfium2 as pdfium
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
textpage = page.get_textpage()
# Extract all text from page
full_text = textpage.get_text_range()
print(f"Full page text:\n{full_text}")
# Extract text from specific character range
partial_text = textpage.get_text_range(index=100, count=200)
print(f"Characters 100-299: {partial_text}")
# Extract text from bounded area (top-left quadrant)
width, height = page.get_size()
bounded_text = textpage.get_text_bounded(
left=0,
bottom=height/2,
right=width/2,
top=height
)
print(f"Top-left text: {bounded_text}")
# Extract text from middle column
column_text = textpage.get_text_bounded(
left=width/3,
right=2*width/3
)
print(f"Middle column: {column_text}")Access detailed information about individual characters including position, bounding boxes, and character counts.
def count_chars(self) -> int:
"""
Get total number of characters on the page.
Returns:
int: Character count including spaces and special characters
"""
def get_index(self, x: float, y: float, x_tol: float, y_tol: float) -> int:
"""
Get character index at specified coordinates.
Parameters:
- x: float, x-coordinate in PDF units
- y: float, y-coordinate in PDF units
- x_tol: float, x-axis tolerance
- y_tol: float, y-axis tolerance
Returns:
int: Character index at position, or -1 if no character found
"""
def get_charbox(self, index: int, loose=False) -> tuple:
"""
Get bounding box for character at index.
Parameters:
- index: int, character index
- loose: bool, use loose bounding box calculation
Returns:
tuple: (left, bottom, right, top) character bounds
"""Character analysis examples:
textpage = page.get_textpage()
# Get character count
char_count = textpage.count_chars()
print(f"Page has {char_count} characters")
# Find character at mouse click position
click_x, click_y = 300, 400 # Example coordinates
char_index = textpage.get_index(click_x, click_y, 5, 5)
if char_index != -1:
char_box = textpage.get_charbox(char_index)
print(f"Character at ({click_x}, {click_y}): index {char_index}")
print(f"Character bounds: {char_box}")
# Get the actual character
character = textpage.get_text_range(char_index, 1)
print(f"Character: '{character}'")
# Analyze character positions for first 100 characters
for i in range(min(100, char_count)):
char_box = textpage.get_charbox(i)
character = textpage.get_text_range(i, 1)
if character not in [' ', '\n', '\t']: # Skip whitespace
print(f"'{character}' at {char_box}")Access text rectangle information for layout analysis and text positioning.
def count_rects(self, index=0, count=-1) -> int:
"""
Get number of text rectangles for character range.
Parameters:
- index: int, starting character index
- count: int, character count (-1 for all remaining)
Returns:
int: Number of rectangles covering the text range
"""
def get_rect(self, index: int) -> tuple:
"""
Get text rectangle coordinates by index.
Parameters:
- index: int, rectangle index
Returns:
tuple: (left, bottom, right, top) rectangle coordinates
"""Rectangle analysis:
textpage = page.get_textpage()
# Get rectangles for first 500 characters
rect_count = textpage.count_rects(0, 500)
print(f"First 500 characters span {rect_count} rectangles")
# Analyze text layout by examining rectangles
for i in range(rect_count):
rect = textpage.get_rect(i)
print(f"Rectangle {i}: {rect}")
# Calculate rectangle dimensions
left, bottom, right, top = rect
width = right - left
height = top - bottom
print(f" Size: {width:.1f} x {height:.1f}")Perform text search operations with various matching options and result iteration.
def search(self, text: str, index=0, match_case=False, match_whole_word=False, consecutive=False) -> PdfTextSearcher:
"""
Create text searcher for finding text matches.
Parameters:
- text: str, text to search for
- index: int, starting character index for search
- match_case: bool, perform case-sensitive search
- match_whole_word: bool, match complete words only
- consecutive: bool, search for consecutive occurrences
Returns:
PdfTextSearcher: Search object for iterating through matches
"""Text search helper class for iterating through search matches on a text page.
class PdfTextSearcher:
"""
Text searcher helper class for finding and iterating through text matches.
Created by PdfTextPage.search() to manage search state and provide
efficient iteration through search results. Supports both forward
and backward searching through matches.
Attributes:
- raw: FPDF_SCHHANDLE, underlying PDFium searcher handle
- textpage: PdfTextPage, reference to the textpage this searcher belongs to
"""
def __init__(self, raw, textpage):
"""
Initialize text searcher.
Parameters:
- raw: FPDF_SCHHANDLE, PDFium searcher handle
- textpage: PdfTextPage, parent textpage
Note: Typically created via PdfTextPage.search() rather than direct instantiation.
"""
def get_next(self) -> tuple[int, int] | None:
"""
Find next search match.
Returns:
tuple: (start_index, char_count) for the next match occurrence,
or None if no more matches are found
Advances the search position to the next occurrence of the search text.
The returned indices can be used with PdfTextPage.get_text_range() to
extract the matched text.
"""
def get_prev(self) -> tuple[int, int] | None:
"""
Find previous search match.
Returns:
tuple: (start_index, char_count) for the previous match occurrence,
or None if no previous matches exist
Moves the search position backward to the previous occurrence.
Useful for bidirectional search navigation.
"""
def close(self):
"""Close and clean up search resources."""Text search examples:
textpage = page.get_textpage()
# Search for specific text
searcher = textpage.search("important", match_case=False)
# Find all matches
matches = []
while True:
match = searcher.get_next()
if match is None:
break
matches.append(match)
print(f"Found {len(matches)} matches for 'important'")
# Process each match
for start_idx, char_count in matches:
# Get the matched text (for verification)
matched_text = textpage.get_text_range(start_idx, char_count)
# Get bounding boxes for highlight
match_boxes = []
for i in range(start_idx, start_idx + char_count):
char_box = textpage.get_charbox(i)
match_boxes.append(char_box)
print(f"Match: '{matched_text}' at chars {start_idx}-{start_idx+char_count}")
print(f"First char box: {match_boxes[0]}")
# Close the searcher when done
searcher.close()
# Case-sensitive search for exact matches
exact_searcher = textpage.search("PDF", match_case=True, match_whole_word=True)
exact_match = exact_searcher.get_next()
if exact_match:
start_idx, char_count = exact_match
print(f"Found exact 'PDF' match at position {start_idx}")
# Bidirectional search example
bidirectional_searcher = textpage.search("chapter")
# Find matches and navigate back and forth
forward_matches = []
match = bidirectional_searcher.get_next()
while match:
forward_matches.append(match)
match = bidirectional_searcher.get_next()
print(f"Found {len(forward_matches)} forward matches")
# Go backward through matches
backward_matches = []
match = bidirectional_searcher.get_prev()
while match:
backward_matches.append(match)
match = bidirectional_searcher.get_prev()
print(f"Found {len(backward_matches)} backward matches")
bidirectional_searcher.close()Advanced search patterns:
def search_and_highlight_text(textpage, search_terms):
"""Search for multiple terms and collect highlighting information."""
all_highlights = []
for term in search_terms:
print(f"\nSearching for '{term}':")
# Create searcher with appropriate options
searcher = textpage.search(
term,
match_case=False,
match_whole_word=True # Match complete words only
)
# Collect all matches for this term
term_matches = []
while True:
match = searcher.get_next()
if match is None:
break
start_idx, char_count = match
# Extract the matched text
matched_text = textpage.get_text_range(start_idx, char_count)
# Calculate bounding box for the entire match
char_boxes = []
for i in range(start_idx, start_idx + char_count):
char_box = textpage.get_charbox(i)
char_boxes.append(char_box)
# Create overall bounding box
if char_boxes:
all_lefts = [box[0] for box in char_boxes]
all_bottoms = [box[1] for box in char_boxes]
all_rights = [box[2] for box in char_boxes]
all_tops = [box[3] for box in char_boxes]
overall_box = (
min(all_lefts), min(all_bottoms),
max(all_rights), max(all_tops)
)
match_info = {
'term': term,
'text': matched_text,
'start_index': start_idx,
'char_count': char_count,
'bbox': overall_box
}
term_matches.append(match_info)
all_highlights.append(match_info)
print(f" Found {len(term_matches)} matches")
searcher.close()
return all_highlights
# Usage
search_terms = ["introduction", "conclusion", "figure", "table", "reference"]
textpage = page.get_textpage()
highlights = search_and_highlight_text(textpage, search_terms)
# Print highlight summary
print(f"\nTotal highlights: {len(highlights)}")
for highlight in highlights:
print(f"'{highlight['term']}' -> '{highlight['text']}' at {highlight['bbox']}")Combine multiple text processing features for comprehensive text analysis.
def analyze_page_text(page):
"""Comprehensive text analysis example."""
textpage = page.get_textpage()
# Basic statistics
char_count = textpage.count_chars()
full_text = textpage.get_text_range()
word_count = len(full_text.split())
line_count = full_text.count('\n') + 1
print(f"Text Statistics:")
print(f" Characters: {char_count}")
print(f" Words: {word_count}")
print(f" Lines: {line_count}")
# Find common words
words = full_text.lower().split()
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
# Most common words (excluding short words)
common_words = [(word, count) for word, count in word_freq.items()
if len(word) > 3]
common_words.sort(key=lambda x: x[1], reverse=True)
print(f"\nMost common words:")
for word, count in common_words[:10]:
print(f" '{word}': {count}")
# Search for specific patterns
patterns = ["http", "www", "@", "phone", "email"]
for pattern in patterns:
searcher = textpage.search(pattern, match_case=False)
match_count = 0
while searcher.get_next():
match_count += 1
if match_count > 0:
print(f"Found {match_count} matches for '{pattern}'")
return {
'char_count': char_count,
'word_count': word_count,
'line_count': line_count,
'common_words': common_words[:10]
}
# Usage
pdf = pdfium.PdfDocument("document.pdf")
for i, page in enumerate(pdf):
print(f"\n--- Page {i+1} Analysis ---")
stats = analyze_page_text(page)Extract text while preserving positional information for layout reconstruction.
def extract_text_with_positions(textpage):
"""Extract text with character positions."""
char_count = textpage.count_chars()
text_elements = []
current_line = []
current_y = None
for i in range(char_count):
char = textpage.get_text_range(i, 1)
char_box = textpage.get_charbox(i)
left, bottom, right, top = char_box
# Group characters by line (similar y-coordinates)
if current_y is None or abs(bottom - current_y) > 5:
if current_line:
text_elements.append(current_line)
current_line = []
current_y = bottom
current_line.append({
'char': char,
'box': char_box,
'x': left,
'y': bottom
})
if current_line:
text_elements.append(current_line)
return text_elements
# Usage
textpage = page.get_textpage()
text_lines = extract_text_with_positions(textpage)
print(f"Found {len(text_lines)} text lines")
for i, line in enumerate(text_lines):
line_text = ''.join(elem['char'] for elem in line)
if line_text.strip(): # Skip empty lines
first_char_y = line[0]['y']
print(f"Line {i+1} (y={first_char_y:.1f}): {line_text.strip()}")@property
def raw(self) -> FPDF_TEXTPAGE:
"""Raw PDFium textpage handle for low-level operations."""
@property
def page(self) -> PdfPage:
"""Parent page containing this text."""errors="ignore" for robustnessInstall with Tessl CLI
npx tessl i tessl/pypi-pypdfium2