CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pypdfium2

Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing

Pending
Overview
Eval results
Files

text-processing.mddocs/

Text Processing

Comprehensive text extraction and search capabilities with support for bounded text extraction, character-level positioning, full-text search, and detailed text analysis. The PdfTextPage class provides access to all text-related operations.

Capabilities

Text Extraction

Extract text content from PDF pages with various extraction modes and error handling options.

class PdfTextPage:
    def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False) -> str:
        """
        Extract text from a character range.
        
        Parameters:
        - index: int, starting character index (0-based)
        - count: int, number of characters to extract (-1 for all remaining)
        - errors: str, error handling mode ("ignore", "strict", "replace")
        - force_this: bool, force extraction from this specific text page
        
        Returns:
        str: Extracted text content
        """
    
    def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore") -> str:
        """
        Extract text within specified bounding rectangle.
        
        Parameters:
        - left: float, left boundary in PDF units (None = page left)
        - bottom: float, bottom boundary in PDF units (None = page bottom)  
        - right: float, right boundary in PDF units (None = page right)
        - top: float, top boundary in PDF units (None = page top)
        - errors: str, error handling mode ("ignore", "strict", "replace")
        
        Returns:
        str: Text within the specified bounds
        """

Basic text extraction examples:

import pypdfium2 as pdfium

pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
textpage = page.get_textpage()

# Extract all text from page
full_text = textpage.get_text_range()
print(f"Full page text:\n{full_text}")

# Extract text from specific character range
partial_text = textpage.get_text_range(index=100, count=200)
print(f"Characters 100-299: {partial_text}")

# Extract text from bounded area (top-left quadrant)
width, height = page.get_size()
bounded_text = textpage.get_text_bounded(
    left=0, 
    bottom=height/2, 
    right=width/2, 
    top=height
)
print(f"Top-left text: {bounded_text}")

# Extract text from middle column
column_text = textpage.get_text_bounded(
    left=width/3,
    right=2*width/3
)
print(f"Middle column: {column_text}")

Character Information

Access detailed information about individual characters including position, bounding boxes, and character counts.

def count_chars(self) -> int:
    """
    Get total number of characters on the page.
    
    Returns:
    int: Character count including spaces and special characters
    """

def get_index(self, x: float, y: float, x_tol: float, y_tol: float) -> int:
    """
    Get character index at specified coordinates.
    
    Parameters:
    - x: float, x-coordinate in PDF units
    - y: float, y-coordinate in PDF units  
    - x_tol: float, x-axis tolerance
    - y_tol: float, y-axis tolerance
    
    Returns:
    int: Character index at position, or -1 if no character found
    """

def get_charbox(self, index: int, loose=False) -> tuple:
    """
    Get bounding box for character at index.
    
    Parameters:
    - index: int, character index
    - loose: bool, use loose bounding box calculation
    
    Returns:
    tuple: (left, bottom, right, top) character bounds
    """

Character analysis examples:

textpage = page.get_textpage()

# Get character count
char_count = textpage.count_chars()
print(f"Page has {char_count} characters")

# Find character at mouse click position
click_x, click_y = 300, 400  # Example coordinates
char_index = textpage.get_index(click_x, click_y, 5, 5)
if char_index != -1:
    char_box = textpage.get_charbox(char_index)
    print(f"Character at ({click_x}, {click_y}): index {char_index}")
    print(f"Character bounds: {char_box}")
    
    # Get the actual character
    character = textpage.get_text_range(char_index, 1)
    print(f"Character: '{character}'")

# Analyze character positions for first 100 characters
for i in range(min(100, char_count)):
    char_box = textpage.get_charbox(i)
    character = textpage.get_text_range(i, 1)
    if character not in [' ', '\n', '\t']:  # Skip whitespace
        print(f"'{character}' at {char_box}")

Text Rectangles

Access text rectangle information for layout analysis and text positioning.

def count_rects(self, index=0, count=-1) -> int:
    """
    Get number of text rectangles for character range.
    
    Parameters:
    - index: int, starting character index
    - count: int, character count (-1 for all remaining)
    
    Returns:
    int: Number of rectangles covering the text range
    """

def get_rect(self, index: int) -> tuple:
    """
    Get text rectangle coordinates by index.
    
    Parameters:
    - index: int, rectangle index
    
    Returns:
    tuple: (left, bottom, right, top) rectangle coordinates
    """

Rectangle analysis:

textpage = page.get_textpage()

# Get rectangles for first 500 characters
rect_count = textpage.count_rects(0, 500)
print(f"First 500 characters span {rect_count} rectangles")

# Analyze text layout by examining rectangles
for i in range(rect_count):
    rect = textpage.get_rect(i)
    print(f"Rectangle {i}: {rect}")
    
    # Calculate rectangle dimensions
    left, bottom, right, top = rect
    width = right - left
    height = top - bottom
    print(f"  Size: {width:.1f} x {height:.1f}")

Text Search

Perform text search operations with various matching options and result iteration.

def search(self, text: str, index=0, match_case=False, match_whole_word=False, consecutive=False) -> PdfTextSearcher:
    """
    Create text searcher for finding text matches.
    
    Parameters:
    - text: str, text to search for
    - index: int, starting character index for search
    - match_case: bool, perform case-sensitive search
    - match_whole_word: bool, match complete words only
    - consecutive: bool, search for consecutive occurrences
    
    Returns:
    PdfTextSearcher: Search object for iterating through matches
    """

PdfTextSearcher Class

Text search helper class for iterating through search matches on a text page.

class PdfTextSearcher:
    """
    Text searcher helper class for finding and iterating through text matches.
    
    Created by PdfTextPage.search() to manage search state and provide
    efficient iteration through search results. Supports both forward
    and backward searching through matches.
    
    Attributes:
    - raw: FPDF_SCHHANDLE, underlying PDFium searcher handle
    - textpage: PdfTextPage, reference to the textpage this searcher belongs to
    """
    
    def __init__(self, raw, textpage):
        """
        Initialize text searcher.
        
        Parameters:
        - raw: FPDF_SCHHANDLE, PDFium searcher handle
        - textpage: PdfTextPage, parent textpage
        
        Note: Typically created via PdfTextPage.search() rather than direct instantiation.
        """
    
    def get_next(self) -> tuple[int, int] | None:
        """
        Find next search match.
        
        Returns:
        tuple: (start_index, char_count) for the next match occurrence,
               or None if no more matches are found
        
        Advances the search position to the next occurrence of the search text.
        The returned indices can be used with PdfTextPage.get_text_range() to
        extract the matched text.
        """
    
    def get_prev(self) -> tuple[int, int] | None:
        """
        Find previous search match.
        
        Returns:
        tuple: (start_index, char_count) for the previous match occurrence,
               or None if no previous matches exist
        
        Moves the search position backward to the previous occurrence.
        Useful for bidirectional search navigation.
        """
    
    def close(self):
        """Close and clean up search resources."""

Text search examples:

textpage = page.get_textpage()

# Search for specific text
searcher = textpage.search("important", match_case=False)

# Find all matches
matches = []
while True:
    match = searcher.get_next()
    if match is None:
        break
    matches.append(match)

print(f"Found {len(matches)} matches for 'important'")

# Process each match
for start_idx, char_count in matches:
    # Get the matched text (for verification)
    matched_text = textpage.get_text_range(start_idx, char_count)
    
    # Get bounding boxes for highlight
    match_boxes = []
    for i in range(start_idx, start_idx + char_count):
        char_box = textpage.get_charbox(i)
        match_boxes.append(char_box)
    
    print(f"Match: '{matched_text}' at chars {start_idx}-{start_idx+char_count}")
    print(f"First char box: {match_boxes[0]}")

# Close the searcher when done
searcher.close()

# Case-sensitive search for exact matches
exact_searcher = textpage.search("PDF", match_case=True, match_whole_word=True)
exact_match = exact_searcher.get_next()
if exact_match:
    start_idx, char_count = exact_match
    print(f"Found exact 'PDF' match at position {start_idx}")

# Bidirectional search example
bidirectional_searcher = textpage.search("chapter")

# Find matches and navigate back and forth
forward_matches = []
match = bidirectional_searcher.get_next()
while match:
    forward_matches.append(match)
    match = bidirectional_searcher.get_next()

print(f"Found {len(forward_matches)} forward matches")

# Go backward through matches
backward_matches = []
match = bidirectional_searcher.get_prev()
while match:
    backward_matches.append(match)
    match = bidirectional_searcher.get_prev()

print(f"Found {len(backward_matches)} backward matches")
bidirectional_searcher.close()

Advanced search patterns:

def search_and_highlight_text(textpage, search_terms):
    """Search for multiple terms and collect highlighting information."""
    
    all_highlights = []
    
    for term in search_terms:
        print(f"\nSearching for '{term}':")
        
        # Create searcher with appropriate options
        searcher = textpage.search(
            term, 
            match_case=False,
            match_whole_word=True  # Match complete words only
        )
        
        # Collect all matches for this term
        term_matches = []
        while True:
            match = searcher.get_next()
            if match is None:
                break
            
            start_idx, char_count = match
            
            # Extract the matched text
            matched_text = textpage.get_text_range(start_idx, char_count)
            
            # Calculate bounding box for the entire match
            char_boxes = []
            for i in range(start_idx, start_idx + char_count):
                char_box = textpage.get_charbox(i)
                char_boxes.append(char_box)
            
            # Create overall bounding box
            if char_boxes:
                all_lefts = [box[0] for box in char_boxes]
                all_bottoms = [box[1] for box in char_boxes]  
                all_rights = [box[2] for box in char_boxes]
                all_tops = [box[3] for box in char_boxes]
                
                overall_box = (
                    min(all_lefts), min(all_bottoms),
                    max(all_rights), max(all_tops)
                )
                
                match_info = {
                    'term': term,
                    'text': matched_text,
                    'start_index': start_idx,
                    'char_count': char_count,
                    'bbox': overall_box
                }
                
                term_matches.append(match_info)
                all_highlights.append(match_info)
        
        print(f"  Found {len(term_matches)} matches")
        searcher.close()
    
    return all_highlights

# Usage
search_terms = ["introduction", "conclusion", "figure", "table", "reference"]
textpage = page.get_textpage()
highlights = search_and_highlight_text(textpage, search_terms)

# Print highlight summary
print(f"\nTotal highlights: {len(highlights)}")
for highlight in highlights:
    print(f"'{highlight['term']}' -> '{highlight['text']}' at {highlight['bbox']}")

Advanced Text Analysis

Combine multiple text processing features for comprehensive text analysis.

def analyze_page_text(page):
    """Comprehensive text analysis example."""
    textpage = page.get_textpage()
    
    # Basic statistics
    char_count = textpage.count_chars()
    full_text = textpage.get_text_range()
    word_count = len(full_text.split())
    line_count = full_text.count('\n') + 1
    
    print(f"Text Statistics:")
    print(f"  Characters: {char_count}")
    print(f"  Words: {word_count}")
    print(f"  Lines: {line_count}")
    
    # Find common words
    words = full_text.lower().split()
    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    
    # Most common words (excluding short words)
    common_words = [(word, count) for word, count in word_freq.items() 
                   if len(word) > 3]
    common_words.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\nMost common words:")
    for word, count in common_words[:10]:
        print(f"  '{word}': {count}")
    
    # Search for specific patterns
    patterns = ["http", "www", "@", "phone", "email"]
    for pattern in patterns:
        searcher = textpage.search(pattern, match_case=False)
        match_count = 0
        while searcher.get_next():
            match_count += 1
        if match_count > 0:
            print(f"Found {match_count} matches for '{pattern}'")
    
    return {
        'char_count': char_count,
        'word_count': word_count,
        'line_count': line_count,
        'common_words': common_words[:10]
    }

# Usage
pdf = pdfium.PdfDocument("document.pdf")
for i, page in enumerate(pdf):
    print(f"\n--- Page {i+1} Analysis ---")
    stats = analyze_page_text(page)

Text Extraction with Coordinates

Extract text while preserving positional information for layout reconstruction.

def extract_text_with_positions(textpage):
    """Extract text with character positions."""
    char_count = textpage.count_chars()
    text_elements = []
    
    current_line = []
    current_y = None
    
    for i in range(char_count):
        char = textpage.get_text_range(i, 1)
        char_box = textpage.get_charbox(i)
        left, bottom, right, top = char_box
        
        # Group characters by line (similar y-coordinates)
        if current_y is None or abs(bottom - current_y) > 5:
            if current_line:
                text_elements.append(current_line)
            current_line = []
            current_y = bottom
        
        current_line.append({
            'char': char,
            'box': char_box,
            'x': left,
            'y': bottom
        })
    
    if current_line:
        text_elements.append(current_line)
    
    return text_elements

# Usage
textpage = page.get_textpage()
text_lines = extract_text_with_positions(textpage)

print(f"Found {len(text_lines)} text lines")
for i, line in enumerate(text_lines):
    line_text = ''.join(elem['char'] for elem in line)
    if line_text.strip():  # Skip empty lines
        first_char_y = line[0]['y']
        print(f"Line {i+1} (y={first_char_y:.1f}): {line_text.strip()}")

Properties

@property
def raw(self) -> FPDF_TEXTPAGE:
    """Raw PDFium textpage handle for low-level operations."""

@property
def page(self) -> PdfPage:
    """Parent page containing this text."""

Text Processing Best Practices

  1. Always create textpage objects for text operations
  2. Handle encoding errors appropriately - use errors="ignore" for robustness
  3. Use bounded extraction for targeted text - more efficient than full extraction
  4. Consider character-level analysis for precise positioning
  5. Clean up textpage objects when done to free memory
  6. Use search functionality for finding specific content rather than manual parsing

Install with Tessl CLI

npx tessl i tessl/pypi-pypdfium2

docs

attachments.md

cli-tools.md

document-management.md

image-bitmap.md

index.md

page-manipulation.md

page-objects.md

text-processing.md

transformation.md

version-info.md

tile.json