CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pypdf

A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

text-extraction.mddocs/

Text Extraction

Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options. pypdf provides sophisticated text extraction that can handle complex PDF layouts while maintaining readability.

Capabilities

Text Extraction Methods

Extract text from PDF pages with various modes and customization options to handle different document types and layout requirements.

def extract_text(
    self,
    orientations: tuple | int = (0, 90, 180, 270),
    space_width: float = 200.0,
    visitor_operand_before=None,
    visitor_operand_after=None,  
    visitor_text=None,
    extraction_mode: str = "plain"
) -> str:
    """
    Extract text from the page with advanced options.
    
    Args:
        orientations: Text orientations to consider in degrees (default: (0, 90, 180, 270))
        space_width: Minimum width threshold for inserting spaces (default: 200.0)
        visitor_operand_before: Callback function called before processing operands
        visitor_operand_after: Callback function called after processing operands  
        visitor_text: Custom text visitor function for advanced processing
        extraction_mode: Text extraction mode ("plain" or "layout", default: "plain")
            - "plain": Simple text extraction without layout preservation (default)
            - "layout": Preserves spatial layout and formatting
        
    Returns:
        Extracted text as string
    """

Text Visitor Functions

Custom text processing through visitor functions for advanced text extraction scenarios.

def mult(m: list[float], n: list[float]) -> list[float]:
    """
    Matrix multiplication utility for text transformation calculations.
    
    Args:
        m: First matrix as list of floats
        n: Second matrix as list of floats
        
    Returns:
        Result of matrix multiplication
    """

Usage Examples

Basic Text Extraction

from pypdf import PdfReader

reader = PdfReader("document.pdf")

# Extract text from first page
page = reader.pages[0]
text = page.extract_text()
print(text)

# Extract text from all pages
full_text = ""
for page in reader.pages:
    full_text += page.extract_text()
    full_text += "\n\n"  # Separate pages

print(full_text)

Layout-Preserving Extraction

from pypdf import PdfReader

reader = PdfReader("formatted_document.pdf")

for page_num, page in enumerate(reader.pages):
    # Extract with layout preservation (default)
    layout_text = page.extract_text(
        extraction_mode="layout",
        layout_mode_space_vertically=True,
        layout_mode_scale_weight=1.25
    )
    
    print(f"Page {page_num + 1}:")
    print(layout_text)
    print("-" * 50)

Plain Text Extraction

from pypdf import PdfReader

reader = PdfReader("document.pdf")

for page in reader.pages:
    # Simple text extraction without layout
    plain_text = page.extract_text(extraction_mode="plain")
    print(plain_text)

Handling Rotated Text

from pypdf import PdfReader

reader = PdfReader("rotated_content.pdf")

for page in reader.pages:
    # Include all text orientations
    text_all_orientations = page.extract_text(
        orientations=(0, 90, 180, 270),
        layout_mode_strip_rotated=False
    )
    
    # Only horizontal text
    text_horizontal_only = page.extract_text(
        orientations=(0,),
        layout_mode_strip_rotated=True
    )
    
    print("All orientations:")
    print(text_all_orientations)
    print("\nHorizontal only:")
    print(text_horizontal_only)

Custom Space Width Handling

from pypdf import PdfReader

reader = PdfReader("document.pdf")

for page in reader.pages:
    # Tighter spacing (less spaces inserted)
    tight_spacing = page.extract_text(space_width=100.0)
    
    # Looser spacing (more spaces inserted)
    loose_spacing = page.extract_text(space_width=300.0)
    
    print("Tight spacing:")
    print(tight_spacing[:200], "...")
    print("\nLoose spacing:")
    print(loose_spacing[:200], "...")

Advanced Text Processing with Visitor

from pypdf import PdfReader

def custom_text_visitor(text, cm, tm, font_dict, font_size):
    """
    Custom text visitor function for advanced text processing.
    
    Args:
        text: Extracted text
        cm: Current transformation matrix
        tm: Text matrix
        font_dict: Font dictionary
        font_size: Font size
    """
    # Example: Only extract text larger than 12pt
    if font_size >= 12:
        return text
    return ""

reader = PdfReader("document.pdf")

for page in reader.pages:
    # Extract only large text
    large_text_only = page.extract_text(visitor_text=custom_text_visitor)
    print(large_text_only)

Extracting Text from Specific Regions

from pypdf import PdfReader, PageObject

def extract_text_from_region(page: PageObject, x1: float, y1: float, x2: float, y2: float) -> str:
    """
    Extract text from a specific rectangular region of a page.
    
    Args:
        page: PageObject to extract from
        x1, y1: Bottom-left coordinates
        x2, y2: Top-right coordinates
        
    Returns:
        Extracted text from the region
    """
    # Create a copy of the page
    cropped_page = PageObject.create_blank_page(x2 - x1, y2 - y1)
    
    # Crop the original page to the desired region
    original_cropbox = page.cropbox
    page.cropbox = [x1, y1, x2, y2]
    
    # Merge the cropped content
    cropped_page.merge_page(page)
    
    # Restore original cropbox
    page.cropbox = original_cropbox
    
    return cropped_page.extract_text()

reader = PdfReader("document.pdf")
page = reader.pages[0]

# Extract text from top-left quarter of the page
width = float(page.mediabox.width)
height = float(page.mediabox.height)

top_left_text = extract_text_from_region(
    page, 0, height/2, width/2, height
)
print("Top-left quarter text:")
print(top_left_text)

Text Extraction with Error Handling

from pypdf import PdfReader
from pypdf.errors import PdfReadError, PdfStreamError

def safe_extract_text(pdf_path: str) -> list[str]:
    """
    Safely extract text from all pages with error handling.
    
    Args:
        pdf_path: Path to PDF file
        
    Returns:
        List of extracted text strings (one per page)
    """
    texts = []
    
    try:
        reader = PdfReader(pdf_path)
        
        for page_num, page in enumerate(reader.pages):
            try:
                text = page.extract_text()
                texts.append(text)
            except (PdfReadError, PdfStreamError) as e:
                print(f"Error extracting text from page {page_num + 1}: {e}")
                texts.append("")  # Empty string for failed pages
                
    except Exception as e:
        print(f"Error opening PDF {pdf_path}: {e}")
        
    return texts

# Extract text safely
page_texts = safe_extract_text("problematic.pdf")
for i, text in enumerate(page_texts):
    if text:
        print(f"Page {i + 1}: {len(text)} characters extracted")
    else:
        print(f"Page {i + 1}: Text extraction failed")

Batch Text Extraction

from pypdf import PdfReader
import os
from pathlib import Path

def extract_text_from_directory(directory_path: str, output_dir: str = None) -> dict[str, str]:
    """
    Extract text from all PDF files in a directory.
    
    Args:
        directory_path: Directory containing PDF files
        output_dir: Optional directory to save text files
        
    Returns:
        Dictionary mapping PDF filenames to extracted text
    """
    pdf_texts = {}
    
    for file_path in Path(directory_path).glob("*.pdf"):
        try:
            reader = PdfReader(str(file_path))
            
            # Extract all text
            full_text = ""
            for page in reader.pages:
                full_text += page.extract_text()
                full_text += "\n\n"
            
            pdf_texts[file_path.name] = full_text
            
            # Optionally save to text file
            if output_dir:
                output_path = Path(output_dir) / f"{file_path.stem}.txt"
                output_path.parent.mkdir(parents=True, exist_ok=True)
                output_path.write_text(full_text, encoding='utf-8')
                
        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")
            pdf_texts[file_path.name] = ""
    
    return pdf_texts

# Extract text from all PDFs in a directory
texts = extract_text_from_directory("pdf_documents/", "extracted_text/")
print(f"Processed {len(texts)} PDF files")

Install with Tessl CLI

npx tessl i tessl/pypi-pypdf

docs

annotations.md

form-fields.md

index.md

metadata.md

page-operations.md

reading-writing.md

text-extraction.md

utilities.md

tile.json