A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options. pypdf provides sophisticated text extraction that can handle complex PDF layouts while maintaining readability.
Extract text from PDF pages with various modes and customization options to handle different document types and layout requirements.
def extract_text(
self,
orientations: tuple | int = (0, 90, 180, 270),
space_width: float = 200.0,
visitor_operand_before=None,
visitor_operand_after=None,
visitor_text=None,
extraction_mode: str = "plain"
) -> str:
"""
Extract text from the page with advanced options.
Args:
orientations: Text orientations to consider in degrees (default: (0, 90, 180, 270))
space_width: Minimum width threshold for inserting spaces (default: 200.0)
visitor_operand_before: Callback function called before processing operands
visitor_operand_after: Callback function called after processing operands
visitor_text: Custom text visitor function for advanced processing
extraction_mode: Text extraction mode ("plain" or "layout", default: "plain")
- "plain": Simple text extraction without layout preservation (default)
- "layout": Preserves spatial layout and formatting
Returns:
Extracted text as string
"""Custom text processing through visitor functions for advanced text extraction scenarios.
def mult(m: list[float], n: list[float]) -> list[float]:
"""
Matrix multiplication utility for text transformation calculations.
Args:
m: First matrix as list of floats
n: Second matrix as list of floats
Returns:
Result of matrix multiplication
"""from pypdf import PdfReader
reader = PdfReader("document.pdf")
# Extract text from first page
page = reader.pages[0]
text = page.extract_text()
print(text)
# Extract text from all pages
full_text = ""
for page in reader.pages:
full_text += page.extract_text()
full_text += "\n\n" # Separate pages
print(full_text)from pypdf import PdfReader
reader = PdfReader("formatted_document.pdf")
for page_num, page in enumerate(reader.pages):
# Extract with layout preservation (default)
layout_text = page.extract_text(
extraction_mode="layout",
layout_mode_space_vertically=True,
layout_mode_scale_weight=1.25
)
print(f"Page {page_num + 1}:")
print(layout_text)
print("-" * 50)from pypdf import PdfReader
reader = PdfReader("document.pdf")
for page in reader.pages:
# Simple text extraction without layout
plain_text = page.extract_text(extraction_mode="plain")
print(plain_text)from pypdf import PdfReader
reader = PdfReader("rotated_content.pdf")
for page in reader.pages:
# Include all text orientations
text_all_orientations = page.extract_text(
orientations=(0, 90, 180, 270),
layout_mode_strip_rotated=False
)
# Only horizontal text
text_horizontal_only = page.extract_text(
orientations=(0,),
layout_mode_strip_rotated=True
)
print("All orientations:")
print(text_all_orientations)
print("\nHorizontal only:")
print(text_horizontal_only)from pypdf import PdfReader
reader = PdfReader("document.pdf")
for page in reader.pages:
# Tighter spacing (less spaces inserted)
tight_spacing = page.extract_text(space_width=100.0)
# Looser spacing (more spaces inserted)
loose_spacing = page.extract_text(space_width=300.0)
print("Tight spacing:")
print(tight_spacing[:200], "...")
print("\nLoose spacing:")
print(loose_spacing[:200], "...")from pypdf import PdfReader
def custom_text_visitor(text, cm, tm, font_dict, font_size):
"""
Custom text visitor function for advanced text processing.
Args:
text: Extracted text
cm: Current transformation matrix
tm: Text matrix
font_dict: Font dictionary
font_size: Font size
"""
# Example: Only extract text larger than 12pt
if font_size >= 12:
return text
return ""
reader = PdfReader("document.pdf")
for page in reader.pages:
# Extract only large text
large_text_only = page.extract_text(visitor_text=custom_text_visitor)
print(large_text_only)from pypdf import PdfReader, PageObject
def extract_text_from_region(page: PageObject, x1: float, y1: float, x2: float, y2: float) -> str:
"""
Extract text from a specific rectangular region of a page.
Args:
page: PageObject to extract from
x1, y1: Bottom-left coordinates
x2, y2: Top-right coordinates
Returns:
Extracted text from the region
"""
# Create a copy of the page
cropped_page = PageObject.create_blank_page(x2 - x1, y2 - y1)
# Crop the original page to the desired region
original_cropbox = page.cropbox
page.cropbox = [x1, y1, x2, y2]
# Merge the cropped content
cropped_page.merge_page(page)
# Restore original cropbox
page.cropbox = original_cropbox
return cropped_page.extract_text()
reader = PdfReader("document.pdf")
page = reader.pages[0]
# Extract text from top-left quarter of the page
width = float(page.mediabox.width)
height = float(page.mediabox.height)
top_left_text = extract_text_from_region(
page, 0, height/2, width/2, height
)
print("Top-left quarter text:")
print(top_left_text)from pypdf import PdfReader
from pypdf.errors import PdfReadError, PdfStreamError
def safe_extract_text(pdf_path: str) -> list[str]:
"""
Safely extract text from all pages with error handling.
Args:
pdf_path: Path to PDF file
Returns:
List of extracted text strings (one per page)
"""
texts = []
try:
reader = PdfReader(pdf_path)
for page_num, page in enumerate(reader.pages):
try:
text = page.extract_text()
texts.append(text)
except (PdfReadError, PdfStreamError) as e:
print(f"Error extracting text from page {page_num + 1}: {e}")
texts.append("") # Empty string for failed pages
except Exception as e:
print(f"Error opening PDF {pdf_path}: {e}")
return texts
# Extract text safely
page_texts = safe_extract_text("problematic.pdf")
for i, text in enumerate(page_texts):
if text:
print(f"Page {i + 1}: {len(text)} characters extracted")
else:
print(f"Page {i + 1}: Text extraction failed")from pypdf import PdfReader
import os
from pathlib import Path
def extract_text_from_directory(directory_path: str, output_dir: str = None) -> dict[str, str]:
"""
Extract text from all PDF files in a directory.
Args:
directory_path: Directory containing PDF files
output_dir: Optional directory to save text files
Returns:
Dictionary mapping PDF filenames to extracted text
"""
pdf_texts = {}
for file_path in Path(directory_path).glob("*.pdf"):
try:
reader = PdfReader(str(file_path))
# Extract all text
full_text = ""
for page in reader.pages:
full_text += page.extract_text()
full_text += "\n\n"
pdf_texts[file_path.name] = full_text
# Optionally save to text file
if output_dir:
output_path = Path(output_dir) / f"{file_path.stem}.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(full_text, encoding='utf-8')
except Exception as e:
print(f"Error processing {file_path.name}: {e}")
pdf_texts[file_path.name] = ""
return pdf_texts
# Extract text from all PDFs in a directory
texts = extract_text_from_directory("pdf_documents/", "extracted_text/")
print(f"Processed {len(texts)} PDF files")Install with Tessl CLI
npx tessl i tessl/pypi-pypdf