Simple PDF text extraction library using Poppler backend
npx @tessl/cli install tessl/pypi-pdftotext@3.0.0Simple Python library for extracting text from PDF documents using the Poppler backend. The library provides a minimal but complete API through a single PDF class that supports sequential access to pages, password-protected documents, and multiple text extraction modes for optimal readability.
pip install pdftotextimport pdftotextimport pdftotext
# Load a PDF file
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
# Check page count
print(f"Document has {len(pdf)} pages")
# Read individual pages
print("First page:")
print(pdf[0])
print("Last page:")
print(pdf[-1])
# Iterate through all pages
for page_num, page_text in enumerate(pdf):
print(f"--- Page {page_num + 1} ---")
print(page_text)
# Read all text as single string
full_text = "\n\n".join(pdf)
print(full_text)Load PDF documents from file-like objects with optional password authentication and text extraction mode configuration.
class PDF:
def __init__(self, pdf_file, password="", raw=False, physical=False):
"""
Initialize PDF object for text extraction.
Args:
pdf_file: A file-like object opened in binary mode containing PDF data
password (str, optional): Password to unlock encrypted PDFs. Both owner and user passwords work. Defaults to "".
raw (bool, optional): Extract text in content stream order (as stored in PDF). Defaults to False.
physical (bool, optional): Extract text in physical layout order (spatial arrangement on page). Defaults to False.
Raises:
pdftotext.Error: If PDF is invalid, corrupted, or password-protected without correct password
TypeError: If pdf_file is not a file-like object or opened in text mode
ValueError: If both raw and physical are True, or if raw/physical values are invalid
Note:
The raw and physical parameters are mutually exclusive. Default mode provides most readable output
by respecting logical document structure. Usually this is preferred over raw or physical modes.
"""Access individual pages as strings using sequence-like interface with support for indexing and iteration.
def __len__(self) -> int:
"""
Return the number of pages in the PDF document.
Returns:
int: Number of pages in the document
"""
def __getitem__(self, index: int) -> str:
"""
Get text content of a specific page.
Args:
index (int): Page index (0-based). Supports negative indexing.
Returns:
str: Text content of the page as UTF-8 string
Raises:
IndexError: If index is out of range
pdftotext.Error: If page cannot be read due to corruption
"""
def __iter__(self):
"""
Enable iteration over pages, yielding page text.
Yields:
str: Text content of each page in sequence
Example:
for page in pdf:
print(page)
"""Configure how text is extracted from PDF pages to optimize for different document layouts and reading requirements.
Default Mode (recommended): Most readable output that respects logical document structure. Handles multi-column layouts, reading order, and text flow intelligently.
Raw Mode (raw=True): Extracts text in the order it appears in the PDF content stream. Useful for debugging or when document structure is less important than preserving original ordering.
Physical Mode (physical=True): Extracts text in physical layout order based on spatial arrangement on the page. Can be useful for documents with complex layouts where spatial positioning matters.
Usage examples:
# Default mode - most readable
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
text = pdf[0] # Respects logical structure
# Raw mode - content stream order
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f, raw=True)
text = pdf[0] # Order as stored in PDF
# Physical mode - spatial order
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f, physical=True)
text = pdf[0] # Spatial arrangement on pageHandle encrypted PDF documents using owner or user passwords.
# Unlock with password
with open("secure_document.pdf", "rb") as f:
pdf = pdftotext.PDF(f, password="secret123")
text = pdf[0]
# Both owner and user passwords work
with open("encrypted.pdf", "rb") as f:
# This works with either password type
pdf = pdftotext.PDF(f, password="owner_password")
# or
pdf = pdftotext.PDF(f, password="user_password")Handle PDF-related errors and edge cases gracefully.
class Error(Exception):
"""
Exception raised for PDF-related errors.
Raised when:
- PDF file is invalid or corrupted
- PDF is password-protected and no/wrong password provided
- Poppler library encounters errors during processing
- Page cannot be read due to corruption
"""Example error handling:
import pdftotext
try:
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
text = pdf[0]
except pdftotext.Error as e:
print(f"PDF error: {e}")
except FileNotFoundError:
print("PDF file not found")
except IndexError as e:
print(f"Page index error: {e}")class PDF:
"""
Main class for PDF text extraction with sequence-like interface.
Provides:
- Sequential access to pages via indexing (pdf[0], pdf[1], etc.)
- Length operation (len(pdf))
- Iteration support (for page in pdf)
- Password authentication for encrypted PDFs
- Multiple text extraction modes (default, raw, physical)
"""
class Error(Exception):
"""
Custom exception class for PDF-related errors.
Inherits from built-in Exception class and is raised for:
- Invalid or corrupted PDF files
- Authentication failures on password-protected PDFs
- Poppler library processing errors
- Page reading errors due to corruption
"""import pdftotext
with open("report.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
# Process each page
for i, page in enumerate(pdf):
print(f"=== Page {i + 1} ===")
print(page[:100] + "..." if len(page) > 100 else page)
# Or get all text at once
full_document = "\n\n".join(pdf)# Regular document
with open("document.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
# Password-protected document
with open("secure.pdf", "rb") as f:
pdf = pdftotext.PDF(f, password="mypassword")
# Multi-column document (try physical mode)
with open("newspaper.pdf", "rb") as f:
pdf = pdftotext.PDF(f, physical=True)
# Document with complex layout (try raw mode)
with open("form.pdf", "rb") as f:
pdf = pdftotext.PDF(f, raw=True)import pdftotext
def extract_pdf_text(filepath, password=None):
"""Extract text from PDF with comprehensive error handling."""
try:
with open(filepath, "rb") as f:
if password:
pdf = pdftotext.PDF(f, password=password)
else:
pdf = pdftotext.PDF(f)
return [page for page in pdf]
except FileNotFoundError:
print(f"File not found: {filepath}")
return None
except pdftotext.Error as e:
print(f"PDF processing error: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None