A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Read and parse PDF files, access document structure, extract text and metadata, handle encrypted documents with password protection. The PdfReader class provides the primary interface for reading PDF files.
Main class for reading PDF files with comprehensive access to document structure, pages, metadata, and content.
class PdfReader:
def __init__(self, stream: Union[str, bytes, Path], strict: bool = False, password: Union[None, str, bytes] = None):
"""
Initialize a PdfReader instance.
Args:
stream: PDF file path or file-like object
strict: Whether to raise exceptions for correctable problems (default: False)
password: Password for encrypted PDFs
Raises:
PdfReadError: If PDF cannot be read
WrongPasswordError: If password is incorrect
"""
@property
def pages(self) -> List[PageObject]:
"""List of all pages in the PDF document."""
@property
def metadata(self) -> DocumentInformation:
"""Document metadata including title, author, subject, etc."""
@property
def pdf_header(self) -> str:
"""PDF version string from document header."""
@property
def xmp_metadata(self) -> Optional[XmpInformation]:
"""XMP metadata if present in the document."""
@property
def is_encrypted(self) -> bool:
"""True if the PDF is encrypted."""
@property
def outline(self) -> OutlineType:
"""Document outline/bookmarks structure."""
@property
def named_destinations(self) -> Dict[str, Any]:
"""Named destinations in the document."""
@property
def page_layout(self) -> Optional[str]:
"""Page layout preference."""
@property
def page_mode(self) -> Optional[PagemodeType]:
"""Page mode preference."""
@property
def threads(self) -> Optional[ArrayObject]:
"""Article threads if present."""
@property
def xfa(self) -> Optional[Dict[str, Any]]:
"""XFA (XML Forms Architecture) data if present."""
def get_page(self, page_number: int) -> PageObject:
"""
Get a specific page by number.
Args:
page_number (int): Zero-based page index
Returns:
PageObject: The requested page
Raises:
IndexError: If page number is out of range
"""
def get_fields(self, tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]:
"""
Get form fields from the PDF.
Returns:
dict: Form field data, or None if no fields present
"""
def get_form_text_fields(self) -> Dict[str, Any]:
"""
Get text form fields and their values.
Returns:
dict: Text field names and values
"""
def get_page_number(self, page: PageObject) -> int:
"""
Get the page number for a given PageObject.
Args:
page (PageObject): Page object to find
Returns:
int: Zero-based page number
Raises:
ValueError: If page is not in this document
"""
def get_destination_page_number(self, destination: Destination) -> int:
"""
Get page number for a destination.
Args:
destination (Destination): Destination object
Returns:
int: Zero-based page number
"""
def decrypt(self, password: Union[str, bytes]) -> PasswordType:
"""
Decrypt an encrypted PDF.
Args:
password (str): Password to try
Returns:
PasswordType: Type of password used (USER_PASSWORD, OWNER_PASSWORD, or NOT_DECRYPTED)
Raises:
WrongPasswordError: If password is incorrect
"""
def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
"""
Decode permission flags from encryption dictionary.
Args:
permissions_code (int): Raw permissions integer
Returns:
dict: Human-readable permission flags
"""Container for PDF document metadata with standardized fields.
class DocumentInformation(DictionaryObject):
"""PDF document metadata container."""
@property
def title(self) -> Optional[str]:
"""Document title."""
@property
def title_raw(self) -> Optional[str]:
"""Raw document title (unprocessed)."""
@property
def author(self) -> Optional[str]:
"""Document author."""
@property
def author_raw(self) -> Optional[str]:
"""Raw document author (unprocessed)."""
@property
def subject(self) -> Optional[str]:
"""Document subject."""
@property
def subject_raw(self) -> Optional[str]:
"""Raw document subject (unprocessed)."""
@property
def creator(self) -> Optional[str]:
"""Application that created the document."""
@property
def creator_raw(self) -> Optional[str]:
"""Raw document creator (unprocessed)."""
@property
def producer(self) -> Optional[str]:
"""Application that produced the PDF."""
@property
def producer_raw(self) -> Optional[str]:
"""Raw document producer (unprocessed)."""
@property
def creation_date(self) -> Optional[str]:
"""Document creation date."""
@property
def creation_date_raw(self) -> Optional[str]:
"""Raw document creation date (unprocessed)."""
@property
def modification_date(self) -> Optional[str]:
"""Document modification date."""
@property
def modification_date_raw(self) -> Optional[str]:
"""Raw document modification date (unprocessed)."""Extended metadata in XMP format for documents that include it.
class XmpInformation:
"""XMP metadata information handler."""
# Methods for parsing and accessing XMP metadata
# Implementation varies based on XMP content structurefrom PyPDF2 import PdfReader
# Open and read a PDF file
reader = PdfReader("document.pdf")
# Access basic information
print(f"Number of pages: {len(reader.pages)}")
print(f"PDF version: {reader.pdf_header}")
print(f"Is encrypted: {reader.is_encrypted}")
# Access metadata
if reader.metadata:
print(f"Title: {reader.metadata.title}")
print(f"Author: {reader.metadata.author}")
print(f"Subject: {reader.metadata.subject}")from PyPDF2 import PdfReader, WrongPasswordError
try:
# Try to open encrypted PDF
reader = PdfReader("encrypted.pdf")
if reader.is_encrypted:
# Decrypt with password
password_type = reader.decrypt("user_password")
print(f"Decrypted with: {password_type}")
# Check permissions
permissions = reader.decode_permissions(reader.encryption.permissions_flag)
print(f"Can print: {permissions.get('print', False)}")
print(f"Can modify: {permissions.get('modify', False)}")
except WrongPasswordError:
print("Incorrect password provided")from PyPDF2 import PdfReader
reader = PdfReader("document.pdf")
full_text = ""
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
full_text += f"\\n--- Page {page_num + 1} ---\\n{text}"
print(full_text)from PyPDF2 import PdfReader
reader = PdfReader("form.pdf")
# Get all form fields
fields = reader.get_fields()
if fields:
for field_name, field_info in fields.items():
print(f"Field: {field_name}, Value: {field_info.get('value', 'N/A')}")
# Get only text fields
text_fields = reader.get_form_text_fields()
for field_name, value in text_fields.items():
print(f"Text field: {field_name} = {value}")class PdfFileReader:
"""DEPRECATED: Use PdfReader instead. Will be removed in PyPDF2 3.0.0."""This class is deprecated and should not be used in new code. All functionality has been moved to PdfReader with the same API.
Install with Tessl CLI
npx tessl i tessl/pypi-py-pdf2