A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Exception classes for comprehensive error handling, utility functions for specialized operations, and helper classes that support PyPDF2's core functionality.
Comprehensive exception hierarchy for handling various PDF processing errors.
class PyPdfError(Exception):
"""Base exception class for all PyPDF2 errors."""
class PdfReadError(PyPdfError):
"""Raised when there's an error reading a PDF file."""
class PdfStreamError(PdfReadError):
"""Raised when there's an error processing PDF streams."""
class PageSizeNotDefinedError(PyPdfError):
"""Raised when page size cannot be determined."""
class ParseError(Exception):
"""Raised when there's an error parsing PDF content."""
class FileNotDecryptedError(PdfReadError):
"""Raised when attempting to access encrypted content without decryption."""
class WrongPasswordError(PdfReadError):
"""Raised when an incorrect password is provided for an encrypted PDF."""
class EmptyFileError(PdfReadError):
"""Raised when attempting to read an empty or corrupt PDF file."""
class DependencyError(Exception):
"""Raised when a required dependency is missing."""Warning classes for non-fatal issues during PDF processing.
class PdfReadWarning(UserWarning):
"""Warning issued during PDF reading for recoverable issues."""Standard paper size definitions and utilities.
class PaperSize:
"""Standard paper size constants with dimensions in points."""
# ISO A-series paper sizes
A0: 'Dimensions' # 2384 x 3371 points (33.1" x 46.8")
A1: 'Dimensions' # 1685 x 2384 points (23.4" x 33.1")
A2: 'Dimensions' # 1190 x 1685 points (16.5" x 23.4")
A3: 'Dimensions' # 842 x 1190 points (11.7" x 16.5")
A4: 'Dimensions' # 595 x 842 points (8.3" x 11.7")
A5: 'Dimensions' # 420 x 595 points (5.8" x 8.3")
A6: 'Dimensions' # 298 x 420 points (4.1" x 5.8")
A7: 'Dimensions' # 210 x 298 points (2.9" x 4.1")
A8: 'Dimensions' # 147 x 210 points (2.0" x 2.9")
# Envelope sizes
C4: 'Dimensions' # 649 x 918 points (9.0" x 12.8")
class Dimensions:
"""Represents paper dimensions in points."""
def __init__(self, width: float, height: float):
"""
Initialize dimensions.
Args:
width (float): Width in points (72 points = 1 inch)
height (float): Height in points (72 points = 1 inch)
"""
self.width = width
self.height = height
@property
def width_inches(self) -> float:
"""Width in inches."""
return self.width / 72.0
@property
def height_inches(self) -> float:
"""Height in inches."""
return self.height / 72.0
@property
def width_mm(self) -> float:
"""Width in millimeters."""
return self.width / 72.0 * 25.4
@property
def height_mm(self) -> float:
"""Height in millimeters."""
return self.height / 72.0 * 25.4Compression and encoding filters for PDF content streams.
class FlateDecode:
"""Flate/ZIP compression filter (most common)."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode Flate-compressed data.
Args:
data (bytes): Compressed data
decode_parms (dict, optional): Decode parameters
Returns:
bytes: Decompressed data
"""
@staticmethod
def encode(data: bytes) -> bytes:
"""
Encode data with Flate compression.
Args:
data (bytes): Data to compress
Returns:
bytes: Compressed data
"""
class ASCIIHexDecode:
"""ASCII hexadecimal encoding filter."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode ASCII hex encoded data.
Args:
data (bytes): Hex-encoded data
decode_parms (dict, optional): Decode parameters
Returns:
bytes: Decoded data
"""
class LZWDecode:
"""LZW compression filter."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode LZW compressed data.
Args:
data (bytes): LZW compressed data
decode_parms (dict, optional): Decode parameters
Returns:
bytes: Decompressed data
"""
class DCTDecode:
"""JPEG compression filter."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode JPEG compressed data.
Args:
data (bytes): JPEG data
decode_parms (dict, optional): Decode parameters
Returns:
bytes: Image data
"""
class JPXDecode:
"""JPEG 2000 compression filter."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode JPEG 2000 compressed data.
Args:
data (bytes): JPEG 2000 data
decode_parms (dict, optional): Decode parameters
Returns:
bytes: Image data
"""
class CCITTFaxDecode:
"""CCITT fax compression filter."""
@staticmethod
def decode(data: bytes, decode_parms: dict = None) -> bytes:
"""
Decode CCITT fax compressed data.
Args:
data (bytes): CCITT compressed data
decode_parms (dict, optional): Decode parameters with Width, Height, etc.
Returns:
bytes: Decompressed image data
"""Extended metadata support for documents that include XMP information.
class XmpInformation:
"""Handler for XMP (Extensible Metadata Platform) information."""
def __init__(self, stream):
"""
Initialize XMP information from stream.
Args:
stream: XMP metadata stream
"""
# Methods for accessing XMP metadata
# Implementation varies based on XMP schema and content
# Provides access to Dublin Core, PDF, and custom metadata__version__: str # Current PyPDF2 version string "2.12.1"General utility functions used throughout the library.
def parse_filename_page_ranges(args: List[Union[str, PageRange, None]]) -> List[Tuple[str, PageRange]]:
"""
Parse command-line style filename and page range arguments.
Args:
args: Arguments to parse (e.g., ["file1.pdf", "1:5", "file2.pdf", "::2"])
Returns:
list: List of (filename, page_range) tuples
"""from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.errors import (
PdfReadError, WrongPasswordError, FileNotDecryptedError,
EmptyFileError, DependencyError
)
def safe_pdf_operation(filename):
try:
reader = PdfReader(filename)
if reader.is_encrypted:
# Try to decrypt
reader.decrypt("password")
# Perform operations
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
return writer
except EmptyFileError:
print(f"Error: {filename} is empty or corrupted")
except WrongPasswordError:
print(f"Error: Incorrect password for {filename}")
except FileNotDecryptedError:
print(f"Error: {filename} is encrypted and needs a password")
except PdfReadError as e:
print(f"Error reading {filename}: {e}")
except DependencyError as e:
print(f"Missing dependency: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
return None
# Usage
result = safe_pdf_operation("document.pdf")
if result:
with open("processed.pdf", "wb") as output_file:
result.write(output_file)from PyPDF2 import PdfWriter
from PyPDF2.papersizes import PaperSize
writer = PdfWriter()
# Create pages with different standard sizes
sizes_to_create = [
("A4", PaperSize.A4),
("A3", PaperSize.A3),
("A5", PaperSize.A5),
("C4 Envelope", PaperSize.C4)
]
for name, size in sizes_to_create:
page = writer.add_blank_page(size.width, size.height)
print(f"{name}: {size.width} x {size.height} points")
print(f" {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")
print(f" {size.width_mm:.0f}mm x {size.height_mm:.0f}mm")
with open("standard_sizes.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2.papersizes import Dimensions
# Create custom paper sizes
us_letter = Dimensions(612, 792) # 8.5" x 11"
us_legal = Dimensions(612, 1008) # 8.5" x 14"
tabloid = Dimensions(792, 1224) # 11" x 17"
custom_sizes = [
("US Letter", us_letter),
("US Legal", us_legal),
("Tabloid", tabloid)
]
for name, size in custom_sizes:
print(f"{name}:")
print(f" Points: {size.width} x {size.height}")
print(f" Inches: {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")
print(f" mm: {size.width_mm:.0f} x {size.height_mm:.0f}")from PyPDF2.filters import FlateDecode
import zlib
# Example of manual filter usage (rarely needed)
original_data = b"Hello, World! This is test data for compression."
# Compress data
compressed = FlateDecode.encode(original_data)
print(f"Original size: {len(original_data)} bytes")
print(f"Compressed size: {len(compressed)} bytes")
print(f"Compression ratio: {len(compressed)/len(original_data):.2%}")
# Decompress data
decompressed = FlateDecode.decode(compressed)
print(f"Decompressed: {decompressed.decode('utf-8')}")
print(f"Data integrity: {original_data == decompressed}")from PyPDF2 import __version__
from packaging import version
print(f"PyPDF2 version: {__version__}")
# Check if version meets requirements
required_version = "2.10.0"
if version.parse(__version__) >= version.parse(required_version):
print(f"PyPDF2 version {__version__} meets requirement >= {required_version}")
else:
print(f"PyPDF2 version {__version__} is below requirement >= {required_version}")
print("Consider upgrading with: pip install --upgrade PyPDF2")from PyPDF2 import parse_filename_page_ranges, PdfMerger
import sys
def merge_from_args(args):
"""Merge PDFs based on command line arguments."""
# Parse arguments like: ["file1.pdf", "1:5", "file2.pdf", "::2", "file3.pdf"]
file_ranges = parse_filename_page_ranges(args)
merger = PdfMerger()
for filename, page_range in file_ranges:
print(f"Adding {filename} with pages {page_range}")
merger.append(filename, pages=page_range)
merger.write("merged_output.pdf")
merger.close()
print("Merge completed: merged_output.pdf")
# Example usage
if __name__ == "__main__":
if len(sys.argv) > 1:
merge_from_args(sys.argv[1:])
else:
print("Usage: python script.py file1.pdf 1:5 file2.pdf ::2 file3.pdf")from PyPDF2 import PdfReader
reader = PdfReader("document.pdf")
# Check for XMP metadata
if reader.xmp_metadata:
xmp = reader.xmp_metadata
print("XMP metadata found:")
# XMP access depends on the specific XMP schema and content
# Common patterns:
try:
print(f"Dublin Core title: {xmp.dc_title}")
print(f"Dublin Core creator: {xmp.dc_creator}")
print(f"Dublin Core subject: {xmp.dc_subject}")
except AttributeError:
print("Standard Dublin Core fields not available")
# Raw XMP data
print("Raw XMP metadata available for custom parsing")
else:
print("No XMP metadata found")
# Standard metadata is always available through reader.metadata
if reader.metadata:
print(f"Standard metadata title: {reader.metadata.title}")from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.errors import PdfReadError, PdfStreamError
import warnings
def repair_pdf_attempt(filename):
"""Attempt to repair/recover a corrupted PDF."""
try:
# Try strict mode first
reader = PdfReader(filename, strict=True)
return reader, "No repair needed"
except PdfReadError:
try:
# Try non-strict mode for recovery
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = PdfReader(filename, strict=False)
return reader, "Recovered in non-strict mode"
except PdfReadError:
# Try to extract what we can
try:
reader = PdfReader(filename, strict=False)
writer = PdfWriter()
pages_recovered = 0
for i, page in enumerate(reader.pages):
try:
# Test if page is readable
_ = page.extract_text()
writer.add_page(page)
pages_recovered += 1
except Exception:
print(f"Skipping corrupted page {i+1}")
continue
return writer, f"Partially recovered {pages_recovered} pages"
except Exception as e:
return None, f"Recovery failed: {e}"
# Usage
pdf_reader, status = repair_pdf_attempt("corrupted.pdf")
print(f"Recovery status: {status}")
if pdf_reader:
if hasattr(pdf_reader, 'write'): # It's a writer
with open("repaired.pdf", "wb") as output_file:
pdf_reader.write(output_file)
else: # It's a reader
print(f"Successfully opened PDF with {len(pdf_reader.pages)} pages")Install with Tessl CLI
npx tessl i tessl/pypi-py-pdf2