A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Supporting utilities including page ranges, standard paper sizes, constants, error handling, and type definitions for enhanced developer experience. These utilities provide convenient functionality for common PDF operations.
The PageRange class provides flexible page selection and range specification for PDF operations.
class PageRange:
def __init__(self, arg):
"""
Initialize a page range from various input formats.
Args:
arg: Range specification - can be:
- slice object (e.g., slice(0, 10, 2))
- PageRange object (copy constructor)
- string (e.g., "1-5", "2,4,6", "1-3,7-9")
- integer (single page)
"""
@staticmethod
def valid(input) -> bool:
"""
Check if input is a valid page range specification.
Args:
input: Input to validate
Returns:
True if input is valid for PageRange
"""
def to_slice(self) -> slice:
"""
Convert page range to a slice object.
Returns:
Equivalent slice object
"""
def indices(self, n: int) -> tuple[int, int, int]:
"""
Get slice indices for a given length.
Args:
n: Total number of items
Returns:
Tuple of (start, stop, step) indices
"""
def __str__(self) -> str:
"""String representation of the page range."""
def __repr__(self) -> str:
"""Developer representation of the page range."""
def __eq__(self, other) -> bool:
"""Check equality with another PageRange."""
def __hash__(self) -> int:
"""Hash function for use in sets and dictionaries."""
def __add__(self, other):
"""Add two page ranges together."""Utility function for parsing filename and page range combinations.
def parse_filename_page_ranges(fnprs: list[str]) -> tuple[list[str], list[PageRange]]:
"""
Parse filename and page range strings.
Args:
fnprs: List of strings in format "filename[pages]" or just "filename"
Examples: ["doc.pdf[1-5]", "other.pdf", "file.pdf[2,4,6-8]"]
Returns:
Tuple of (filenames, page_ranges):
- filenames: List of extracted filenames
- page_ranges: List of corresponding PageRange objects
"""Standard paper size definitions for creating properly sized documents.
class PaperSize:
"""Standard paper size definitions in points (72 points = 1 inch)."""
# ISO A series (most common internationally)
A0: tuple[float, float] = (2384, 3370) # 841 × 1189 mm
A1: tuple[float, float] = (1684, 2384) # 594 × 841 mm
A2: tuple[float, float] = (1191, 1684) # 420 × 594 mm
A3: tuple[float, float] = (842, 1191) # 297 × 420 mm
A4: tuple[float, float] = (595, 842) # 210 × 297 mm
A5: tuple[float, float] = (420, 595) # 148 × 210 mm
A6: tuple[float, float] = (298, 420) # 105 × 148 mm
A7: tuple[float, float] = (210, 298) # 74 × 105 mm
A8: tuple[float, float] = (147, 210) # 52 × 74 mm
# Envelope sizes
C4: tuple[float, float] = (649, 918) # 229 × 324 mm envelopePDF-specific constants, enums, and flags for various operations.
from enum import IntEnum, IntFlag
class PasswordType(IntEnum):
"""Types of PDF passwords."""
NOT_DECRYPTED = 0
USER_PASSWORD = 1
OWNER_PASSWORD = 2
class ImageType(IntFlag):
"""Types of images that can be extracted or processed."""
NONE = 0
XOBJECT_IMAGES = 1 # Form XObject images
INLINE_IMAGES = 2 # Inline images in content streams
DRAWING_IMAGES = 4 # Images created by drawing operations
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES # Standard image types
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES # All image types
class ObjectDeletionFlag(IntFlag):
"""Flags for controlling object deletion in PDFs."""
NONE = 0
TEXT = 1 # Text objects
LINKS = 2 # Link annotations
ATTACHMENTS = 4 # File attachments
OBJECTS_3D = 8 # 3D objects
ALL_ANNOTATIONS = 16 # All annotation types
XOBJECT_IMAGES = 32 # Form XObject images
INLINE_IMAGES = 64 # Inline images
DRAWING_IMAGES = 128 # Drawing-based images
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES # All imagesComprehensive exception hierarchy for proper error handling in PDF operations.
class PyPdfError(Exception):
"""Base exception for all pypdf errors."""
class DeprecationError(PyPdfError):
"""Raised when deprecated functionality is used."""
class DependencyError(PyPdfError):
"""Raised when required dependencies are missing."""
class PdfReadError(PyPdfError):
"""Raised when PDF reading fails."""
class PdfStreamError(PdfReadError):
"""Raised when PDF stream processing fails."""
class FileNotDecryptedError(PdfReadError):
"""Raised when trying to access encrypted content without decryption."""
class WrongPasswordError(PdfReadError):
"""Raised when incorrect password is provided for encrypted PDF."""
class EmptyFileError(PdfReadError):
"""Raised when PDF file is empty or invalid."""
class ParseError(PyPdfError):
"""Raised when PDF parsing fails."""
class PageSizeNotDefinedError(ParseError):
"""Raised when page size cannot be determined."""
class EmptyImageDataError(ParseError):
"""Raised when image data is empty or invalid."""
class LimitReachedError(ParseError):
"""Raised when processing limits are exceeded."""
class PdfReadWarning(UserWarning):
"""Warning for non-fatal PDF reading issues."""from pypdf import PdfReader, PdfWriter, PageRange
reader = PdfReader("document.pdf")
writer = PdfWriter()
# Create page ranges in different ways
range1 = PageRange("1-5") # Pages 1 through 5
range2 = PageRange("2,4,6") # Pages 2, 4, and 6
range3 = PageRange(slice(0, 10, 2)) # Every other page from 0 to 9
# Use page range to select pages
for page_num in range(len(reader.pages)):
if page_num in range1.indices(len(reader.pages)):
writer.add_page(reader.pages[page_num])
with open("selected_pages.pdf", "wb") as output:
writer.write(output)from pypdf import PageRange
# Validate page range inputs
inputs = ["1-10", "2,4,6", "invalid", slice(0, 5)]
for inp in inputs:
if PageRange.valid(inp):
pr = PageRange(inp)
print(f"Valid range: {inp} -> {pr}")
print(f" As slice: {pr.to_slice()}")
print(f" Indices for 20 pages: {pr.indices(20)}")
else:
print(f"Invalid range: {inp}")from pypdf import parse_filename_page_ranges
# Parse combined filename and page specifications
file_specs = [
"document.pdf[1-10]",
"report.pdf[2,4,6-8]",
"book.pdf", # No page range specified
"chapter1.pdf[5-]" # From page 5 to end
]
filenames, page_ranges = parse_filename_page_ranges(file_specs)
for filename, page_range in zip(filenames, page_ranges):
print(f"File: {filename}")
if page_range:
print(f" Pages: {page_range}")
else:
print(f" Pages: All")from pypdf import PdfWriter, PageObject, PaperSize
writer = PdfWriter()
# Create pages with standard sizes
sizes_to_create = [
("Letter", (612, 792)), # US Letter
("A4", PaperSize.A4), # ISO A4
("A3", PaperSize.A3), # ISO A3
("Legal", (612, 1008)) # US Legal
]
for name, (width, height) in sizes_to_create:
page = PageObject.create_blank_page(width, height)
writer.add_page(page)
print(f"Created {name} page: {width} x {height} points")
with open("standard_sizes.pdf", "wb") as output:
writer.write(output)from pypdf import PdfReader, PdfWriter
from pypdf.errors import (
PdfReadError, FileNotDecryptedError, WrongPasswordError,
EmptyFileError, ParseError
)
def safe_pdf_operation(pdf_path: str, password: str = None):
"""Safely perform PDF operations with comprehensive error handling."""
try:
reader = PdfReader(pdf_path, password=password)
if reader.is_encrypted and not password:
raise FileNotDecryptedError("PDF is encrypted but no password provided")
writer = PdfWriter()
# Process each page safely
for page_num, page in enumerate(reader.pages):
try:
# Attempt to extract text to verify page is readable
text = page.extract_text()
writer.add_page(page)
print(f"Processed page {page_num + 1}: {len(text)} characters")
except ParseError as e:
print(f"Warning: Could not process page {page_num + 1}: {e}")
# Skip problematic page or add blank page
blank_page = PageObject.create_blank_page(612, 792)
writer.add_page(blank_page)
# Save result
output_path = pdf_path.replace('.pdf', '_processed.pdf')
with open(output_path, "wb") as output:
writer.write(output)
print(f"Successfully processed {pdf_path}")
return True
except FileNotDecryptedError:
print(f"Error: {pdf_path} is encrypted. Please provide password.")
return False
except WrongPasswordError:
print(f"Error: Incorrect password for {pdf_path}")
return False
except EmptyFileError:
print(f"Error: {pdf_path} is empty or corrupted")
return False
except PdfReadError as e:
print(f"Error reading {pdf_path}: {e}")
return False
except Exception as e:
print(f"Unexpected error processing {pdf_path}: {e}")
return False
# Use the safe operation
success = safe_pdf_operation("document.pdf")
if not success:
success = safe_pdf_operation("document.pdf", password="secret")from pypdf import PdfReader, ImageType
reader = PdfReader("document_with_images.pdf")
for page_num, page in enumerate(reader.pages):
print(f"Page {page_num + 1}:")
# Extract different types of images
try:
# All images
all_images = page.images
print(f" Total images: {len(all_images)}")
# You can specify image types when working with image extraction
# (This would be used in specific image extraction methods)
print(f" Image types available: {list(ImageType)}")
except Exception as e:
print(f" Error accessing images: {e}")from pypdf import PdfReader, PdfWriter, PageRange, PaperSize
from pypdf.errors import PyPdfError
def extract_page_range(input_pdf: str, output_pdf: str, page_range_str: str):
"""Extract specific pages to new PDF."""
try:
reader = PdfReader(input_pdf)
writer = PdfWriter()
# Parse page range
page_range = PageRange(page_range_str)
start, stop, step = page_range.indices(len(reader.pages))
# Extract pages
for i in range(start, stop, step):
if i < len(reader.pages):
writer.add_page(reader.pages[i])
with open(output_pdf, "wb") as output:
writer.write(output)
print(f"Extracted pages {page_range_str} to {output_pdf}")
except PyPdfError as e:
print(f"PDF Error: {e}")
except Exception as e:
print(f"Error: {e}")
def create_blank_document(output_pdf: str, page_count: int = 1, size: str = "A4"):
"""Create a blank PDF document."""
writer = PdfWriter()
# Get paper size
if hasattr(PaperSize, size):
width, height = getattr(PaperSize, size)
else:
# Default to A4 if size not found
width, height = PaperSize.A4
print(f"Unknown size '{size}', using A4")
# Create blank pages
for _ in range(page_count):
page = PageObject.create_blank_page(width, height)
writer.add_page(page)
with open(output_pdf, "wb") as output:
writer.write(output)
print(f"Created {page_count} blank {size} pages in {output_pdf}")
def get_pdf_info(pdf_path: str) -> dict:
"""Get comprehensive PDF information."""
try:
reader = PdfReader(pdf_path)
info = {
"filename": pdf_path,
"page_count": len(reader.pages),
"is_encrypted": reader.is_encrypted,
"pdf_version": reader.pdf_header,
"metadata": {},
"page_sizes": []
}
# Get metadata
if reader.metadata:
info["metadata"] = {
"title": reader.metadata.title,
"author": reader.metadata.author,
"subject": reader.metadata.subject,
"creator": reader.metadata.creator,
"producer": reader.metadata.producer
}
# Get page sizes
for i, page in enumerate(reader.pages):
try:
width = float(page.mediabox.width)
height = float(page.mediabox.height)
info["page_sizes"].append({
"page": i + 1,
"width": width,
"height": height,
"size_points": f"{width} x {height}"
})
except:
info["page_sizes"].append({
"page": i + 1,
"error": "Could not determine size"
})
return info
except Exception as e:
return {
"filename": pdf_path,
"error": str(e)
}
# Use utility functions
extract_page_range("document.pdf", "pages_1_to_5.pdf", "1-5")
create_blank_document("blank.pdf", 10, "A4")
info = get_pdf_info("document.pdf")
print(f"PDF Info: {info}")pypdf provides a comprehensive exception hierarchy for different types of PDF processing errors.
# Base exception classes
class PyPdfError(Exception):
"""Base class for all exceptions raised by pypdf."""
class PdfReadError(PyPdfError):
"""Raised when there is an issue reading a PDF file."""
class PdfStreamError(PdfReadError):
"""Raised when there is an issue reading the stream of data in a PDF file."""
class ParseError(PyPdfError):
"""Raised when there is an issue parsing a PDF file."""
# File access and decryption errors
class FileNotDecryptedError(PdfReadError):
"""Raised when an encrypted PDF has not been successfully decrypted."""
class WrongPasswordError(FileNotDecryptedError):
"""Raised when the wrong password is used to decrypt an encrypted PDF."""
class EmptyFileError(PdfReadError):
"""Raised when a PDF file is empty or has no content."""
# Specific operation errors
class PageSizeNotDefinedError(PyPdfError):
"""Raised when the page size of a PDF document is not defined."""
class EmptyImageDataError(PyPdfError):
"""Raised when trying to process an image that has no data."""
class LimitReachedError(PyPdfError):
"""Raised when a limit is reached."""
# Dependency and deprecation errors
class DependencyError(Exception):
"""Raised when a required dependency is not available."""
class DeprecationError(Exception):
"""Raised when a deprecated feature is used."""
# Warnings
class PdfReadWarning(UserWarning):
"""Issued when there is a potential issue reading a PDF file, but it can still be read."""from pypdf.constants import UserAccessPermissions
class UserAccessPermissions(IntFlag):
"""PDF user access permissions for encryption."""
PRINT = 4 # Allow printing
MODIFY = 8 # Allow document modification
EXTRACT = 16 # Allow text/graphics extraction
ADD_OR_MODIFY = 32 # Allow annotations/form fields
FILL_FORM_FIELDS = 256 # Allow form field filling
EXTRACT_TEXT_AND_GRAPHICS = 512 # Allow accessibility extraction
ASSEMBLE_DOC = 1024 # Allow document assembly
PRINT_TO_REPRESENTATION = 2048 # Allow high-quality printing
@classmethod
def all(cls) -> "UserAccessPermissions":
"""Get all permissions enabled."""
def to_dict(self) -> dict[str, bool]:
"""Convert permissions to dictionary format."""
@classmethod
def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions":
"""Create permissions from dictionary format."""# Stream processing constants
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
# Core PDF structure constants
class Core:
OUTLINES = "/Outlines"
THREADS = "/Threads"
PAGE = "/Page"
PAGES = "/Pages"
CATALOG = "/Catalog"
class TrailerKeys:
SIZE = "/Size"
PREV = "/Prev"
ROOT = "/Root"
ENCRYPT = "/Encrypt"
INFO = "/Info"
ID = "/ID"Install with Tessl CLI
npx tessl i tessl/pypi-pypdf