tessl/pypi-pypdf

A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Utilities

Name: tessl/pypi-pypdf
Author: tessl

Supporting utilities including page ranges, standard paper sizes, constants, error handling, and type definitions for enhanced developer experience. These utilities provide convenient functionality for common PDF operations.

Capabilities

Page Ranges

The PageRange class provides flexible page selection and range specification for PDF operations.

class PageRange:
    def __init__(self, arg):
        """
        Initialize a page range from various input formats.
        
        Args:
            arg: Range specification - can be:
                - slice object (e.g., slice(0, 10, 2))
                - PageRange object (copy constructor)
                - string (e.g., "1-5", "2,4,6", "1-3,7-9")
                - integer (single page)
        """

    @staticmethod
    def valid(input) -> bool:
        """
        Check if input is a valid page range specification.
        
        Args:
            input: Input to validate
            
        Returns:
            True if input is valid for PageRange
        """

    def to_slice(self) -> slice:
        """
        Convert page range to a slice object.
        
        Returns:
            Equivalent slice object
        """

    def indices(self, n: int) -> tuple[int, int, int]:
        """
        Get slice indices for a given length.
        
        Args:
            n: Total number of items
            
        Returns:
            Tuple of (start, stop, step) indices
        """

    def __str__(self) -> str:
        """String representation of the page range."""

    def __repr__(self) -> str:
        """Developer representation of the page range."""

    def __eq__(self, other) -> bool:
        """Check equality with another PageRange."""

    def __hash__(self) -> int:
        """Hash function for use in sets and dictionaries."""

    def __add__(self, other):
        """Add two page ranges together."""

Page Range Parsing

Utility function for parsing filename and page range combinations.

def parse_filename_page_ranges(fnprs: list[str]) -> tuple[list[str], list[PageRange]]:
    """
    Parse filename and page range strings.
    
    Args:
        fnprs: List of strings in format "filename[pages]" or just "filename"
               Examples: ["doc.pdf[1-5]", "other.pdf", "file.pdf[2,4,6-8]"]
    
    Returns:
        Tuple of (filenames, page_ranges):
        - filenames: List of extracted filenames
        - page_ranges: List of corresponding PageRange objects
    """

Paper Sizes

Standard paper size definitions for creating properly sized documents.

class PaperSize:
    """Standard paper size definitions in points (72 points = 1 inch)."""
    
    # ISO A series (most common internationally)
    A0: tuple[float, float] = (2384, 3370)  # 841 × 1189 mm
    A1: tuple[float, float] = (1684, 2384)  # 594 × 841 mm
    A2: tuple[float, float] = (1191, 1684)  # 420 × 594 mm
    A3: tuple[float, float] = (842, 1191)   # 297 × 420 mm
    A4: tuple[float, float] = (595, 842)    # 210 × 297 mm
    A5: tuple[float, float] = (420, 595)    # 148 × 210 mm
    A6: tuple[float, float] = (298, 420)    # 105 × 148 mm
    A7: tuple[float, float] = (210, 298)    # 74 × 105 mm
    A8: tuple[float, float] = (147, 210)    # 52 × 74 mm
    
    # Envelope sizes
    C4: tuple[float, float] = (649, 918)    # 229 × 324 mm envelope

Constants and Enums

PDF-specific constants, enums, and flags for various operations.

from enum import IntEnum, IntFlag

class PasswordType(IntEnum):
    """Types of PDF passwords."""
    NOT_DECRYPTED = 0
    USER_PASSWORD = 1
    OWNER_PASSWORD = 2

class ImageType(IntFlag):
    """Types of images that can be extracted or processed."""
    NONE = 0
    XOBJECT_IMAGES = 1      # Form XObject images
    INLINE_IMAGES = 2       # Inline images in content streams
    DRAWING_IMAGES = 4      # Images created by drawing operations
    IMAGES = XOBJECT_IMAGES | INLINE_IMAGES  # Standard image types
    ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES  # All image types

class ObjectDeletionFlag(IntFlag):
    """Flags for controlling object deletion in PDFs."""
    NONE = 0
    TEXT = 1                # Text objects
    LINKS = 2               # Link annotations
    ATTACHMENTS = 4         # File attachments
    OBJECTS_3D = 8          # 3D objects
    ALL_ANNOTATIONS = 16    # All annotation types
    XOBJECT_IMAGES = 32     # Form XObject images
    INLINE_IMAGES = 64      # Inline images
    DRAWING_IMAGES = 128    # Drawing-based images
    IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES  # All images

Error Handling

Comprehensive exception hierarchy for proper error handling in PDF operations.

class PyPdfError(Exception):
    """Base exception for all pypdf errors."""

class DeprecationError(PyPdfError):
    """Raised when deprecated functionality is used."""

class DependencyError(PyPdfError):
    """Raised when required dependencies are missing."""

class PdfReadError(PyPdfError):
    """Raised when PDF reading fails."""

class PdfStreamError(PdfReadError):
    """Raised when PDF stream processing fails."""

class FileNotDecryptedError(PdfReadError):
    """Raised when trying to access encrypted content without decryption."""

class WrongPasswordError(PdfReadError):
    """Raised when incorrect password is provided for encrypted PDF."""

class EmptyFileError(PdfReadError):
    """Raised when PDF file is empty or invalid."""

class ParseError(PyPdfError):
    """Raised when PDF parsing fails."""

class PageSizeNotDefinedError(ParseError):
    """Raised when page size cannot be determined."""

class EmptyImageDataError(ParseError):
    """Raised when image data is empty or invalid."""

class LimitReachedError(ParseError):
    """Raised when processing limits are exceeded."""

class PdfReadWarning(UserWarning):
    """Warning for non-fatal PDF reading issues."""

Usage Examples

Working with Page Ranges

from pypdf import PdfReader, PdfWriter, PageRange

reader = PdfReader("document.pdf")
writer = PdfWriter()

# Create page ranges in different ways
range1 = PageRange("1-5")      # Pages 1 through 5
range2 = PageRange("2,4,6")    # Pages 2, 4, and 6
range3 = PageRange(slice(0, 10, 2))  # Every other page from 0 to 9

# Use page range to select pages
for page_num in range(len(reader.pages)):
    if page_num in range1.indices(len(reader.pages)):
        writer.add_page(reader.pages[page_num])

with open("selected_pages.pdf", "wb") as output:
    writer.write(output)

Page Range Validation and Conversion

from pypdf import PageRange

# Validate page range inputs
inputs = ["1-10", "2,4,6", "invalid", slice(0, 5)]

for inp in inputs:
    if PageRange.valid(inp):
        pr = PageRange(inp)
        print(f"Valid range: {inp} -> {pr}")
        print(f"  As slice: {pr.to_slice()}")
        print(f"  Indices for 20 pages: {pr.indices(20)}")
    else:
        print(f"Invalid range: {inp}")

Parsing Filename and Page Ranges

from pypdf import parse_filename_page_ranges

# Parse combined filename and page specifications
file_specs = [
    "document.pdf[1-10]",
    "report.pdf[2,4,6-8]",
    "book.pdf",  # No page range specified
    "chapter1.pdf[5-]"  # From page 5 to end
]

filenames, page_ranges = parse_filename_page_ranges(file_specs)

for filename, page_range in zip(filenames, page_ranges):
    print(f"File: {filename}")
    if page_range:
        print(f"  Pages: {page_range}")
    else:
        print(f"  Pages: All")

Using Standard Paper Sizes

from pypdf import PdfWriter, PageObject, PaperSize

writer = PdfWriter()

# Create pages with standard sizes
sizes_to_create = [
    ("Letter", (612, 792)),     # US Letter
    ("A4", PaperSize.A4),       # ISO A4
    ("A3", PaperSize.A3),       # ISO A3
    ("Legal", (612, 1008))      # US Legal
]

for name, (width, height) in sizes_to_create:
    page = PageObject.create_blank_page(width, height)
    writer.add_page(page)
    print(f"Created {name} page: {width} x {height} points")

with open("standard_sizes.pdf", "wb") as output:
    writer.write(output)

Error Handling Best Practices

from pypdf import PdfReader, PdfWriter
from pypdf.errors import (
    PdfReadError, FileNotDecryptedError, WrongPasswordError,
    EmptyFileError, ParseError
)

def safe_pdf_operation(pdf_path: str, password: str = None):
    """Safely perform PDF operations with comprehensive error handling."""
    
    try:
        reader = PdfReader(pdf_path, password=password)
        
        if reader.is_encrypted and not password:
            raise FileNotDecryptedError("PDF is encrypted but no password provided")
        
        writer = PdfWriter()
        
        # Process each page safely
        for page_num, page in enumerate(reader.pages):
            try:
                # Attempt to extract text to verify page is readable
                text = page.extract_text()
                writer.add_page(page)
                print(f"Processed page {page_num + 1}: {len(text)} characters")
                
            except ParseError as e:
                print(f"Warning: Could not process page {page_num + 1}: {e}")
                # Skip problematic page or add blank page
                blank_page = PageObject.create_blank_page(612, 792)
                writer.add_page(blank_page)
        
        # Save result
        output_path = pdf_path.replace('.pdf', '_processed.pdf')
        with open(output_path, "wb") as output:
            writer.write(output)
            
        print(f"Successfully processed {pdf_path}")
        return True
        
    except FileNotDecryptedError:
        print(f"Error: {pdf_path} is encrypted. Please provide password.")
        return False
        
    except WrongPasswordError:
        print(f"Error: Incorrect password for {pdf_path}")
        return False
        
    except EmptyFileError:
        print(f"Error: {pdf_path} is empty or corrupted")
        return False
        
    except PdfReadError as e:
        print(f"Error reading {pdf_path}: {e}")
        return False
        
    except Exception as e:
        print(f"Unexpected error processing {pdf_path}: {e}")
        return False

# Use the safe operation
success = safe_pdf_operation("document.pdf")
if not success:
    success = safe_pdf_operation("document.pdf", password="secret")

Working with Image Types

from pypdf import PdfReader, ImageType

reader = PdfReader("document_with_images.pdf")

for page_num, page in enumerate(reader.pages):
    print(f"Page {page_num + 1}:")
    
    # Extract different types of images
    try:
        # All images
        all_images = page.images
        print(f"  Total images: {len(all_images)}")
        
        # You can specify image types when working with image extraction
        # (This would be used in specific image extraction methods)
        print(f"  Image types available: {list(ImageType)}")
        
    except Exception as e:
        print(f"  Error accessing images: {e}")

Utility Functions for Common Operations

from pypdf import PdfReader, PdfWriter, PageRange, PaperSize
from pypdf.errors import PyPdfError

def extract_page_range(input_pdf: str, output_pdf: str, page_range_str: str):
    """Extract specific pages to new PDF."""
    try:
        reader = PdfReader(input_pdf)
        writer = PdfWriter()
        
        # Parse page range
        page_range = PageRange(page_range_str)
        start, stop, step = page_range.indices(len(reader.pages))
        
        # Extract pages
        for i in range(start, stop, step):
            if i < len(reader.pages):
                writer.add_page(reader.pages[i])
        
        with open(output_pdf, "wb") as output:
            writer.write(output)
        
        print(f"Extracted pages {page_range_str} to {output_pdf}")
        
    except PyPdfError as e:
        print(f"PDF Error: {e}")
    except Exception as e:
        print(f"Error: {e}")

def create_blank_document(output_pdf: str, page_count: int = 1, size: str = "A4"):
    """Create a blank PDF document."""
    writer = PdfWriter()
    
    # Get paper size
    if hasattr(PaperSize, size):
        width, height = getattr(PaperSize, size)
    else:
        # Default to A4 if size not found
        width, height = PaperSize.A4
        print(f"Unknown size '{size}', using A4")
    
    # Create blank pages
    for _ in range(page_count):
        page = PageObject.create_blank_page(width, height)
        writer.add_page(page)
    
    with open(output_pdf, "wb") as output:
        writer.write(output)
    
    print(f"Created {page_count} blank {size} pages in {output_pdf}")

def get_pdf_info(pdf_path: str) -> dict:
    """Get comprehensive PDF information."""
    try:
        reader = PdfReader(pdf_path)
        
        info = {
            "filename": pdf_path,
            "page_count": len(reader.pages),
            "is_encrypted": reader.is_encrypted,
            "pdf_version": reader.pdf_header,
            "metadata": {},
            "page_sizes": []
        }
        
        # Get metadata
        if reader.metadata:
            info["metadata"] = {
                "title": reader.metadata.title,
                "author": reader.metadata.author,
                "subject": reader.metadata.subject,
                "creator": reader.metadata.creator,
                "producer": reader.metadata.producer
            }
        
        # Get page sizes
        for i, page in enumerate(reader.pages):
            try:
                width = float(page.mediabox.width)
                height = float(page.mediabox.height)
                info["page_sizes"].append({
                    "page": i + 1,
                    "width": width,
                    "height": height,
                    "size_points": f"{width} x {height}"
                })
            except:
                info["page_sizes"].append({
                    "page": i + 1,
                    "error": "Could not determine size"
                })
        
        return info
        
    except Exception as e:
        return {
            "filename": pdf_path,
            "error": str(e)
        }

# Use utility functions
extract_page_range("document.pdf", "pages_1_to_5.pdf", "1-5")
create_blank_document("blank.pdf", 10, "A4")
info = get_pdf_info("document.pdf")
print(f"PDF Info: {info}")

Error Classes and Exception Handling

Exception Hierarchy

pypdf provides a comprehensive exception hierarchy for different types of PDF processing errors.

# Base exception classes
class PyPdfError(Exception):
    """Base class for all exceptions raised by pypdf."""

class PdfReadError(PyPdfError):
    """Raised when there is an issue reading a PDF file."""

class PdfStreamError(PdfReadError):
    """Raised when there is an issue reading the stream of data in a PDF file."""

class ParseError(PyPdfError):
    """Raised when there is an issue parsing a PDF file."""

# File access and decryption errors
class FileNotDecryptedError(PdfReadError):
    """Raised when an encrypted PDF has not been successfully decrypted."""

class WrongPasswordError(FileNotDecryptedError):
    """Raised when the wrong password is used to decrypt an encrypted PDF."""

class EmptyFileError(PdfReadError):
    """Raised when a PDF file is empty or has no content."""

# Specific operation errors
class PageSizeNotDefinedError(PyPdfError):
    """Raised when the page size of a PDF document is not defined."""

class EmptyImageDataError(PyPdfError):
    """Raised when trying to process an image that has no data."""

class LimitReachedError(PyPdfError):
    """Raised when a limit is reached."""

# Dependency and deprecation errors
class DependencyError(Exception):
    """Raised when a required dependency is not available."""

class DeprecationError(Exception):
    """Raised when a deprecated feature is used."""

# Warnings
class PdfReadWarning(UserWarning):
    """Issued when there is a potential issue reading a PDF file, but it can still be read."""

User Access Permission Constants

from pypdf.constants import UserAccessPermissions

class UserAccessPermissions(IntFlag):
    """PDF user access permissions for encryption."""
    
    PRINT = 4                         # Allow printing
    MODIFY = 8                        # Allow document modification
    EXTRACT = 16                      # Allow text/graphics extraction
    ADD_OR_MODIFY = 32                # Allow annotations/form fields
    FILL_FORM_FIELDS = 256            # Allow form field filling
    EXTRACT_TEXT_AND_GRAPHICS = 512   # Allow accessibility extraction
    ASSEMBLE_DOC = 1024               # Allow document assembly
    PRINT_TO_REPRESENTATION = 2048    # Allow high-quality printing
    
    @classmethod
    def all(cls) -> "UserAccessPermissions":
        """Get all permissions enabled."""
        
    def to_dict(self) -> dict[str, bool]:
        """Convert permissions to dictionary format."""
        
    @classmethod
    def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions":
        """Create permissions from dictionary format."""

Stream and Parsing Constants

# Stream processing constants
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"

# Core PDF structure constants
class Core:
    OUTLINES = "/Outlines"
    THREADS = "/Threads"
    PAGE = "/Page"
    PAGES = "/Pages" 
    CATALOG = "/Catalog"

class TrailerKeys:
    SIZE = "/Size"
    PREV = "/Prev"
    ROOT = "/Root"
    ENCRYPT = "/Encrypt"
    INFO = "/Info"
    ID = "/ID"

Install with Tessl CLI