tessl/pypi-pypdfium2

Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing

—

Pending

Overview

Eval results

Files

Document Management

Name: tessl/pypi-pypdfium2
Author: tessl

Core PDF document operations including loading, creating, saving, metadata handling, and document-level manipulation. The PdfDocument class serves as the primary entry point for all PDF operations.

Capabilities

Document Creation and Loading

Create new PDF documents or load existing ones from various sources including file paths, bytes, and file-like objects.

class PdfDocument:
    def __init__(self, input, password=None, autoclose=False):
        """
        Create a PDF document from various input sources.
        
        Parameters:
        - input: str (file path), bytes, or file-like object
        - password: str, optional password for encrypted PDFs
        - autoclose: bool, automatically close document when object is deleted
        """
    
    @classmethod
    def new(cls) -> PdfDocument:
        """Create a new empty PDF document."""

Example usage:

import pypdfium2 as pdfium

# Load from file path
pdf = pdfium.PdfDocument("document.pdf")

# Load with password
pdf = pdfium.PdfDocument("encrypted.pdf", password="secret")

# Load from bytes
with open("document.pdf", "rb") as f:
    pdf_bytes = f.read()
pdf = pdfium.PdfDocument(pdf_bytes)

# Create new document
new_pdf = pdfium.PdfDocument.new()

Document Information

Access and modify document metadata, version information, and properties.

def __len__(self) -> int:
    """Get the number of pages in the document."""

def get_version(self) -> int | None:
    """Get PDF version number (e.g., 14 for PDF 1.4)."""

def get_identifier(self, type=...) -> bytes:
    """Get document file identifier."""

def is_tagged(self) -> bool:
    """Check if document is a tagged PDF for accessibility."""

def get_pagemode(self) -> int:
    """Get page mode (how document should be displayed)."""

def get_formtype(self) -> int:
    """Get form type if document contains interactive forms."""

Metadata Management

Read and write PDF metadata including title, author, subject, keywords, and creation information.

def get_metadata_value(self, key: str) -> str:
    """
    Get specific metadata value.
    
    Parameters:
    - key: str, metadata key (Title, Author, Subject, Keywords, Creator, Producer, CreationDate, ModDate)
    
    Returns:
    str: Metadata value or empty string if not found
    """

def get_metadata_dict(self, skip_empty=False) -> dict:
    """
    Get all metadata as dictionary.
    
    Parameters:
    - skip_empty: bool, exclude empty metadata values
    
    Returns:
    dict: Metadata key-value pairs
    """

# Available metadata keys
METADATA_KEYS = ("Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModDate")

Example:

pdf = pdfium.PdfDocument("document.pdf")

# Get specific metadata
title = pdf.get_metadata_value("Title")
author = pdf.get_metadata_value("Author")

# Get all metadata
metadata = pdf.get_metadata_dict()
print(f"Title: {metadata.get('Title', 'Unknown')}")
print(f"Pages: {len(pdf)}")
print(f"PDF Version: {pdf.get_version()}")

Page Management

Access, create, delete, and manipulate pages within the document.

def __iter__(self) -> Iterator[PdfPage]:
    """Iterate over all pages in the document."""

def __getitem__(self, index: int) -> PdfPage:
    """Get page by index (0-based)."""

def __delitem__(self, index: int):
    """Delete page by index."""

def get_page(self, index: int) -> PdfPage:
    """Get page by index with explicit method."""

def new_page(self, width: float, height: float, index: int = None) -> PdfPage:
    """
    Create new page in document.
    
    Parameters:
    - width: float, page width in PDF units (1/72 inch)
    - height: float, page height in PDF units  
    - index: int, optional insertion index (None = append)
    
    Returns:
    PdfPage: New page object
    """

def del_page(self, index: int):
    """Delete page by index."""

def import_pages(self, pdf: PdfDocument, pages=None, index=None):
    """
    Import pages from another PDF document.
    
    Parameters:
    - pdf: PdfDocument, source document
    - pages: list of int, page indices to import (None = all pages)
    - index: int, insertion point in this document (None = append)
    """

def get_page_size(self, index: int) -> tuple[float, float]:
    """Get page dimensions as (width, height) tuple."""

def get_page_label(self, index: int) -> str:
    """Get page label (may differ from index for custom numbering)."""

def page_as_xobject(self, index: int, dest_pdf: PdfDocument) -> PdfXObject:
    """Convert page to Form XObject for embedding in another document."""

Example usage:

pdf = pdfium.PdfDocument("document.pdf")

# Access pages
first_page = pdf[0]
last_page = pdf[-1]

# Iterate pages
for i, page in enumerate(pdf):
    print(f"Page {i+1}: {page.get_size()}")

# Create new page
new_page = pdf.new_page(612, 792)  # US Letter size

# Import pages from another PDF
source_pdf = pdfium.PdfDocument("source.pdf")
pdf.import_pages(source_pdf, pages=[0, 2, 4])  # Import pages 1, 3, 5

# Delete a page
del pdf[5]

File Attachments

Manage embedded file attachments within the PDF document.

def count_attachments(self) -> int:
    """Get number of file attachments."""

def get_attachment(self, index: int) -> PdfAttachment:
    """Get attachment by index."""

def new_attachment(self, name: str) -> PdfAttachment:
    """
    Create new file attachment.
    
    Parameters:
    - name: str, attachment filename
    
    Returns:
    PdfAttachment: New attachment object
    """

def del_attachment(self, index: int):
    """Delete attachment by index."""

Document Outline and Bookmarks

Navigate and extract the document's table of contents structure, including nested bookmarks.

def get_toc(self, max_depth=15, parent=None, level=0, seen=None) -> Iterator[PdfOutlineItem]:
    """
    Iterate through the bookmarks in the document's table of contents.
    
    Parameters:
    - max_depth: int, maximum recursion depth to consider (default: 15)
    - parent: internal parent bookmark (typically None for root level)
    - level: internal nesting level (typically 0 for root)
    - seen: internal set for circular reference detection
    
    Yields:
    PdfOutlineItem: Bookmark information objects
    
    Each bookmark contains title, page reference, view settings, and
    hierarchical information including nesting level and child counts.
    """

PdfOutlineItem Class

Bookmark information structure for PDF table of contents entries.

class PdfOutlineItem:
    """
    Bookmark information namedtuple for PDF outline entries.
    
    Represents a single bookmark/outline item from a PDF's table of contents,
    containing hierarchical navigation information and target page details.
    
    Attributes:
    - level: int, number of parent items (nesting depth)
    - title: str, title string of the bookmark
    - is_closed: bool | None, True if children should be collapsed,
                             False if expanded, None if no children
    - n_kids: int, absolute number of child items
    - page_index: int | None, zero-based target page index (None if no target)
    - view_mode: int, view mode constant defining coordinate interpretation
    - view_pos: list[float], target position coordinates on the page
    """
    
    level: int
    title: str  
    is_closed: bool | None
    n_kids: int
    page_index: int | None
    view_mode: int
    view_pos: list[float]

Example usage:

pdf = pdfium.PdfDocument("document_with_bookmarks.pdf")

# Extract table of contents
for bookmark in pdf.get_toc():
    indent = "  " * bookmark.level  # Indent based on nesting
    print(f"{indent}{bookmark.title}")
    
    if bookmark.page_index is not None:
        print(f"{indent}  → Page {bookmark.page_index + 1}")
        print(f"{indent}  → Position: {bookmark.view_pos}")
    
    if bookmark.n_kids > 0:
        expanded = "📂" if not bookmark.is_closed else "📁"
        print(f"{indent}  {expanded} ({bookmark.n_kids} children)")

# Navigate to specific bookmark
for bookmark in pdf.get_toc():
    if "Chapter 1" in bookmark.title and bookmark.page_index is not None:
        # Load the target page
        target_page = pdf[bookmark.page_index]
        break

Interactive Forms

Initialize interactive form environment for handling PDF forms and annotations.

def init_forms(self, config=None):
    """
    Initialize interactive form environment.
    
    Parameters:
    - config: optional form configuration
    
    Sets up form environment for handling interactive elements,
    annotations, and form fields.
    """

PdfFormEnv Class

Form environment helper class for managing interactive PDF forms.

class PdfFormEnv:
    """
    Form environment helper class for managing interactive PDF forms.
    
    This class provides the form environment context needed for rendering
    and interacting with PDF forms. Created automatically when init_forms()
    is called on a document that contains forms.
    
    Attributes:
    - raw: FPDF_FORMHANDLE, underlying PDFium form env handle
    - config: FPDF_FORMFILLINFO, form configuration interface
    - pdf: PdfDocument, parent document this form env belongs to
    """
    
    def __init__(self, raw, config, pdf):
        """
        Initialize form environment.
        
        Parameters:
        - raw: FPDF_FORMHANDLE, PDFium form handle
        - config: FPDF_FORMFILLINFO, form configuration
        - pdf: PdfDocument, parent document
        
        Note: This is typically created automatically by PdfDocument.init_forms()
        rather than being instantiated directly.
        """
    
    def close(self):
        """Close and clean up form environment resources."""

Example usage:

pdf = pdfium.PdfDocument("form.pdf")

# Initialize forms if document contains them
pdf.init_forms()

if pdf.formenv:
    print("Form environment is active")
    # Form environment will be used automatically during page rendering
    # to handle interactive form elements

Document Saving

Save PDF documents to files or buffers with version control and optimization options.

def save(self, dest, version=None, flags=...):
    """
    Save document to file or buffer.
    
    Parameters:
    - dest: str (file path) or file-like object for output
    - version: int, optional PDF version to save as
    - flags: various save options and optimization flags
    
    Saves the current state of the document including all modifications,
    new pages, and metadata changes.
    """

Example:

pdf = pdfium.PdfDocument("input.pdf")

# Modify document
pdf.new_page(612, 792)

# Save to new file
pdf.save("output.pdf")

# Save to buffer
import io
buffer = io.BytesIO()
pdf.save(buffer)
pdf_bytes = buffer.getvalue()

Resource Management

Proper cleanup and resource management for PDF documents.

def close():
    """Close document and free resources."""

def __enter__(self) -> PdfDocument:
    """Context manager entry."""

def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit with cleanup."""

Always close documents when done or use context managers:

# Manual cleanup
pdf = pdfium.PdfDocument("document.pdf")
# ... work with PDF
pdf.close()

# Context manager (recommended)
with pdfium.PdfDocument("document.pdf") as pdf:
    # ... work with PDF
    pass  # Automatically closed

Properties

@property
def raw(self) -> FPDF_DOCUMENT:
    """Raw PDFium document handle for low-level operations."""

@property  
def formenv(self) -> PdfFormEnv | None:
    """Form environment if initialized, None otherwise."""

Advanced Features

Unsupported Feature Handling

Handle notifications about PDF features not supported by the PDFium library.

PdfUnspHandler Class

Unsupported feature handler for managing notifications about PDF features not available in PDFium.

class PdfUnspHandler:
    """
    Unsupported feature handler helper class.
    
    Manages callbacks for handling notifications when PDFium encounters
    PDF features that are not supported by the current build. Useful for
    logging, debugging, and informing users about document limitations.
    
    Attributes:
    - handlers: dict[str, callable], dictionary of named handler functions
                called with unsupported feature codes (FPDF_UNSP_*)
    """
    
    def __init__(self):
        """Initialize unsupported feature handler."""
    
    def setup(self, add_default=True):
        """
        Attach the handler to PDFium and register exit function.
        
        Parameters:
        - add_default: bool, if True, add default warning callback
        
        Sets up the handler to receive notifications from PDFium when
        unsupported features are encountered during document processing.
        """
    
    def __call__(self, _, type: int):
        """
        Handle unsupported feature notification.
        
        Parameters:
        - _: unused parameter (PDFium context)  
        - type: int, unsupported feature code (FPDF_UNSP_*)
        
        Called automatically by PDFium when unsupported features are found.
        Executes all registered handler functions with the feature code.
        """

Example usage:

import pypdfium2 as pdfium

# Create and setup unsupported feature handler
unsp_handler = pdfium.PdfUnspHandler()

# Add custom handler for unsupported features
def my_handler(feature_code):
    feature_name = {
        1: "Document XFA", 
        2: "Portable Collection",
        3: "Attachment",
        4: "Security", 
        5: "Shared Review",
        6: "Shared Form Acrobat",
        7: "Shared Form Filesystem", 
        8: "Shared Form Email",
        9: "3D Annotation",
        10: "Movie Annotation",
        11: "Sound Annotation", 
        12: "Screen Media",
        13: "Screen Rich Media",
        14: "Attachment 3D",
        15: "Multimedia"
    }.get(feature_code, f"Unknown feature {feature_code}")
    
    print(f"Warning: Unsupported PDF feature detected: {feature_name}")

unsp_handler.handlers["custom"] = my_handler

# Setup handler (includes default warning logger)
unsp_handler.setup(add_default=True)

# Now when processing PDFs, unsupported features will be reported
pdf = pdfium.PdfDocument("document_with_unsupported_features.pdf")
# Any unsupported features will trigger the handlers

Install with Tessl CLI