tessl/pypi-pypdfium2

Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing

—

Pending

Overview

Eval results

Files

Page Objects and Graphics

Name: tessl/pypi-pypdfium2
Author: tessl

Manipulation of PDF page objects including images, text, and vector graphics. Supports object transformation, insertion, removal, and detailed analysis of page content structure.

Capabilities

Page Object Base Class

The PdfObject class serves as the base for all page objects including text, images, and vector graphics.

class PdfObject:
    def get_pos(self) -> tuple:
        """
        Get object position bounds.
        
        Returns:
        tuple: (left, bottom, right, top) bounding rectangle in PDF units
        """
    
    def get_matrix(self) -> PdfMatrix:
        """
        Get object transformation matrix.
        
        Returns:
        PdfMatrix: Current transformation matrix
        """
    
    def set_matrix(self, matrix: PdfMatrix):
        """
        Set object transformation matrix.
        
        Parameters:
        - matrix: PdfMatrix, new transformation matrix
        """
    
    def transform(self, matrix: PdfMatrix):
        """
        Apply transformation matrix to object.
        
        Parameters:
        - matrix: PdfMatrix, transformation to apply
        """

Basic object manipulation:

import pypdfium2 as pdfium

pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]

# Iterate through page objects
for i in range(page.count_objects()):
    obj = page.get_object(i)
    
    # Get object information
    bounds = obj.get_pos()
    matrix = obj.get_matrix()
    
    print(f"Object {i}:")
    print(f"  Type: {obj.type}")
    print(f"  Bounds: {bounds}")
    print(f"  Matrix: {matrix.get()}")
    print(f"  Level: {obj.level}")

Page Object Properties

Access object metadata and relationships.

@property
def raw(self) -> FPDF_PAGEOBJECT:
    """Raw PDFium page object handle."""

@property
def type(self) -> int:
    """Object type constant (text, image, path, etc.)."""

@property
def page(self) -> PdfPage:
    """Parent page containing this object."""

@property
def pdf(self) -> PdfDocument:
    """Parent document containing this object."""

@property
def level(self) -> int:
    """Nesting level of the object."""

Image Objects

The PdfImage class provides specialized handling for image objects within PDF pages.

class PdfImage(PdfObject):
    @classmethod
    def new(cls, pdf: PdfDocument) -> PdfImage:
        """
        Create new image object.
        
        Parameters:
        - pdf: PdfDocument, parent document
        
        Returns:
        PdfImage: New image object (not yet inserted into page)
        """
    
    def get_metadata(self) -> ImageInfo:
        """
        Get image metadata information.
        
        Returns:
        ImageInfo: Named tuple with image format, mode, and filter information
        """
    
    def get_size(self) -> tuple[int, int]:
        """
        Get image dimensions.
        
        Returns:
        tuple: (width, height) in pixels
        """
    
    def get_filters(self, skip_simple=False) -> list:
        """
        Get list of filters applied to image data.
        
        Parameters:
        - skip_simple: bool, skip simple/common filters
        
        Returns:
        list: Filter names applied to image
        """

Image metadata structure:

class ImageInfo(NamedTuple):
    format: str          # Image format (JPEG, PNG, etc.)
    mode: str           # Color mode (RGB, RGBA, L, etc.)
    metadata: dict      # Additional metadata
    all_filters: list   # All filters applied
    complex_filters: list  # Complex/uncommon filters

Image Data Access

Extract and manipulate image data from PDF image objects.

def get_bitmap(self, render=False) -> PdfBitmap:
    """
    Get image as bitmap.
    
    Parameters:
    - render: bool, render image through PDFium (may change appearance)
    
    Returns:
    PdfBitmap: Image data as bitmap object
    """

def get_data(self, decode_simple=False) -> bytes:
    """
    Get raw image data.
    
    Parameters:
    - decode_simple: bool, decode simple filters (like FlateDecode)
    
    Returns:
    bytes: Raw image data (may be compressed)
    """

def extract(self, dest: str, *args, **kwargs):
    """
    Extract image to file.
    
    Parameters:
    - dest: str, output file path
    - Additional parameters for format-specific options
    
    Automatically detects image format and saves appropriately.
    """

Image processing examples:

pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]

# Find and process all images on page
for i in range(page.count_objects()):
    obj = page.get_object(i)
    
    if isinstance(obj, pdfium.PdfImage):
        print(f"\nProcessing image {i}:")
        
        # Get image metadata
        metadata = obj.get_metadata()
        print(f"  Format: {metadata.format}")  
        print(f"  Mode: {metadata.mode}")
        print(f"  Size: {obj.get_size()}")
        print(f"  Filters: {metadata.all_filters}")
        
        # Extract image to file
        output_path = f"extracted_image_{i}.png"
        try:
            obj.extract(output_path)
            print(f"  Extracted to: {output_path}")
        except Exception as e:
            print(f"  Extraction failed: {e}")
            
            # Try getting as bitmap instead
            try:
                bitmap = obj.get_bitmap()
                pil_image = bitmap.to_pil()
                pil_image.save(output_path)
                print(f"  Converted and saved to: {output_path}")
            except Exception as e2:
                print(f"  Bitmap conversion failed: {e2}")
        
        # Analyze image position and transformation
        bounds = obj.get_pos()
        matrix = obj.get_matrix()
        print(f"  Position: {bounds}")
        print(f"  Transform: {matrix.get()}")

Image Modification

Modify existing images or create new image objects.

def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
    """
    Load JPEG data into image object.
    
    Parameters:
    - source: file path, bytes, or file-like object containing JPEG data
    - pages: list of pages to apply to (None = current page)
    - inline: bool, embed as inline image
    - autoclose: bool, close source if file-like object
    """

def set_bitmap(self, bitmap: PdfBitmap, pages=None):
    """
    Set image data from bitmap.
    
    Parameters:
    - bitmap: PdfBitmap, source bitmap data
    - pages: list of pages to apply to (None = current page)
    """

Creating and modifying images:

pdf = pdfium.PdfDocument.new()
page = pdf.new_page(612, 792)  # US Letter

# Create new image object
img_obj = pdfium.PdfImage.new(pdf)

# Load JPEG data
img_obj.load_jpeg("photo.jpg")

# Position the image on page
transform = pdfium.PdfMatrix()
transform = transform.translate(100, 400)  # Position
transform = transform.scale(200, 150)      # Size

img_obj.set_matrix(transform)

# Insert into page
page.insert_object(img_obj)

# Generate content stream
page.gen_content()

# Save document
pdf.save("document_with_image.pdf")

Object Transformation

Apply geometric transformations to page objects including rotation, scaling, and translation.

def transform_objects_example(page):
    """Example of transforming page objects."""
    
    for i in range(page.count_objects()):
        obj = page.get_object(i)
        
        if isinstance(obj, pdfium.PdfImage):
            # Get current transformation
            current_matrix = obj.get_matrix()
            print(f"Current matrix: {current_matrix.get()}")
            
            # Create new transformation
            new_matrix = pdfium.PdfMatrix()
            
            # Scale image to 150% size
            new_matrix = new_matrix.scale(1.5, 1.5)
            
            # Rotate 15 degrees
            new_matrix = new_matrix.rotate(15)
            
            # Move to new position
            new_matrix = new_matrix.translate(50, 100)
            
            # Combine with existing transformation
            combined_matrix = current_matrix.multiply(new_matrix)
            
            # Apply transformation
            obj.set_matrix(combined_matrix)
            
            print(f"New matrix: {combined_matrix.get()}")

# Usage
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
transform_objects_example(page)

# Regenerate content stream after transformations
page.gen_content()
pdf.save("transformed_document.pdf")

Object Analysis

Analyze page objects for content extraction and document understanding.

def analyze_page_objects(page):
    """Comprehensive page object analysis."""
    
    analysis = {
        'total_objects': page.count_objects(),
        'images': [],
        'text_objects': 0,
        'path_objects': 0,
        'other_objects': 0,
        'coverage_area': 0
    }
    
    page_width, page_height = page.get_size()
    page_area = page_width * page_height
    
    for i in range(page.count_objects()):
        obj = page.get_object(i)
        bounds = obj.get_pos()
        
        # Calculate object area
        if bounds:
            left, bottom, right, top = bounds
            obj_area = (right - left) * (top - bottom)
            analysis['coverage_area'] += obj_area
        
        # Categorize objects
        if isinstance(obj, pdfium.PdfImage):
            img_info = {
                'index': i,
                'size': obj.get_size(),
                'bounds': bounds,
                'metadata': obj.get_metadata()._asdict()
            }
            analysis['images'].append(img_info)
            
        elif obj.type == pdfium.raw.FPDF_PAGEOBJ_TEXT:
            analysis['text_objects'] += 1
            
        elif obj.type == pdfium.raw.FPDF_PAGEOBJ_PATH:
            analysis['path_objects'] += 1
            
        else:
            analysis['other_objects'] += 1
    
    # Calculate coverage percentage
    analysis['coverage_percentage'] = (analysis['coverage_area'] / page_area) * 100
    
    return analysis

# Usage
pdf = pdfium.PdfDocument("document.pdf")

for i, page in enumerate(pdf):
    print(f"\n--- Page {i+1} Object Analysis ---")
    analysis = analyze_page_objects(page)
    
    print(f"Total objects: {analysis['total_objects']}")
    print(f"Images: {len(analysis['images'])}")
    print(f"Text objects: {analysis['text_objects']}")
    print(f"Path objects: {analysis['path_objects']}")
    print(f"Other objects: {analysis['other_objects']}")
    print(f"Coverage: {analysis['coverage_percentage']:.1f}%")
    
    # Detail image information
    for img_info in analysis['images']:
        print(f"  Image {img_info['index']}: {img_info['size']} pixels")
        print(f"    Format: {img_info['metadata']['format']}")
        print(f"    Bounds: {img_info['bounds']}")

Form XObjects

Handle PDF Form XObjects for reusable content and complex graphics. Form XObjects are reusable page content that can be embedded multiple times within documents.

class PdfXObject:
    """
    XObject helper class for managing reusable PDF content.
    
    Form XObjects are self-contained graphic objects that can be referenced
    multiple times within a PDF document. They're useful for templates,
    logos, headers, footers, and other repeated content.
    
    Attributes:
    - raw: FPDF_XOBJECT, underlying PDFium XObject handle
    - pdf: PdfDocument, reference to document this XObject belongs to
    """
    
    def __init__(self, raw, pdf):
        """
        Initialize XObject wrapper.
        
        Parameters:
        - raw: FPDF_XOBJECT, PDFium XObject handle
        - pdf: PdfDocument, parent document
        
        Note: XObjects are typically created via PdfDocument.page_as_xobject()
        rather than direct instantiation.
        """
    
    def as_pageobject(self) -> PdfObject:
        """
        Convert Form XObject to independent page object.
        
        Returns:
        PdfObject: Page object representation of the XObject content
        
        Creates an independent page object from the XObject that can be
        inserted into pages. Multiple page objects can share the same
        XObject resources. Page objects remain valid after XObject closure.
        """
    
    def close(self):
        """Close and release XObject resources."""

Creating and using XObjects:

import pypdfium2 as pdfium

# Load source document 
source_pdf = pdfium.PdfDocument("source_document.pdf")
target_pdf = pdfium.PdfDocument.new()

# Convert a page to XObject for reuse
page_xobject = source_pdf.page_as_xobject(0, target_pdf)

# Create target page
target_page = target_pdf.new_page(612, 792)

# Create multiple page objects from the same XObject
header_obj = page_xobject.as_pageobject()
footer_obj = page_xobject.as_pageobject()

# Position header at top of page
header_matrix = pdfium.PdfMatrix()
header_matrix = header_matrix.translate(50, 700)
header_matrix = header_matrix.scale(0.5, 0.5)  # Scale down
header_obj.set_matrix(header_matrix)

# Position footer at bottom
footer_matrix = pdfium.PdfMatrix()  
footer_matrix = footer_matrix.translate(50, 50)
footer_matrix = footer_matrix.scale(0.3, 0.3)  # Scale down more
footer_obj.set_matrix(footer_matrix)

# Insert both objects into page
target_page.insert_obj(header_obj)
target_page.insert_obj(footer_obj)

# Generate content and save
target_page.gen_content()
target_pdf.save("document_with_reused_content.pdf")

# XObject can be closed after page objects are created
page_xobject.close()

XObject reuse patterns:

def create_template_document():
    """Create document with repeated template content."""
    
    # Source document with logo/header content
    logo_pdf = pdfium.PdfDocument("company_logo.pdf")
    main_pdf = pdfium.PdfDocument.new()
    
    # Convert logo page to reusable XObject
    logo_xobject = logo_pdf.page_as_xobject(0, main_pdf)
    
    # Create multiple pages with logo
    for i in range(5):
        page = main_pdf.new_page(612, 792)
        
        # Add logo to each page
        logo_obj = logo_xobject.as_pageobject()
        
        # Position logo in top-right corner
        logo_matrix = pdfium.PdfMatrix()
        logo_matrix = logo_matrix.translate(450, 720)  # Top-right
        logo_matrix = logo_matrix.scale(0.2, 0.2)      # Small size
        logo_obj.set_matrix(logo_matrix)
        
        page.insert_obj(logo_obj)
        
        # Add page-specific content here
        # ... (text, images, etc.)
        
        page.gen_content()
    
    # Clean up
    logo_xobject.close()
    logo_pdf.close()
    
    return main_pdf

# Usage
template_doc = create_template_document()
template_doc.save("template_document.pdf")
template_doc.close()

Exception Handling

Handle errors that may occur during image extraction and processing.

class ImageNotExtractableError(Exception):
    """
    Raised when image cannot be extracted from PDF.
    
    This may occur due to:
    - Unsupported image formats
    - Corrupted image data
    - Complex filter combinations
    - Encrypted or protected images
    """

Safe image extraction:

def safe_extract_images(page, output_dir):
    """Safely extract all images from page."""
    import os
    
    extracted_count = 0
    failed_count = 0
    
    for i in range(page.count_objects()):
        obj = page.get_object(i)
        
        if isinstance(obj, pdfium.PdfImage):
            try:
                # Try direct extraction first
                output_path = os.path.join(output_dir, f"image_{i}.png")
                obj.extract(output_path)
                extracted_count += 1
                print(f"Extracted image {i}")
                
            except pdfium.ImageNotExtractableError:
                # Try bitmap conversion
                try:
                    bitmap = obj.get_bitmap(render=True)
                    pil_image = bitmap.to_pil()
                    output_path = os.path.join(output_dir, f"image_{i}_rendered.png")
                    pil_image.save(output_path)
                    extracted_count += 1
                    print(f"Rendered and extracted image {i}")
                    
                except Exception as e:
                    failed_count += 1
                    print(f"Failed to extract image {i}: {e}")
            
            except Exception as e:
                failed_count += 1
                print(f"Unexpected error extracting image {i}: {e}")
    
    return extracted_count, failed_count

# Usage
import os
os.makedirs("extracted_images", exist_ok=True)

pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]

extracted, failed = safe_extract_images(page, "extracted_images")
print(f"Successfully extracted: {extracted}")
print(f"Failed extractions: {failed}")

Object Type Constants

Common object type constants available through the raw module:

# Available through pypdfium2.raw
FPDF_PAGEOBJ_UNKNOWN = 0   # Unknown object type
FPDF_PAGEOBJ_TEXT = 1      # Text object
FPDF_PAGEOBJ_PATH = 2      # Path/vector graphics
FPDF_PAGEOBJ_IMAGE = 3     # Image object
FPDF_PAGEOBJ_SHADING = 4   # Shading object
FPDF_PAGEOBJ_FORM = 5      # Form XObject

Object type identification:

for i in range(page.count_objects()):
    obj = page.get_object(i)
    
    type_names = {
        pdfium.raw.FPDF_PAGEOBJ_TEXT: "Text",
        pdfium.raw.FPDF_PAGEOBJ_PATH: "Path", 
        pdfium.raw.FPDF_PAGEOBJ_IMAGE: "Image",
        pdfium.raw.FPDF_PAGEOBJ_SHADING: "Shading",
        pdfium.raw.FPDF_PAGEOBJ_FORM: "Form"
    }
    
    type_name = type_names.get(obj.type, "Unknown")
    print(f"Object {i}: {type_name} (type {obj.type})")

Install with Tessl CLI