tessl/pypi-pypdfium2

Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing

—

Pending

Overview

Eval results

Files

Page Manipulation

Name: tessl/pypi-pypdfium2
Author: tessl

Page-level operations including rendering, rotation, dimension management, bounding box manipulation, and content processing. The PdfPage class provides comprehensive access to individual PDF pages.

Capabilities

Page Dimensions

Access and manage page dimensions in PDF coordinate units (1/72 inch).

def get_width(self) -> float:
    """Get page width in PDF units."""

def get_height(self) -> float:
    """Get page height in PDF units."""

def get_size(self) -> tuple[float, float]:
    """Get page dimensions as (width, height) tuple."""

Example:

import pypdfium2 as pdfium

pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]

width = page.get_width()
height = page.get_height()
w, h = page.get_size()

print(f"Page size: {width} x {height} PDF units")
print(f"Page size: {width/72:.1f} x {height/72:.1f} inches")

Page Rotation

Manage page rotation in 90-degree increments.

def get_rotation(self) -> int:
    """
    Get current page rotation in degrees.
    
    Returns:
    int: Clockwise rotation (0, 90, 180, or 270 degrees)
    """

def set_rotation(self, rotation: int):
    """
    Set page rotation.
    
    Parameters:
    - rotation: int, clockwise rotation in degrees (0, 90, 180, 270)
    """

Example:

page = pdf[0]

# Check current rotation
current_rotation = page.get_rotation()
print(f"Current rotation: {current_rotation} degrees")

# Rotate page 90 degrees clockwise
page.set_rotation(90)

# Rotate back to normal
page.set_rotation(0)

Bounding Boxes

Access and modify various page bounding boxes that define page geometry and layout.

def get_mediabox(self, fallback_ok=True) -> tuple | None:
    """
    Get media box coordinates.
    
    Parameters:
    - fallback_ok: bool, allow fallback to other boxes if media box not set
    
    Returns:  
    tuple: (left, bottom, right, top) coordinates or None
    """

def set_mediabox(self, left: float, bottom: float, right: float, top: float):
    """Set media box coordinates."""

def get_cropbox(self, fallback_ok=True) -> tuple | None:
    """Get crop box coordinates (visible area)."""

def set_cropbox(self, left: float, bottom: float, right: float, top: float):
    """Set crop box coordinates."""

def get_bleedbox(self, fallback_ok=True) -> tuple | None:
    """Get bleed box coordinates (printing bleed area)."""

def set_bleedbox(self, left: float, bottom: float, right: float, top: float):
    """Set bleed box coordinates."""

def get_trimbox(self, fallback_ok=True) -> tuple | None:
    """Get trim box coordinates (final trimmed page)."""

def set_trimbox(self, left: float, bottom: float, right: float, top: float):
    """Set trim box coordinates."""

def get_artbox(self, fallback_ok=True) -> tuple | None:
    """Get art box coordinates (meaningful content area)."""

def set_artbox(self, left: float, bottom: float, right: float, top: float):
    """Set art box coordinates."""

Box hierarchy and usage:

page = pdf[0]

# Get media box (full page dimensions)
mediabox = page.get_mediabox()
print(f"Media box: {mediabox}")

# Get crop box (visible area when displayed)
cropbox = page.get_cropbox()
if cropbox:
    print(f"Crop box: {cropbox}")

# Set custom crop area
page.set_cropbox(50, 50, 550, 750)  # Crop 50 units from each edge

Page Rendering

Render pages to various image formats with extensive customization options.

def render(self, rotation=0, scale=1, ...) -> PdfBitmap:
    """
    Render page to bitmap.
    
    Parameters:
    - rotation: int, additional rotation in degrees (0, 90, 180, 270)
    - scale: float, scaling factor (1.0 = 72 DPI, 2.0 = 144 DPI)
    - crop: tuple, optional crop box (left, bottom, right, top)
    - colour: tuple, background color as (R, G, B) or (R, G, B, A)
    - fill_to_size: bool, scale to exact size maintaining aspect ratio
    - optimize_mode: str, rendering optimization mode
    - draw_annots: bool, include annotations in rendering
    - draw_forms: bool, include form fields in rendering
    - grayscale: bool, render in grayscale
    - rev_byteorder: bool, reverse byte order for pixel data
    - prefer_bgrx: bool, prefer BGRX pixel format
    - no_smoothing: bool, disable anti-aliasing
    - force_halftone: bool, force halftone for images
    - limit_image_cache: bool, limit image cache usage
    - force_text_matrix: bool, force text matrix transformation
    
    Returns:
    PdfBitmap: Rendered bitmap object
    """

def render_topil(self, **kwargs) -> PIL.Image:
    """
    Render page directly to PIL Image.
    
    Same parameters as render() method.
    
    Returns:
    PIL.Image: PIL Image object
    """

def render_tonumpy(self, **kwargs) -> numpy.ndarray:
    """
    Render page directly to NumPy array.
    
    Same parameters as render() method.
    
    Returns:
    numpy.ndarray: Image data as NumPy array
    """

Rendering examples:

page = pdf[0]

# Basic rendering at 150 DPI
bitmap = page.render(scale=150/72)
pil_image = bitmap.to_pil()
pil_image.save("page_150dpi.png")

# Direct PIL rendering with custom background
pil_image = page.render_topil(
    scale=2.0,
    colour=(255, 255, 255, 255),  # White background
    draw_annots=True
)
pil_image.save("page_with_annotations.png")

# Render rotated page
rotated_bitmap = page.render(rotation=90, scale=1.5)

# Render cropped area
cropped_bitmap = page.render(
    crop=(100, 100, 500, 700),
    scale=2.0
)

# Grayscale rendering
gray_array = page.render_tonumpy(
    grayscale=True,
    scale=1.0
)

Text Processing

Access text content and structure within the page.

def get_textpage(self) -> PdfTextPage:
    """
    Get text page object for text extraction and analysis.
    
    Returns:
    PdfTextPage: Text page object providing text extraction capabilities
    """

Example:

page = pdf[0]
textpage = page.get_textpage()

# Extract all text
all_text = textpage.get_text_range()
print(f"Page text: {all_text}")

# Extract text from specific area
bounded_text = textpage.get_text_bounded(
    left=100, bottom=100, right=500, top=700
)

Page Objects

Access and manipulate individual objects within the page (text, images, graphics).

def count_objects(self) -> int:
    """Get number of page objects."""

def get_object(self, index: int) -> PdfObject:
    """
    Get page object by index.
    
    Parameters:
    - index: int, object index (0-based)
    
    Returns:
    PdfObject: Page object (may be PdfImage, PdfText, etc.)
    """

def insert_object(self, obj: PdfObject):
    """
    Insert page object into page.
    
    Parameters:
    - obj: PdfObject, object to insert
    """

def remove_object(self, obj: PdfObject):
    """
    Remove page object from page.
    
    Parameters:
    - obj: PdfObject, object to remove
    """

def gen_content(self):
    """Generate content stream from page objects."""

Working with page objects:

page = pdf[0]

# Count objects on page
obj_count = page.count_objects()
print(f"Page has {obj_count} objects")

# Iterate through objects
for i in range(obj_count):
    obj = page.get_object(i)
    print(f"Object {i}: type {obj.type}")
    
    # Check if it's an image
    if isinstance(obj, pdfium.PdfImage):
        print(f"  Image size: {obj.get_size()}")
        # Extract image
        obj.extract(f"image_{i}.png")

# Create new image object (requires raw image data)
# new_image = pdfium.PdfImage.new(pdf)
# page.insert_object(new_image)

Properties

@property
def raw(self) -> FPDF_PAGE:
    """Raw PDFium page handle for low-level operations."""

@property
def pdf(self) -> PdfDocument:
    """Parent document containing this page."""

@property  
def formenv(self):
    """Form environment reference for interactive elements."""

Common Page Operations

Page Size Conversion

# Convert between units
def pdf_to_inches(pdf_units):
    return pdf_units / 72

def inches_to_pdf(inches):
    return inches * 72

def pdf_to_mm(pdf_units):
    return pdf_units * 25.4 / 72

# Standard page sizes in PDF units
PAGE_SIZES = {
    'letter': (612, 792),
    'a4': (595, 842),
    'legal': (612, 1008),
    'tabloid': (792, 1224)
}

Aspect Ratio Calculations

page = pdf[0]
width, height = page.get_size()
aspect_ratio = width / height

print(f"Aspect ratio: {aspect_ratio:.2f}")
if abs(aspect_ratio - 8.5/11) < 0.01:
    print("This is likely US Letter size")
elif abs(aspect_ratio - 210/297) < 0.01:
    print("This is likely A4 size")

Install with Tessl CLI