Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing
—
Manipulation of PDF page objects including images, text, and vector graphics. Supports object transformation, insertion, removal, and detailed analysis of page content structure.
The PdfObject class serves as the base for all page objects including text, images, and vector graphics.
class PdfObject:
def get_pos(self) -> tuple:
"""
Get object position bounds.
Returns:
tuple: (left, bottom, right, top) bounding rectangle in PDF units
"""
def get_matrix(self) -> PdfMatrix:
"""
Get object transformation matrix.
Returns:
PdfMatrix: Current transformation matrix
"""
def set_matrix(self, matrix: PdfMatrix):
"""
Set object transformation matrix.
Parameters:
- matrix: PdfMatrix, new transformation matrix
"""
def transform(self, matrix: PdfMatrix):
"""
Apply transformation matrix to object.
Parameters:
- matrix: PdfMatrix, transformation to apply
"""Basic object manipulation:
import pypdfium2 as pdfium
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
# Iterate through page objects
for i in range(page.count_objects()):
obj = page.get_object(i)
# Get object information
bounds = obj.get_pos()
matrix = obj.get_matrix()
print(f"Object {i}:")
print(f" Type: {obj.type}")
print(f" Bounds: {bounds}")
print(f" Matrix: {matrix.get()}")
print(f" Level: {obj.level}")Access object metadata and relationships.
@property
def raw(self) -> FPDF_PAGEOBJECT:
"""Raw PDFium page object handle."""
@property
def type(self) -> int:
"""Object type constant (text, image, path, etc.)."""
@property
def page(self) -> PdfPage:
"""Parent page containing this object."""
@property
def pdf(self) -> PdfDocument:
"""Parent document containing this object."""
@property
def level(self) -> int:
"""Nesting level of the object."""The PdfImage class provides specialized handling for image objects within PDF pages.
class PdfImage(PdfObject):
@classmethod
def new(cls, pdf: PdfDocument) -> PdfImage:
"""
Create new image object.
Parameters:
- pdf: PdfDocument, parent document
Returns:
PdfImage: New image object (not yet inserted into page)
"""
def get_metadata(self) -> ImageInfo:
"""
Get image metadata information.
Returns:
ImageInfo: Named tuple with image format, mode, and filter information
"""
def get_size(self) -> tuple[int, int]:
"""
Get image dimensions.
Returns:
tuple: (width, height) in pixels
"""
def get_filters(self, skip_simple=False) -> list:
"""
Get list of filters applied to image data.
Parameters:
- skip_simple: bool, skip simple/common filters
Returns:
list: Filter names applied to image
"""Image metadata structure:
class ImageInfo(NamedTuple):
format: str # Image format (JPEG, PNG, etc.)
mode: str # Color mode (RGB, RGBA, L, etc.)
metadata: dict # Additional metadata
all_filters: list # All filters applied
complex_filters: list # Complex/uncommon filtersExtract and manipulate image data from PDF image objects.
def get_bitmap(self, render=False) -> PdfBitmap:
"""
Get image as bitmap.
Parameters:
- render: bool, render image through PDFium (may change appearance)
Returns:
PdfBitmap: Image data as bitmap object
"""
def get_data(self, decode_simple=False) -> bytes:
"""
Get raw image data.
Parameters:
- decode_simple: bool, decode simple filters (like FlateDecode)
Returns:
bytes: Raw image data (may be compressed)
"""
def extract(self, dest: str, *args, **kwargs):
"""
Extract image to file.
Parameters:
- dest: str, output file path
- Additional parameters for format-specific options
Automatically detects image format and saves appropriately.
"""Image processing examples:
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
# Find and process all images on page
for i in range(page.count_objects()):
obj = page.get_object(i)
if isinstance(obj, pdfium.PdfImage):
print(f"\nProcessing image {i}:")
# Get image metadata
metadata = obj.get_metadata()
print(f" Format: {metadata.format}")
print(f" Mode: {metadata.mode}")
print(f" Size: {obj.get_size()}")
print(f" Filters: {metadata.all_filters}")
# Extract image to file
output_path = f"extracted_image_{i}.png"
try:
obj.extract(output_path)
print(f" Extracted to: {output_path}")
except Exception as e:
print(f" Extraction failed: {e}")
# Try getting as bitmap instead
try:
bitmap = obj.get_bitmap()
pil_image = bitmap.to_pil()
pil_image.save(output_path)
print(f" Converted and saved to: {output_path}")
except Exception as e2:
print(f" Bitmap conversion failed: {e2}")
# Analyze image position and transformation
bounds = obj.get_pos()
matrix = obj.get_matrix()
print(f" Position: {bounds}")
print(f" Transform: {matrix.get()}")Modify existing images or create new image objects.
def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
"""
Load JPEG data into image object.
Parameters:
- source: file path, bytes, or file-like object containing JPEG data
- pages: list of pages to apply to (None = current page)
- inline: bool, embed as inline image
- autoclose: bool, close source if file-like object
"""
def set_bitmap(self, bitmap: PdfBitmap, pages=None):
"""
Set image data from bitmap.
Parameters:
- bitmap: PdfBitmap, source bitmap data
- pages: list of pages to apply to (None = current page)
"""Creating and modifying images:
pdf = pdfium.PdfDocument.new()
page = pdf.new_page(612, 792) # US Letter
# Create new image object
img_obj = pdfium.PdfImage.new(pdf)
# Load JPEG data
img_obj.load_jpeg("photo.jpg")
# Position the image on page
transform = pdfium.PdfMatrix()
transform = transform.translate(100, 400) # Position
transform = transform.scale(200, 150) # Size
img_obj.set_matrix(transform)
# Insert into page
page.insert_object(img_obj)
# Generate content stream
page.gen_content()
# Save document
pdf.save("document_with_image.pdf")Apply geometric transformations to page objects including rotation, scaling, and translation.
def transform_objects_example(page):
"""Example of transforming page objects."""
for i in range(page.count_objects()):
obj = page.get_object(i)
if isinstance(obj, pdfium.PdfImage):
# Get current transformation
current_matrix = obj.get_matrix()
print(f"Current matrix: {current_matrix.get()}")
# Create new transformation
new_matrix = pdfium.PdfMatrix()
# Scale image to 150% size
new_matrix = new_matrix.scale(1.5, 1.5)
# Rotate 15 degrees
new_matrix = new_matrix.rotate(15)
# Move to new position
new_matrix = new_matrix.translate(50, 100)
# Combine with existing transformation
combined_matrix = current_matrix.multiply(new_matrix)
# Apply transformation
obj.set_matrix(combined_matrix)
print(f"New matrix: {combined_matrix.get()}")
# Usage
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
transform_objects_example(page)
# Regenerate content stream after transformations
page.gen_content()
pdf.save("transformed_document.pdf")Analyze page objects for content extraction and document understanding.
def analyze_page_objects(page):
"""Comprehensive page object analysis."""
analysis = {
'total_objects': page.count_objects(),
'images': [],
'text_objects': 0,
'path_objects': 0,
'other_objects': 0,
'coverage_area': 0
}
page_width, page_height = page.get_size()
page_area = page_width * page_height
for i in range(page.count_objects()):
obj = page.get_object(i)
bounds = obj.get_pos()
# Calculate object area
if bounds:
left, bottom, right, top = bounds
obj_area = (right - left) * (top - bottom)
analysis['coverage_area'] += obj_area
# Categorize objects
if isinstance(obj, pdfium.PdfImage):
img_info = {
'index': i,
'size': obj.get_size(),
'bounds': bounds,
'metadata': obj.get_metadata()._asdict()
}
analysis['images'].append(img_info)
elif obj.type == pdfium.raw.FPDF_PAGEOBJ_TEXT:
analysis['text_objects'] += 1
elif obj.type == pdfium.raw.FPDF_PAGEOBJ_PATH:
analysis['path_objects'] += 1
else:
analysis['other_objects'] += 1
# Calculate coverage percentage
analysis['coverage_percentage'] = (analysis['coverage_area'] / page_area) * 100
return analysis
# Usage
pdf = pdfium.PdfDocument("document.pdf")
for i, page in enumerate(pdf):
print(f"\n--- Page {i+1} Object Analysis ---")
analysis = analyze_page_objects(page)
print(f"Total objects: {analysis['total_objects']}")
print(f"Images: {len(analysis['images'])}")
print(f"Text objects: {analysis['text_objects']}")
print(f"Path objects: {analysis['path_objects']}")
print(f"Other objects: {analysis['other_objects']}")
print(f"Coverage: {analysis['coverage_percentage']:.1f}%")
# Detail image information
for img_info in analysis['images']:
print(f" Image {img_info['index']}: {img_info['size']} pixels")
print(f" Format: {img_info['metadata']['format']}")
print(f" Bounds: {img_info['bounds']}")Handle PDF Form XObjects for reusable content and complex graphics. Form XObjects are reusable page content that can be embedded multiple times within documents.
class PdfXObject:
"""
XObject helper class for managing reusable PDF content.
Form XObjects are self-contained graphic objects that can be referenced
multiple times within a PDF document. They're useful for templates,
logos, headers, footers, and other repeated content.
Attributes:
- raw: FPDF_XOBJECT, underlying PDFium XObject handle
- pdf: PdfDocument, reference to document this XObject belongs to
"""
def __init__(self, raw, pdf):
"""
Initialize XObject wrapper.
Parameters:
- raw: FPDF_XOBJECT, PDFium XObject handle
- pdf: PdfDocument, parent document
Note: XObjects are typically created via PdfDocument.page_as_xobject()
rather than direct instantiation.
"""
def as_pageobject(self) -> PdfObject:
"""
Convert Form XObject to independent page object.
Returns:
PdfObject: Page object representation of the XObject content
Creates an independent page object from the XObject that can be
inserted into pages. Multiple page objects can share the same
XObject resources. Page objects remain valid after XObject closure.
"""
def close(self):
"""Close and release XObject resources."""Creating and using XObjects:
import pypdfium2 as pdfium
# Load source document
source_pdf = pdfium.PdfDocument("source_document.pdf")
target_pdf = pdfium.PdfDocument.new()
# Convert a page to XObject for reuse
page_xobject = source_pdf.page_as_xobject(0, target_pdf)
# Create target page
target_page = target_pdf.new_page(612, 792)
# Create multiple page objects from the same XObject
header_obj = page_xobject.as_pageobject()
footer_obj = page_xobject.as_pageobject()
# Position header at top of page
header_matrix = pdfium.PdfMatrix()
header_matrix = header_matrix.translate(50, 700)
header_matrix = header_matrix.scale(0.5, 0.5) # Scale down
header_obj.set_matrix(header_matrix)
# Position footer at bottom
footer_matrix = pdfium.PdfMatrix()
footer_matrix = footer_matrix.translate(50, 50)
footer_matrix = footer_matrix.scale(0.3, 0.3) # Scale down more
footer_obj.set_matrix(footer_matrix)
# Insert both objects into page
target_page.insert_obj(header_obj)
target_page.insert_obj(footer_obj)
# Generate content and save
target_page.gen_content()
target_pdf.save("document_with_reused_content.pdf")
# XObject can be closed after page objects are created
page_xobject.close()XObject reuse patterns:
def create_template_document():
"""Create document with repeated template content."""
# Source document with logo/header content
logo_pdf = pdfium.PdfDocument("company_logo.pdf")
main_pdf = pdfium.PdfDocument.new()
# Convert logo page to reusable XObject
logo_xobject = logo_pdf.page_as_xobject(0, main_pdf)
# Create multiple pages with logo
for i in range(5):
page = main_pdf.new_page(612, 792)
# Add logo to each page
logo_obj = logo_xobject.as_pageobject()
# Position logo in top-right corner
logo_matrix = pdfium.PdfMatrix()
logo_matrix = logo_matrix.translate(450, 720) # Top-right
logo_matrix = logo_matrix.scale(0.2, 0.2) # Small size
logo_obj.set_matrix(logo_matrix)
page.insert_obj(logo_obj)
# Add page-specific content here
# ... (text, images, etc.)
page.gen_content()
# Clean up
logo_xobject.close()
logo_pdf.close()
return main_pdf
# Usage
template_doc = create_template_document()
template_doc.save("template_document.pdf")
template_doc.close()Handle errors that may occur during image extraction and processing.
class ImageNotExtractableError(Exception):
"""
Raised when image cannot be extracted from PDF.
This may occur due to:
- Unsupported image formats
- Corrupted image data
- Complex filter combinations
- Encrypted or protected images
"""Safe image extraction:
def safe_extract_images(page, output_dir):
"""Safely extract all images from page."""
import os
extracted_count = 0
failed_count = 0
for i in range(page.count_objects()):
obj = page.get_object(i)
if isinstance(obj, pdfium.PdfImage):
try:
# Try direct extraction first
output_path = os.path.join(output_dir, f"image_{i}.png")
obj.extract(output_path)
extracted_count += 1
print(f"Extracted image {i}")
except pdfium.ImageNotExtractableError:
# Try bitmap conversion
try:
bitmap = obj.get_bitmap(render=True)
pil_image = bitmap.to_pil()
output_path = os.path.join(output_dir, f"image_{i}_rendered.png")
pil_image.save(output_path)
extracted_count += 1
print(f"Rendered and extracted image {i}")
except Exception as e:
failed_count += 1
print(f"Failed to extract image {i}: {e}")
except Exception as e:
failed_count += 1
print(f"Unexpected error extracting image {i}: {e}")
return extracted_count, failed_count
# Usage
import os
os.makedirs("extracted_images", exist_ok=True)
pdf = pdfium.PdfDocument("document.pdf")
page = pdf[0]
extracted, failed = safe_extract_images(page, "extracted_images")
print(f"Successfully extracted: {extracted}")
print(f"Failed extractions: {failed}")Common object type constants available through the raw module:
# Available through pypdfium2.raw
FPDF_PAGEOBJ_UNKNOWN = 0 # Unknown object type
FPDF_PAGEOBJ_TEXT = 1 # Text object
FPDF_PAGEOBJ_PATH = 2 # Path/vector graphics
FPDF_PAGEOBJ_IMAGE = 3 # Image object
FPDF_PAGEOBJ_SHADING = 4 # Shading object
FPDF_PAGEOBJ_FORM = 5 # Form XObjectObject type identification:
for i in range(page.count_objects()):
obj = page.get_object(i)
type_names = {
pdfium.raw.FPDF_PAGEOBJ_TEXT: "Text",
pdfium.raw.FPDF_PAGEOBJ_PATH: "Path",
pdfium.raw.FPDF_PAGEOBJ_IMAGE: "Image",
pdfium.raw.FPDF_PAGEOBJ_SHADING: "Shading",
pdfium.raw.FPDF_PAGEOBJ_FORM: "Form"
}
type_name = type_names.get(obj.type, "Unknown")
print(f"Object {i}: {type_name} (type {obj.type})")Install with Tessl CLI
npx tessl i tessl/pypi-pypdfium2