Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing
npx @tessl/cli install tessl/pypi-pypdfium2@4.30.0Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing. Built on Google's powerful PDFium library, pypdfium2 provides both high-level helper classes for common PDF operations and low-level raw bindings for advanced functionality.
pip install pypdfium2import pypdfium2 as pdfiumFor direct access to specific classes:
from pypdfium2 import PdfDocument, PdfPage, PdfBitmapFor version information:
from pypdfium2 import PYPDFIUM_INFO, PDFIUM_INFOimport pypdfium2 as pdfium
# Open a PDF document
pdf = pdfium.PdfDocument("document.pdf")
# Get basic information
print(f"Pages: {len(pdf)}")
print(f"Version: {pdf.get_version()}")
print(f"Metadata: {pdf.get_metadata_dict()}")
# Render first page to image
page = pdf[0]
bitmap = page.render(scale=2.0)
pil_image = bitmap.to_pil()
pil_image.save("page1.png")
# Extract text from page
textpage = page.get_textpage()
text = textpage.get_text_range()
print(f"Page text: {text}")
# Clean up
pdf.close()pypdfium2 follows a layered architecture design:
This design enables both simple high-level operations and advanced low-level manipulation while maintaining compatibility with the broader Python ecosystem.
Core PDF document operations including loading, creating, saving, and metadata manipulation. Supports password-protected PDFs, form handling, and file attachments.
class PdfDocument:
def __init__(self, input_data, password=None, autoclose=False): ...
@classmethod
def new(cls): ...
def __len__(self) -> int: ...
def save(self, dest, version=None, flags=...): ...
def get_metadata_dict(self, skip_empty=False) -> dict: ...
def is_tagged(self) -> bool: ...Page-level operations including rendering, rotation, dimension management, and bounding box manipulation. Supports various rendering formats and customization options.
class PdfPage:
def get_size(self) -> tuple[float, float]: ...
def render(self, rotation=0, scale=1, ...) -> PdfBitmap: ...
def get_rotation(self) -> int: ...
def set_rotation(self, rotation): ...
def get_mediabox(self, fallback_ok=True) -> tuple | None: ...Comprehensive text extraction and search capabilities with support for bounded text extraction, character-level positioning, and full-text search.
class PdfTextPage:
def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False) -> str: ...
def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore") -> str: ...
def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False) -> PdfTextSearcher: ...
def get_charbox(self, index, loose=False) -> tuple: ...Image rendering, manipulation, and extraction with support for multiple output formats including PIL Images, NumPy arrays, and raw bitmaps.
class PdfBitmap:
@classmethod
def from_pil(cls, pil_image, recopy=False) -> PdfBitmap: ...
def to_numpy(self) -> numpy.ndarray: ...
def to_pil(self) -> PIL.Image: ...
def fill_rect(self, left, top, width, height, color): ...Manipulation of PDF page objects including images, text, and vector graphics. Supports object transformation, insertion, and removal.
class PdfObject:
def get_pos(self) -> tuple: ...
def get_matrix(self) -> PdfMatrix: ...
def transform(self, matrix): ...
class PdfImage(PdfObject):
def get_metadata(self) -> ImageInfo: ...
def extract(self, dest, *args, **kwargs): ...Management of embedded file attachments with support for attachment metadata, data extraction, and modification.
class PdfAttachment:
def get_name(self) -> str: ...
def get_data(self) -> ctypes.Array: ...
def set_data(self, data): ...
def get_str_value(self, key) -> str: ...2D transformation matrices for coordinate system manipulation, rotation, scaling, and translation operations.
class PdfMatrix:
def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0): ...
def translate(self, x, y) -> PdfMatrix: ...
def scale(self, x, y) -> PdfMatrix: ...
def rotate(self, angle, ccw=False, rad=False) -> PdfMatrix: ...
def on_point(self, x, y) -> tuple: ...Access to pypdfium2 and PDFium version information, build details, and feature flags.
PYPDFIUM_INFO: _version_pypdfium2
PDFIUM_INFO: _version_pdfium
# Version properties
version: str
api_tag: tuple[int]
major: int
minor: int
patch: int
build: int # PDFIUM_INFO onlyVersion and Library Information
Access to pypdfium2's comprehensive command-line tools for batch processing, text extraction, image operations, and document manipulation.
def cli_main(raw_args=None) -> int:
"""Main CLI entry point for pypdfium2 command-line tools."""
def api_main(raw_args=None) -> int:
"""Alternative API entry point with same functionality as cli_main."""class PdfiumError(RuntimeError):
"""Main exception for PDFium library errors"""
class ImageNotExtractableError(Exception):
"""Raised when image cannot be extracted from PDF"""Common error scenarios include invalid PDF files, unsupported operations, memory allocation failures, and file I/O errors. Always handle exceptions when working with external PDF files or performing complex operations.
For advanced use cases requiring direct PDFium API access:
from pypdfium2 import raw
# Access low-level PDFium functions
doc_handle = raw.FPDF_LoadDocument(file_path, password)
page_count = raw.FPDF_GetPageCount(doc_handle)The raw module provides complete access to PDFium's C API with all functions, constants, and structures available for advanced manipulation.