High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.
npx @tessl/cli install tessl/pypi-pymupdf@1.26.0A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents. PyMuPDF provides comprehensive PDF processing capabilities built on top of the MuPDF C++ library, enabling developers to extract text, images, and metadata, manipulate document content, and render pages to various formats.
pip install PyMuPDFimport pymupdfLegacy compatibility (still supported):
import fitz # Maps to pymupdfimport pymupdf
# Open a document
doc = pymupdf.open("document.pdf") # Same as pymupdf.Document("document.pdf")
# Extract text from all pages using standalone function
text = ""
for page in doc:
text += pymupdf.get_text(page)
# Get document metadata
metadata = doc.metadata
# Save and close
doc.save("output.pdf")
doc.close()PyMuPDF follows a hierarchical document model:
The library provides both high-level convenience methods and low-level access to document structures, enabling everything from simple text extraction to complex document manipulation and rendering.
Core document handling including opening, saving, and metadata management. Supports PDF, XPS, EPUB, MOBI, CBZ, SVG and other formats with comprehensive document manipulation capabilities.
# Note: open() is an alias for Document constructor
open = Document
class Document:
def __init__(self, filename: str = None, stream: bytes = None, filetype: str = None,
rect: Rect = None, width: int = 0, height: int = 0, fontsize: int = 11): ...
def save(self, filename: str, **kwargs) -> None: ...
def close(self) -> None: ...
def load_page(self, page_num: int) -> Page: ...
@property
def page_count(self) -> int: ...
@property
def metadata(self) -> dict: ...Text and image extraction from document pages with multiple output formats, search capabilities, and layout analysis. Includes support for structured text extraction with formatting information.
# Standalone text extraction functions
def get_text(page: Page, option: str = "text", **kwargs) -> str: ...
def get_text_blocks(page: Page, **kwargs) -> list: ...
def get_text_words(page: Page, **kwargs) -> list: ...
def get_textbox(page: Page, rect: Rect, **kwargs) -> str: ...
class Page:
def get_textpage(self, **kwargs) -> TextPage: ...
def search_for(self, needle: str, **kwargs) -> list: ...
def get_images(self, **kwargs) -> list: ...
def get_links(self) -> list: ...High-performance rendering of document pages to various formats including PNG, JPEG, and other image formats. Supports custom resolutions, color spaces, and rendering options.
class Page:
def get_pixmap(self, **kwargs) -> Pixmap: ...
class Pixmap:
def save(self, filename: str, **kwargs) -> None: ...
def tobytes(self, output: str = "png") -> bytes: ...
@property
def width(self) -> int: ...
@property
def height(self) -> int: ...Comprehensive annotation handling including creation, modification, and deletion of various annotation types. Support for interactive forms and form field manipulation.
class Annot:
def set_info(self, content: str = None, **kwargs) -> None: ...
def set_rect(self, rect: Rect) -> None: ...
def update(self) -> None: ...
def delete(self) -> None: ...
@property
def type(self) -> list: ...Coordinate system handling with matrices, rectangles, points, and quads for precise positioning and transformations. Essential for layout manipulation and coordinate calculations.
class Matrix:
def __init__(self, a: float = 1.0, b: float = 0.0, c: float = 0.0,
d: float = 1.0, e: float = 0.0, f: float = 0.0): ...
def prerotate(self, deg: float) -> Matrix: ...
def prescale(self, sx: float, sy: float) -> Matrix: ...
class Rect:
def __init__(self, x0: float, y0: float, x1: float, y1: float): ...
def transform(self, matrix: Matrix) -> Rect: ...
@property
def width(self) -> float: ...
@property
def height(self) -> float: ...Advanced table detection and extraction capabilities with support for table structure analysis, cell content extraction, and export to various formats including pandas DataFrames.
class Table:
def extract(self) -> list: ...
def to_pandas(self) -> 'pandas.DataFrame': ...
class TableFinder:
def __init__(self, page: Page): ...
def find_tables(self, **kwargs) -> list: ...Creating new documents and modifying existing ones including page insertion, deletion, and content manipulation. Support for adding text, images, and other content elements.
class Document:
def new_page(self, width: float = 595, height: float = 842, **kwargs) -> Page: ...
def delete_page(self, pno: int) -> None: ...
def insert_pdf(self, docsrc: Document, **kwargs) -> int: ...
class Page:
def insert_text(self, point: Point, text: str, **kwargs) -> int: ...
def insert_image(self, rect: Rect, **kwargs) -> None: ...Document Creation and Modification
class Document:
"""Main document class for PDF and other document formats."""
class Page:
"""Represents a single page in a document."""
class Pixmap:
"""Raster image representation with pixel data."""
class TextPage:
"""Text extraction with layout and formatting information."""
class Annot:
"""Document annotation (note, highlight, etc.)."""
class Matrix:
"""2D transformation matrix for coordinate transformations."""
class Rect:
"""Rectangle defined by four coordinates (x0, y0, x1, y1)."""
class Point:
"""2D point with x and y coordinates."""
class Quad:
"""Quadrilateral defined by four corner points."""
class Font:
"""Font representation for text operations."""
class Archive:
"""Archive file handling for compressed documents."""
class TextWriter:
"""Utility for writing text with advanced formatting."""
class Shape:
"""Drawing operations for vector graphics."""
# Exception types
class FileDataError(RuntimeError):
"""Raised when file data is corrupted or invalid."""
class FileNotFoundError(RuntimeError):
"""Raised when requested file cannot be found."""
class EmptyFileError(FileDataError):
"""Raised when file is empty or contains no data."""