CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pymupdf

High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

document-operations.mddocs/

Document Operations

Core document handling for opening, saving, and managing PDF and other document formats. PyMuPDF supports a wide range of document types including PDF, XPS, EPUB, MOBI, CBZ, and SVG files.

Capabilities

Opening Documents

Open documents from files, bytes, or streams with automatic format detection or explicit format specification.

# Note: open() is an alias for the Document constructor
open = Document

Document Class

Main document container with comprehensive document management capabilities.

class Document:
    def __init__(self, filename: str = None, stream: bytes = None, filetype: str = None,
                 rect: Rect = None, width: int = 0, height: int = 0, fontsize: int = 11):
        """
        Create document object. Use open() as a synonym.
        
        Parameters:
        - filename: path to document file, or None for new document
        - stream: document content as bytes  
        - filetype: explicit file type ('pdf', 'xps', 'epub', etc.)
        - rect: Rect to crop pages (for reflowable documents)
        - width: page width for reflowable documents 
        - height: page height for reflowable documents
        - fontsize: font size for reflowable documents
        """
    
    def save(self, filename: str, **kwargs) -> None:
        """
        Save document to file.
        
        Parameters:
        - filename: output file path
        - garbage: remove unused objects (0-4, default 0) 
        - clean: clean and sanitize document content
        - deflate: compress uncompressed streams
        - deflate_images: compress images
        - deflate_fonts: compress fonts
        - incremental: save incrementally (faster for small changes)
        - ascii: write in ASCII mode
        - expand: decompress streams
        - linear: create linearized PDF
        - permissions: set document permissions
        - encryption: encryption method (0-4)
        - owner_pw: owner password
        - user_pw: user password
        """
    
    def saveIncr(self) -> None:
        """Save document incrementally (in-place)."""
    
    def close(self) -> None:
        """Close document and free memory."""
    
    def load_page(self, page_num: int) -> Page:
        """
        Load a specific page by number.
        
        Parameters:
        - page_num: zero-based page number
        
        Returns:
        Page object
        """
    
    def new_page(self, pno: int = -1, width: float = 595, height: float = 842) -> Page:
        """
        Create a new page.
        
        Parameters:
        - pno: insertion point (-1 for append)
        - width: page width in points
        - height: page height in points
        
        Returns:
        New Page object
        """
    
    def delete_page(self, pno: int) -> None:
        """
        Delete a page.
        
        Parameters:
        - pno: page number to delete
        """
    
    def copy_page(self, pno: int, to: int = -1) -> None:
        """
        Copy a page within the document.
        
        Parameters:
        - pno: source page number
        - to: target position (-1 for append)
        """
    
    def move_page(self, pno: int, to: int) -> None:
        """
        Move a page to different position.
        
        Parameters:
        - pno: source page number  
        - to: target position
        """
    
    def insert_pdf(self, docsrc: Document, from_page: int = 0, to_page: int = -1, 
                   start_at: int = -1, rotate: int = -1, links: bool = True, 
                   annots: bool = True, show_progress: int = 0, final: bool = True) -> int:
        """
        Insert pages from another PDF document.
        
        Parameters:
        - docsrc: source Document object
        - from_page: first source page (0-based)
        - to_page: last source page (-1 for last)
        - start_at: insertion point (-1 for append)
        - rotate: rotation angle (0, 90, 180, 270)
        - links: copy links
        - annots: copy annotations
        - show_progress: progress callback frequency
        - final: finalize operation
        
        Returns:
        Number of pages inserted
        """
    
    def authenticate(self, password: str) -> int:
        """
        Authenticate encrypted document.
        
        Parameters:
        - password: document password
        
        Returns:
        Authentication result (0=failed, 1=user password, 2=owner password)
        """
    
    @property
    def page_count(self) -> int:
        """Number of pages in document."""
    
    @property
    def metadata(self) -> dict:
        """Document metadata dictionary."""
    
    def set_metadata(self, m: dict) -> None:
        """
        Set document metadata.
        
        Parameters:
        - m: metadata dictionary with keys like 'title', 'author', 'subject', 'creator', etc.
        """
    
    @property
    def needs_pass(self) -> bool:
        """True if document requires password authentication."""
    
    @property
    def is_encrypted(self) -> bool:
        """True if document is encrypted."""
    
    @property
    def is_pdf(self) -> bool:
        """True if document is PDF format."""
    
    @property
    def is_form_pdf(self) -> bool:
        """True if PDF contains interactive forms."""
    
    @property
    def is_reflowable(self) -> bool:
        """True if document has reflowable layout (EPUB, etc.)."""
    
    @property
    def is_closed(self) -> bool:
        """True if document has been closed."""
    
    @property
    def name(self) -> str:
        """Document filename or '<new document>' for new documents."""
    
    def can_save_incrementally(self) -> bool:
        """True if document can be saved incrementally."""
    
    def chapter_count(self) -> int:
        """Number of chapters (for EPUB documents)."""
    
    def last_location(self) -> tuple:
        """Last location tuple for reflowable documents."""
    
    def next_location(self, location: tuple) -> tuple:
        """
        Next location after given location.
        
        Parameters:
        - location: current location tuple
        
        Returns:
        Next location tuple
        """
    
    def previous_location(self, location: tuple) -> tuple:
        """
        Previous location before given location.
        
        Parameters:
        - location: current location tuple
        
        Returns:
        Previous location tuple
        """
    
    def page_xref(self, pno: int) -> int:
        """
        Get PDF cross-reference number for page.
        
        Parameters:
        - pno: page number
        
        Returns:
        Cross-reference number
        """

Table of Contents Operations

Manage document bookmarks and navigation structure.

def get_toc(self, simple: bool = True) -> list:
    """
    Get table of contents.
    
    Parameters:
    - simple: return simple format (default) or detailed format
    
    Returns:
    List of [level, title, page, dest] entries
    """

def set_toc(self, toc: list, collapse: int = 1) -> int:
    """
    Set table of contents.
    
    Parameters:
    - toc: table of contents list
    - collapse: collapse levels above this number
    
    Returns:
    Number of items processed
    """

Embedded Files Operations

Handle files embedded within documents.

def embeddedFileNames(self) -> list:
    """
    Get list of embedded file names.
    
    Returns:
    List of embedded file names
    """

def embeddedFileGet(self, name: str) -> bytes:
    """
    Extract embedded file content.
    
    Parameters:
    - name: embedded file name
    
    Returns:
    File content as bytes
    """

def embeddedFileAdd(self, name: str, buffer: typing.Union[str, bytes], 
                   filename: str = None, ufilename: str = None, 
                   desc: str = None) -> None:
    """
    Add embedded file to document.
    
    Parameters:
    - name: reference name for the file
    - buffer: file content
    - filename: original filename  
    - ufilename: unicode filename
    - desc: file description
    """

def embeddedFileDel(self, name: str) -> None:
    """
    Delete embedded file.
    
    Parameters:
    - name: embedded file name to delete
    """

Usage Examples

Basic Document Operations

import pymupdf

# Open document
doc = pymupdf.open("input.pdf")

# Check if password required
if doc.needs_pass:
    success = doc.authenticate("password")
    if not success:
        raise ValueError("Invalid password")

# Get basic info
print(f"Pages: {doc.page_count}")
print(f"Metadata: {doc.metadata}")

# Save with compression
doc.save("output.pdf", garbage=4, deflate=True)
doc.close()

Document Merging

import pymupdf

# Open target document
target_doc = pymupdf.open("target.pdf")

# Open source document  
source_doc = pymupdf.open("source.pdf")

# Insert all pages from source
target_doc.insert_pdf(source_doc)

# Save merged document
target_doc.save("merged.pdf")

# Clean up
target_doc.close()
source_doc.close()

Creating New Documents

import pymupdf

# Create new document
doc = pymupdf.open()

# Add pages
page1 = doc.new_page()
page2 = doc.new_page(width=792, height=612)  # Letter size landscape

# Set metadata
doc.set_metadata({
    "title": "My Document",
    "author": "Author Name", 
    "subject": "Document Subject",
    "creator": "PyMuPDF"
})

# Save new document
doc.save("new_document.pdf")
doc.close()

Install with Tessl CLI

npx tessl i tessl/pypi-pymupdf

docs

annotations-forms.md

document-creation-modification.md

document-operations.md

document-rendering.md

geometry-transformations.md

index.md

page-content-extraction.md

table-extraction.md

tile.json