High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Core document handling for opening, saving, and managing PDF and other document formats. PyMuPDF supports a wide range of document types including PDF, XPS, EPUB, MOBI, CBZ, and SVG files.
Open documents from files, bytes, or streams with automatic format detection or explicit format specification.
# Note: open() is an alias for the Document constructor
open = DocumentMain document container with comprehensive document management capabilities.
class Document:
def __init__(self, filename: str = None, stream: bytes = None, filetype: str = None,
rect: Rect = None, width: int = 0, height: int = 0, fontsize: int = 11):
"""
Create document object. Use open() as a synonym.
Parameters:
- filename: path to document file, or None for new document
- stream: document content as bytes
- filetype: explicit file type ('pdf', 'xps', 'epub', etc.)
- rect: Rect to crop pages (for reflowable documents)
- width: page width for reflowable documents
- height: page height for reflowable documents
- fontsize: font size for reflowable documents
"""
def save(self, filename: str, **kwargs) -> None:
"""
Save document to file.
Parameters:
- filename: output file path
- garbage: remove unused objects (0-4, default 0)
- clean: clean and sanitize document content
- deflate: compress uncompressed streams
- deflate_images: compress images
- deflate_fonts: compress fonts
- incremental: save incrementally (faster for small changes)
- ascii: write in ASCII mode
- expand: decompress streams
- linear: create linearized PDF
- permissions: set document permissions
- encryption: encryption method (0-4)
- owner_pw: owner password
- user_pw: user password
"""
def saveIncr(self) -> None:
"""Save document incrementally (in-place)."""
def close(self) -> None:
"""Close document and free memory."""
def load_page(self, page_num: int) -> Page:
"""
Load a specific page by number.
Parameters:
- page_num: zero-based page number
Returns:
Page object
"""
def new_page(self, pno: int = -1, width: float = 595, height: float = 842) -> Page:
"""
Create a new page.
Parameters:
- pno: insertion point (-1 for append)
- width: page width in points
- height: page height in points
Returns:
New Page object
"""
def delete_page(self, pno: int) -> None:
"""
Delete a page.
Parameters:
- pno: page number to delete
"""
def copy_page(self, pno: int, to: int = -1) -> None:
"""
Copy a page within the document.
Parameters:
- pno: source page number
- to: target position (-1 for append)
"""
def move_page(self, pno: int, to: int) -> None:
"""
Move a page to different position.
Parameters:
- pno: source page number
- to: target position
"""
def insert_pdf(self, docsrc: Document, from_page: int = 0, to_page: int = -1,
start_at: int = -1, rotate: int = -1, links: bool = True,
annots: bool = True, show_progress: int = 0, final: bool = True) -> int:
"""
Insert pages from another PDF document.
Parameters:
- docsrc: source Document object
- from_page: first source page (0-based)
- to_page: last source page (-1 for last)
- start_at: insertion point (-1 for append)
- rotate: rotation angle (0, 90, 180, 270)
- links: copy links
- annots: copy annotations
- show_progress: progress callback frequency
- final: finalize operation
Returns:
Number of pages inserted
"""
def authenticate(self, password: str) -> int:
"""
Authenticate encrypted document.
Parameters:
- password: document password
Returns:
Authentication result (0=failed, 1=user password, 2=owner password)
"""
@property
def page_count(self) -> int:
"""Number of pages in document."""
@property
def metadata(self) -> dict:
"""Document metadata dictionary."""
def set_metadata(self, m: dict) -> None:
"""
Set document metadata.
Parameters:
- m: metadata dictionary with keys like 'title', 'author', 'subject', 'creator', etc.
"""
@property
def needs_pass(self) -> bool:
"""True if document requires password authentication."""
@property
def is_encrypted(self) -> bool:
"""True if document is encrypted."""
@property
def is_pdf(self) -> bool:
"""True if document is PDF format."""
@property
def is_form_pdf(self) -> bool:
"""True if PDF contains interactive forms."""
@property
def is_reflowable(self) -> bool:
"""True if document has reflowable layout (EPUB, etc.)."""
@property
def is_closed(self) -> bool:
"""True if document has been closed."""
@property
def name(self) -> str:
"""Document filename or '<new document>' for new documents."""
def can_save_incrementally(self) -> bool:
"""True if document can be saved incrementally."""
def chapter_count(self) -> int:
"""Number of chapters (for EPUB documents)."""
def last_location(self) -> tuple:
"""Last location tuple for reflowable documents."""
def next_location(self, location: tuple) -> tuple:
"""
Next location after given location.
Parameters:
- location: current location tuple
Returns:
Next location tuple
"""
def previous_location(self, location: tuple) -> tuple:
"""
Previous location before given location.
Parameters:
- location: current location tuple
Returns:
Previous location tuple
"""
def page_xref(self, pno: int) -> int:
"""
Get PDF cross-reference number for page.
Parameters:
- pno: page number
Returns:
Cross-reference number
"""Manage document bookmarks and navigation structure.
def get_toc(self, simple: bool = True) -> list:
"""
Get table of contents.
Parameters:
- simple: return simple format (default) or detailed format
Returns:
List of [level, title, page, dest] entries
"""
def set_toc(self, toc: list, collapse: int = 1) -> int:
"""
Set table of contents.
Parameters:
- toc: table of contents list
- collapse: collapse levels above this number
Returns:
Number of items processed
"""Handle files embedded within documents.
def embeddedFileNames(self) -> list:
"""
Get list of embedded file names.
Returns:
List of embedded file names
"""
def embeddedFileGet(self, name: str) -> bytes:
"""
Extract embedded file content.
Parameters:
- name: embedded file name
Returns:
File content as bytes
"""
def embeddedFileAdd(self, name: str, buffer: typing.Union[str, bytes],
filename: str = None, ufilename: str = None,
desc: str = None) -> None:
"""
Add embedded file to document.
Parameters:
- name: reference name for the file
- buffer: file content
- filename: original filename
- ufilename: unicode filename
- desc: file description
"""
def embeddedFileDel(self, name: str) -> None:
"""
Delete embedded file.
Parameters:
- name: embedded file name to delete
"""import pymupdf
# Open document
doc = pymupdf.open("input.pdf")
# Check if password required
if doc.needs_pass:
success = doc.authenticate("password")
if not success:
raise ValueError("Invalid password")
# Get basic info
print(f"Pages: {doc.page_count}")
print(f"Metadata: {doc.metadata}")
# Save with compression
doc.save("output.pdf", garbage=4, deflate=True)
doc.close()import pymupdf
# Open target document
target_doc = pymupdf.open("target.pdf")
# Open source document
source_doc = pymupdf.open("source.pdf")
# Insert all pages from source
target_doc.insert_pdf(source_doc)
# Save merged document
target_doc.save("merged.pdf")
# Clean up
target_doc.close()
source_doc.close()import pymupdf
# Create new document
doc = pymupdf.open()
# Add pages
page1 = doc.new_page()
page2 = doc.new_page(width=792, height=612) # Letter size landscape
# Set metadata
doc.set_metadata({
"title": "My Document",
"author": "Author Name",
"subject": "Document Subject",
"creator": "PyMuPDF"
})
# Save new document
doc.save("new_document.pdf")
doc.close()Install with Tessl CLI
npx tessl i tessl/pypi-pymupdf