High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive text and image extraction from document pages with multiple output formats, search capabilities, and detailed layout analysis. PyMuPDF provides powerful extraction tools that preserve formatting and structural information.
Extract text in various formats with layout and formatting information.
def get_text(
page: Page,
option: str = "text",
*,
clip: Rect = None,
flags: int = None,
textpage: TextPage = None,
sort: bool = False,
delimiters=None,
tolerance=3
) -> str:
"""
Extract text from a page in specified format (standalone utility function).
Parameters:
- page: Page object to extract text from
- option: output format ("text", "html", "dict", "json", "rawdict", "xml", "xhtml", "words", "blocks")
- clip: Rect to limit extraction area
- flags: text extraction flags (TEXT_PRESERVE_LIGATURES, etc.)
- textpage: existing TextPage object to reuse
- sort: sort text by reading order
- delimiters: characters to use as word delimiters (for words option)
- tolerance: consider words part of same line if coordinates don't differ more than this
Returns:
Extracted text in requested format
"""
def get_text_blocks(
page: Page,
clip: Rect = None,
flags: int = None,
textpage: TextPage = None,
sort: bool = False
) -> list:
"""
Return the text blocks on a page.
Parameters:
- page: Page object to extract blocks from
- clip: Rect to limit extraction area
- flags: text extraction flags
- textpage: existing TextPage object to reuse
- sort: sort blocks by reading order
Returns:
List of text blocks with coordinates and content
"""
def get_text_words(
page: Page,
clip: Rect = None,
flags: int = None,
textpage: TextPage = None,
sort: bool = False,
delimiters=None,
tolerance=3
) -> list:
"""
Return text words as list with bounding box for each word.
Parameters:
- page: Page object to extract words from
- clip: Rect to limit extraction area
- flags: text extraction flags
- textpage: existing TextPage object to reuse
- sort: sort words by reading order
- delimiters: characters to use as word delimiters
- tolerance: consider words part of same line if coordinates don't differ more than this
Returns:
List of words with bounding rectangles
"""
def get_textbox(page: Page, rect: Rect, textpage: TextPage = None) -> str:
"""
Extract text from specific rectangular area.
Parameters:
- page: Page object
- rect: rectangular area to extract text from
- textpage: existing TextPage object to reuse
Returns:
Text content within the specified rectangle
"""
def get_text_selection(
page: Page,
p1: Point,
p2: Point,
clip: Rect = None,
textpage: TextPage = None
) -> str:
"""
Extract text between two points on page.
Parameters:
- page: Page object
- p1: start point for text selection
- p2: end point for text selection
- clip: Rect to limit extraction area
- textpage: existing TextPage object to reuse
Returns:
Selected text content
"""
class Page:
def get_textpage(self, clip: Rect = None, flags: int = 0, matrix: Matrix = None) -> TextPage:
"""
Get TextPage object for detailed text analysis.
Parameters:
- clip: rectangle to limit text extraction
- flags: extraction flags for text processing
Returns:
TextPage object with detailed text information
"""Detailed text extraction and analysis with layout information.
class TextPage:
def extractText(self, sort: bool = False) -> str:
"""
Extract plain text.
Parameters:
- sort: sort text by reading order
Returns:
Plain text string
"""
def extractHTML(self) -> str:
"""
Extract text as HTML with formatting.
Returns:
HTML formatted text
"""
def extractJSON(self, cb=None) -> str:
"""
Extract text as JSON with detailed layout info.
Parameters:
- cb: optional callback function
Returns:
JSON string with text blocks, lines, spans, and characters
"""
def extractXHTML(self) -> str:
"""
Extract text as XHTML.
Returns:
XHTML formatted text
"""
def extractXML(self) -> str:
"""
Extract text as XML.
Returns:
XML formatted text with structure
"""
def extractDICT(self, cb=None, sort: bool = False) -> dict:
"""
Extract text as dictionary with detailed information.
Parameters:
- cb: optional callback function
- sort: sort text by reading order
Returns:
Dictionary with blocks, lines, spans, and character details
"""
def extractBLOCKS(self) -> list:
"""
Extract text blocks.
Returns:
List of text blocks with coordinates and content
"""
def extractWORDS(self, delimiters: str = None) -> list:
"""
Extract individual words with positions.
Parameters:
- delimiters: word delimiter characters
Returns:
List of words with bounding boxes
"""
def search(self, needle: str, hit_max: int = 16, quads: bool = False) -> list:
"""
Search for text on the page.
Parameters:
- needle: text to search for
- hit_max: maximum number of hits
- quads: return results as Quad objects instead of Rect
Returns:
List of Rect or Quad objects indicating match locations
"""Search for text with various options and return location information.
class Page:
def search_for(self, needle: str, hit_max: int = 16, quads: bool = False,
flags: int = 0, clip: Rect = None) -> list:
"""
Search for text on page.
Parameters:
- needle: text to search for
- hit_max: maximum number of hits to return
- quads: return Quad objects instead of Rect objects
- flags: search flags for case sensitivity, etc.
- clip: limit search to this rectangle
Returns:
List of Rect or Quad objects indicating match locations
"""Extract embedded images from document pages.
class Page:
def get_images(self, full: bool = False) -> list:
"""
Get list of images on page.
Parameters:
- full: include detailed image information
Returns:
List of image dictionaries with xref, bbox, transform, etc.
"""
def get_image_bbox(self, name: str, transform: bool = True) -> Rect:
"""
Get bounding box of named image.
Parameters:
- name: image name/reference
- transform: apply transformation matrix
Returns:
Image bounding rectangle
"""
def get_pixmap(self, matrix: Matrix = None, colorspace: Colorspace = None,
clip: Rect = None, alpha: bool = False, annots: bool = True) -> Pixmap:
"""
Render page to Pixmap for image extraction.
Parameters:
- matrix: transformation matrix
- colorspace: target color space
- clip: clipping rectangle
- alpha: include alpha channel
- annots: include annotations
Returns:
Pixmap object with page image
"""Extract interactive elements from pages.
class Page:
def get_links(self) -> list:
"""
Get list of links on page.
Returns:
List of link dictionaries with kind, from, to, uri, etc.
"""
def first_link(self) -> Link:
"""
Get first link on page.
Returns:
Link object or None
"""
def load_links(self) -> None:
"""Load links from page for iteration."""
def first_annot(self) -> Annot:
"""
Get first annotation on page.
Returns:
Annot object or None
"""
def load_annot(self, ident: typing.Union[str, int]) -> Annot:
"""
Load annotation by identifier.
Parameters:
- ident: annotation identifier (xref number or unique name)
Returns:
Annot object
"""
def annot_names(self) -> list:
"""
Get list of annotation names on page.
Returns:
List of annotation names
"""
def annots(self, types: list = None) -> list:
"""
Get list of annotations on page.
Parameters:
- types: filter by annotation types
Returns:
List of Annot objects
"""Extract vector graphics and drawing information.
class Page:
def get_drawings(self, extended: bool = False) -> list:
"""
Get vector drawings from page.
Parameters:
- extended: include extended path information
Returns:
List of drawing dictionaries with paths, colors, etc.
"""
def get_cdrawings(self, extended: bool = False) -> list:
"""
Get drawings in compact format.
Parameters:
- extended: include extended information
Returns:
List of compact drawing representations
"""import pymupdf
doc = pymupdf.open("document.pdf")
page = doc.load_page(0)
# Extract plain text using standalone function
text = pymupdf.get_text(page)
print(text)
# Extract with formatting as HTML
html = pymupdf.get_text(page, "html")
print(html)
# Extract detailed layout information
layout_dict = pymupdf.get_text(page, "dict")
for block in layout_dict["blocks"]:
if "lines" in block: # Text block
for line in block["lines"]:
for span in line["spans"]:
print(f"Text: {span['text']}, Font: {span['font']}, Size: {span['size']}")
# Extract text blocks
blocks = pymupdf.get_text_blocks(page)
for block in blocks:
print(f"Block text: {block[4]}") # block[4] contains the text
# Extract individual words with coordinates
words = pymupdf.get_text_words(page)
for word in words:
x0, y0, x1, y1, text, block_no, line_no, word_no = word
print(f"Word '{text}' at ({x0}, {y0}, {x1}, {y1})")
doc.close()import pymupdf
doc = pymupdf.open("document.pdf")
# Search across all pages
search_term = "important keyword"
results = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
matches = page.search_for(search_term, quads=True)
for match in matches:
results.append({
"page": page_num,
"text": search_term,
"quad": match,
"bbox": match.rect
})
print(f"Found {len(results)} matches")
doc.close()import pymupdf
doc = pymupdf.open("document.pdf")
page = doc.load_page(0)
# Get image information
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0] # Image xref number
pix = pymupdf.Pixmap(doc, xref) # Extract image
if pix.n - pix.alpha < 4: # GRAY or RGB
pix.save(f"image_{page.number}_{img_index}.png")
else: # CMYK: convert to RGB first
pix1 = pymupdf.Pixmap(pymupdf.csRGB, pix)
pix1.save(f"image_{page.number}_{img_index}.png")
pix1 = None
pix = None
doc.close()import pymupdf
doc = pymupdf.open("document.pdf")
page = doc.load_page(0)
# Create TextPage for detailed analysis
textpage = page.get_textpage()
# Extract words with coordinates
words = textpage.extractWORDS()
for word in words:
x0, y0, x1, y1, text, block_no, line_no, word_no = word
print(f"Word: '{text}' at ({x0}, {y0}, {x1}, {y1})")
# Search within TextPage
matches = textpage.search("search term")
print(f"Found {len(matches)} matches")
doc.close()import pymupdf
doc = pymupdf.open("document.pdf")
page = doc.load_page(0)
# Get all links
links = page.get_links()
for link in links:
print(f"Link type: {link['kind']}")
print(f"From: {link['from']}") # Source rectangle
if link['kind'] == pymupdf.LINK_URI:
print(f"URI: {link['uri']}")
elif link['kind'] == pymupdf.LINK_GOTO:
print(f"Target page: {link['page']}")
if 'to' in link:
print(f"Target point: {link['to']}")
doc.close()Install with Tessl CLI
npx tessl i tessl/pypi-pymupdf