tessl/pypi-pymupdf

High performance Python library for data extraction, analysis, conversion & manipulation of PDF and other documents.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Page Content Extraction

Name: tessl/pypi-pymupdf
Author: tessl

Comprehensive text and image extraction from document pages with multiple output formats, search capabilities, and detailed layout analysis. PyMuPDF provides powerful extraction tools that preserve formatting and structural information.

Capabilities

Text Extraction

Extract text in various formats with layout and formatting information.

def get_text(
    page: Page, 
    option: str = "text", 
    *, 
    clip: Rect = None,
    flags: int = None,
    textpage: TextPage = None,
    sort: bool = False,
    delimiters=None,
    tolerance=3
) -> str:
    """
    Extract text from a page in specified format (standalone utility function).
    
    Parameters:
    - page: Page object to extract text from
    - option: output format ("text", "html", "dict", "json", "rawdict", "xml", "xhtml", "words", "blocks")
    - clip: Rect to limit extraction area
    - flags: text extraction flags (TEXT_PRESERVE_LIGATURES, etc.)
    - textpage: existing TextPage object to reuse
    - sort: sort text by reading order
    - delimiters: characters to use as word delimiters (for words option)
    - tolerance: consider words part of same line if coordinates don't differ more than this
    
    Returns:
    Extracted text in requested format
    """

def get_text_blocks(
    page: Page,
    clip: Rect = None,
    flags: int = None,
    textpage: TextPage = None,
    sort: bool = False
) -> list:
    """
    Return the text blocks on a page.
    
    Parameters:
    - page: Page object to extract blocks from
    - clip: Rect to limit extraction area
    - flags: text extraction flags
    - textpage: existing TextPage object to reuse
    - sort: sort blocks by reading order
    
    Returns:
    List of text blocks with coordinates and content
    """

def get_text_words(
    page: Page,
    clip: Rect = None,
    flags: int = None,
    textpage: TextPage = None,
    sort: bool = False,
    delimiters=None,
    tolerance=3
) -> list:
    """
    Return text words as list with bounding box for each word.
    
    Parameters:
    - page: Page object to extract words from
    - clip: Rect to limit extraction area  
    - flags: text extraction flags
    - textpage: existing TextPage object to reuse
    - sort: sort words by reading order
    - delimiters: characters to use as word delimiters
    - tolerance: consider words part of same line if coordinates don't differ more than this
    
    Returns:
    List of words with bounding rectangles
    """

def get_textbox(page: Page, rect: Rect, textpage: TextPage = None) -> str:
    """
    Extract text from specific rectangular area.
    
    Parameters:
    - page: Page object
    - rect: rectangular area to extract text from
    - textpage: existing TextPage object to reuse
    
    Returns:
    Text content within the specified rectangle
    """

def get_text_selection(
    page: Page,
    p1: Point,
    p2: Point,
    clip: Rect = None,
    textpage: TextPage = None
) -> str:
    """
    Extract text between two points on page.
    
    Parameters:
    - page: Page object
    - p1: start point for text selection
    - p2: end point for text selection
    - clip: Rect to limit extraction area
    - textpage: existing TextPage object to reuse
    
    Returns:
    Selected text content
    """

class Page:
    def get_textpage(self, clip: Rect = None, flags: int = 0, matrix: Matrix = None) -> TextPage:
        """
        Get TextPage object for detailed text analysis.
        
        Parameters:
        - clip: rectangle to limit text extraction
        - flags: extraction flags for text processing
        
        Returns:
        TextPage object with detailed text information
        """

TextPage Class

Detailed text extraction and analysis with layout information.

class TextPage:
    def extractText(self, sort: bool = False) -> str:
        """
        Extract plain text.
        
        Parameters:
        - sort: sort text by reading order
        
        Returns:
        Plain text string
        """
    
    def extractHTML(self) -> str:
        """
        Extract text as HTML with formatting.
        
        Returns:
        HTML formatted text
        """
    
    def extractJSON(self, cb=None) -> str:
        """
        Extract text as JSON with detailed layout info.
        
        Parameters:
        - cb: optional callback function
        
        Returns:
        JSON string with text blocks, lines, spans, and characters
        """
    
    def extractXHTML(self) -> str:
        """
        Extract text as XHTML.
        
        Returns:
        XHTML formatted text
        """
    
    def extractXML(self) -> str:
        """
        Extract text as XML.
        
        Returns:
        XML formatted text with structure
        """
    
    def extractDICT(self, cb=None, sort: bool = False) -> dict:
        """
        Extract text as dictionary with detailed information.
        
        Parameters:
        - cb: optional callback function
        - sort: sort text by reading order
        
        Returns:
        Dictionary with blocks, lines, spans, and character details
        """
    
    def extractBLOCKS(self) -> list:
        """
        Extract text blocks.
        
        Returns:
        List of text blocks with coordinates and content
        """
    
    def extractWORDS(self, delimiters: str = None) -> list:
        """
        Extract individual words with positions.
        
        Parameters:
        - delimiters: word delimiter characters
        
        Returns:
        List of words with bounding boxes
        """
    
    def search(self, needle: str, hit_max: int = 16, quads: bool = False) -> list:
        """
        Search for text on the page.
        
        Parameters:
        - needle: text to search for
        - hit_max: maximum number of hits
        - quads: return results as Quad objects instead of Rect
        
        Returns:
        List of Rect or Quad objects indicating match locations
        """

Text Search

Search for text with various options and return location information.

class Page:
    def search_for(self, needle: str, hit_max: int = 16, quads: bool = False, 
                   flags: int = 0, clip: Rect = None) -> list:
        """
        Search for text on page.
        
        Parameters:
        - needle: text to search for
        - hit_max: maximum number of hits to return
        - quads: return Quad objects instead of Rect objects
        - flags: search flags for case sensitivity, etc.
        - clip: limit search to this rectangle
        
        Returns:
        List of Rect or Quad objects indicating match locations
        """

Image Extraction

Extract embedded images from document pages.

class Page:
    def get_images(self, full: bool = False) -> list:
        """
        Get list of images on page.
        
        Parameters:
        - full: include detailed image information
        
        Returns:
        List of image dictionaries with xref, bbox, transform, etc.
        """
    
    def get_image_bbox(self, name: str, transform: bool = True) -> Rect:
        """
        Get bounding box of named image.
        
        Parameters:
        - name: image name/reference
        - transform: apply transformation matrix
        
        Returns:
        Image bounding rectangle
        """
    
    def get_pixmap(self, matrix: Matrix = None, colorspace: Colorspace = None, 
                   clip: Rect = None, alpha: bool = False, annots: bool = True) -> Pixmap:
        """
        Render page to Pixmap for image extraction.
        
        Parameters:
        - matrix: transformation matrix
        - colorspace: target color space
        - clip: clipping rectangle
        - alpha: include alpha channel
        - annots: include annotations
        
        Returns:
        Pixmap object with page image
        """

Links and Annotations

Extract interactive elements from pages.

class Page:
    def get_links(self) -> list:
        """
        Get list of links on page.
        
        Returns:
        List of link dictionaries with kind, from, to, uri, etc.
        """
    
    def first_link(self) -> Link:
        """
        Get first link on page.
        
        Returns:
        Link object or None
        """
    
    def load_links(self) -> None:
        """Load links from page for iteration."""
    
    def first_annot(self) -> Annot:
        """
        Get first annotation on page.
        
        Returns:
        Annot object or None
        """
    
    def load_annot(self, ident: typing.Union[str, int]) -> Annot:
        """
        Load annotation by identifier.
        
        Parameters:
        - ident: annotation identifier (xref number or unique name)
        
        Returns:
        Annot object
        """
    
    def annot_names(self) -> list:
        """
        Get list of annotation names on page.
        
        Returns:
        List of annotation names
        """
    
    def annots(self, types: list = None) -> list:
        """
        Get list of annotations on page.
        
        Parameters:
        - types: filter by annotation types
        
        Returns:
        List of Annot objects
        """

Drawing and Vector Content

Extract vector graphics and drawing information.

class Page:
    def get_drawings(self, extended: bool = False) -> list:
        """
        Get vector drawings from page.
        
        Parameters:
        - extended: include extended path information
        
        Returns:
        List of drawing dictionaries with paths, colors, etc.
        """
    
    def get_cdrawings(self, extended: bool = False) -> list:
        """
        Get drawings in compact format.
        
        Parameters:
        - extended: include extended information
        
        Returns:  
        List of compact drawing representations
        """

Usage Examples

Basic Text Extraction

import pymupdf

doc = pymupdf.open("document.pdf")
page = doc.load_page(0)

# Extract plain text using standalone function
text = pymupdf.get_text(page)
print(text)

# Extract with formatting as HTML
html = pymupdf.get_text(page, "html")
print(html)

# Extract detailed layout information
layout_dict = pymupdf.get_text(page, "dict")
for block in layout_dict["blocks"]:
    if "lines" in block:  # Text block
        for line in block["lines"]:
            for span in line["spans"]:
                print(f"Text: {span['text']}, Font: {span['font']}, Size: {span['size']}")

# Extract text blocks  
blocks = pymupdf.get_text_blocks(page)
for block in blocks:
    print(f"Block text: {block[4]}")  # block[4] contains the text

# Extract individual words with coordinates
words = pymupdf.get_text_words(page)
for word in words:
    x0, y0, x1, y1, text, block_no, line_no, word_no = word
    print(f"Word '{text}' at ({x0}, {y0}, {x1}, {y1})")

doc.close()

Advanced Text Search

import pymupdf

doc = pymupdf.open("document.pdf")

# Search across all pages
search_term = "important keyword"
results = []

for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    matches = page.search_for(search_term, quads=True)
    for match in matches:
        results.append({
            "page": page_num,
            "text": search_term,
            "quad": match,
            "bbox": match.rect
        })

print(f"Found {len(results)} matches")
doc.close()

Image Extraction with Details

import pymupdf

doc = pymupdf.open("document.pdf") 
page = doc.load_page(0)

# Get image information
images = page.get_images(full=True)

for img_index, img in enumerate(images):
    xref = img[0]  # Image xref number
    pix = pymupdf.Pixmap(doc, xref)  # Extract image
    
    if pix.n - pix.alpha < 4:  # GRAY or RGB
        pix.save(f"image_{page.number}_{img_index}.png")
    else:  # CMYK: convert to RGB first
        pix1 = pymupdf.Pixmap(pymupdf.csRGB, pix)
        pix1.save(f"image_{page.number}_{img_index}.png")
        pix1 = None
    
    pix = None

doc.close()

Working with TextPage Objects

import pymupdf

doc = pymupdf.open("document.pdf")
page = doc.load_page(0)

# Create TextPage for detailed analysis
textpage = page.get_textpage()

# Extract words with coordinates
words = textpage.extractWORDS()
for word in words:
    x0, y0, x1, y1, text, block_no, line_no, word_no = word
    print(f"Word: '{text}' at ({x0}, {y0}, {x1}, {y1})")

# Search within TextPage
matches = textpage.search("search term")
print(f"Found {len(matches)} matches")

doc.close()

Link Analysis

import pymupdf

doc = pymupdf.open("document.pdf")
page = doc.load_page(0)

# Get all links
links = page.get_links()

for link in links:
    print(f"Link type: {link['kind']}")
    print(f"From: {link['from']}")  # Source rectangle
    
    if link['kind'] == pymupdf.LINK_URI:
        print(f"URI: {link['uri']}")
    elif link['kind'] == pymupdf.LINK_GOTO:
        print(f"Target page: {link['page']}")
        if 'to' in link:
            print(f"Target point: {link['to']}")

doc.close()

Install with Tessl CLI