tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

PDF Document Operations

Name: tessl/pypi-pdfplumber
Author: tessl

Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, document-level operations, and PDF repair capabilities.

Capabilities

Opening PDF Documents

The primary function for opening PDF documents from file paths, streams, or bytes with comprehensive configuration options.

def open(path_or_fp, pages=None, laparams=None, password=None, 
         strict_metadata=False, unicode_norm=None, repair=False, 
         gs_path=None, repair_setting="default", raise_unicode_errors=True):
    """
    Open PDF document from file path or stream.
    
    Parameters:
    - path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
    - pages: List[int] or Tuple[int], optional - Specific pages to parse
    - laparams: Dict[str, Any], optional - Layout analysis parameters
    - password: str, optional - PDF password for encrypted documents
    - strict_metadata: bool - Raise errors for malformed metadata
    - unicode_norm: str, optional - Unicode normalization ("NFC", "NFKC", "NFD", "NFKD")
    - repair: bool - Attempt PDF repair using Ghostscript
    - gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
    - repair_setting: str - Repair quality setting ("default", "prepress", "printer", "ebook", "screen")
    - raise_unicode_errors: bool - Raise errors for unicode decoding issues
    
    Returns:
    PDF object with context manager support
    """

Usage Examples:

# Open from file path
with pdfplumber.open("document.pdf") as pdf:
    print(f"Document has {len(pdf.pages)} pages")

# Open specific pages only
with pdfplumber.open("large_doc.pdf", pages=[0, 1, 5]) as pdf:
    for page in pdf.pages:
        print(f"Page {page.page_number}: {page.extract_text()[:100]}")

# Open encrypted PDF
with pdfplumber.open("encrypted.pdf", password="secret") as pdf:
    text = pdf.pages[0].extract_text()

# Open with repair for corrupted PDFs
with pdfplumber.open("corrupted.pdf", repair=True) as pdf:
    text = pdf.pages[0].extract_text()

PDF Class

The main PDF document class providing access to pages, metadata, and document-level operations.

class PDF:
    """PDF document container with page access and metadata."""
    
    def __init__(self, stream, stream_is_external=False, path=None, 
                 pages=None, laparams=None, password=None, 
                 strict_metadata=False, unicode_norm=None, 
                 raise_unicode_errors=True):
        """Initialize PDF object from stream."""
        
    @property
    def pages(self) -> List[Page]:
        """List of page objects in document."""
    
    @property
    def objects(self) -> Dict[str, T_obj_list]:
        """All objects aggregated from all pages by type."""
    
    @property
    def annots(self) -> List[Dict[str, Any]]:
        """All annotations from all pages."""
    
    @property
    def hyperlinks(self) -> List[Dict[str, Any]]:
        """All hyperlinks from all pages."""
    
    @property
    def structure_tree(self) -> List[Dict[str, Any]]:
        """Document structure tree for accessibility."""
    
    metadata: Dict
    """PDF metadata dictionary (instance variable)."""
    
    def close(self):
        """Close PDF and cleanup resources."""
    
    def __enter__(self):
        """Context manager entry."""
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with cleanup."""

Usage Examples:

# Access document metadata
pdf = pdfplumber.open("document.pdf")
print(f"Title: {pdf.metadata.get('Title', 'No title')}")
print(f"Author: {pdf.metadata.get('Author', 'Unknown')}")
print(f"Created: {pdf.metadata.get('CreationDate', 'Unknown')}")

# Get all text objects from document
all_chars = pdf.objects.get('chars', [])
print(f"Document contains {len(all_chars)} character objects")

# Access document-level annotations
for annot in pdf.annots:
    print(f"Annotation: {annot.get('contents', 'No content')}")
    
pdf.close()

PDF Repair

Repair corrupted or malformed PDF documents using Ghostscript with various quality settings.

def repair(path_or_fp, outfile=None, password=None, gs_path=None, 
           setting="default"):
    """
    Repair PDF using Ghostscript.
    
    Parameters:
    - path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
    - outfile: str or pathlib.Path, optional - Output file path
    - password: str, optional - PDF password
    - gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
    - setting: str - Quality setting ("default", "prepress", "printer", "ebook", "screen")
    
    Returns:
    BytesIO containing repaired PDF data
    """

# Repair setting type
T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]

Usage Examples:

# Repair PDF to memory
repaired_data = pdfplumber.repair("corrupted.pdf")
with pdfplumber.open(repaired_data) as pdf:
    text = pdf.pages[0].extract_text()

# Repair PDF to file
pdfplumber.repair("corrupted.pdf", outfile="repaired.pdf")

# Repair with specific quality setting
pdfplumber.repair("corrupted.pdf", outfile="high_quality.pdf", setting="prepress")

# Repair encrypted PDF
pdfplumber.repair("encrypted_corrupted.pdf", password="secret", outfile="repaired.pdf")

Container Base Class

Base class providing object property access and serialization methods inherited by PDF and Page classes.

class Container:
    """Base container with object access and serialization."""
    
    @property
    def rects(self) -> T_obj_list:
        """Rectangle objects."""
    
    @property
    def lines(self) -> T_obj_list:
        """Line objects."""
    
    @property
    def curves(self) -> T_obj_list:
        """Curve objects."""
    
    @property 
    def images(self) -> T_obj_list:
        """Image objects."""
    
    @property
    def chars(self) -> T_obj_list:
        """Character objects."""
    
    @property
    def textboxverticals(self) -> T_obj_list:
        """Vertical text box objects."""
    
    @property
    def textboxhorizontals(self) -> T_obj_list:
        """Horizontal text box objects."""
    
    @property
    def textlineverticals(self) -> T_obj_list:
        """Vertical text line objects."""
    
    @property
    def textlinehorizontals(self) -> T_obj_list:
        """Horizontal text line objects."""
    
    @property
    def rect_edges(self) -> T_obj_list:
        """Edges derived from rectangles."""
    
    @property
    def curve_edges(self) -> T_obj_list:
        """Edges derived from curves."""
    
    @property
    def edges(self) -> T_obj_list:
        """All edges (lines + rect_edges + curve_edges)."""
    
    @property
    def horizontal_edges(self) -> T_obj_list:
        """Horizontal edges only."""
    
    @property
    def vertical_edges(self) -> T_obj_list:
        """Vertical edges only."""
    
    def flush_cache(self, properties=None):
        """Clear cached properties."""
    
    def to_json(self, stream=None, object_types=None, include_attrs=None, 
                exclude_attrs=None, precision=None, indent=None):
        """Export as JSON."""
    
    def to_csv(self, stream=None, object_types=None, precision=None, 
               include_attrs=None, exclude_attrs=None):
        """Export as CSV."""
    
    def to_dict(self, object_types=None):
        """Convert to dictionary representation."""

Error Handling

# Custom exceptions for PDF operations
class MalformedPDFException(Exception):
    """Raised for malformed PDF files."""

class PdfminerException(Exception):
    """Wrapper for pdfminer exceptions."""

Common error scenarios: