Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, document-level operations, and PDF repair capabilities.
The primary function for opening PDF documents from file paths, streams, or bytes with comprehensive configuration options.
def open(path_or_fp, pages=None, laparams=None, password=None,
strict_metadata=False, unicode_norm=None, repair=False,
gs_path=None, repair_setting="default", raise_unicode_errors=True):
"""
Open PDF document from file path or stream.
Parameters:
- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
- pages: List[int] or Tuple[int], optional - Specific pages to parse
- laparams: Dict[str, Any], optional - Layout analysis parameters
- password: str, optional - PDF password for encrypted documents
- strict_metadata: bool - Raise errors for malformed metadata
- unicode_norm: str, optional - Unicode normalization ("NFC", "NFKC", "NFD", "NFKD")
- repair: bool - Attempt PDF repair using Ghostscript
- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
- repair_setting: str - Repair quality setting ("default", "prepress", "printer", "ebook", "screen")
- raise_unicode_errors: bool - Raise errors for unicode decoding issues
Returns:
PDF object with context manager support
"""Usage Examples:
# Open from file path
with pdfplumber.open("document.pdf") as pdf:
print(f"Document has {len(pdf.pages)} pages")
# Open specific pages only
with pdfplumber.open("large_doc.pdf", pages=[0, 1, 5]) as pdf:
for page in pdf.pages:
print(f"Page {page.page_number}: {page.extract_text()[:100]}")
# Open encrypted PDF
with pdfplumber.open("encrypted.pdf", password="secret") as pdf:
text = pdf.pages[0].extract_text()
# Open with repair for corrupted PDFs
with pdfplumber.open("corrupted.pdf", repair=True) as pdf:
text = pdf.pages[0].extract_text()The main PDF document class providing access to pages, metadata, and document-level operations.
class PDF:
"""PDF document container with page access and metadata."""
def __init__(self, stream, stream_is_external=False, path=None,
pages=None, laparams=None, password=None,
strict_metadata=False, unicode_norm=None,
raise_unicode_errors=True):
"""Initialize PDF object from stream."""
@property
def pages(self) -> List[Page]:
"""List of page objects in document."""
@property
def objects(self) -> Dict[str, T_obj_list]:
"""All objects aggregated from all pages by type."""
@property
def annots(self) -> List[Dict[str, Any]]:
"""All annotations from all pages."""
@property
def hyperlinks(self) -> List[Dict[str, Any]]:
"""All hyperlinks from all pages."""
@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Document structure tree for accessibility."""
metadata: Dict
"""PDF metadata dictionary (instance variable)."""
def close(self):
"""Close PDF and cleanup resources."""
def __enter__(self):
"""Context manager entry."""
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit with cleanup."""Usage Examples:
# Access document metadata
pdf = pdfplumber.open("document.pdf")
print(f"Title: {pdf.metadata.get('Title', 'No title')}")
print(f"Author: {pdf.metadata.get('Author', 'Unknown')}")
print(f"Created: {pdf.metadata.get('CreationDate', 'Unknown')}")
# Get all text objects from document
all_chars = pdf.objects.get('chars', [])
print(f"Document contains {len(all_chars)} character objects")
# Access document-level annotations
for annot in pdf.annots:
print(f"Annotation: {annot.get('contents', 'No content')}")
pdf.close()Repair corrupted or malformed PDF documents using Ghostscript with various quality settings.
def repair(path_or_fp, outfile=None, password=None, gs_path=None,
setting="default"):
"""
Repair PDF using Ghostscript.
Parameters:
- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
- outfile: str or pathlib.Path, optional - Output file path
- password: str, optional - PDF password
- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
- setting: str - Quality setting ("default", "prepress", "printer", "ebook", "screen")
Returns:
BytesIO containing repaired PDF data
"""
# Repair setting type
T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]Usage Examples:
# Repair PDF to memory
repaired_data = pdfplumber.repair("corrupted.pdf")
with pdfplumber.open(repaired_data) as pdf:
text = pdf.pages[0].extract_text()
# Repair PDF to file
pdfplumber.repair("corrupted.pdf", outfile="repaired.pdf")
# Repair with specific quality setting
pdfplumber.repair("corrupted.pdf", outfile="high_quality.pdf", setting="prepress")
# Repair encrypted PDF
pdfplumber.repair("encrypted_corrupted.pdf", password="secret", outfile="repaired.pdf")Base class providing object property access and serialization methods inherited by PDF and Page classes.
class Container:
"""Base container with object access and serialization."""
@property
def rects(self) -> T_obj_list:
"""Rectangle objects."""
@property
def lines(self) -> T_obj_list:
"""Line objects."""
@property
def curves(self) -> T_obj_list:
"""Curve objects."""
@property
def images(self) -> T_obj_list:
"""Image objects."""
@property
def chars(self) -> T_obj_list:
"""Character objects."""
@property
def textboxverticals(self) -> T_obj_list:
"""Vertical text box objects."""
@property
def textboxhorizontals(self) -> T_obj_list:
"""Horizontal text box objects."""
@property
def textlineverticals(self) -> T_obj_list:
"""Vertical text line objects."""
@property
def textlinehorizontals(self) -> T_obj_list:
"""Horizontal text line objects."""
@property
def rect_edges(self) -> T_obj_list:
"""Edges derived from rectangles."""
@property
def curve_edges(self) -> T_obj_list:
"""Edges derived from curves."""
@property
def edges(self) -> T_obj_list:
"""All edges (lines + rect_edges + curve_edges)."""
@property
def horizontal_edges(self) -> T_obj_list:
"""Horizontal edges only."""
@property
def vertical_edges(self) -> T_obj_list:
"""Vertical edges only."""
def flush_cache(self, properties=None):
"""Clear cached properties."""
def to_json(self, stream=None, object_types=None, include_attrs=None,
exclude_attrs=None, precision=None, indent=None):
"""Export as JSON."""
def to_csv(self, stream=None, object_types=None, precision=None,
include_attrs=None, exclude_attrs=None):
"""Export as CSV."""
def to_dict(self, object_types=None):
"""Convert to dictionary representation."""# Custom exceptions for PDF operations
class MalformedPDFException(Exception):
"""Raised for malformed PDF files."""
class PdfminerException(Exception):
"""Wrapper for pdfminer exceptions."""Common error scenarios:
MalformedPDFExceptionraise_unicode_errors=TrueInstall with Tessl CLI
npx tessl i tessl/pypi-pdfplumber