CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-py-pdf2

A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

pdf-reading.mddocs/

PDF Reading

Read and parse PDF files, access document structure, extract text and metadata, handle encrypted documents with password protection. The PdfReader class provides the primary interface for reading PDF files.

Capabilities

PdfReader Class

Main class for reading PDF files with comprehensive access to document structure, pages, metadata, and content.

class PdfReader:
    def __init__(self, stream: Union[str, bytes, Path], strict: bool = False, password: Union[None, str, bytes] = None):
        """
        Initialize a PdfReader instance.

        Args:
            stream: PDF file path or file-like object
            strict: Whether to raise exceptions for correctable problems (default: False)
            password: Password for encrypted PDFs

        Raises:
            PdfReadError: If PDF cannot be read
            WrongPasswordError: If password is incorrect
        """

    @property
    def pages(self) -> List[PageObject]:
        """List of all pages in the PDF document."""

    @property  
    def metadata(self) -> DocumentInformation:
        """Document metadata including title, author, subject, etc."""

    @property
    def pdf_header(self) -> str:
        """PDF version string from document header."""

    @property
    def xmp_metadata(self) -> Optional[XmpInformation]:
        """XMP metadata if present in the document."""

    @property
    def is_encrypted(self) -> bool:
        """True if the PDF is encrypted."""

    @property
    def outline(self) -> OutlineType:
        """Document outline/bookmarks structure."""

    @property
    def named_destinations(self) -> Dict[str, Any]:
        """Named destinations in the document."""

    @property
    def page_layout(self) -> Optional[str]:
        """Page layout preference."""

    @property
    def page_mode(self) -> Optional[PagemodeType]:
        """Page mode preference."""

    @property
    def threads(self) -> Optional[ArrayObject]:
        """Article threads if present."""

    @property
    def xfa(self) -> Optional[Dict[str, Any]]:
        """XFA (XML Forms Architecture) data if present."""

    def get_page(self, page_number: int) -> PageObject:
        """
        Get a specific page by number.

        Args:
            page_number (int): Zero-based page index

        Returns:
            PageObject: The requested page

        Raises:
            IndexError: If page number is out of range
        """

    def get_fields(self, tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]:
        """
        Get form fields from the PDF.

        Returns:
            dict: Form field data, or None if no fields present
        """

    def get_form_text_fields(self) -> Dict[str, Any]:
        """
        Get text form fields and their values.

        Returns:
            dict: Text field names and values
        """

    def get_page_number(self, page: PageObject) -> int:
        """
        Get the page number for a given PageObject.

        Args:
            page (PageObject): Page object to find

        Returns:
            int: Zero-based page number

        Raises:
            ValueError: If page is not in this document
        """

    def get_destination_page_number(self, destination: Destination) -> int:
        """
        Get page number for a destination.

        Args:
            destination (Destination): Destination object

        Returns:
            int: Zero-based page number
        """

    def decrypt(self, password: Union[str, bytes]) -> PasswordType:
        """
        Decrypt an encrypted PDF.

        Args:
            password (str): Password to try

        Returns:
            PasswordType: Type of password used (USER_PASSWORD, OWNER_PASSWORD, or NOT_DECRYPTED)

        Raises:
            WrongPasswordError: If password is incorrect
        """

    def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
        """
        Decode permission flags from encryption dictionary.

        Args:
            permissions_code (int): Raw permissions integer

        Returns:
            dict: Human-readable permission flags
        """

Document Information

Container for PDF document metadata with standardized fields.

class DocumentInformation(DictionaryObject):
    """PDF document metadata container."""

    @property
    def title(self) -> Optional[str]:
        """Document title."""

    @property
    def title_raw(self) -> Optional[str]:
        """Raw document title (unprocessed)."""

    @property
    def author(self) -> Optional[str]:
        """Document author."""

    @property
    def author_raw(self) -> Optional[str]:
        """Raw document author (unprocessed)."""

    @property
    def subject(self) -> Optional[str]:
        """Document subject."""

    @property
    def subject_raw(self) -> Optional[str]:
        """Raw document subject (unprocessed)."""

    @property
    def creator(self) -> Optional[str]:
        """Application that created the document."""

    @property
    def creator_raw(self) -> Optional[str]:
        """Raw document creator (unprocessed)."""

    @property
    def producer(self) -> Optional[str]:
        """Application that produced the PDF."""

    @property
    def producer_raw(self) -> Optional[str]:
        """Raw document producer (unprocessed)."""

    @property
    def creation_date(self) -> Optional[str]:
        """Document creation date."""

    @property
    def creation_date_raw(self) -> Optional[str]:
        """Raw document creation date (unprocessed)."""

    @property
    def modification_date(self) -> Optional[str]:
        """Document modification date."""

    @property
    def modification_date_raw(self) -> Optional[str]:
        """Raw document modification date (unprocessed)."""

XMP Metadata

Extended metadata in XMP format for documents that include it.

class XmpInformation:
    """XMP metadata information handler."""
    
    # Methods for parsing and accessing XMP metadata
    # Implementation varies based on XMP content structure

Usage Examples

Basic PDF Reading

from PyPDF2 import PdfReader

# Open and read a PDF file
reader = PdfReader("document.pdf")

# Access basic information
print(f"Number of pages: {len(reader.pages)}")
print(f"PDF version: {reader.pdf_header}")
print(f"Is encrypted: {reader.is_encrypted}")

# Access metadata
if reader.metadata:
    print(f"Title: {reader.metadata.title}")
    print(f"Author: {reader.metadata.author}")
    print(f"Subject: {reader.metadata.subject}")

Working with Encrypted PDFs

from PyPDF2 import PdfReader, WrongPasswordError

try:
    # Try to open encrypted PDF
    reader = PdfReader("encrypted.pdf")
    
    if reader.is_encrypted:
        # Decrypt with password
        password_type = reader.decrypt("user_password")
        print(f"Decrypted with: {password_type}")
        
        # Check permissions
        permissions = reader.decode_permissions(reader.encryption.permissions_flag)
        print(f"Can print: {permissions.get('print', False)}")
        print(f"Can modify: {permissions.get('modify', False)}")
        
except WrongPasswordError:
    print("Incorrect password provided")

Extracting Text from All Pages

from PyPDF2 import PdfReader

reader = PdfReader("document.pdf")
full_text = ""

for page_num, page in enumerate(reader.pages):
    text = page.extract_text()
    full_text += f"\\n--- Page {page_num + 1} ---\\n{text}"

print(full_text)

Working with Form Fields

from PyPDF2 import PdfReader

reader = PdfReader("form.pdf")

# Get all form fields
fields = reader.get_fields()
if fields:
    for field_name, field_info in fields.items():
        print(f"Field: {field_name}, Value: {field_info.get('value', 'N/A')}")

# Get only text fields
text_fields = reader.get_form_text_fields()
for field_name, value in text_fields.items():
    print(f"Text field: {field_name} = {value}")

Deprecated Classes

PdfFileReader (Deprecated)

class PdfFileReader:
    """DEPRECATED: Use PdfReader instead. Will be removed in PyPDF2 3.0.0."""

This class is deprecated and should not be used in new code. All functionality has been moved to PdfReader with the same API.

Install with Tessl CLI

npx tessl i tessl/pypi-py-pdf2

docs

errors-and-utilities.md

index.md

page-manipulation.md

pdf-merging.md

pdf-reading.md

pdf-writing.md

types-and-objects.md

tile.json