tessl/pypi-py-pdf2

A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Types and Objects

Name: tessl/pypi-py-pdf2
Author: tessl

Low-level PDF object types for advanced manipulation, constants, type definitions, and utility functions used throughout the PyPDF2 library. These components provide the foundation for PDF specification-level operations.

Capabilities

Generic PDF Objects

Base classes and data structures that represent PDF objects according to the PDF specification.

class PdfObject:
    """Base class for all PDF objects."""

class NullObject(PdfObject):
    """PDF null object representation."""

class BooleanObject(PdfObject):
    """PDF boolean object (true/false)."""

class IndirectObject(PdfObject):
    """PDF indirect object reference."""
    
    @property
    def idnum(self) -> int:
        """Object ID number."""
    
    @property
    def generation(self) -> int:
        """Object generation number."""
    
    @property
    def pdf(self):
        """Associated PDF reader."""

class FloatObject(float, PdfObject):
    """PDF floating-point number object."""

class NumberObject(int, PdfObject):
    """PDF integer number object."""

class ByteStringObject(bytes, PdfObject):
    """PDF byte string object."""

class TextStringObject(str, PdfObject):
    """PDF text string object."""

class NameObject(str, PdfObject):
    """PDF name object (starts with /)."""

Data Structure Objects

Collections and containers for PDF data structures.

class ArrayObject(list, PdfObject):
    """PDF array object (list-like)."""

class DictionaryObject(dict, PdfObject):
    """PDF dictionary object (dict-like)."""

class TreeObject(DictionaryObject):
    """PDF tree structure for hierarchical data."""

class StreamObject(PdfObject):
    """PDF stream object containing binary data."""

class DecodedStreamObject(StreamObject):
    """Decoded (uncompressed) PDF stream."""

class EncodedStreamObject(StreamObject):
    """Encoded (compressed) PDF stream."""

class ContentStream(DecodedStreamObject):
    """PDF content stream with page content operations."""

class Field(TreeObject):
    """PDF form field object."""

Navigation and Annotation Objects

Objects for document navigation, bookmarks, and annotations.

class Destination(DictionaryObject):
    """PDF destination for navigation."""
    
    @property
    def title(self) -> Optional[str]:
        """Destination title."""
    
    @property
    def page(self):
        """Target page reference."""
    
    @property
    def typ(self) -> str:
        """Destination type (fit type)."""

class OutlineItem(DictionaryObject):
    """PDF outline item (bookmark)."""
    
    @property
    def title(self) -> Optional[str]:
        """Bookmark title."""
    
    @property
    def page(self):
        """Target page reference."""
    
    @property
    def parent(self):
        """Parent outline item."""
    
    @property
    def children(self):
        """Child outline items."""

class Bookmark(OutlineItem):
    """DEPRECATED: Use OutlineItem instead."""

class AnnotationBuilder:
    """Builder for creating PDF annotations."""
    
    # Methods for building various annotation types
    # Implementation depends on annotation type

Utility Objects and Functions

Helper classes and functions for PDF manipulation.

class PageRange:
    """Slice-like representation of page ranges."""
    
    def __init__(self, arg: Union[slice, "PageRange", str]):
        """
        Create a PageRange from various input types.
        
        Args:
            arg: Range specification (string, slice, or PageRange)
        """
    
    def to_slice(self) -> slice:
        """Convert to Python slice object."""
    
    def indices(self, n: int) -> Tuple[int, int, int]:
        """
        Get slice indices for given length.
        
        Args:
            n (int): Total length
            
        Returns:
            tuple: (start, stop, step) indices
        """
    
    @staticmethod
    def valid(input: Any) -> bool:
        """
        Check if input is valid for PageRange.
        
        Args:
            input: Input to validate
            
        Returns:
            bool: True if valid
        """

class PaperSize:
    """Standard paper size constants."""
    
    A0: 'Dimensions'  # 2384 x 3371 points
    A1: 'Dimensions'  # 1685 x 2384 points  
    A2: 'Dimensions'  # 1190 x 1685 points
    A3: 'Dimensions'  # 842 x 1190 points
    A4: 'Dimensions'  # 595 x 842 points
    A5: 'Dimensions'  # 420 x 595 points
    A6: 'Dimensions'  # 298 x 420 points
    A7: 'Dimensions'  # 210 x 298 points
    A8: 'Dimensions'  # 147 x 210 points
    C4: 'Dimensions'  # 649 x 918 points (envelope)

class PasswordType:
    """Enumeration for password validation results."""
    
    NOT_DECRYPTED: int = 0
    USER_PASSWORD: int = 1
    OWNER_PASSWORD: int = 2

# Utility functions
def create_string_object(string: str, forced_encoding=None) -> Union[TextStringObject, ByteStringObject]:
    """
    Create appropriate string object based on content.
    
    Args:
        string (str): String content
        forced_encoding (str, optional): Force specific encoding
        
    Returns:
        Union[TextStringObject, ByteStringObject]: Appropriate string object
    """

def encode_pdfdocencoding(unicode_string: str) -> bytes:
    """
    Encode string using PDF document encoding.
    
    Args:
        unicode_string (str): Unicode string to encode
        
    Returns:
        bytes: Encoded bytes
    """

def decode_pdfdocencoding(byte_string: bytes) -> str:
    """
    Decode bytes using PDF document encoding.
    
    Args:
        byte_string (bytes): Bytes to decode
        
    Returns:
        str: Decoded string
    """

def hex_to_rgb(color: str) -> Tuple[float, float, float]:
    """
    Convert hex color to RGB tuple.
    
    Args:
        color (str): Hex color string (e.g., "#FF0000")
        
    Returns:
        tuple: (red, green, blue) values 0.0-1.0
    """

def read_object(stream, pdf) -> PdfObject:
    """
    Read a PDF object from stream.
    
    Args:
        stream: Input stream
        pdf: PDF reader reference
        
    Returns:
        PdfObject: Parsed PDF object
    """

def parse_filename_page_ranges(args: List[Union[str, PageRange, None]]) -> List[Tuple[str, PageRange]]:
    """
    Parse filename and page range arguments.
    
    Args:
        args: Command-line style arguments
        
    Returns:
        list: List of (filename, page_range) tuples
    """

Type Definitions

Type aliases and definitions used throughout the library.

# Border array for annotations
BorderArrayType = List[Union[NameObject, NumberObject, ArrayObject]]

# Outline item types
OutlineItemType = Union[OutlineItem, Destination]

# PDF fit types for destinations
FitType = Literal["/Fit", "/XYZ", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV"]

# Zoom argument types
ZoomArgType = Union[NumberObject, NullObject, float]
ZoomArgsType = List[ZoomArgType]

# Complex outline structure type
OutlineType = List[Union[OutlineItemType, List]]

# Page layout types
LayoutType = Literal[
    "/SinglePage", "/OneColumn", "/TwoColumnLeft", "/TwoColumnRight",
    "/TwoPageLeft", "/TwoPageRight"
]

# Page mode types  
PagemodeType = Literal[
    "/UseNone", "/UseOutlines", "/UseThumbs", "/FullScreen",
    "/UseOC", "/UseAttachments"
]

# Page range specification types
PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]]

# Dimension type for paper sizes
class Dimensions:
    """Represents paper dimensions."""
    
    def __init__(self, width: float, height: float):
        """
        Create dimensions.
        
        Args:
            width (float): Width in points
            height (float): Height in points
        """
        self.width = width
        self.height = height

Usage Examples

Working with Generic Objects

from PyPDF2 import PdfReader
from PyPDF2.generic import DictionaryObject, ArrayObject, NameObject

reader = PdfReader("document.pdf")

# Access raw PDF objects
for page in reader.pages:
    # Pages are DictionaryObject instances
    if isinstance(page, DictionaryObject):
        # Access dictionary entries
        mediabox = page.get("/MediaBox")
        if isinstance(mediabox, ArrayObject):
            print(f"MediaBox: {[float(x) for x in mediabox]}")
        
        # Check for resources
        resources = page.get("/Resources")
        if resources:
            fonts = resources.get("/Font", {})
            print(f"Fonts: {list(fonts.keys())}")

Using Page Ranges

from PyPDF2 import PdfMerger, PageRange

merger = PdfMerger()

# Various ways to specify page ranges
merger.append("doc1.pdf", pages=PageRange("1:5"))      # Pages 1-4
merger.append("doc2.pdf", pages=PageRange("::2"))      # Every other page  
merger.append("doc3.pdf", pages=PageRange("10:"))      # Page 10 to end
merger.append("doc4.pdf", pages=PageRange([1, 3, 5]))  # Specific pages

# Validate page range
if PageRange.valid("1:10"):
    print("Valid page range")

merger.write("output.pdf")
merger.close()

Working with Paper Sizes

from PyPDF2 import PdfWriter
from PyPDF2.generic import PaperSize

writer = PdfWriter()

# Create pages with standard sizes
a4_page = writer.add_blank_page(PaperSize.A4.width, PaperSize.A4.height)
letter_page = writer.add_blank_page(612, 792)  # US Letter
a3_page = writer.add_blank_page(PaperSize.A3.width, PaperSize.A3.height)

print(f"A4 size: {PaperSize.A4.width} x {PaperSize.A4.height} points")
print(f"A3 size: {PaperSize.A3.width} x {PaperSize.A3.height} points")

with open("standard_sizes.pdf", "wb") as output_file:
    writer.write(output_file)

Creating Custom PDF Objects

from PyPDF2.generic import (
    DictionaryObject, ArrayObject, NameObject, 
    TextStringObject, NumberObject
)

# Create a custom dictionary object
custom_dict = DictionaryObject({
    NameObject("/Type"): NameObject("/Annotation"),
    NameObject("/Subtype"): NameObject("/Text"),
    NameObject("/Contents"): TextStringObject("Custom note"),
    NameObject("/Rect"): ArrayObject([
        NumberObject(100), NumberObject(100),
        NumberObject(200), NumberObject(150)
    ])
})

print(f"Custom object: {custom_dict}")

String Encoding Utilities

from PyPDF2.generic import (
    create_string_object, encode_pdfdocencoding, 
    decode_pdfdocencoding, hex_to_rgb
)

# Create appropriate string objects
text = create_string_object("Hello, World!")
binary_text = create_string_object("\\x00\\xff\\x42", "latin-1")

# Encoding/decoding
unicode_text = "Héllo, Wørld!"
encoded = encode_pdfdocencoding(unicode_text)
decoded = decode_pdfdocencoding(encoded)

print(f"Original: {unicode_text}")
print(f"Decoded: {decoded}")

# Color conversion
red_rgb = hex_to_rgb("#FF0000")  # (1.0, 0.0, 0.0)
blue_rgb = hex_to_rgb("#0000FF")  # (0.0, 0.0, 1.0)
print(f"Red RGB: {red_rgb}")
print(f"Blue RGB: {blue_rgb}")

Working with Outlines and Destinations

from PyPDF2 import PdfReader
from PyPDF2.generic import OutlineItem, Destination

reader = PdfReader("document.pdf")

# Access document outline
outline = reader.outline
if outline:
    def print_outline(items, level=0):
        for item in items:
            if isinstance(item, OutlineItem):
                indent = "  " * level
                print(f"{indent}{item.title}")
                if hasattr(item, 'children') and item.children:
                    print_outline(item.children, level + 1)
            elif isinstance(item, list):
                print_outline(item, level)
    
    print_outline(outline)

# Access named destinations
destinations = reader.named_destinations
for name, dest in destinations.items():
    if isinstance(dest, Destination):
        print(f"Destination '{name}' -> Page {dest.page}, Type: {dest.typ}")

Password Type Checking

from PyPDF2 import PdfReader, PasswordType

reader = PdfReader("encrypted.pdf")

if reader.is_encrypted:
    # Try different password types
    result = reader.decrypt("user_password")
    
    if result == PasswordType.USER_PASSWORD:
        print("Opened with user password - some restrictions may apply")
    elif result == PasswordType.OWNER_PASSWORD:
        print("Opened with owner password - full access")
    elif result == PasswordType.NOT_DECRYPTED:
        print("Password incorrect or file corrupted")

Constants and Enumerations

PyPDF2 includes extensive constants from the PDF specification organized in the constants module:

Key Constants

# Core PDF constants
class Core:
    OUTLINES = "/Outlines"
    THREADS = "/Threads" 
    PAGE = "/Page"
    PAGES = "/Pages"
    CATALOG = "/Catalog"

# User access permissions
class UserAccessPermissions:
    PRINT = 1 << 2
    MODIFY = 1 << 3
    COPY = 1 << 4
    ADD_OR_MODIFY = 1 << 5

# PDF filter types
class FilterTypes:
    FLATE_DECODE = "/FlateDecode"
    LZW_DECODE = "/LZWDecode"
    ASCII_HEX_DECODE = "/ASCIIHexDecode"
    DCT_DECODE = "/DCTDecode"

These constants ensure compliance with PDF specification requirements and provide standardized access to PDF dictionary keys and values.

Install with Tessl CLI