tessl/pypi-pikepdf

Read and write PDFs with Python, powered by qpdf

—

Pending

Overview

Eval results

Files

PDF Objects and Data Types

Name: tessl/pypi-pikepdf
Author: tessl

PDF object types and data structures that form the foundation of PDF content representation. These classes provide the building blocks for manipulating PDF data at the object level.

Capabilities

Base Object Class

The fundamental PDF object type that all other PDF objects inherit from, providing common functionality for object manipulation and ownership.

class Object:
    """
    Universal PDF object type representing any PDF data structure.
    
    All PDF objects (arrays, dictionaries, names, etc.) derive from this class.
    """
    
    def is_owned_by(self, possible_owner: Pdf) -> bool:
        """
        Check if this object is owned by a specific PDF.
        
        Parameters:
        - possible_owner (Pdf): PDF to check ownership against
        
        Returns:
        bool: True if this object belongs to the specified PDF
        """
    
    def same_owner_as(self, other: Object) -> bool:
        """
        Check if this object has the same owner as another object.
        
        Parameters:
        - other (Object): Object to compare ownership with
        
        Returns:
        bool: True if both objects have the same owner
        """
    
    def with_same_owner_as(self, other: Object) -> Object:
        """
        Return a copy of this object owned by the same PDF as another object.
        
        Parameters:
        - other (Object): Object whose owner should be used
        
        Returns:
        Object: Copy of this object with the same owner as other
        
        Raises:
        ForeignObjectError: If objects cannot be made compatible
        """
    
    @staticmethod
    def parse(data: str, *, pdf_context: Pdf = None) -> Object:
        """
        Parse a string representation of PDF data into an Object.
        
        Parameters:
        - data (str): String containing PDF object data
        - pdf_context (Pdf, optional): PDF context for parsing
        
        Returns:
        Object: Parsed PDF object
        
        Raises:
        PdfError: If the data cannot be parsed
        """
    
    def unparse(self, *, resolved: bool = False) -> str:
        """
        Convert the object back to its string representation.
        
        Parameters:
        - resolved (bool): Whether to resolve indirect references
        
        Returns:
        str: String representation of the object
        """
    
    @property
    def _type_code(self) -> ObjectType:
        """
        The object's type code.
        
        Returns:
        ObjectType: Enumeration value indicating the object type
        """
    
    @property
    def is_indirect(self) -> bool:
        """
        Whether this is an indirect object.
        
        Returns:
        bool: True if this is an indirect object reference
        """
    
    @property
    def objgen(self) -> tuple[int, int]:
        """
        Object and generation numbers for indirect objects.
        
        Returns:
        tuple[int, int]: (object_number, generation_number) or (0, 0) for direct objects
        """

Array Objects

PDF arrays represent ordered collections of PDF objects, similar to Python lists.

class Array(Object):
    """
    PDF array object representing an ordered list of PDF objects.
    
    Behaves like a Python list with additional PDF-specific functionality.
    """
    
    def __init__(self, iterable=None) -> None:
        """
        Create a new PDF array.
        
        Parameters:
        - iterable (optional): Initial objects to populate the array
        """
    
    def __len__(self) -> int:
        """Return the number of elements in the array."""
    
    def __getitem__(self, index: int) -> Object:
        """Get an element by index."""
    
    def __setitem__(self, index: int, value: Object) -> None:
        """Set an element at the given index."""
    
    def append(self, obj: Object) -> None:
        """
        Add an object to the end of the array.
        
        Parameters:
        - obj (Object): Object to append
        """
    
    def extend(self, iterable) -> None:
        """
        Extend the array with objects from an iterable.
        
        Parameters:
        - iterable: Objects to add to the array
        """
    
    def insert(self, index: int, obj: Object) -> None:
        """
        Insert an object at the specified index.
        
        Parameters:
        - index (int): Position to insert at
        - obj (Object): Object to insert
        """

Dictionary Objects

PDF dictionaries represent key-value mappings where keys are Name objects and values are any PDF objects.

class Dictionary(Object):
    """
    PDF dictionary object representing key-value mappings.
    
    Keys must be Name objects, values can be any PDF objects.
    Behaves like a Python dictionary with PDF-specific enhancements.
    """
    
    def __init__(self, mapping=None, **kwargs) -> None:
        """
        Create a new PDF dictionary.
        
        Parameters:
        - mapping (optional): Initial key-value pairs
        - **kwargs: Additional key-value pairs (keys converted to Names)
        """
    
    def __getitem__(self, key) -> Object:
        """Get a value by key (key can be str or Name)."""
    
    def __setitem__(self, key, value: Object) -> None:
        """Set a key-value pair (key converted to Name if needed)."""
    
    def __contains__(self, key) -> bool:
        """Check if key exists in dictionary."""
    
    def __len__(self) -> int:
        """Return number of key-value pairs."""
    
    def keys(self):
        """Return dictionary keys as Name objects."""
    
    def values(self):
        """Return dictionary values."""
    
    def items(self):
        """Return key-value pairs."""
    
    def get(self, key, default=None) -> Object:
        """
        Get a value with optional default.
        
        Parameters:
        - key: Dictionary key (str or Name)
        - default: Default value if key not found
        
        Returns:
        Object: Value associated with key, or default
        """

Name Objects

PDF names are atomic identifiers used as dictionary keys and various PDF constants.

class Name(Object):
    """
    PDF name object representing an immutable identifier.
    
    Names are used as dictionary keys and PDF constants.
    Supports both string construction and attribute-style access.
    """
    
    def __init__(self, name_string: str) -> None:
        """
        Create a PDF name from a string.
        
        Parameters:
        - name_string (str): String representation of the name
        """
    
    def __str__(self) -> str:
        """Return string representation without leading slash."""
    
    def __repr__(self) -> str:
        """Return full representation including leading slash."""
    
    def __eq__(self, other) -> bool:
        """Compare names for equality."""
    
    def __hash__(self) -> int:
        """Return hash for use as dictionary key."""

# Name constants can be accessed as attributes
# Example: Name.Type, Name.Font, Name.Contents

String Objects

PDF strings can contain text or binary data with proper encoding handling.

class String(Object):
    """
    PDF string object for text or binary data.
    
    Handles PDF string encoding including literal strings and hex strings.
    """
    
    def __init__(self, str_or_bytes) -> None:
        """
        Create a PDF string from text or bytes.
        
        Parameters:
        - str_or_bytes (str | bytes): String content
        """
    
    def __str__(self) -> str:
        """Return string content as text."""
    
    def __bytes__(self) -> bytes:
        """Return string content as bytes."""
    
    def __len__(self) -> int:
        """Return length of string content."""
    
    @property
    def for_pdf(self) -> str:
        """
        String representation suitable for PDF output.
        
        Returns:
        str: Properly escaped string for PDF files
        """

Stream Objects

PDF streams contain both a dictionary of metadata and binary data content.

class Stream(Object):
    """
    PDF stream object containing dictionary metadata and binary data.
    
    Streams are used for page content, images, fonts, and other binary data.
    """
    
    def __init__(self, owner: Pdf, data=None, dict=None, **kwargs) -> None:
        """
        Create a new PDF stream.
        
        Parameters:
        - owner (Pdf): PDF that will own this stream
        - data (bytes, optional): Stream data content
        - dict (Dictionary, optional): Stream dictionary
        - **kwargs: Additional dictionary entries
        """
    
    @property
    def dictionary(self) -> Dictionary:
        """
        The stream's dictionary containing metadata.
        
        Returns:
        Dictionary: Stream metadata and parameters
        """
    
    def read_bytes(self) -> bytes:
        """
        Read the stream's data as bytes.
        
        Returns:
        bytes: Decoded stream data
        
        Raises:
        DataDecodingError: If stream cannot be decoded
        """
    
    def read_raw_bytes(self) -> bytes:
        """
        Read the stream's raw (unfiltered) data.
        
        Returns:
        bytes: Raw stream data without decoding filters
        """
    
    def write(self, data: bytes, *, filter=None, decode_parms=None) -> None:
        """
        Write data to the stream.
        
        Parameters:
        - data (bytes): Data to write
        - filter (optional): Compression filter to apply
        - decode_parms (optional): Filter parameters
        """

Operator Objects

PDF operators represent content stream commands and their operands.

class Operator(Object):
    """
    PDF content stream operator.
    
    Represents commands in PDF content streams like 'Tj' (show text) or 'l' (line to).
    """
    
    def __init__(self, name: str) -> None:
        """
        Create a PDF operator.
        
        Parameters:
        - name (str): Operator name (e.g., 'Tj', 'cm', 'Do')
        """
    
    def __str__(self) -> str:
        """Return operator name."""
    
    def __repr__(self) -> str:
        """Return full representation."""

Object Type Enumeration

Enumeration of all possible PDF object types for type checking and identification.

from enum import Enum

class ObjectType(Enum):
    """Enumeration of PDF object types."""
    uninitialized = ...  # Uninitialized object
    reserved = ...  # Reserved type
    null = ...  # Null object
    boolean = ...  # Boolean true/false
    integer = ...  # Integer number
    real = ...  # Real (floating-point) number
    string = ...  # String object
    name_ = ...  # Name object (underscore avoids conflict with 'name')
    array = ...  # Array object
    dictionary = ...  # Dictionary object
    stream = ...  # Stream object
    operator = ...  # Content stream operator
    inlineimage = ...  # Inline image

Usage Examples

Working with Arrays

import pikepdf

pdf = pikepdf.new()

# Create an array
arr = pikepdf.Array([1, 2, 3])

# Add elements
arr.append(pikepdf.String("hello"))
arr.extend([pikepdf.Name.Type, pikepdf.Name.Font])

# Access elements
first = arr[0]  # Integer 1
last = arr[-1]  # Name(/Font)

# Use in dictionary
dict_obj = pikepdf.Dictionary({
    '/Contents': arr,
    '/Type': pikepdf.Name.Page
})

Working with Dictionaries

import pikepdf

# Create a dictionary
page_dict = pikepdf.Dictionary({
    '/Type': pikepdf.Name.Page,
    '/MediaBox': pikepdf.Array([0, 0, 612, 792]),
    '/Resources': pikepdf.Dictionary()
})

# Access values
page_type = page_dict['/Type']  # Name(/Page)
media_box = page_dict['/MediaBox']  # Array

# Add new entries
page_dict['/Rotate'] = 90
page_dict['/Contents'] = pikepdf.Array()

# Check for keys
if '/Resources' in page_dict:
    resources = page_dict['/Resources']

Working with Names

import pikepdf

# Create names
type_name = pikepdf.Name.Type
page_name = pikepdf.Name.Page
custom_name = pikepdf.Name('/CustomAttribute')

# Names can be compared
if type_name == pikepdf.Name.Type:
    print("Names are equal")

# Use in dictionaries
metadata = {
    type_name: page_name,
    pikepdf.Name.MediaBox: pikepdf.Array([0, 0, 612, 792])
}

Working with Strings

import pikepdf

# Create strings
title = pikepdf.String("Document Title")
binary_data = pikepdf.String(b'\x00\x01\x02\x03')

# Convert between representations
text_content = str(title)  # "Document Title"
byte_content = bytes(binary_data)  # b'\x00\x01\x02\x03'

# Use in document info
pdf = pikepdf.new()
pdf.docinfo['/Title'] = title
pdf.docinfo['/Author'] = pikepdf.String("Jane Doe")

Working with Streams

import pikepdf

pdf = pikepdf.new()

# Create a stream with text content
content_data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET"
content_stream = pikepdf.Stream(pdf, content_data)

# Set stream properties
content_stream.dictionary['/Length'] = len(content_data)

# Read stream data
data = content_stream.read_bytes()
raw_data = content_stream.read_raw_bytes()

# Use stream in a page
page = pdf.add_blank_page()
page['/Contents'] = content_stream

Object Copying and Ownership

import pikepdf

# Open two PDFs
pdf1 = pikepdf.open('source.pdf')
pdf2 = pikepdf.new()

# Copy object from one PDF to another
source_obj = pdf1.pages[0]['/Resources']
copied_obj = pdf2.copy_foreign(source_obj)

# Check ownership
assert copied_obj.is_owned_by(pdf2)
assert not copied_obj.is_owned_by(pdf1)

# Make object indirect
indirect_obj = pdf2.make_indirect(copied_obj)
obj_id, generation = indirect_obj.objgen

Install with Tessl CLI