Read and write PDFs with Python, powered by qpdf
—
PDF object types and data structures that form the foundation of PDF content representation. These classes provide the building blocks for manipulating PDF data at the object level.
The fundamental PDF object type that all other PDF objects inherit from, providing common functionality for object manipulation and ownership.
class Object:
"""
Universal PDF object type representing any PDF data structure.
All PDF objects (arrays, dictionaries, names, etc.) derive from this class.
"""
def is_owned_by(self, possible_owner: Pdf) -> bool:
"""
Check if this object is owned by a specific PDF.
Parameters:
- possible_owner (Pdf): PDF to check ownership against
Returns:
bool: True if this object belongs to the specified PDF
"""
def same_owner_as(self, other: Object) -> bool:
"""
Check if this object has the same owner as another object.
Parameters:
- other (Object): Object to compare ownership with
Returns:
bool: True if both objects have the same owner
"""
def with_same_owner_as(self, other: Object) -> Object:
"""
Return a copy of this object owned by the same PDF as another object.
Parameters:
- other (Object): Object whose owner should be used
Returns:
Object: Copy of this object with the same owner as other
Raises:
ForeignObjectError: If objects cannot be made compatible
"""
@staticmethod
def parse(data: str, *, pdf_context: Pdf = None) -> Object:
"""
Parse a string representation of PDF data into an Object.
Parameters:
- data (str): String containing PDF object data
- pdf_context (Pdf, optional): PDF context for parsing
Returns:
Object: Parsed PDF object
Raises:
PdfError: If the data cannot be parsed
"""
def unparse(self, *, resolved: bool = False) -> str:
"""
Convert the object back to its string representation.
Parameters:
- resolved (bool): Whether to resolve indirect references
Returns:
str: String representation of the object
"""
@property
def _type_code(self) -> ObjectType:
"""
The object's type code.
Returns:
ObjectType: Enumeration value indicating the object type
"""
@property
def is_indirect(self) -> bool:
"""
Whether this is an indirect object.
Returns:
bool: True if this is an indirect object reference
"""
@property
def objgen(self) -> tuple[int, int]:
"""
Object and generation numbers for indirect objects.
Returns:
tuple[int, int]: (object_number, generation_number) or (0, 0) for direct objects
"""PDF arrays represent ordered collections of PDF objects, similar to Python lists.
class Array(Object):
"""
PDF array object representing an ordered list of PDF objects.
Behaves like a Python list with additional PDF-specific functionality.
"""
def __init__(self, iterable=None) -> None:
"""
Create a new PDF array.
Parameters:
- iterable (optional): Initial objects to populate the array
"""
def __len__(self) -> int:
"""Return the number of elements in the array."""
def __getitem__(self, index: int) -> Object:
"""Get an element by index."""
def __setitem__(self, index: int, value: Object) -> None:
"""Set an element at the given index."""
def append(self, obj: Object) -> None:
"""
Add an object to the end of the array.
Parameters:
- obj (Object): Object to append
"""
def extend(self, iterable) -> None:
"""
Extend the array with objects from an iterable.
Parameters:
- iterable: Objects to add to the array
"""
def insert(self, index: int, obj: Object) -> None:
"""
Insert an object at the specified index.
Parameters:
- index (int): Position to insert at
- obj (Object): Object to insert
"""PDF dictionaries represent key-value mappings where keys are Name objects and values are any PDF objects.
class Dictionary(Object):
"""
PDF dictionary object representing key-value mappings.
Keys must be Name objects, values can be any PDF objects.
Behaves like a Python dictionary with PDF-specific enhancements.
"""
def __init__(self, mapping=None, **kwargs) -> None:
"""
Create a new PDF dictionary.
Parameters:
- mapping (optional): Initial key-value pairs
- **kwargs: Additional key-value pairs (keys converted to Names)
"""
def __getitem__(self, key) -> Object:
"""Get a value by key (key can be str or Name)."""
def __setitem__(self, key, value: Object) -> None:
"""Set a key-value pair (key converted to Name if needed)."""
def __contains__(self, key) -> bool:
"""Check if key exists in dictionary."""
def __len__(self) -> int:
"""Return number of key-value pairs."""
def keys(self):
"""Return dictionary keys as Name objects."""
def values(self):
"""Return dictionary values."""
def items(self):
"""Return key-value pairs."""
def get(self, key, default=None) -> Object:
"""
Get a value with optional default.
Parameters:
- key: Dictionary key (str or Name)
- default: Default value if key not found
Returns:
Object: Value associated with key, or default
"""PDF names are atomic identifiers used as dictionary keys and various PDF constants.
class Name(Object):
"""
PDF name object representing an immutable identifier.
Names are used as dictionary keys and PDF constants.
Supports both string construction and attribute-style access.
"""
def __init__(self, name_string: str) -> None:
"""
Create a PDF name from a string.
Parameters:
- name_string (str): String representation of the name
"""
def __str__(self) -> str:
"""Return string representation without leading slash."""
def __repr__(self) -> str:
"""Return full representation including leading slash."""
def __eq__(self, other) -> bool:
"""Compare names for equality."""
def __hash__(self) -> int:
"""Return hash for use as dictionary key."""
# Name constants can be accessed as attributes
# Example: Name.Type, Name.Font, Name.ContentsPDF strings can contain text or binary data with proper encoding handling.
class String(Object):
"""
PDF string object for text or binary data.
Handles PDF string encoding including literal strings and hex strings.
"""
def __init__(self, str_or_bytes) -> None:
"""
Create a PDF string from text or bytes.
Parameters:
- str_or_bytes (str | bytes): String content
"""
def __str__(self) -> str:
"""Return string content as text."""
def __bytes__(self) -> bytes:
"""Return string content as bytes."""
def __len__(self) -> int:
"""Return length of string content."""
@property
def for_pdf(self) -> str:
"""
String representation suitable for PDF output.
Returns:
str: Properly escaped string for PDF files
"""PDF streams contain both a dictionary of metadata and binary data content.
class Stream(Object):
"""
PDF stream object containing dictionary metadata and binary data.
Streams are used for page content, images, fonts, and other binary data.
"""
def __init__(self, owner: Pdf, data=None, dict=None, **kwargs) -> None:
"""
Create a new PDF stream.
Parameters:
- owner (Pdf): PDF that will own this stream
- data (bytes, optional): Stream data content
- dict (Dictionary, optional): Stream dictionary
- **kwargs: Additional dictionary entries
"""
@property
def dictionary(self) -> Dictionary:
"""
The stream's dictionary containing metadata.
Returns:
Dictionary: Stream metadata and parameters
"""
def read_bytes(self) -> bytes:
"""
Read the stream's data as bytes.
Returns:
bytes: Decoded stream data
Raises:
DataDecodingError: If stream cannot be decoded
"""
def read_raw_bytes(self) -> bytes:
"""
Read the stream's raw (unfiltered) data.
Returns:
bytes: Raw stream data without decoding filters
"""
def write(self, data: bytes, *, filter=None, decode_parms=None) -> None:
"""
Write data to the stream.
Parameters:
- data (bytes): Data to write
- filter (optional): Compression filter to apply
- decode_parms (optional): Filter parameters
"""PDF operators represent content stream commands and their operands.
class Operator(Object):
"""
PDF content stream operator.
Represents commands in PDF content streams like 'Tj' (show text) or 'l' (line to).
"""
def __init__(self, name: str) -> None:
"""
Create a PDF operator.
Parameters:
- name (str): Operator name (e.g., 'Tj', 'cm', 'Do')
"""
def __str__(self) -> str:
"""Return operator name."""
def __repr__(self) -> str:
"""Return full representation."""Enumeration of all possible PDF object types for type checking and identification.
from enum import Enum
class ObjectType(Enum):
"""Enumeration of PDF object types."""
uninitialized = ... # Uninitialized object
reserved = ... # Reserved type
null = ... # Null object
boolean = ... # Boolean true/false
integer = ... # Integer number
real = ... # Real (floating-point) number
string = ... # String object
name_ = ... # Name object (underscore avoids conflict with 'name')
array = ... # Array object
dictionary = ... # Dictionary object
stream = ... # Stream object
operator = ... # Content stream operator
inlineimage = ... # Inline imageimport pikepdf
pdf = pikepdf.new()
# Create an array
arr = pikepdf.Array([1, 2, 3])
# Add elements
arr.append(pikepdf.String("hello"))
arr.extend([pikepdf.Name.Type, pikepdf.Name.Font])
# Access elements
first = arr[0] # Integer 1
last = arr[-1] # Name(/Font)
# Use in dictionary
dict_obj = pikepdf.Dictionary({
'/Contents': arr,
'/Type': pikepdf.Name.Page
})import pikepdf
# Create a dictionary
page_dict = pikepdf.Dictionary({
'/Type': pikepdf.Name.Page,
'/MediaBox': pikepdf.Array([0, 0, 612, 792]),
'/Resources': pikepdf.Dictionary()
})
# Access values
page_type = page_dict['/Type'] # Name(/Page)
media_box = page_dict['/MediaBox'] # Array
# Add new entries
page_dict['/Rotate'] = 90
page_dict['/Contents'] = pikepdf.Array()
# Check for keys
if '/Resources' in page_dict:
resources = page_dict['/Resources']import pikepdf
# Create names
type_name = pikepdf.Name.Type
page_name = pikepdf.Name.Page
custom_name = pikepdf.Name('/CustomAttribute')
# Names can be compared
if type_name == pikepdf.Name.Type:
print("Names are equal")
# Use in dictionaries
metadata = {
type_name: page_name,
pikepdf.Name.MediaBox: pikepdf.Array([0, 0, 612, 792])
}import pikepdf
# Create strings
title = pikepdf.String("Document Title")
binary_data = pikepdf.String(b'\x00\x01\x02\x03')
# Convert between representations
text_content = str(title) # "Document Title"
byte_content = bytes(binary_data) # b'\x00\x01\x02\x03'
# Use in document info
pdf = pikepdf.new()
pdf.docinfo['/Title'] = title
pdf.docinfo['/Author'] = pikepdf.String("Jane Doe")import pikepdf
pdf = pikepdf.new()
# Create a stream with text content
content_data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET"
content_stream = pikepdf.Stream(pdf, content_data)
# Set stream properties
content_stream.dictionary['/Length'] = len(content_data)
# Read stream data
data = content_stream.read_bytes()
raw_data = content_stream.read_raw_bytes()
# Use stream in a page
page = pdf.add_blank_page()
page['/Contents'] = content_streamimport pikepdf
# Open two PDFs
pdf1 = pikepdf.open('source.pdf')
pdf2 = pikepdf.new()
# Copy object from one PDF to another
source_obj = pdf1.pages[0]['/Resources']
copied_obj = pdf2.copy_foreign(source_obj)
# Check ownership
assert copied_obj.is_owned_by(pdf2)
assert not copied_obj.is_owned_by(pdf1)
# Make object indirect
indirect_obj = pdf2.make_indirect(copied_obj)
obj_id, generation = indirect_obj.objgenInstall with Tessl CLI
npx tessl i tessl/pypi-pikepdf