A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Low-level PDF object types for advanced manipulation, constants, type definitions, and utility functions used throughout the PyPDF2 library. These components provide the foundation for PDF specification-level operations.
Base classes and data structures that represent PDF objects according to the PDF specification.
class PdfObject:
"""Base class for all PDF objects."""
class NullObject(PdfObject):
"""PDF null object representation."""
class BooleanObject(PdfObject):
"""PDF boolean object (true/false)."""
class IndirectObject(PdfObject):
"""PDF indirect object reference."""
@property
def idnum(self) -> int:
"""Object ID number."""
@property
def generation(self) -> int:
"""Object generation number."""
@property
def pdf(self):
"""Associated PDF reader."""
class FloatObject(float, PdfObject):
"""PDF floating-point number object."""
class NumberObject(int, PdfObject):
"""PDF integer number object."""
class ByteStringObject(bytes, PdfObject):
"""PDF byte string object."""
class TextStringObject(str, PdfObject):
"""PDF text string object."""
class NameObject(str, PdfObject):
"""PDF name object (starts with /)."""Collections and containers for PDF data structures.
class ArrayObject(list, PdfObject):
"""PDF array object (list-like)."""
class DictionaryObject(dict, PdfObject):
"""PDF dictionary object (dict-like)."""
class TreeObject(DictionaryObject):
"""PDF tree structure for hierarchical data."""
class StreamObject(PdfObject):
"""PDF stream object containing binary data."""
class DecodedStreamObject(StreamObject):
"""Decoded (uncompressed) PDF stream."""
class EncodedStreamObject(StreamObject):
"""Encoded (compressed) PDF stream."""
class ContentStream(DecodedStreamObject):
"""PDF content stream with page content operations."""
class Field(TreeObject):
"""PDF form field object."""Objects for document navigation, bookmarks, and annotations.
class Destination(DictionaryObject):
"""PDF destination for navigation."""
@property
def title(self) -> Optional[str]:
"""Destination title."""
@property
def page(self):
"""Target page reference."""
@property
def typ(self) -> str:
"""Destination type (fit type)."""
class OutlineItem(DictionaryObject):
"""PDF outline item (bookmark)."""
@property
def title(self) -> Optional[str]:
"""Bookmark title."""
@property
def page(self):
"""Target page reference."""
@property
def parent(self):
"""Parent outline item."""
@property
def children(self):
"""Child outline items."""
class Bookmark(OutlineItem):
"""DEPRECATED: Use OutlineItem instead."""
class AnnotationBuilder:
"""Builder for creating PDF annotations."""
# Methods for building various annotation types
# Implementation depends on annotation typeHelper classes and functions for PDF manipulation.
class PageRange:
"""Slice-like representation of page ranges."""
def __init__(self, arg: Union[slice, "PageRange", str]):
"""
Create a PageRange from various input types.
Args:
arg: Range specification (string, slice, or PageRange)
"""
def to_slice(self) -> slice:
"""Convert to Python slice object."""
def indices(self, n: int) -> Tuple[int, int, int]:
"""
Get slice indices for given length.
Args:
n (int): Total length
Returns:
tuple: (start, stop, step) indices
"""
@staticmethod
def valid(input: Any) -> bool:
"""
Check if input is valid for PageRange.
Args:
input: Input to validate
Returns:
bool: True if valid
"""
class PaperSize:
"""Standard paper size constants."""
A0: 'Dimensions' # 2384 x 3371 points
A1: 'Dimensions' # 1685 x 2384 points
A2: 'Dimensions' # 1190 x 1685 points
A3: 'Dimensions' # 842 x 1190 points
A4: 'Dimensions' # 595 x 842 points
A5: 'Dimensions' # 420 x 595 points
A6: 'Dimensions' # 298 x 420 points
A7: 'Dimensions' # 210 x 298 points
A8: 'Dimensions' # 147 x 210 points
C4: 'Dimensions' # 649 x 918 points (envelope)
class PasswordType:
"""Enumeration for password validation results."""
NOT_DECRYPTED: int = 0
USER_PASSWORD: int = 1
OWNER_PASSWORD: int = 2
# Utility functions
def create_string_object(string: str, forced_encoding=None) -> Union[TextStringObject, ByteStringObject]:
"""
Create appropriate string object based on content.
Args:
string (str): String content
forced_encoding (str, optional): Force specific encoding
Returns:
Union[TextStringObject, ByteStringObject]: Appropriate string object
"""
def encode_pdfdocencoding(unicode_string: str) -> bytes:
"""
Encode string using PDF document encoding.
Args:
unicode_string (str): Unicode string to encode
Returns:
bytes: Encoded bytes
"""
def decode_pdfdocencoding(byte_string: bytes) -> str:
"""
Decode bytes using PDF document encoding.
Args:
byte_string (bytes): Bytes to decode
Returns:
str: Decoded string
"""
def hex_to_rgb(color: str) -> Tuple[float, float, float]:
"""
Convert hex color to RGB tuple.
Args:
color (str): Hex color string (e.g., "#FF0000")
Returns:
tuple: (red, green, blue) values 0.0-1.0
"""
def read_object(stream, pdf) -> PdfObject:
"""
Read a PDF object from stream.
Args:
stream: Input stream
pdf: PDF reader reference
Returns:
PdfObject: Parsed PDF object
"""
def parse_filename_page_ranges(args: List[Union[str, PageRange, None]]) -> List[Tuple[str, PageRange]]:
"""
Parse filename and page range arguments.
Args:
args: Command-line style arguments
Returns:
list: List of (filename, page_range) tuples
"""Type aliases and definitions used throughout the library.
# Border array for annotations
BorderArrayType = List[Union[NameObject, NumberObject, ArrayObject]]
# Outline item types
OutlineItemType = Union[OutlineItem, Destination]
# PDF fit types for destinations
FitType = Literal["/Fit", "/XYZ", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV"]
# Zoom argument types
ZoomArgType = Union[NumberObject, NullObject, float]
ZoomArgsType = List[ZoomArgType]
# Complex outline structure type
OutlineType = List[Union[OutlineItemType, List]]
# Page layout types
LayoutType = Literal[
"/SinglePage", "/OneColumn", "/TwoColumnLeft", "/TwoColumnRight",
"/TwoPageLeft", "/TwoPageRight"
]
# Page mode types
PagemodeType = Literal[
"/UseNone", "/UseOutlines", "/UseThumbs", "/FullScreen",
"/UseOC", "/UseAttachments"
]
# Page range specification types
PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]]
# Dimension type for paper sizes
class Dimensions:
"""Represents paper dimensions."""
def __init__(self, width: float, height: float):
"""
Create dimensions.
Args:
width (float): Width in points
height (float): Height in points
"""
self.width = width
self.height = heightfrom PyPDF2 import PdfReader
from PyPDF2.generic import DictionaryObject, ArrayObject, NameObject
reader = PdfReader("document.pdf")
# Access raw PDF objects
for page in reader.pages:
# Pages are DictionaryObject instances
if isinstance(page, DictionaryObject):
# Access dictionary entries
mediabox = page.get("/MediaBox")
if isinstance(mediabox, ArrayObject):
print(f"MediaBox: {[float(x) for x in mediabox]}")
# Check for resources
resources = page.get("/Resources")
if resources:
fonts = resources.get("/Font", {})
print(f"Fonts: {list(fonts.keys())}")from PyPDF2 import PdfMerger, PageRange
merger = PdfMerger()
# Various ways to specify page ranges
merger.append("doc1.pdf", pages=PageRange("1:5")) # Pages 1-4
merger.append("doc2.pdf", pages=PageRange("::2")) # Every other page
merger.append("doc3.pdf", pages=PageRange("10:")) # Page 10 to end
merger.append("doc4.pdf", pages=PageRange([1, 3, 5])) # Specific pages
# Validate page range
if PageRange.valid("1:10"):
print("Valid page range")
merger.write("output.pdf")
merger.close()from PyPDF2 import PdfWriter
from PyPDF2.generic import PaperSize
writer = PdfWriter()
# Create pages with standard sizes
a4_page = writer.add_blank_page(PaperSize.A4.width, PaperSize.A4.height)
letter_page = writer.add_blank_page(612, 792) # US Letter
a3_page = writer.add_blank_page(PaperSize.A3.width, PaperSize.A3.height)
print(f"A4 size: {PaperSize.A4.width} x {PaperSize.A4.height} points")
print(f"A3 size: {PaperSize.A3.width} x {PaperSize.A3.height} points")
with open("standard_sizes.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2.generic import (
DictionaryObject, ArrayObject, NameObject,
TextStringObject, NumberObject
)
# Create a custom dictionary object
custom_dict = DictionaryObject({
NameObject("/Type"): NameObject("/Annotation"),
NameObject("/Subtype"): NameObject("/Text"),
NameObject("/Contents"): TextStringObject("Custom note"),
NameObject("/Rect"): ArrayObject([
NumberObject(100), NumberObject(100),
NumberObject(200), NumberObject(150)
])
})
print(f"Custom object: {custom_dict}")from PyPDF2.generic import (
create_string_object, encode_pdfdocencoding,
decode_pdfdocencoding, hex_to_rgb
)
# Create appropriate string objects
text = create_string_object("Hello, World!")
binary_text = create_string_object("\\x00\\xff\\x42", "latin-1")
# Encoding/decoding
unicode_text = "Héllo, Wørld!"
encoded = encode_pdfdocencoding(unicode_text)
decoded = decode_pdfdocencoding(encoded)
print(f"Original: {unicode_text}")
print(f"Decoded: {decoded}")
# Color conversion
red_rgb = hex_to_rgb("#FF0000") # (1.0, 0.0, 0.0)
blue_rgb = hex_to_rgb("#0000FF") # (0.0, 0.0, 1.0)
print(f"Red RGB: {red_rgb}")
print(f"Blue RGB: {blue_rgb}")from PyPDF2 import PdfReader
from PyPDF2.generic import OutlineItem, Destination
reader = PdfReader("document.pdf")
# Access document outline
outline = reader.outline
if outline:
def print_outline(items, level=0):
for item in items:
if isinstance(item, OutlineItem):
indent = " " * level
print(f"{indent}{item.title}")
if hasattr(item, 'children') and item.children:
print_outline(item.children, level + 1)
elif isinstance(item, list):
print_outline(item, level)
print_outline(outline)
# Access named destinations
destinations = reader.named_destinations
for name, dest in destinations.items():
if isinstance(dest, Destination):
print(f"Destination '{name}' -> Page {dest.page}, Type: {dest.typ}")from PyPDF2 import PdfReader, PasswordType
reader = PdfReader("encrypted.pdf")
if reader.is_encrypted:
# Try different password types
result = reader.decrypt("user_password")
if result == PasswordType.USER_PASSWORD:
print("Opened with user password - some restrictions may apply")
elif result == PasswordType.OWNER_PASSWORD:
print("Opened with owner password - full access")
elif result == PasswordType.NOT_DECRYPTED:
print("Password incorrect or file corrupted")PyPDF2 includes extensive constants from the PDF specification organized in the constants module:
# Core PDF constants
class Core:
OUTLINES = "/Outlines"
THREADS = "/Threads"
PAGE = "/Page"
PAGES = "/Pages"
CATALOG = "/Catalog"
# User access permissions
class UserAccessPermissions:
PRINT = 1 << 2
MODIFY = 1 << 3
COPY = 1 << 4
ADD_OR_MODIFY = 1 << 5
# PDF filter types
class FilterTypes:
FLATE_DECODE = "/FlateDecode"
LZW_DECODE = "/LZWDecode"
ASCII_HEX_DECODE = "/ASCIIHexDecode"
DCT_DECODE = "/DCTDecode"These constants ensure compliance with PDF specification requirements and provide standardized access to PDF dictionary keys and values.
Install with Tessl CLI
npx tessl i tessl/pypi-py-pdf2