A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
npx @tessl/cli install tessl/pypi-pypdf@6.0.0A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files. pypdf can also add custom data, viewing options, and passwords to PDF files, while providing comprehensive text and metadata extraction capabilities.
pip install pypdfpip install pypdf[crypto] for AES encryption/decryptionfrom pypdf import PdfReader, PdfWriterFor page operations:
from pypdf import PdfReader, PdfWriter, PageObject, TransformationFor working with metadata and annotations:
from pypdf import DocumentInformation, PageRange, PaperSizefrom pypdf import PdfReader, PdfWriter
# Reading a PDF
reader = PdfReader("example.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
# Writing a PDF
writer = PdfWriter()
writer.add_page(page)
with open("output.pdf", "wb") as output_file:
writer.write(output_file)
# Merging PDFs
reader1 = PdfReader("document1.pdf")
reader2 = PdfReader("document2.pdf")
writer = PdfWriter()
for page in reader1.pages:
writer.add_page(page)
for page in reader2.pages:
writer.add_page(page)
with open("merged.pdf", "wb") as output_file:
writer.write(output_file)pypdf is built around two core classes and a rich ecosystem of supporting components:
Core functionality for opening, reading, creating, and saving PDF documents. Includes support for encrypted PDFs, incremental updates, and context manager usage patterns.
class PdfReader:
def __init__(self, stream, strict: bool = False, password: str | None = None): ...
def decrypt(self, password: str) -> PasswordType: ...
def close(self) -> None: ...
class PdfWriter:
def __init__(self, clone_from=None, incremental: bool = False): ...
def add_page(self, page: PageObject) -> None: ...
def write(self, stream) -> None: ...
def encrypt(self, user_password: str, owner_password: str | None = None, **kwargs) -> None: ...Comprehensive page manipulation including transformations (scaling, rotation, translation), page merging, cropping, and geometric operations. Support for blank page creation and advanced transformation matrices.
class PageObject:
def extract_text(self, extraction_mode: str = "layout", **kwargs) -> str: ...
def scale(self, sx: float, sy: float) -> PageObject: ...
def rotate(self, angle: int) -> PageObject: ...
def merge_page(self, page2: PageObject) -> None: ...
def merge_transformed_page(self, page2: PageObject, ctm, expand: bool = False) -> None: ...
class Transformation:
def __init__(self, ctm=(1, 0, 0, 1, 0, 0)): ...
def translate(self, tx: float = 0, ty: float = 0) -> Transformation: ...
def scale(self, sx: float = 1, sy: float | None = None) -> Transformation: ...
def rotate(self, rotation: float) -> Transformation: ...Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options.
def extract_text(
self,
orientations: tuple | int = (0, 90, 180, 270),
space_width: float = 200.0,
visitor_operand_before=None,
visitor_operand_after=None,
visitor_text=None,
extraction_mode: str = "plain"
) -> str: ...Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes.
class DocumentInformation:
@property
def title(self) -> str | None: ...
@property
def author(self) -> str | None: ...
@property
def subject(self) -> str | None: ...
@property
def creator(self) -> str | None: ...
@property
def producer(self) -> str | None: ...
@property
def creation_date(self) -> datetime | None: ...
@property
def modification_date(self) -> datetime | None: ...Complete annotation system supporting markup annotations (highlights, text annotations, shapes) and interactive elements (links, popups) with full customization capabilities.
class AnnotationDictionary: ...
class Highlight: ...
class Text: ...
class Link: ...
class FreeText: ...Supporting utilities including page ranges, standard paper sizes, constants, error handling, and type definitions for enhanced developer experience.
class PageRange:
def __init__(self, arg): ...
def indices(self, n: int) -> tuple[int, int, int]: ...
class PaperSize:
A4: tuple[float, float]
A3: tuple[float, float]
# ... other standard sizes
def parse_filename_page_ranges(fnprs: list[str]) -> tuple[list[str], list[PageRange]]: ...Comprehensive form field manipulation including reading field values, updating form data, setting field appearance properties, and managing interactive PDF forms.
def update_page_form_field_values(
self,
page: PageObject | list[PageObject] | None,
fields: dict[str, str | list[str] | tuple[str, str, float]],
flags: int = 0,
auto_regenerate: bool = True,
flatten: bool = False
) -> None: ...
def set_need_appearances_writer(self, state: bool = True) -> None: ...
def reattach_fields(self, page: PageObject | None = None) -> list[DictionaryObject]: ...from enum import IntEnum, IntFlag
class PasswordType(IntEnum):
NOT_DECRYPTED = 0
USER_PASSWORD = 1
OWNER_PASSWORD = 2
class ImageType(IntFlag):
NONE = 0
XOBJECT_IMAGES = 1
INLINE_IMAGES = 2
DRAWING_IMAGES = 4
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
class ObjectDeletionFlag(IntFlag):
NONE = 0
TEXT = 1
LINKS = 2
ATTACHMENTS = 4
OBJECTS_3D = 8
ALL_ANNOTATIONS = 16
XOBJECT_IMAGES = 32
INLINE_IMAGES = 64
DRAWING_IMAGES = 128
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES