A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Transform, scale, rotate, crop, and merge individual PDF pages with precise control over page geometry and content. The PageObject class and Transformation utilities provide comprehensive page manipulation capabilities.
Represents a single PDF page with methods for content extraction, geometric transformations, and page merging.
class PageObject(DictionaryObject):
"""PDF page object with transformation and content capabilities."""
@property
def mediabox(self) -> RectangleObject:
"""The page's media box (full page size)."""
@property
def cropbox(self) -> RectangleObject:
"""The page's crop box (visible area)."""
@property
def bleedbox(self) -> RectangleObject:
"""The page's bleed box (printing area)."""
@property
def trimbox(self) -> RectangleObject:
"""The page's trim box (finished page size)."""
@property
def artbox(self) -> RectangleObject:
"""The page's art box (meaningful content area)."""
@property
def annotations(self) -> Optional[ArrayObject]:
"""Page annotations if present."""
def extract_text(self, visitor_text=None) -> str:
"""
Extract text content from the page.
Args:
visitor_text (callable, optional): Custom text visitor function
Returns:
str: Extracted text content
"""
def scale(self, sx: float, sy: float) -> None:
"""
Scale the page by given factors.
Args:
sx (float): Horizontal scale factor
sy (float): Vertical scale factor
"""
def scale_by(self, factor: float) -> None:
"""
Scale the page uniformly.
Args:
factor (float): Scale factor for both dimensions
"""
def scale_to(self, width: float, height: float) -> None:
"""
Scale the page to specific dimensions.
Args:
width (float): Target width in points
height (float): Target height in points
"""
def rotate(self, angle: int) -> 'PageObject':
"""
Rotate the page by the given angle.
Args:
angle (int): Rotation angle in degrees (must be multiple of 90)
Returns:
PageObject: Self for method chaining
"""
def rotate_clockwise(self, angle: int) -> 'PageObject':
"""
DEPRECATED: Use rotate() instead.
Rotate the page clockwise.
Args:
angle (int): Rotation angle in degrees
Returns:
PageObject: Self for method chaining
"""
def rotate_counter_clockwise(self, angle: int) -> 'PageObject':
"""
DEPRECATED: Use rotate() instead.
Rotate the page counter-clockwise.
Args:
angle (int): Rotation angle in degrees
Returns:
PageObject: Self for method chaining
"""
def merge_page(self, page2: 'PageObject') -> None:
"""
Merge another page's content onto this page.
Args:
page2 (PageObject): Page to merge onto this page
"""
def merge_scaled_page(self, page2: 'PageObject', scale: float, expand: bool = False) -> None:
"""
Merge a scaled page onto this page.
Args:
page2 (PageObject): Page to merge
scale (float): Scale factor for the merged page
expand (bool): Whether to expand page size if needed
"""
def merge_rotated_page(self, page2: 'PageObject', rotation: int, expand: bool = False) -> None:
"""
Merge a rotated page onto this page.
Args:
page2 (PageObject): Page to merge
rotation (int): Rotation angle in degrees
expand (bool): Whether to expand page size if needed
"""
def merge_scaled_translated_page(
self,
page2: 'PageObject',
scale: float,
tx: float,
ty: float,
expand: bool = False
) -> None:
"""
Merge a scaled and translated page onto this page.
Args:
page2 (PageObject): Page to merge
scale (float): Scale factor
tx (float): X translation in points
ty (float): Y translation in points
expand (bool): Whether to expand page size if needed
"""
def merge_rotated_scaled_page(
self,
page2: 'PageObject',
rotation: int,
scale: float,
expand: bool = False
) -> None:
"""
Merge a rotated and scaled page onto this page.
Args:
page2 (PageObject): Page to merge
rotation (int): Rotation angle in degrees
scale (float): Scale factor
expand (bool): Whether to expand page size if needed
"""
def merge_rotated_scaled_translated_page(
self,
page2: 'PageObject',
rotation: int,
scale: float,
tx: float,
ty: float,
expand: bool = False
) -> None:
"""
Merge a page with full transformation onto this page.
Args:
page2 (PageObject): Page to merge
rotation (int): Rotation angle in degrees
scale (float): Scale factor
tx (float): X translation in points
ty (float): Y translation in points
expand (bool): Whether to expand page size if needed
"""
def merge_transformed_page(
self,
page2: 'PageObject',
ctm: Transformation,
expand: bool = False
) -> None:
"""
Merge a page with custom transformation matrix.
Args:
page2 (PageObject): Page to merge
ctm (Transformation): Current transformation matrix
expand (bool): Whether to expand page size if needed
"""
def add_transformation(self, ctm: Transformation) -> None:
"""
Add a transformation to the page.
Args:
ctm (Transformation): Transformation matrix to apply
"""
def get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
Get fonts used on the page.
Returns:
tuple: (font_names, font_subsets) sets
"""
def get_images(self) -> Dict[str, Any]:
"""
Get images embedded in the page.
Returns:
dict: Image information by name
"""
@staticmethod
def create_blank_page(pdf=None, width: float = 612, height: float = 792) -> 'PageObject':
"""
Create a blank page.
Args:
pdf: Optional PDF reader reference
width (float): Page width in points (default: 612 - 8.5")
height (float): Page height in points (default: 792 - 11")
Returns:
PageObject: New blank page
"""2D coordinate transformation operations for precise page geometry control.
class Transformation:
"""2D transformation matrix for page operations."""
def __init__(self, ctm: Tuple[float, float, float, float, float, float] = (1, 0, 0, 1, 0, 0)):
"""
Initialize transformation matrix.
Args:
ctm: 6-element transformation matrix (a, b, c, d, e, f)
"""
def translate(self, tx: float = 0, ty: float = 0) -> 'Transformation':
"""
Add translation to the transformation.
Args:
tx (float): X translation in points
ty (float): Y translation in points
Returns:
Transformation: New transformation with translation applied
"""
def scale(self, sx: Optional[float] = None, sy: Optional[float] = None) -> 'Transformation':
"""
Add scaling to the transformation.
Args:
sx: X scale factor (default: 1.0)
sy: Y scale factor (default: same as sx)
Returns:
Transformation: New transformation with scaling applied
"""
def rotate(self, rotation: float) -> 'Transformation':
"""
Add rotation to the transformation.
Args:
rotation (float): Rotation angle in degrees
Returns:
Transformation: New transformation with rotation applied
"""Geometric rectangle representation for page boundaries and regions.
class RectangleObject(ArrayObject):
"""PDF rectangle object for geometric regions."""
@property
def left(self) -> float:
"""Left coordinate."""
@property
def bottom(self) -> float:
"""Bottom coordinate."""
@property
def right(self) -> float:
"""Right coordinate."""
@property
def top(self) -> float:
"""Top coordinate."""
@property
def width(self) -> float:
"""Rectangle width."""
@property
def height(self) -> float:
"""Rectangle height."""
def scale(self, sx: float, sy: float) -> 'RectangleObject':
"""
Scale the rectangle.
Args:
sx (float): X scale factor
sy (float): Y scale factor
Returns:
RectangleObject: New scaled rectangle
"""
def normalize(self) -> 'RectangleObject':
"""
Normalize rectangle coordinates.
Returns:
RectangleObject: Normalized rectangle
"""
def intersect(self, other: 'RectangleObject') -> 'RectangleObject':
"""
Calculate intersection with another rectangle.
Args:
other (RectangleObject): Rectangle to intersect with
Returns:
RectangleObject: Intersection rectangle
"""
def union(self, other: 'RectangleObject') -> 'RectangleObject':
"""
Calculate union with another rectangle.
Args:
other (RectangleObject): Rectangle to union with
Returns:
RectangleObject: Union rectangle
"""from PyPDF2 import PdfReader, PdfWriter
# Read source PDF
reader = PdfReader("source.pdf")
writer = PdfWriter()
# Get first page
page = reader.pages[0]
# Scale the page to 50% size
page.scale(0.5, 0.5)
# Rotate 90 degrees clockwise
page.rotate(90)
# Add to writer
writer.add_page(page)
# Save result
with open("transformed.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2 import PdfReader, PdfWriter
# Read source files
reader1 = PdfReader("background.pdf")
reader2 = PdfReader("overlay.pdf")
writer = PdfWriter()
# Get pages
background = reader1.pages[0]
overlay = reader2.pages[0]
# Scale overlay to fit in corner
overlay.scale(0.3, 0.3)
# Merge overlay onto background
background.merge_scaled_translated_page(
overlay,
scale=0.5,
tx=400, # Position in bottom-right
ty=100,
expand=False
)
writer.add_page(background)
with open("merged_pages.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import RectangleObject
reader = PdfReader("document.pdf")
writer = PdfWriter()
for page in reader.pages:
# Get current dimensions
mediabox = page.mediabox
print(f"Page size: {mediabox.width} x {mediabox.height} points")
# Convert to inches (72 points = 1 inch)
width_inches = float(mediabox.width) / 72
height_inches = float(mediabox.height) / 72
print(f"Page size: {width_inches:.1f}\" x {height_inches:.1f}\"")
# Crop page to center area
crop_margin = 50 # 50 points margin
page.cropbox = RectangleObject([
float(mediabox.left) + crop_margin,
float(mediabox.bottom) + crop_margin,
float(mediabox.right) - crop_margin,
float(mediabox.top) - crop_margin
])
writer.add_page(page)
with open("cropped.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2 import PdfReader, PdfWriter, Transformation
reader = PdfReader("source.pdf")
writer = PdfWriter()
page = reader.pages[0]
# Create complex transformation
transform = (Transformation()
.rotate(45) # Rotate 45 degrees
.scale(0.8, 1.2) # Scale differently in X and Y
.translate(100, 50) # Move to new position
)
# Apply transformation
page.add_transformation(transform)
writer.add_page(page)
with open("custom_transform.pdf", "wb") as output_file:
writer.write(output_file)from PyPDF2 import PdfReader
reader = PdfReader("document.pdf")
for page_num, page in enumerate(reader.pages):
# Extract text
text = page.extract_text()
print(f"Page {page_num + 1} text:")
print(text[:200] + "..." if len(text) > 200 else text)
# Get font information
font_names, font_subsets = page.get_fonts()
print(f"Fonts used: {font_names}")
# Get images
images = page.get_images()
print(f"Images found: {len(images)}")
for img_name, img_info in images.items():
print(f" - {img_name}: {img_info}")from PyPDF2 import PdfWriter, PageObject
from PyPDF2.generic import RectangleObject
writer = PdfWriter()
# Create custom sized blank page (A4: 595 x 842 points)
blank_page = PageObject.create_blank_page(width=595, height=842)
# You can then add content or merge other pages onto it
writer.add_page(blank_page)
# Create US Letter sized page (8.5" x 11" = 612 x 792 points)
letter_page = PageObject.create_blank_page(width=612, height=792)
writer.add_page(letter_page)
with open("blank_pages.pdf", "wb") as output_file:
writer.write(output_file)def set_custom_rtl(_min: int, _max: int, specials: List[int]) -> Tuple[int, int, List[int]]:
"""
Configure right-to-left text parameters for text extraction.
Args:
_min (int): Minimum character code for RTL
_max (int): Maximum character code for RTL
specials (list): Special character codes to handle as RTL
Returns:
tuple: Configuration tuple with min, max, and specials
"""This function helps configure text extraction for right-to-left languages and custom character sets.
Install with Tessl CLI
npx tessl i tessl/pypi-py-pdf2