Read and write PDFs with Python, powered by qpdf
—
Specialized operations including matrix transformations, coordinate systems, job interface, and tree structures for advanced PDF manipulation. These capabilities enable sophisticated PDF processing and analysis workflows.
2D transformation matrix for coordinate transformations and geometric operations.
class Matrix:
"""
PDF transformation matrix for geometric operations.
Represents a 2D transformation matrix with 6 elements in the form:
[a b c d e f] which corresponds to the transformation:
x' = a*x + c*y + e
y' = b*x + d*y + f
Used for scaling, rotation, translation, and skewing operations.
"""
def __init__(self, a: float = 1, b: float = 0, c: float = 0,
d: float = 1, e: float = 0, f: float = 0) -> None:
"""
Create a transformation matrix with specified elements.
Parameters:
- a (float): X-scaling component
- b (float): Y-skewing component
- c (float): X-skewing component
- d (float): Y-scaling component
- e (float): X-translation component
- f (float): Y-translation component
Default creates an identity matrix (no transformation).
"""
@staticmethod
def identity() -> Matrix:
"""
Create an identity matrix that performs no transformation.
Returns:
Matrix: Identity matrix [1 0 0 1 0 0]
"""
def translated(self, dx: float, dy: float) -> Matrix:
"""
Create a new matrix with translation applied.
Parameters:
- dx (float): Translation distance in X direction
- dy (float): Translation distance in Y direction
Returns:
Matrix: New matrix with translation transformation
"""
def scaled(self, sx: float, sy: float = None) -> Matrix:
"""
Create a new matrix with scaling applied.
Parameters:
- sx (float): Scale factor in X direction
- sy (float, optional): Scale factor in Y direction (defaults to sx for uniform scaling)
Returns:
Matrix: New matrix with scaling transformation
"""
def rotated(self, angle_degrees: float) -> Matrix:
"""
Create a new matrix with rotation applied.
Parameters:
- angle_degrees (float): Rotation angle in degrees (positive = counterclockwise)
Returns:
Matrix: New matrix with rotation transformation
"""
def inverse(self) -> Matrix:
"""
Calculate the inverse of this transformation matrix.
Returns:
Matrix: Inverse transformation matrix
Raises:
ValueError: If matrix is not invertible (determinant is zero)
"""
def transform(self, point: tuple[float, float]) -> tuple[float, float]:
"""
Transform a point using this matrix.
Parameters:
- point (tuple[float, float]): Point coordinates (x, y)
Returns:
tuple[float, float]: Transformed point coordinates (x', y')
"""
def __mul__(self, other: Matrix) -> Matrix:
"""
Matrix multiplication (composition of transformations).
Parameters:
- other (Matrix): Matrix to multiply with
Returns:
Matrix: Result of matrix multiplication
"""
@property
def a(self) -> float:
"""X-scaling component of the transformation."""
@property
def b(self) -> float:
"""Y-skewing component of the transformation."""
@property
def c(self) -> float:
"""X-skewing component of the transformation."""
@property
def d(self) -> float:
"""Y-scaling component of the transformation."""
@property
def e(self) -> float:
"""X-translation component of the transformation."""
@property
def f(self) -> float:
"""Y-translation component of the transformation."""Command-line job interface providing access to qpdf functionality.
class Job:
"""
Command-line job interface for advanced PDF operations.
Provides access to qpdf's command-line functionality through
a programmatic interface, enabling complex PDF processing workflows.
"""
def run(self) -> int:
"""
Execute the configured job.
Returns:
int: Exit code (0 for success, non-zero for failure)
"""
def check_configuration(self) -> bool:
"""
Validate the job configuration without executing.
Returns:
bool: True if configuration is valid
Raises:
JobUsageError: If configuration has errors
"""
def create_pdf(self) -> Pdf:
"""
Create a PDF object from the job configuration.
Returns:
Pdf: PDF object created by the job
Raises:
JobUsageError: If job doesn't create a PDF
"""
def write_pdf(self, pdf: Pdf) -> None:
"""
Write a PDF using the job's output configuration.
Parameters:
- pdf (Pdf): PDF to write using job settings
"""
@property
def creates_output(self) -> bool:
"""
Whether this job creates output files.
Returns:
bool: True if job will create output
"""
@property
def has_warnings(self) -> bool:
"""
Whether the job execution produced warnings.
Returns:
bool: True if warnings were generated
"""
@property
def exit_code(self) -> int:
"""
Exit code from the last job execution.
Returns:
int: Exit code (0 = success)
"""
@staticmethod
def json_out_schema() -> dict:
"""
Get the JSON schema for job output format.
Returns:
dict: JSON schema describing output structure
"""
@staticmethod
def job_json_schema() -> dict:
"""
Get the JSON schema for job configuration format.
Returns:
dict: JSON schema describing job configuration structure
"""Specialized tree data structures for PDF name trees and number trees.
class NameTree:
"""
PDF name tree structure for sorted key-value storage.
Name trees provide efficient storage and retrieval of key-value pairs
where keys are byte strings sorted in lexical order.
Implements MutableMapping[bytes, Object] interface.
"""
@staticmethod
def new(pdf: Pdf) -> NameTree:
"""
Create a new empty name tree.
Parameters:
- pdf (Pdf): PDF document to create the tree in
Returns:
NameTree: New empty name tree
"""
def __len__(self) -> int:
"""Number of entries in the name tree."""
def __iter__(self) -> Iterator[bytes]:
"""Iterate over keys in the name tree."""
def __getitem__(self, key: bytes) -> Object:
"""
Get value by key.
Parameters:
- key (bytes): Key to look up
Returns:
Object: Value associated with the key
Raises:
KeyError: If key is not found
"""
def __setitem__(self, key: bytes, value: Object) -> None:
"""
Set key-value pair.
Parameters:
- key (bytes): Key for the entry
- value (Object): Value to store
"""
def __delitem__(self, key: bytes) -> None:
"""
Delete entry by key.
Parameters:
- key (bytes): Key to delete
Raises:
KeyError: If key is not found
"""
def __contains__(self, key: bytes) -> bool:
"""Check if key exists in the tree."""
class NumberTree:
"""
PDF number tree structure for sorted numeric key-value storage.
Number trees provide efficient storage and retrieval of key-value pairs
where keys are integers sorted in numeric order.
Implements MutableMapping[int, Object] interface.
"""
@staticmethod
def new(pdf: Pdf) -> NumberTree:
"""
Create a new empty number tree.
Parameters:
- pdf (Pdf): PDF document to create the tree in
Returns:
NumberTree: New empty number tree
"""
def __len__(self) -> int:
"""Number of entries in the number tree."""
def __iter__(self) -> Iterator[int]:
"""Iterate over keys in the number tree."""
def __getitem__(self, key: int) -> Object:
"""
Get value by numeric key.
Parameters:
- key (int): Numeric key to look up
Returns:
Object: Value associated with the key
Raises:
KeyError: If key is not found
"""
def __setitem__(self, key: int, value: Object) -> None:
"""
Set key-value pair.
Parameters:
- key (int): Numeric key for the entry
- value (Object): Value to store
"""
def __delitem__(self, key: int) -> None:
"""
Delete entry by numeric key.
Parameters:
- key (int): Key to delete
Raises:
KeyError: If key is not found
"""
def __contains__(self, key: int) -> bool:
"""Check if numeric key exists in the tree."""Helper functions for working with coordinate systems and transformations.
def get_objects_with_ctm(pdf: Pdf) -> list[tuple[Object, Matrix]]:
"""
Find objects with coordinate transformation matrices (CTM).
Scans the PDF for objects that have associated transformation
matrices, useful for analyzing coordinate system changes.
Parameters:
- pdf (Pdf): PDF document to analyze
Returns:
list[tuple[Object, Matrix]]: List of (object, transformation_matrix) pairs
"""Global pikepdf configuration functions for controlling behavior.
def get_decimal_precision() -> int:
"""
Get the current decimal precision for floating-point output.
Controls how many decimal places are used when writing
floating-point numbers to PDF files.
Returns:
int: Current precision (number of decimal places)
"""
def set_decimal_precision(precision: int) -> None:
"""
Set the decimal precision for floating-point output.
Parameters:
- precision (int): Number of decimal places (typically 2-6)
Raises:
ValueError: If precision is out of valid range
"""
def set_flate_compression_level(level: int) -> None:
"""
Set the compression level for Flate (deflate) streams.
Controls the trade-off between compression speed and compression ratio
when compressing PDF streams using Flate encoding.
Parameters:
- level (int): Compression level (0-9, where 0=no compression, 9=maximum compression)
Raises:
ValueError: If level is out of valid range (0-9)
"""Utility classes for advanced PDF object manipulation.
class ObjectHelper:
"""
Helper class for PDF object operations.
Provides utility methods for advanced object manipulation
and analysis that don't fit into the main object classes.
"""
# Note: Specific methods would be documented based on actual implementation
# This class provides low-level object utilitiesimport pikepdf
import math
# Create various transformation matrices
identity = pikepdf.Matrix.identity()
print(f"Identity matrix: [{identity.a}, {identity.b}, {identity.c}, {identity.d}, {identity.e}, {identity.f}]")
# Translation
translate = pikepdf.Matrix().translated(100, 50)
print(f"Translation (100, 50): [{translate.a}, {translate.b}, {translate.c}, {translate.d}, {translate.e}, {translate.f}]")
# Scaling
scale = pikepdf.Matrix().scaled(2.0, 1.5) # 2x width, 1.5x height
print(f"Scaling (2.0, 1.5): [{scale.a}, {scale.b}, {scale.c}, {scale.d}, {scale.e}, {scale.f}]")
# Rotation (45 degrees)
rotate = pikepdf.Matrix().rotated(45)
print(f"Rotation 45°: [{rotate.a:.3f}, {rotate.b:.3f}, {rotate.c:.3f}, {rotate.d:.3f}, {rotate.e}, {rotate.f}]")
# Combined transformation: scale, then rotate, then translate
combined = pikepdf.Matrix().scaled(1.5, 1.5).rotated(30).translated(100, 200)
print(f"Combined transform: [{combined.a:.3f}, {combined.b:.3f}, {combined.c:.3f}, {combined.d:.3f}, {combined.e:.1f}, {combined.f:.1f}]")
# Transform points
original_point = (10, 20)
transformed_point = combined.transform(original_point)
print(f"Point {original_point} -> {transformed_point}")
# Matrix multiplication (composition)
m1 = pikepdf.Matrix().scaled(2, 2)
m2 = pikepdf.Matrix().rotated(90)
m3 = m1 * m2 # Apply m1 first, then m2
print(f"Matrix multiplication result: [{m3.a:.3f}, {m3.b:.3f}, {m3.c:.3f}, {m3.d:.3f}, {m3.e}, {m3.f}]")
# Inverse transformation
original_matrix = pikepdf.Matrix().scaled(2, 3).translated(10, 15)
inverse_matrix = original_matrix.inverse()
# Verify inverse (should return original point)
point = (5, 7)
transformed = original_matrix.transform(point)
back_to_original = inverse_matrix.transform(transformed)
print(f"Original: {point}, Transformed: {transformed}, Back: {back_to_original}")import pikepdf
def apply_transformation_to_page(page, matrix):
"""Apply a transformation matrix to all content on a page."""
# Get existing content
if '/Contents' in page:
existing_content = page['/Contents']
# Create transformation commands
transform_commands = f"""
q
{matrix.a} {matrix.b} {matrix.c} {matrix.d} {matrix.e} {matrix.f} cm
"""
restore_commands = "\nQ"
# Wrap existing content with transformation
if isinstance(existing_content, pikepdf.Array):
# Multiple content streams
transform_stream = pikepdf.Stream(page.owner, transform_commands.encode())
restore_stream = pikepdf.Stream(page.owner, restore_commands.encode())
new_contents = pikepdf.Array([transform_stream])
new_contents.extend(existing_content)
new_contents.append(restore_stream)
page['/Contents'] = new_contents
else:
# Single content stream
new_content = transform_commands.encode() + existing_content.read_bytes() + restore_commands.encode()
page['/Contents'] = pikepdf.Stream(page.owner, new_content)
# Apply transformation to a PDF page
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]
# Create a transformation matrix (rotate 15 degrees and scale 90%)
transform_matrix = pikepdf.Matrix().rotated(15).scaled(0.9, 0.9)
# Apply transformation
apply_transformation_to_page(page, transform_matrix)
pdf.save('transformed_document.pdf')
pdf.close()
print("Applied transformation to page content")import pikepdf
# Create a PDF with name tree
pdf = pikepdf.new()
# Create a name tree for storing named destinations
name_tree = pikepdf.NameTree.new(pdf)
# Add entries to the name tree
destinations = {
b'chapter1': pikepdf.Array([pdf.pages[0], pikepdf.Name.Fit]),
b'section1.1': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitH, 700]),
b'appendix': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitV, 100]),
}
for name, destination in destinations.items():
name_tree[name] = destination
print(f"Added destination: {name.decode()} -> {destination}")
print(f"Name tree contains {len(name_tree)} entries")
# Iterate through name tree
print("All entries in name tree:")
for key in name_tree:
value = name_tree[key]
print(f" {key.decode()}: {value}")
# Create a number tree for page labels
number_tree = pikepdf.NumberTree.new(pdf)
# Add page labels (page number -> label format)
page_labels = {
0: pikepdf.Dictionary({'/S': pikepdf.Name.r}), # Roman numerals
5: pikepdf.Dictionary({'/S': pikepdf.Name.D, '/P': pikepdf.String('Page ')}), # Decimal with prefix
10: pikepdf.Dictionary({'/S': pikepdf.Name.a}), # Lowercase letters
}
for page_num, label_dict in page_labels.items():
number_tree[page_num] = label_dict
print(f"Added page label: Page {page_num} -> {label_dict}")
print(f"Number tree contains {len(number_tree)} entries")
# Save PDF with trees
pdf.save('document_with_trees.pdf')
pdf.close()import pikepdf
import json
def process_pdf_with_job_interface(input_pdf, output_pdf, operations):
"""Use job interface for complex PDF processing."""
try:
# Create a job configuration
job_config = {
'inputFile': input_pdf,
'outputFile': output_pdf,
'staticId': True, # Reproducible output
'deterministicId': True,
'operations': operations
}
# Create job from configuration
job = pikepdf.Job()
# Configure job (this is simplified - actual API may differ)
# In practice, you'd use specific job configuration methods
# Validate configuration
if job.check_configuration():
print("Job configuration is valid")
# Execute the job
exit_code = job.run()
if exit_code == 0:
print(f"Job completed successfully: {input_pdf} -> {output_pdf}")
if job.has_warnings:
print("Job completed with warnings")
return True
else:
print(f"Job failed with exit code: {exit_code}")
return False
else:
print("Job configuration is invalid")
return False
except pikepdf.JobUsageError as e:
print(f"Job usage error: {e}")
return False
# Example job operations
operations = [
{'operation': 'qdf', 'parameters': {}}, # Convert to QDF format for inspection
{'operation': 'optimize', 'parameters': {'compress-streams': True}},
{'operation': 'linearize', 'parameters': {}} # Linearize for fast web view
]
# Process PDF with job interface
# success = process_pdf_with_job_interface('input.pdf', 'output.pdf', operations)import pikepdf
def configure_pikepdf_settings():
"""Configure pikepdf global settings for optimal performance."""
# Get current settings
current_precision = pikepdf.settings.get_decimal_precision()
print(f"Current decimal precision: {current_precision}")
# Set precision for clean output (fewer decimal places)
pikepdf.settings.set_decimal_precision(3)
print("Set decimal precision to 3 places")
# Set compression level for optimal balance of speed and size
pikepdf.settings.set_flate_compression_level(6) # Medium compression
print("Set Flate compression level to 6 (medium)")
# Verify settings
new_precision = pikepdf.settings.get_decimal_precision()
print(f"New decimal precision: {new_precision}")
def create_optimized_pdf():
"""Create a PDF with optimized settings."""
# Configure settings for clean, compact output
configure_pikepdf_settings()
# Create PDF
pdf = pikepdf.new()
page = pdf.add_blank_page()
# Add content with floating-point coordinates
content = """
BT
/F1 12 Tf
100.123456789 700.987654321 Td
(Optimized PDF with controlled precision) Tj
ET
"""
content_stream = pikepdf.Stream(pdf, content.encode())
page['/Contents'] = content_stream
# Save with compression and optimization
pdf.save('optimized_output.pdf',
compress_streams=True,
normalize_content=True)
pdf.close()
print("Created optimized PDF with controlled precision and compression")
# Configure and create optimized PDF
# create_optimized_pdf()import pikepdf
def analyze_object_relationships(pdf_path):
"""Analyze complex object relationships in a PDF."""
pdf = pikepdf.open(pdf_path)
analysis = {
'total_objects': len(pdf.objects),
'object_types': {},
'indirect_objects': 0,
'shared_objects': {},
'complex_structures': {}
}
# Analyze all objects
for (obj_id, gen), obj in pdf.objects.items():
# Count object types
obj_type = str(obj._type_code)
analysis['object_types'][obj_type] = analysis['object_types'].get(obj_type, 0) + 1
if obj.is_indirect:
analysis['indirect_objects'] += 1
# Find shared objects (referenced multiple times)
if obj.is_indirect:
# Count references (this is simplified - would need full PDF traversal)
analysis['shared_objects'][f"{obj_id}/{gen}"] = {
'type': obj_type,
'size': len(str(obj)) if hasattr(obj, '__str__') else 0
}
# Find coordinate transformation matrices
try:
ctm_objects = pikepdf.get_objects_with_ctm(pdf)
analysis['complex_structures']['objects_with_ctm'] = len(ctm_objects)
print(f"Found {len(ctm_objects)} objects with coordinate transformations:")
for obj, matrix in ctm_objects[:5]: # Show first 5
print(f" Object {obj}: Matrix [{matrix.a:.2f}, {matrix.b:.2f}, {matrix.c:.2f}, {matrix.d:.2f}, {matrix.e:.2f}, {matrix.f:.2f}]")
except Exception as e:
print(f"Could not analyze CTM objects: {e}")
pdf.close()
print(f"\nPDF Object Analysis for {pdf_path}:")
print(f"Total objects: {analysis['total_objects']}")
print(f"Indirect objects: {analysis['indirect_objects']}")
print(f"\nObject types:")
for obj_type, count in sorted(analysis['object_types'].items()):
print(f" {obj_type}: {count}")
return analysis
# Analyze object relationships
# analysis = analyze_object_relationships('complex_document.pdf')import pikepdf
import time
from pathlib import Path
def benchmark_pdf_operations(pdf_path):
"""Benchmark various PDF operations for performance analysis."""
operations = {}
# Time PDF opening
start_time = time.time()
pdf = pikepdf.open(pdf_path)
operations['open'] = time.time() - start_time
# Time page access
start_time = time.time()
page_count = len(pdf.pages)
first_page = pdf.pages[0] if page_count > 0 else None
operations['page_access'] = time.time() - start_time
# Time content parsing
if first_page:
start_time = time.time()
try:
instructions = pikepdf.parse_content_stream(first_page)
operations['content_parsing'] = time.time() - start_time
operations['instruction_count'] = len(instructions)
except Exception as e:
operations['content_parsing'] = f"Failed: {e}"
# Time object iteration
start_time = time.time()
object_count = len(pdf.objects)
operations['object_iteration'] = time.time() - start_time
operations['object_count'] = object_count
# Time save operation
output_path = Path(pdf_path).with_suffix('.benchmark.pdf')
start_time = time.time()
pdf.save(str(output_path))
operations['save'] = time.time() - start_time
pdf.close()
# Clean up benchmark file
if output_path.exists():
output_path.unlink()
print(f"Performance Benchmark for {pdf_path}:")
print(f" Open: {operations['open']:.3f}s")
print(f" Page access ({page_count} pages): {operations['page_access']:.3f}s")
if 'content_parsing' in operations:
if isinstance(operations['content_parsing'], str):
print(f" Content parsing: {operations['content_parsing']}")
else:
print(f" Content parsing ({operations.get('instruction_count', 0)} instructions): {operations['content_parsing']:.3f}s")
print(f" Object iteration ({object_count} objects): {operations['object_iteration']:.3f}s")
print(f" Save: {operations['save']:.3f}s")
return operations
def optimize_pdf_processing():
"""Demonstrate techniques for optimizing PDF processing performance."""
# Configure for optimal performance
pikepdf.settings.set_decimal_precision(2) # Reduce precision for speed
pikepdf.settings.set_flate_compression_level(1) # Fast compression
print("Configured pikepdf for performance:")
print(f" Decimal precision: {pikepdf.settings.get_decimal_precision()}")
print(" Compression level: 1 (fast)")
# Performance tips:
print("\nPerformance optimization tips:")
print("1. Use access_mode=pikepdf.AccessMode.mmap for large files")
print("2. Set suppress_warnings=True to reduce overhead")
print("3. Use static_id=True for reproducible output without timestamp overhead")
print("4. Consider stream_decode_level for controlling decoding complexity")
print("5. Process pages in batches for large documents")
print("6. Cache parsed content streams if reusing")
print("7. Use pikepdf.new() instead of opening/clearing for new documents")
# Run performance analysis
# if Path('document.pdf').exists():
# benchmark_pdf_operations('document.pdf')
optimize_pdf_processing()Install with Tessl CLI
npx tessl i tessl/pypi-pikepdf