tessl/pypi-pikepdf

Read and write PDFs with Python, powered by qpdf

—

Pending

Overview

Eval results

Files

Advanced Operations

Name: tessl/pypi-pikepdf
Author: tessl

Specialized operations including matrix transformations, coordinate systems, job interface, and tree structures for advanced PDF manipulation. These capabilities enable sophisticated PDF processing and analysis workflows.

Capabilities

Matrix Class

2D transformation matrix for coordinate transformations and geometric operations.

class Matrix:
    """
    PDF transformation matrix for geometric operations.
    
    Represents a 2D transformation matrix with 6 elements in the form:
    [a b c d e f] which corresponds to the transformation:
    x' = a*x + c*y + e
    y' = b*x + d*y + f
    
    Used for scaling, rotation, translation, and skewing operations.
    """
    
    def __init__(self, a: float = 1, b: float = 0, c: float = 0, 
                 d: float = 1, e: float = 0, f: float = 0) -> None:
        """
        Create a transformation matrix with specified elements.
        
        Parameters:
        - a (float): X-scaling component
        - b (float): Y-skewing component
        - c (float): X-skewing component
        - d (float): Y-scaling component
        - e (float): X-translation component
        - f (float): Y-translation component
        
        Default creates an identity matrix (no transformation).
        """
    
    @staticmethod
    def identity() -> Matrix:
        """
        Create an identity matrix that performs no transformation.
        
        Returns:
        Matrix: Identity matrix [1 0 0 1 0 0]
        """
    
    def translated(self, dx: float, dy: float) -> Matrix:
        """
        Create a new matrix with translation applied.
        
        Parameters:
        - dx (float): Translation distance in X direction
        - dy (float): Translation distance in Y direction
        
        Returns:
        Matrix: New matrix with translation transformation
        """
    
    def scaled(self, sx: float, sy: float = None) -> Matrix:
        """
        Create a new matrix with scaling applied.
        
        Parameters:
        - sx (float): Scale factor in X direction
        - sy (float, optional): Scale factor in Y direction (defaults to sx for uniform scaling)
        
        Returns:
        Matrix: New matrix with scaling transformation
        """
    
    def rotated(self, angle_degrees: float) -> Matrix:
        """
        Create a new matrix with rotation applied.
        
        Parameters:
        - angle_degrees (float): Rotation angle in degrees (positive = counterclockwise)
        
        Returns:
        Matrix: New matrix with rotation transformation
        """
    
    def inverse(self) -> Matrix:
        """
        Calculate the inverse of this transformation matrix.
        
        Returns:
        Matrix: Inverse transformation matrix
        
        Raises:
        ValueError: If matrix is not invertible (determinant is zero)
        """
    
    def transform(self, point: tuple[float, float]) -> tuple[float, float]:
        """
        Transform a point using this matrix.
        
        Parameters:
        - point (tuple[float, float]): Point coordinates (x, y)
        
        Returns:
        tuple[float, float]: Transformed point coordinates (x', y')
        """
    
    def __mul__(self, other: Matrix) -> Matrix:
        """
        Matrix multiplication (composition of transformations).
        
        Parameters:
        - other (Matrix): Matrix to multiply with
        
        Returns:
        Matrix: Result of matrix multiplication
        """
    
    @property
    def a(self) -> float:
        """X-scaling component of the transformation."""
    
    @property  
    def b(self) -> float:
        """Y-skewing component of the transformation."""
    
    @property
    def c(self) -> float:
        """X-skewing component of the transformation."""
    
    @property
    def d(self) -> float:
        """Y-scaling component of the transformation."""
    
    @property
    def e(self) -> float:
        """X-translation component of the transformation."""
    
    @property
    def f(self) -> float:
        """Y-translation component of the transformation."""

Job Interface

Command-line job interface providing access to qpdf functionality.

class Job:
    """
    Command-line job interface for advanced PDF operations.
    
    Provides access to qpdf's command-line functionality through
    a programmatic interface, enabling complex PDF processing workflows.
    """
    
    def run(self) -> int:
        """
        Execute the configured job.
        
        Returns:
        int: Exit code (0 for success, non-zero for failure)
        """
    
    def check_configuration(self) -> bool:
        """
        Validate the job configuration without executing.
        
        Returns:
        bool: True if configuration is valid
        
        Raises:
        JobUsageError: If configuration has errors
        """
    
    def create_pdf(self) -> Pdf:
        """
        Create a PDF object from the job configuration.
        
        Returns:
        Pdf: PDF object created by the job
        
        Raises:
        JobUsageError: If job doesn't create a PDF
        """
    
    def write_pdf(self, pdf: Pdf) -> None:
        """
        Write a PDF using the job's output configuration.
        
        Parameters:
        - pdf (Pdf): PDF to write using job settings
        """
    
    @property
    def creates_output(self) -> bool:
        """
        Whether this job creates output files.
        
        Returns:
        bool: True if job will create output
        """
    
    @property
    def has_warnings(self) -> bool:
        """
        Whether the job execution produced warnings.
        
        Returns:
        bool: True if warnings were generated
        """
    
    @property
    def exit_code(self) -> int:
        """
        Exit code from the last job execution.
        
        Returns:
        int: Exit code (0 = success)
        """
    
    @staticmethod
    def json_out_schema() -> dict:
        """
        Get the JSON schema for job output format.
        
        Returns:
        dict: JSON schema describing output structure
        """
    
    @staticmethod
    def job_json_schema() -> dict:
        """
        Get the JSON schema for job configuration format.
        
        Returns:
        dict: JSON schema describing job configuration structure
        """

Tree Structures

Specialized tree data structures for PDF name trees and number trees.

class NameTree:
    """
    PDF name tree structure for sorted key-value storage.
    
    Name trees provide efficient storage and retrieval of key-value pairs
    where keys are byte strings sorted in lexical order.
    
    Implements MutableMapping[bytes, Object] interface.
    """
    
    @staticmethod
    def new(pdf: Pdf) -> NameTree:
        """
        Create a new empty name tree.
        
        Parameters:
        - pdf (Pdf): PDF document to create the tree in
        
        Returns:
        NameTree: New empty name tree
        """
    
    def __len__(self) -> int:
        """Number of entries in the name tree."""
    
    def __iter__(self) -> Iterator[bytes]:
        """Iterate over keys in the name tree."""
    
    def __getitem__(self, key: bytes) -> Object:
        """
        Get value by key.
        
        Parameters:
        - key (bytes): Key to look up
        
        Returns:
        Object: Value associated with the key
        
        Raises:
        KeyError: If key is not found
        """
    
    def __setitem__(self, key: bytes, value: Object) -> None:
        """
        Set key-value pair.
        
        Parameters:
        - key (bytes): Key for the entry
        - value (Object): Value to store
        """
    
    def __delitem__(self, key: bytes) -> None:
        """
        Delete entry by key.
        
        Parameters:
        - key (bytes): Key to delete
        
        Raises:
        KeyError: If key is not found
        """
    
    def __contains__(self, key: bytes) -> bool:
        """Check if key exists in the tree."""

class NumberTree:
    """
    PDF number tree structure for sorted numeric key-value storage.
    
    Number trees provide efficient storage and retrieval of key-value pairs
    where keys are integers sorted in numeric order.
    
    Implements MutableMapping[int, Object] interface.
    """
    
    @staticmethod
    def new(pdf: Pdf) -> NumberTree:
        """
        Create a new empty number tree.
        
        Parameters:
        - pdf (Pdf): PDF document to create the tree in
        
        Returns:
        NumberTree: New empty number tree
        """
    
    def __len__(self) -> int:
        """Number of entries in the number tree."""
    
    def __iter__(self) -> Iterator[int]:
        """Iterate over keys in the number tree."""
    
    def __getitem__(self, key: int) -> Object:
        """
        Get value by numeric key.
        
        Parameters:
        - key (int): Numeric key to look up
        
        Returns:
        Object: Value associated with the key
        
        Raises:
        KeyError: If key is not found
        """
    
    def __setitem__(self, key: int, value: Object) -> None:
        """
        Set key-value pair.
        
        Parameters:
        - key (int): Numeric key for the entry
        - value (Object): Value to store
        """
    
    def __delitem__(self, key: int) -> None:
        """
        Delete entry by numeric key.
        
        Parameters:
        - key (int): Key to delete
        
        Raises:
        KeyError: If key is not found
        """
    
    def __contains__(self, key: int) -> bool:
        """Check if numeric key exists in the tree."""

Coordinate Transformation Utilities

Helper functions for working with coordinate systems and transformations.

def get_objects_with_ctm(pdf: Pdf) -> list[tuple[Object, Matrix]]:
    """
    Find objects with coordinate transformation matrices (CTM).
    
    Scans the PDF for objects that have associated transformation
    matrices, useful for analyzing coordinate system changes.
    
    Parameters:
    - pdf (Pdf): PDF document to analyze
    
    Returns:
    list[tuple[Object, Matrix]]: List of (object, transformation_matrix) pairs
    """

Settings and Configuration

Global pikepdf configuration functions for controlling behavior.

def get_decimal_precision() -> int:
    """
    Get the current decimal precision for floating-point output.
    
    Controls how many decimal places are used when writing
    floating-point numbers to PDF files.
    
    Returns:
    int: Current precision (number of decimal places)
    """

def set_decimal_precision(precision: int) -> None:
    """
    Set the decimal precision for floating-point output.
    
    Parameters:
    - precision (int): Number of decimal places (typically 2-6)
    
    Raises:
    ValueError: If precision is out of valid range
    """

def set_flate_compression_level(level: int) -> None:
    """
    Set the compression level for Flate (deflate) streams.
    
    Controls the trade-off between compression speed and compression ratio
    when compressing PDF streams using Flate encoding.
    
    Parameters:
    - level (int): Compression level (0-9, where 0=no compression, 9=maximum compression)
    
    Raises:
    ValueError: If level is out of valid range (0-9)
    """

Helper Classes

Utility classes for advanced PDF object manipulation.

class ObjectHelper:
    """
    Helper class for PDF object operations.
    
    Provides utility methods for advanced object manipulation
    and analysis that don't fit into the main object classes.
    """
    
    # Note: Specific methods would be documented based on actual implementation
    # This class provides low-level object utilities

Usage Examples

Matrix Transformations

import pikepdf
import math

# Create various transformation matrices
identity = pikepdf.Matrix.identity()
print(f"Identity matrix: [{identity.a}, {identity.b}, {identity.c}, {identity.d}, {identity.e}, {identity.f}]")

# Translation
translate = pikepdf.Matrix().translated(100, 50)
print(f"Translation (100, 50): [{translate.a}, {translate.b}, {translate.c}, {translate.d}, {translate.e}, {translate.f}]")

# Scaling
scale = pikepdf.Matrix().scaled(2.0, 1.5)  # 2x width, 1.5x height
print(f"Scaling (2.0, 1.5): [{scale.a}, {scale.b}, {scale.c}, {scale.d}, {scale.e}, {scale.f}]")

# Rotation (45 degrees)
rotate = pikepdf.Matrix().rotated(45)
print(f"Rotation 45°: [{rotate.a:.3f}, {rotate.b:.3f}, {rotate.c:.3f}, {rotate.d:.3f}, {rotate.e}, {rotate.f}]")

# Combined transformation: scale, then rotate, then translate
combined = pikepdf.Matrix().scaled(1.5, 1.5).rotated(30).translated(100, 200)
print(f"Combined transform: [{combined.a:.3f}, {combined.b:.3f}, {combined.c:.3f}, {combined.d:.3f}, {combined.e:.1f}, {combined.f:.1f}]")

# Transform points
original_point = (10, 20)
transformed_point = combined.transform(original_point)
print(f"Point {original_point} -> {transformed_point}")

# Matrix multiplication (composition)
m1 = pikepdf.Matrix().scaled(2, 2)
m2 = pikepdf.Matrix().rotated(90)
m3 = m1 * m2  # Apply m1 first, then m2
print(f"Matrix multiplication result: [{m3.a:.3f}, {m3.b:.3f}, {m3.c:.3f}, {m3.d:.3f}, {m3.e}, {m3.f}]")

# Inverse transformation
original_matrix = pikepdf.Matrix().scaled(2, 3).translated(10, 15)
inverse_matrix = original_matrix.inverse()

# Verify inverse (should return original point)
point = (5, 7)
transformed = original_matrix.transform(point)
back_to_original = inverse_matrix.transform(transformed)
print(f"Original: {point}, Transformed: {transformed}, Back: {back_to_original}")

Applying Transformations to PDF Content

import pikepdf

def apply_transformation_to_page(page, matrix):
    """Apply a transformation matrix to all content on a page."""
    
    # Get existing content
    if '/Contents' in page:
        existing_content = page['/Contents']
        
        # Create transformation commands
        transform_commands = f"""
        q
        {matrix.a} {matrix.b} {matrix.c} {matrix.d} {matrix.e} {matrix.f} cm
        """
        
        restore_commands = "\nQ"
        
        # Wrap existing content with transformation
        if isinstance(existing_content, pikepdf.Array):
            # Multiple content streams
            transform_stream = pikepdf.Stream(page.owner, transform_commands.encode())
            restore_stream = pikepdf.Stream(page.owner, restore_commands.encode())
            
            new_contents = pikepdf.Array([transform_stream])
            new_contents.extend(existing_content)
            new_contents.append(restore_stream)
            
            page['/Contents'] = new_contents
        else:
            # Single content stream
            new_content = transform_commands.encode() + existing_content.read_bytes() + restore_commands.encode()
            page['/Contents'] = pikepdf.Stream(page.owner, new_content)

# Apply transformation to a PDF page
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]

# Create a transformation matrix (rotate 15 degrees and scale 90%)
transform_matrix = pikepdf.Matrix().rotated(15).scaled(0.9, 0.9)

# Apply transformation
apply_transformation_to_page(page, transform_matrix)

pdf.save('transformed_document.pdf')
pdf.close()
print("Applied transformation to page content")

Working with Name and Number Trees

import pikepdf

# Create a PDF with name tree
pdf = pikepdf.new()

# Create a name tree for storing named destinations
name_tree = pikepdf.NameTree.new(pdf)

# Add entries to the name tree
destinations = {
    b'chapter1': pikepdf.Array([pdf.pages[0], pikepdf.Name.Fit]),
    b'section1.1': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitH, 700]),
    b'appendix': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitV, 100]),
}

for name, destination in destinations.items():
    name_tree[name] = destination
    print(f"Added destination: {name.decode()} -> {destination}")

print(f"Name tree contains {len(name_tree)} entries")

# Iterate through name tree
print("All entries in name tree:")
for key in name_tree:
    value = name_tree[key]
    print(f"  {key.decode()}: {value}")

# Create a number tree for page labels
number_tree = pikepdf.NumberTree.new(pdf)

# Add page labels (page number -> label format)
page_labels = {
    0: pikepdf.Dictionary({'/S': pikepdf.Name.r}),  # Roman numerals
    5: pikepdf.Dictionary({'/S': pikepdf.Name.D, '/P': pikepdf.String('Page ')}),  # Decimal with prefix
    10: pikepdf.Dictionary({'/S': pikepdf.Name.a}),  # Lowercase letters
}

for page_num, label_dict in page_labels.items():
    number_tree[page_num] = label_dict
    print(f"Added page label: Page {page_num} -> {label_dict}")

print(f"Number tree contains {len(number_tree)} entries")

# Save PDF with trees
pdf.save('document_with_trees.pdf')
pdf.close()

Advanced Job Interface Usage

import pikepdf
import json

def process_pdf_with_job_interface(input_pdf, output_pdf, operations):
    """Use job interface for complex PDF processing."""
    
    try:
        # Create a job configuration
        job_config = {
            'inputFile': input_pdf,
            'outputFile': output_pdf,
            'staticId': True,  # Reproducible output
            'deterministicId': True,
            'operations': operations
        }
        
        # Create job from configuration
        job = pikepdf.Job()
        
        # Configure job (this is simplified - actual API may differ)
        # In practice, you'd use specific job configuration methods
        
        # Validate configuration
        if job.check_configuration():
            print("Job configuration is valid")
            
            # Execute the job
            exit_code = job.run()
            
            if exit_code == 0:
                print(f"Job completed successfully: {input_pdf} -> {output_pdf}")
                
                if job.has_warnings:
                    print("Job completed with warnings")
                
                return True
            else:
                print(f"Job failed with exit code: {exit_code}")
                return False
        else:
            print("Job configuration is invalid")
            return False
            
    except pikepdf.JobUsageError as e:
        print(f"Job usage error: {e}")
        return False

# Example job operations
operations = [
    {'operation': 'qdf', 'parameters': {}},  # Convert to QDF format for inspection
    {'operation': 'optimize', 'parameters': {'compress-streams': True}},
    {'operation': 'linearize', 'parameters': {}}  # Linearize for fast web view
]

# Process PDF with job interface
# success = process_pdf_with_job_interface('input.pdf', 'output.pdf', operations)

Configuration and Settings Management

import pikepdf

def configure_pikepdf_settings():
    """Configure pikepdf global settings for optimal performance."""
    
    # Get current settings
    current_precision = pikepdf.settings.get_decimal_precision()
    print(f"Current decimal precision: {current_precision}")
    
    # Set precision for clean output (fewer decimal places)
    pikepdf.settings.set_decimal_precision(3)
    print("Set decimal precision to 3 places")
    
    # Set compression level for optimal balance of speed and size
    pikepdf.settings.set_flate_compression_level(6)  # Medium compression
    print("Set Flate compression level to 6 (medium)")
    
    # Verify settings
    new_precision = pikepdf.settings.get_decimal_precision()
    print(f"New decimal precision: {new_precision}")

def create_optimized_pdf():
    """Create a PDF with optimized settings."""
    
    # Configure settings for clean, compact output
    configure_pikepdf_settings()
    
    # Create PDF
    pdf = pikepdf.new()
    page = pdf.add_blank_page()
    
    # Add content with floating-point coordinates
    content = """
    BT
    /F1 12 Tf
    100.123456789 700.987654321 Td
    (Optimized PDF with controlled precision) Tj
    ET
    """
    
    content_stream = pikepdf.Stream(pdf, content.encode())
    page['/Contents'] = content_stream
    
    # Save with compression and optimization
    pdf.save('optimized_output.pdf', 
             compress_streams=True,
             normalize_content=True)
    pdf.close()
    
    print("Created optimized PDF with controlled precision and compression")

# Configure and create optimized PDF
# create_optimized_pdf()

Advanced Object Analysis

import pikepdf

def analyze_object_relationships(pdf_path):
    """Analyze complex object relationships in a PDF."""
    
    pdf = pikepdf.open(pdf_path)
    
    analysis = {
        'total_objects': len(pdf.objects),
        'object_types': {},
        'indirect_objects': 0,
        'shared_objects': {},
        'complex_structures': {}
    }
    
    # Analyze all objects
    for (obj_id, gen), obj in pdf.objects.items():
        # Count object types
        obj_type = str(obj._type_code)
        analysis['object_types'][obj_type] = analysis['object_types'].get(obj_type, 0) + 1
        
        if obj.is_indirect:
            analysis['indirect_objects'] += 1
        
        # Find shared objects (referenced multiple times)
        if obj.is_indirect:
            # Count references (this is simplified - would need full PDF traversal)
            analysis['shared_objects'][f"{obj_id}/{gen}"] = {
                'type': obj_type,
                'size': len(str(obj)) if hasattr(obj, '__str__') else 0
            }
    
    # Find coordinate transformation matrices
    try:
        ctm_objects = pikepdf.get_objects_with_ctm(pdf)
        analysis['complex_structures']['objects_with_ctm'] = len(ctm_objects)
        
        print(f"Found {len(ctm_objects)} objects with coordinate transformations:")
        for obj, matrix in ctm_objects[:5]:  # Show first 5
            print(f"  Object {obj}: Matrix [{matrix.a:.2f}, {matrix.b:.2f}, {matrix.c:.2f}, {matrix.d:.2f}, {matrix.e:.2f}, {matrix.f:.2f}]")
            
    except Exception as e:
        print(f"Could not analyze CTM objects: {e}")
    
    pdf.close()
    
    print(f"\nPDF Object Analysis for {pdf_path}:")
    print(f"Total objects: {analysis['total_objects']}")
    print(f"Indirect objects: {analysis['indirect_objects']}")
    
    print(f"\nObject types:")
    for obj_type, count in sorted(analysis['object_types'].items()):
        print(f"  {obj_type}: {count}")
    
    return analysis

# Analyze object relationships
# analysis = analyze_object_relationships('complex_document.pdf')

Performance Optimization Techniques

import pikepdf
import time
from pathlib import Path

def benchmark_pdf_operations(pdf_path):
    """Benchmark various PDF operations for performance analysis."""
    
    operations = {}
    
    # Time PDF opening
    start_time = time.time()
    pdf = pikepdf.open(pdf_path)
    operations['open'] = time.time() - start_time
    
    # Time page access
    start_time = time.time()
    page_count = len(pdf.pages)
    first_page = pdf.pages[0] if page_count > 0 else None
    operations['page_access'] = time.time() - start_time
    
    # Time content parsing
    if first_page:
        start_time = time.time()
        try:
            instructions = pikepdf.parse_content_stream(first_page)
            operations['content_parsing'] = time.time() - start_time
            operations['instruction_count'] = len(instructions)
        except Exception as e:
            operations['content_parsing'] = f"Failed: {e}"
    
    # Time object iteration
    start_time = time.time()
    object_count = len(pdf.objects)
    operations['object_iteration'] = time.time() - start_time
    operations['object_count'] = object_count
    
    # Time save operation
    output_path = Path(pdf_path).with_suffix('.benchmark.pdf')
    start_time = time.time()
    pdf.save(str(output_path))
    operations['save'] = time.time() - start_time
    
    pdf.close()
    
    # Clean up benchmark file
    if output_path.exists():
        output_path.unlink()
    
    print(f"Performance Benchmark for {pdf_path}:")
    print(f"  Open: {operations['open']:.3f}s")
    print(f"  Page access ({page_count} pages): {operations['page_access']:.3f}s")
    if 'content_parsing' in operations:
        if isinstance(operations['content_parsing'], str):
            print(f"  Content parsing: {operations['content_parsing']}")
        else:
            print(f"  Content parsing ({operations.get('instruction_count', 0)} instructions): {operations['content_parsing']:.3f}s")
    print(f"  Object iteration ({object_count} objects): {operations['object_iteration']:.3f}s")
    print(f"  Save: {operations['save']:.3f}s")
    
    return operations

def optimize_pdf_processing():
    """Demonstrate techniques for optimizing PDF processing performance."""
    
    # Configure for optimal performance
    pikepdf.settings.set_decimal_precision(2)  # Reduce precision for speed
    pikepdf.settings.set_flate_compression_level(1)  # Fast compression
    
    print("Configured pikepdf for performance:")
    print(f"  Decimal precision: {pikepdf.settings.get_decimal_precision()}")
    print("  Compression level: 1 (fast)")
    
    # Performance tips:
    print("\nPerformance optimization tips:")
    print("1. Use access_mode=pikepdf.AccessMode.mmap for large files")
    print("2. Set suppress_warnings=True to reduce overhead")
    print("3. Use static_id=True for reproducible output without timestamp overhead")
    print("4. Consider stream_decode_level for controlling decoding complexity")
    print("5. Process pages in batches for large documents")
    print("6. Cache parsed content streams if reusing")
    print("7. Use pikepdf.new() instead of opening/clearing for new documents")

# Run performance analysis
# if Path('document.pdf').exists():
#     benchmark_pdf_operations('document.pdf')

optimize_pdf_processing()

Install with Tessl CLI