tessl/pypi-pikepdf

Read and write PDFs with Python, powered by qpdf

—

Pending

Overview

Eval results

Files

Content Stream Processing

Name: tessl/pypi-pikepdf
Author: tessl

Low-level content stream parsing, token filtering, and PDF operator manipulation for advanced content processing. These capabilities enable fine-grained control over PDF content rendering and modification.

Capabilities

Content Stream Parsing Functions

High-level functions for parsing and reconstructing PDF content streams.

def parse_content_stream(page_or_stream) -> list[ContentStreamInstruction]:
    """
    Parse a PDF content stream into individual instructions.
    
    Converts the binary content stream format into a list of structured
    instruction objects containing operators and their operands.
    
    Parameters:
    - page_or_stream: Page object or Stream object containing content data
    
    Returns:
    list[ContentStreamInstruction]: Parsed content stream instructions
    
    Raises:
    PdfParsingError: If content stream cannot be parsed due to syntax errors
    """

def unparse_content_stream(instructions: list[ContentStreamInstruction]) -> bytes:
    """
    Convert content stream instructions back to binary stream format.
    
    Takes a list of instruction objects and reconstructs the binary
    content stream data suitable for PDF storage.
    
    Parameters:
    - instructions (list[ContentStreamInstruction]): Instructions to convert
    
    Returns:
    bytes: Binary content stream data
    
    Raises:
    ValueError: If instructions contain invalid data or operators
    """

ContentStreamInstruction Class

Individual content stream instructions containing operators and operands.

class ContentStreamInstruction:
    """
    Parsed content stream instruction representing an operator and its operands.
    
    Content streams contain sequences of these instructions that define
    the visual appearance of PDF pages including text, graphics, and images.
    """
    
    @property
    def operands(self) -> list[Object]:
        """
        List of operand objects for this instruction.
        
        Operands are the data values that the operator acts upon.
        The number and type of operands depends on the specific operator.
        
        Returns:
        list[Object]: PDF objects serving as operands
        """
    
    @property
    def operator(self) -> Operator:
        """
        The PDF operator for this instruction.
        
        Returns:
        Operator: PDF operator object (e.g., 'Tj' for show text, 'cm' for transform matrix)
        """
    
    def __init__(self, operands: list[Object], operator: Operator) -> None:
        """
        Create a content stream instruction.
        
        Parameters:
        - operands (list[Object]): Operand objects for the instruction
        - operator (Operator): PDF operator for the instruction
        """
    
    def __str__(self) -> str:
        """
        String representation of the instruction.
        
        Returns:
        str: Human-readable format showing operands and operator
        """
    
    def __repr__(self) -> str:
        """
        Detailed string representation for debugging.
        
        Returns:
        str: Complete representation including object types
        """

ContentStreamInlineImage Class

Special instruction type for inline images embedded in content streams.

class ContentStreamInlineImage(ContentStreamInstruction):
    """
    Inline image found within a content stream.
    
    Represents images that are embedded directly in the content stream
    using the BI...ID...EI inline image operators, rather than being
    referenced as external XObject images.
    """
    
    @property
    def iimage(self) -> PdfInlineImage:
        """
        The inline image object contained in this instruction.
        
        Returns:
        PdfInlineImage: Inline image that can be processed or extracted
        """
    
    @property
    def operands(self) -> list[Object]:
        """
        Operands associated with the inline image.
        
        Returns:
        list[Object]: Image operands and parameters
        """
    
    @property
    def operator(self) -> Operator:
        """
        The operator associated with this inline image.
        
        Returns:
        Operator: Usually the 'EI' (end inline image) operator
        """

Token Processing Classes

Low-level token filtering and stream processing for advanced manipulation.

class Token:
    """
    Individual token from a content stream.
    
    Represents the lowest level of content stream parsing,
    where the stream is broken into individual tokens before
    being assembled into instructions.
    """
    
    @property
    def type_(self) -> TokenType:
        """
        Type of this token.
        
        Returns:
        TokenType: Enumeration indicating token type (operator, operand, etc.)
        """
    
    @property
    def raw_value(self) -> bytes:
        """
        Raw binary value of the token as it appears in the stream.
        
        Returns:
        bytes: Original token data from content stream
        """
    
    @property
    def value(self) -> Object:
        """
        Parsed value of the token as a PDF object.
        
        Returns:
        Object: PDF object representation of token value
        """
    
    @property
    def error_msg(self) -> str:
        """
        Error message if token parsing failed.
        
        Returns:
        str: Error description, or empty string if no error
        """

class TokenFilter:
    """
    Base class for content stream token filtering.
    
    Provides a framework for processing content streams at the token level,
    allowing for sophisticated content transformation and analysis.
    """
    
    def handle_token(self, token: Token) -> None:
        """
        Process an individual token from the content stream.
        
        Override this method to implement custom token processing logic.
        This method is called for each token in the content stream.
        
        Parameters:
        - token (Token): Token to process
        """

class TokenType(Enum):
    """Enumeration of content stream token types."""
    
    bad = ...  # Invalid or unrecognized token
    array_close = ...  # ']' array closing
    array_open = ...  # '[' array opening  
    brace_close = ...  # '}' (not used in content streams)
    brace_open = ...  # '{' (not used in content streams)
    dict_close = ...  # '>>' dictionary closing
    dict_open = ...  # '<<' dictionary opening
    integer = ...  # Integer number
    name = ...  # Name object (starting with '/')
    operator = ...  # PDF operator
    real = ...  # Real (floating-point) number
    string = ...  # String literal
    inline_image = ...  # Inline image data
    space = ...  # Whitespace
    comment = ...  # Comment text

Content Stream Exception Classes

Specialized exceptions for content stream operations.

class PdfParsingError(Exception):
    """
    Raised when content stream parsing fails.
    
    This can occur with:
    - Syntax errors in content streams
    - Corrupted or incomplete stream data
    - Unsupported content stream features
    """

class UnparseableContentStreamInstructions(Exception):
    """
    Raised when instructions cannot be converted back to stream format.
    
    This occurs when instruction objects contain invalid or
    inconsistent data that cannot be serialized to PDF format.
    """

Usage Examples

Basic Content Stream Parsing

import pikepdf

# Open PDF and get a page
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]

# Parse the page's content stream
instructions = pikepdf.parse_content_stream(page)

print(f"Page has {len(instructions)} content instructions")

# Analyze each instruction
for i, instruction in enumerate(instructions):
    operator = instruction.operator
    operands = instruction.operands
    
    print(f"Instruction {i+1}: {operator}")
    
    # Show text operations
    if str(operator) == 'Tj':  # Show text
        text_string = operands[0] if operands else "No text"
        print(f"  Text: {text_string}")
    
    elif str(operator) == 'TJ':  # Show text with individual glyph positioning
        text_array = operands[0] if operands else []
        print(f"  Text array with {len(text_array)} elements")
    
    # Show graphics state changes
    elif str(operator) == 'cm':  # Concatenate matrix
        if len(operands) >= 6:
            matrix = [float(op) for op in operands]
            print(f"  Transform matrix: {matrix}")
    
    elif str(operator) == 'gs':  # Set graphics state
        gs_name = operands[0] if operands else "Unknown"
        print(f"  Graphics state: {gs_name}")
    
    # Show image operations
    elif str(operator) == 'Do':  # Invoke XObject
        xobject_name = operands[0] if operands else "Unknown"
        print(f"  XObject: {xobject_name}")

pdf.close()

Text Extraction from Content Streams

import pikepdf

def extract_text_from_content_stream(page):
    """Extract text from a page's content stream."""
    
    instructions = pikepdf.parse_content_stream(page)
    
    extracted_text = []
    current_font = None
    current_font_size = 12
    
    for instruction in instructions:
        operator = str(instruction.operator)
        operands = instruction.operands
        
        # Track font changes
        if operator == 'Tf' and len(operands) >= 2:  # Set font and size
            current_font = operands[0]
            current_font_size = float(operands[1])
        
        # Extract text
        elif operator == 'Tj' and operands:  # Show text
            text = str(operands[0])
            extracted_text.append({
                'text': text,
                'font': current_font,
                'font_size': current_font_size
            })
        
        elif operator == 'TJ' and operands:  # Show text with positioning
            text_array = operands[0]
            for element in text_array:
                if hasattr(element, '_type_code') and element._type_code == pikepdf.ObjectType.string:
                    text = str(element)
                    extracted_text.append({
                        'text': text,
                        'font': current_font,
                        'font_size': current_font_size
                    })
    
    return extracted_text

# Extract text with formatting information
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]

text_elements = extract_text_from_content_stream(page)

print("Extracted text with formatting:")
for element in text_elements:
    print(f"Font {element['font']}, Size {element['font_size']}: '{element['text']}'")

pdf.close()

Modifying Content Streams

import pikepdf

def add_watermark_to_content(page, watermark_text):
    """Add a watermark to a page by modifying its content stream."""
    
    # Parse existing content
    instructions = pikepdf.parse_content_stream(page)
    
    # Create watermark instructions
    # Save graphics state
    save_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('q'))
    
    # Set transparency
    set_alpha = pikepdf.ContentStreamInstruction(
        [pikepdf.String('0.3')], 
        pikepdf.Operator('gs')  # This would reference a graphics state with alpha
    )
    
    # Position for watermark (center of page)
    mediabox = page.mediabox
    center_x = (mediabox.lower_left[0] + mediabox.upper_right[0]) / 2
    center_y = (mediabox.lower_left[1] + mediabox.upper_right[1]) / 2
    
    # Begin text object
    begin_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('BT'))
    
    # Set font (assuming /F1 exists)
    set_font = pikepdf.ContentStreamInstruction(
        [pikepdf.Name.F1, 24], 
        pikepdf.Operator('Tf')
    )
    
    # Position text
    set_position = pikepdf.ContentStreamInstruction(
        [center_x, center_y], 
        pikepdf.Operator('Td')
    )
    
    # Show watermark text
    show_text = pikepdf.ContentStreamInstruction(
        [pikepdf.String(watermark_text)], 
        pikepdf.Operator('Tj')
    )
    
    # End text object
    end_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('ET'))
    
    # Restore graphics state
    restore_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q'))
    
    # Combine: original content + watermark
    watermark_instructions = [
        save_gs, begin_text, set_font, set_position, 
        show_text, end_text, restore_gs
    ]
    
    # Add watermark instructions to the beginning
    all_instructions = watermark_instructions + instructions
    
    # Convert back to content stream
    new_content = pikepdf.unparse_content_stream(all_instructions)
    
    # Update page content
    page['/Contents'] = pikepdf.Stream(page.owner, new_content)

# Add watermark to all pages
pdf = pikepdf.open('document.pdf')

for page in pdf.pages:
    add_watermark_to_content(page, "CONFIDENTIAL")

pdf.save('watermarked_document.pdf')
pdf.close()
print("Added watermark to all pages")

Advanced Content Analysis

import pikepdf
from collections import defaultdict

def analyze_content_usage(pdf_path):
    """Analyze content stream operator usage across a PDF."""
    
    pdf = pikepdf.open(pdf_path)
    
    analysis = {
        'operator_counts': defaultdict(int),
        'font_usage': defaultdict(int),
        'image_references': set(),
        'graphics_states': set(),
        'color_operations': [],
        'transform_operations': []
    }
    
    for page_num, page in enumerate(pdf.pages):
        try:
            instructions = pikepdf.parse_content_stream(page)
            
            for instruction in instructions:
                operator = str(instruction.operator)
                operands = instruction.operands
                
                # Count operator usage
                analysis['operator_counts'][operator] += 1
                
                # Track font usage
                if operator == 'Tf' and len(operands) >= 2:
                    font_name = str(operands[0])
                    font_size = float(operands[1])
                    analysis['font_usage'][f"{font_name} @ {font_size}pt"] += 1
                
                # Track image references
                elif operator == 'Do' and operands:
                    xobject_name = str(operands[0])
                    analysis['image_references'].add(xobject_name)
                
                # Track graphics state usage
                elif operator == 'gs' and operands:
                    gs_name = str(operands[0])
                    analysis['graphics_states'].add(gs_name)
                
                # Track color operations
                elif operator in ['rg', 'RG', 'g', 'G', 'k', 'K', 'cs', 'CS', 'sc', 'SC']:
                    color_info = {
                        'page': page_num,
                        'operator': operator,
                        'values': [float(op) if hasattr(op, '__float__') else str(op) for op in operands]
                    }
                    analysis['color_operations'].append(color_info)
                
                # Track transformation matrices
                elif operator == 'cm' and len(operands) == 6:
                    matrix = [float(op) for op in operands]
                    analysis['transform_operations'].append({
                        'page': page_num,
                        'matrix': matrix
                    })
        
        except Exception as e:
            print(f"Error analyzing page {page_num}: {e}")
    
    pdf.close()
    return analysis

def print_content_analysis(analysis):
    """Print a formatted content analysis report."""
    
    print("PDF Content Stream Analysis")
    print("=" * 50)
    
    # Most common operators
    print("\nTop 10 Most Used Operators:")
    sorted_ops = sorted(analysis['operator_counts'].items(), key=lambda x: x[1], reverse=True)
    for op, count in sorted_ops[:10]:
        print(f"  {op}: {count} times")
    
    # Font usage
    if analysis['font_usage']:
        print(f"\nFont Usage ({len(analysis['font_usage'])} different fonts):")
        for font, count in sorted(analysis['font_usage'].items(), key=lambda x: x[1], reverse=True):
            print(f"  {font}: {count} times")
    
    # Image references
    if analysis['image_references']:
        print(f"\nImage References ({len(analysis['image_references'])} images):")
        for img in sorted(analysis['image_references']):
            print(f"  {img}")
    
    # Graphics states
    if analysis['graphics_states']:
        print(f"\nGraphics States ({len(analysis['graphics_states'])} states):")
        for gs in sorted(analysis['graphics_states']):
            print(f"  {gs}")
    
    # Color usage summary
    color_ops = len(analysis['color_operations'])
    if color_ops > 0:
        print(f"\nColor Operations: {color_ops} total")
        color_types = defaultdict(int)
        for op_info in analysis['color_operations']:
            color_types[op_info['operator']] += 1
        for color_op, count in sorted(color_types.items()):
            print(f"  {color_op}: {count} times")
    
    # Transformation summary
    transform_count = len(analysis['transform_operations'])
    if transform_count > 0:
        print(f"\nTransformation Matrices: {transform_count} total")

# Analyze content usage
analysis = analyze_content_usage('document.pdf')
print_content_analysis(analysis)

Custom Token Filter Implementation

import pikepdf

class TextExtractionFilter(pikepdf.TokenFilter):
    """Custom token filter for extracting text while preserving structure."""
    
    def __init__(self):
        super().__init__()
        self.extracted_text = []
        self.current_font_size = 12
        self.in_text_object = False
    
    def handle_token(self, token):
        """Process each token in the content stream."""
        
        if token.type_ == pikepdf.TokenType.operator:
            operator = str(token.value)
            
            # Track text object boundaries
            if operator == 'BT':
                self.in_text_object = True
            elif operator == 'ET':
                self.in_text_object = False
            
            # Track font size changes
            elif operator == 'Tf' and hasattr(self, '_pending_font_size'):
                self.current_font_size = self._pending_font_size
                delattr(self, '_pending_font_size')
            
            # Extract text
            elif operator in ['Tj', 'TJ'] and self.in_text_object:
                if hasattr(self, '_pending_text'):
                    self.extracted_text.append({
                        'text': self._pending_text,
                        'font_size': self.current_font_size
                    })
                    delattr(self, '_pending_text')
        
        elif token.type_ == pikepdf.TokenType.string:
            # Store text for next operator
            self._pending_text = str(token.value)
        
        elif token.type_ == pikepdf.TokenType.real or token.type_ == pikepdf.TokenType.integer:
            # Might be font size (this is simplified - real implementation would be more sophisticated)
            try:
                value = float(token.raw_value)
                if 6 <= value <= 72:  # Reasonable font size range
                    self._pending_font_size = value
            except:
                pass

def extract_text_with_filter(page):
    """Extract text using custom token filter."""
    
    # Create and use custom filter
    text_filter = TextExtractionFilter()
    
    # Note: This is a conceptual example. The actual pikepdf API for token filtering
    # may differ. The real implementation would need to process the content stream
    # at the token level using the appropriate pikepdf mechanisms.
    
    instructions = pikepdf.parse_content_stream(page)
    
    # Simulate token filtering (in practice, this would use the actual token stream)
    for instruction in instructions:
        # Process operator token
        op_token = type('Token', (), {
            'type_': pikepdf.TokenType.operator,
            'value': instruction.operator,
            'raw_value': str(instruction.operator).encode()
        })()
        text_filter.handle_token(op_token)
        
        # Process operand tokens
        for operand in instruction.operands:
            if operand._type_code == pikepdf.ObjectType.string:
                string_token = type('Token', (), {
                    'type_': pikepdf.TokenType.string,
                    'value': operand,
                    'raw_value': str(operand).encode()
                })()
                text_filter.handle_token(string_token)
    
    return text_filter.extracted_text

# Use custom token filter
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]

extracted_text = extract_text_with_filter(page)

print("Text extracted with custom filter:")
for text_item in extracted_text:
    print(f"Size {text_item['font_size']}: '{text_item['text']}'")

pdf.close()

Content Stream Optimization

import pikepdf
from collections import defaultdict

def optimize_content_streams(pdf_path, output_path):
    """Optimize content streams by removing redundant operations."""
    
    pdf = pikepdf.open(pdf_path)
    
    optimization_stats = {
        'pages_processed': 0,
        'instructions_removed': 0,
        'redundant_font_sets': 0,
        'redundant_graphics_states': 0
    }
    
    for page in pdf.pages:
        try:
            instructions = pikepdf.parse_content_stream(page)
            original_count = len(instructions)
            
            optimized_instructions = []
            current_font = None
            current_font_size = None
            current_gs = None
            
            for instruction in instructions:
                operator = str(instruction.operator)
                operands = instruction.operands
                
                # Remove redundant font settings
                if operator == 'Tf' and len(operands) >= 2:
                    font = operands[0]
                    size = operands[1]
                    
                    if font == current_font and size == current_font_size:
                        # Skip redundant font setting
                        optimization_stats['redundant_font_sets'] += 1
                        continue
                    else:
                        current_font = font
                        current_font_size = size
                
                # Remove redundant graphics state settings
                elif operator == 'gs' and operands:
                    gs_name = operands[0]
                    
                    if gs_name == current_gs:
                        # Skip redundant graphics state
                        optimization_stats['redundant_graphics_states'] += 1
                        continue
                    else:
                        current_gs = gs_name
                
                # Keep instruction
                optimized_instructions.append(instruction)
            
            # Update page if optimizations were made
            if len(optimized_instructions) < original_count:
                new_content = pikepdf.unparse_content_stream(optimized_instructions)
                page['/Contents'] = pikepdf.Stream(pdf, new_content)
                
                optimization_stats['instructions_removed'] += (original_count - len(optimized_instructions))
            
            optimization_stats['pages_processed'] += 1
            
        except Exception as e:
            print(f"Error optimizing page: {e}")
    
    # Save optimized PDF
    pdf.save(output_path)
    pdf.close()
    
    print("Content Stream Optimization Results:")
    print(f"  Pages processed: {optimization_stats['pages_processed']}")
    print(f"  Instructions removed: {optimization_stats['instructions_removed']}")
    print(f"  Redundant font settings: {optimization_stats['redundant_font_sets']}")
    print(f"  Redundant graphics states: {optimization_stats['redundant_graphics_states']}")
    
    return optimization_stats

# Optimize content streams
# optimize_content_streams('document.pdf', 'optimized_document.pdf')

Install with Tessl CLI