Read and write PDFs with Python, powered by qpdf
—
Low-level content stream parsing, token filtering, and PDF operator manipulation for advanced content processing. These capabilities enable fine-grained control over PDF content rendering and modification.
High-level functions for parsing and reconstructing PDF content streams.
def parse_content_stream(page_or_stream) -> list[ContentStreamInstruction]:
"""
Parse a PDF content stream into individual instructions.
Converts the binary content stream format into a list of structured
instruction objects containing operators and their operands.
Parameters:
- page_or_stream: Page object or Stream object containing content data
Returns:
list[ContentStreamInstruction]: Parsed content stream instructions
Raises:
PdfParsingError: If content stream cannot be parsed due to syntax errors
"""
def unparse_content_stream(instructions: list[ContentStreamInstruction]) -> bytes:
"""
Convert content stream instructions back to binary stream format.
Takes a list of instruction objects and reconstructs the binary
content stream data suitable for PDF storage.
Parameters:
- instructions (list[ContentStreamInstruction]): Instructions to convert
Returns:
bytes: Binary content stream data
Raises:
ValueError: If instructions contain invalid data or operators
"""Individual content stream instructions containing operators and operands.
class ContentStreamInstruction:
"""
Parsed content stream instruction representing an operator and its operands.
Content streams contain sequences of these instructions that define
the visual appearance of PDF pages including text, graphics, and images.
"""
@property
def operands(self) -> list[Object]:
"""
List of operand objects for this instruction.
Operands are the data values that the operator acts upon.
The number and type of operands depends on the specific operator.
Returns:
list[Object]: PDF objects serving as operands
"""
@property
def operator(self) -> Operator:
"""
The PDF operator for this instruction.
Returns:
Operator: PDF operator object (e.g., 'Tj' for show text, 'cm' for transform matrix)
"""
def __init__(self, operands: list[Object], operator: Operator) -> None:
"""
Create a content stream instruction.
Parameters:
- operands (list[Object]): Operand objects for the instruction
- operator (Operator): PDF operator for the instruction
"""
def __str__(self) -> str:
"""
String representation of the instruction.
Returns:
str: Human-readable format showing operands and operator
"""
def __repr__(self) -> str:
"""
Detailed string representation for debugging.
Returns:
str: Complete representation including object types
"""Special instruction type for inline images embedded in content streams.
class ContentStreamInlineImage(ContentStreamInstruction):
"""
Inline image found within a content stream.
Represents images that are embedded directly in the content stream
using the BI...ID...EI inline image operators, rather than being
referenced as external XObject images.
"""
@property
def iimage(self) -> PdfInlineImage:
"""
The inline image object contained in this instruction.
Returns:
PdfInlineImage: Inline image that can be processed or extracted
"""
@property
def operands(self) -> list[Object]:
"""
Operands associated with the inline image.
Returns:
list[Object]: Image operands and parameters
"""
@property
def operator(self) -> Operator:
"""
The operator associated with this inline image.
Returns:
Operator: Usually the 'EI' (end inline image) operator
"""Low-level token filtering and stream processing for advanced manipulation.
class Token:
"""
Individual token from a content stream.
Represents the lowest level of content stream parsing,
where the stream is broken into individual tokens before
being assembled into instructions.
"""
@property
def type_(self) -> TokenType:
"""
Type of this token.
Returns:
TokenType: Enumeration indicating token type (operator, operand, etc.)
"""
@property
def raw_value(self) -> bytes:
"""
Raw binary value of the token as it appears in the stream.
Returns:
bytes: Original token data from content stream
"""
@property
def value(self) -> Object:
"""
Parsed value of the token as a PDF object.
Returns:
Object: PDF object representation of token value
"""
@property
def error_msg(self) -> str:
"""
Error message if token parsing failed.
Returns:
str: Error description, or empty string if no error
"""
class TokenFilter:
"""
Base class for content stream token filtering.
Provides a framework for processing content streams at the token level,
allowing for sophisticated content transformation and analysis.
"""
def handle_token(self, token: Token) -> None:
"""
Process an individual token from the content stream.
Override this method to implement custom token processing logic.
This method is called for each token in the content stream.
Parameters:
- token (Token): Token to process
"""
class TokenType(Enum):
"""Enumeration of content stream token types."""
bad = ... # Invalid or unrecognized token
array_close = ... # ']' array closing
array_open = ... # '[' array opening
brace_close = ... # '}' (not used in content streams)
brace_open = ... # '{' (not used in content streams)
dict_close = ... # '>>' dictionary closing
dict_open = ... # '<<' dictionary opening
integer = ... # Integer number
name = ... # Name object (starting with '/')
operator = ... # PDF operator
real = ... # Real (floating-point) number
string = ... # String literal
inline_image = ... # Inline image data
space = ... # Whitespace
comment = ... # Comment textSpecialized exceptions for content stream operations.
class PdfParsingError(Exception):
"""
Raised when content stream parsing fails.
This can occur with:
- Syntax errors in content streams
- Corrupted or incomplete stream data
- Unsupported content stream features
"""
class UnparseableContentStreamInstructions(Exception):
"""
Raised when instructions cannot be converted back to stream format.
This occurs when instruction objects contain invalid or
inconsistent data that cannot be serialized to PDF format.
"""import pikepdf
# Open PDF and get a page
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]
# Parse the page's content stream
instructions = pikepdf.parse_content_stream(page)
print(f"Page has {len(instructions)} content instructions")
# Analyze each instruction
for i, instruction in enumerate(instructions):
operator = instruction.operator
operands = instruction.operands
print(f"Instruction {i+1}: {operator}")
# Show text operations
if str(operator) == 'Tj': # Show text
text_string = operands[0] if operands else "No text"
print(f" Text: {text_string}")
elif str(operator) == 'TJ': # Show text with individual glyph positioning
text_array = operands[0] if operands else []
print(f" Text array with {len(text_array)} elements")
# Show graphics state changes
elif str(operator) == 'cm': # Concatenate matrix
if len(operands) >= 6:
matrix = [float(op) for op in operands]
print(f" Transform matrix: {matrix}")
elif str(operator) == 'gs': # Set graphics state
gs_name = operands[0] if operands else "Unknown"
print(f" Graphics state: {gs_name}")
# Show image operations
elif str(operator) == 'Do': # Invoke XObject
xobject_name = operands[0] if operands else "Unknown"
print(f" XObject: {xobject_name}")
pdf.close()import pikepdf
def extract_text_from_content_stream(page):
"""Extract text from a page's content stream."""
instructions = pikepdf.parse_content_stream(page)
extracted_text = []
current_font = None
current_font_size = 12
for instruction in instructions:
operator = str(instruction.operator)
operands = instruction.operands
# Track font changes
if operator == 'Tf' and len(operands) >= 2: # Set font and size
current_font = operands[0]
current_font_size = float(operands[1])
# Extract text
elif operator == 'Tj' and operands: # Show text
text = str(operands[0])
extracted_text.append({
'text': text,
'font': current_font,
'font_size': current_font_size
})
elif operator == 'TJ' and operands: # Show text with positioning
text_array = operands[0]
for element in text_array:
if hasattr(element, '_type_code') and element._type_code == pikepdf.ObjectType.string:
text = str(element)
extracted_text.append({
'text': text,
'font': current_font,
'font_size': current_font_size
})
return extracted_text
# Extract text with formatting information
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]
text_elements = extract_text_from_content_stream(page)
print("Extracted text with formatting:")
for element in text_elements:
print(f"Font {element['font']}, Size {element['font_size']}: '{element['text']}'")
pdf.close()import pikepdf
def add_watermark_to_content(page, watermark_text):
"""Add a watermark to a page by modifying its content stream."""
# Parse existing content
instructions = pikepdf.parse_content_stream(page)
# Create watermark instructions
# Save graphics state
save_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('q'))
# Set transparency
set_alpha = pikepdf.ContentStreamInstruction(
[pikepdf.String('0.3')],
pikepdf.Operator('gs') # This would reference a graphics state with alpha
)
# Position for watermark (center of page)
mediabox = page.mediabox
center_x = (mediabox.lower_left[0] + mediabox.upper_right[0]) / 2
center_y = (mediabox.lower_left[1] + mediabox.upper_right[1]) / 2
# Begin text object
begin_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('BT'))
# Set font (assuming /F1 exists)
set_font = pikepdf.ContentStreamInstruction(
[pikepdf.Name.F1, 24],
pikepdf.Operator('Tf')
)
# Position text
set_position = pikepdf.ContentStreamInstruction(
[center_x, center_y],
pikepdf.Operator('Td')
)
# Show watermark text
show_text = pikepdf.ContentStreamInstruction(
[pikepdf.String(watermark_text)],
pikepdf.Operator('Tj')
)
# End text object
end_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('ET'))
# Restore graphics state
restore_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q'))
# Combine: original content + watermark
watermark_instructions = [
save_gs, begin_text, set_font, set_position,
show_text, end_text, restore_gs
]
# Add watermark instructions to the beginning
all_instructions = watermark_instructions + instructions
# Convert back to content stream
new_content = pikepdf.unparse_content_stream(all_instructions)
# Update page content
page['/Contents'] = pikepdf.Stream(page.owner, new_content)
# Add watermark to all pages
pdf = pikepdf.open('document.pdf')
for page in pdf.pages:
add_watermark_to_content(page, "CONFIDENTIAL")
pdf.save('watermarked_document.pdf')
pdf.close()
print("Added watermark to all pages")import pikepdf
from collections import defaultdict
def analyze_content_usage(pdf_path):
"""Analyze content stream operator usage across a PDF."""
pdf = pikepdf.open(pdf_path)
analysis = {
'operator_counts': defaultdict(int),
'font_usage': defaultdict(int),
'image_references': set(),
'graphics_states': set(),
'color_operations': [],
'transform_operations': []
}
for page_num, page in enumerate(pdf.pages):
try:
instructions = pikepdf.parse_content_stream(page)
for instruction in instructions:
operator = str(instruction.operator)
operands = instruction.operands
# Count operator usage
analysis['operator_counts'][operator] += 1
# Track font usage
if operator == 'Tf' and len(operands) >= 2:
font_name = str(operands[0])
font_size = float(operands[1])
analysis['font_usage'][f"{font_name} @ {font_size}pt"] += 1
# Track image references
elif operator == 'Do' and operands:
xobject_name = str(operands[0])
analysis['image_references'].add(xobject_name)
# Track graphics state usage
elif operator == 'gs' and operands:
gs_name = str(operands[0])
analysis['graphics_states'].add(gs_name)
# Track color operations
elif operator in ['rg', 'RG', 'g', 'G', 'k', 'K', 'cs', 'CS', 'sc', 'SC']:
color_info = {
'page': page_num,
'operator': operator,
'values': [float(op) if hasattr(op, '__float__') else str(op) for op in operands]
}
analysis['color_operations'].append(color_info)
# Track transformation matrices
elif operator == 'cm' and len(operands) == 6:
matrix = [float(op) for op in operands]
analysis['transform_operations'].append({
'page': page_num,
'matrix': matrix
})
except Exception as e:
print(f"Error analyzing page {page_num}: {e}")
pdf.close()
return analysis
def print_content_analysis(analysis):
"""Print a formatted content analysis report."""
print("PDF Content Stream Analysis")
print("=" * 50)
# Most common operators
print("\nTop 10 Most Used Operators:")
sorted_ops = sorted(analysis['operator_counts'].items(), key=lambda x: x[1], reverse=True)
for op, count in sorted_ops[:10]:
print(f" {op}: {count} times")
# Font usage
if analysis['font_usage']:
print(f"\nFont Usage ({len(analysis['font_usage'])} different fonts):")
for font, count in sorted(analysis['font_usage'].items(), key=lambda x: x[1], reverse=True):
print(f" {font}: {count} times")
# Image references
if analysis['image_references']:
print(f"\nImage References ({len(analysis['image_references'])} images):")
for img in sorted(analysis['image_references']):
print(f" {img}")
# Graphics states
if analysis['graphics_states']:
print(f"\nGraphics States ({len(analysis['graphics_states'])} states):")
for gs in sorted(analysis['graphics_states']):
print(f" {gs}")
# Color usage summary
color_ops = len(analysis['color_operations'])
if color_ops > 0:
print(f"\nColor Operations: {color_ops} total")
color_types = defaultdict(int)
for op_info in analysis['color_operations']:
color_types[op_info['operator']] += 1
for color_op, count in sorted(color_types.items()):
print(f" {color_op}: {count} times")
# Transformation summary
transform_count = len(analysis['transform_operations'])
if transform_count > 0:
print(f"\nTransformation Matrices: {transform_count} total")
# Analyze content usage
analysis = analyze_content_usage('document.pdf')
print_content_analysis(analysis)import pikepdf
class TextExtractionFilter(pikepdf.TokenFilter):
"""Custom token filter for extracting text while preserving structure."""
def __init__(self):
super().__init__()
self.extracted_text = []
self.current_font_size = 12
self.in_text_object = False
def handle_token(self, token):
"""Process each token in the content stream."""
if token.type_ == pikepdf.TokenType.operator:
operator = str(token.value)
# Track text object boundaries
if operator == 'BT':
self.in_text_object = True
elif operator == 'ET':
self.in_text_object = False
# Track font size changes
elif operator == 'Tf' and hasattr(self, '_pending_font_size'):
self.current_font_size = self._pending_font_size
delattr(self, '_pending_font_size')
# Extract text
elif operator in ['Tj', 'TJ'] and self.in_text_object:
if hasattr(self, '_pending_text'):
self.extracted_text.append({
'text': self._pending_text,
'font_size': self.current_font_size
})
delattr(self, '_pending_text')
elif token.type_ == pikepdf.TokenType.string:
# Store text for next operator
self._pending_text = str(token.value)
elif token.type_ == pikepdf.TokenType.real or token.type_ == pikepdf.TokenType.integer:
# Might be font size (this is simplified - real implementation would be more sophisticated)
try:
value = float(token.raw_value)
if 6 <= value <= 72: # Reasonable font size range
self._pending_font_size = value
except:
pass
def extract_text_with_filter(page):
"""Extract text using custom token filter."""
# Create and use custom filter
text_filter = TextExtractionFilter()
# Note: This is a conceptual example. The actual pikepdf API for token filtering
# may differ. The real implementation would need to process the content stream
# at the token level using the appropriate pikepdf mechanisms.
instructions = pikepdf.parse_content_stream(page)
# Simulate token filtering (in practice, this would use the actual token stream)
for instruction in instructions:
# Process operator token
op_token = type('Token', (), {
'type_': pikepdf.TokenType.operator,
'value': instruction.operator,
'raw_value': str(instruction.operator).encode()
})()
text_filter.handle_token(op_token)
# Process operand tokens
for operand in instruction.operands:
if operand._type_code == pikepdf.ObjectType.string:
string_token = type('Token', (), {
'type_': pikepdf.TokenType.string,
'value': operand,
'raw_value': str(operand).encode()
})()
text_filter.handle_token(string_token)
return text_filter.extracted_text
# Use custom token filter
pdf = pikepdf.open('document.pdf')
page = pdf.pages[0]
extracted_text = extract_text_with_filter(page)
print("Text extracted with custom filter:")
for text_item in extracted_text:
print(f"Size {text_item['font_size']}: '{text_item['text']}'")
pdf.close()import pikepdf
from collections import defaultdict
def optimize_content_streams(pdf_path, output_path):
"""Optimize content streams by removing redundant operations."""
pdf = pikepdf.open(pdf_path)
optimization_stats = {
'pages_processed': 0,
'instructions_removed': 0,
'redundant_font_sets': 0,
'redundant_graphics_states': 0
}
for page in pdf.pages:
try:
instructions = pikepdf.parse_content_stream(page)
original_count = len(instructions)
optimized_instructions = []
current_font = None
current_font_size = None
current_gs = None
for instruction in instructions:
operator = str(instruction.operator)
operands = instruction.operands
# Remove redundant font settings
if operator == 'Tf' and len(operands) >= 2:
font = operands[0]
size = operands[1]
if font == current_font and size == current_font_size:
# Skip redundant font setting
optimization_stats['redundant_font_sets'] += 1
continue
else:
current_font = font
current_font_size = size
# Remove redundant graphics state settings
elif operator == 'gs' and operands:
gs_name = operands[0]
if gs_name == current_gs:
# Skip redundant graphics state
optimization_stats['redundant_graphics_states'] += 1
continue
else:
current_gs = gs_name
# Keep instruction
optimized_instructions.append(instruction)
# Update page if optimizations were made
if len(optimized_instructions) < original_count:
new_content = pikepdf.unparse_content_stream(optimized_instructions)
page['/Contents'] = pikepdf.Stream(pdf, new_content)
optimization_stats['instructions_removed'] += (original_count - len(optimized_instructions))
optimization_stats['pages_processed'] += 1
except Exception as e:
print(f"Error optimizing page: {e}")
# Save optimized PDF
pdf.save(output_path)
pdf.close()
print("Content Stream Optimization Results:")
print(f" Pages processed: {optimization_stats['pages_processed']}")
print(f" Instructions removed: {optimization_stats['instructions_removed']}")
print(f" Redundant font settings: {optimization_stats['redundant_font_sets']}")
print(f" Redundant graphics states: {optimization_stats['redundant_graphics_states']}")
return optimization_stats
# Optimize content streams
# optimize_content_streams('document.pdf', 'optimized_document.pdf')Install with Tessl CLI
npx tessl i tessl/pypi-pikepdf