or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdattachments.mdcontent-streams.mdcore-operations.mdencryption.mdforms.mdimages.mdindex.mdmetadata.mdobjects.mdoutlines.mdpages.md

content-streams.mddocs/

0

# Content Stream Processing

1

2

Low-level content stream parsing, token filtering, and PDF operator manipulation for advanced content processing. These capabilities enable fine-grained control over PDF content rendering and modification.

3

4

## Capabilities

5

6

### Content Stream Parsing Functions

7

8

High-level functions for parsing and reconstructing PDF content streams.

9

10

```python { .api }

11

def parse_content_stream(page_or_stream) -> list[ContentStreamInstruction]:

12

"""

13

Parse a PDF content stream into individual instructions.

14

15

Converts the binary content stream format into a list of structured

16

instruction objects containing operators and their operands.

17

18

Parameters:

19

- page_or_stream: Page object or Stream object containing content data

20

21

Returns:

22

list[ContentStreamInstruction]: Parsed content stream instructions

23

24

Raises:

25

PdfParsingError: If content stream cannot be parsed due to syntax errors

26

"""

27

28

def unparse_content_stream(instructions: list[ContentStreamInstruction]) -> bytes:

29

"""

30

Convert content stream instructions back to binary stream format.

31

32

Takes a list of instruction objects and reconstructs the binary

33

content stream data suitable for PDF storage.

34

35

Parameters:

36

- instructions (list[ContentStreamInstruction]): Instructions to convert

37

38

Returns:

39

bytes: Binary content stream data

40

41

Raises:

42

ValueError: If instructions contain invalid data or operators

43

"""

44

```

45

46

### ContentStreamInstruction Class

47

48

Individual content stream instructions containing operators and operands.

49

50

```python { .api }

51

class ContentStreamInstruction:

52

"""

53

Parsed content stream instruction representing an operator and its operands.

54

55

Content streams contain sequences of these instructions that define

56

the visual appearance of PDF pages including text, graphics, and images.

57

"""

58

59

@property

60

def operands(self) -> list[Object]:

61

"""

62

List of operand objects for this instruction.

63

64

Operands are the data values that the operator acts upon.

65

The number and type of operands depends on the specific operator.

66

67

Returns:

68

list[Object]: PDF objects serving as operands

69

"""

70

71

@property

72

def operator(self) -> Operator:

73

"""

74

The PDF operator for this instruction.

75

76

Returns:

77

Operator: PDF operator object (e.g., 'Tj' for show text, 'cm' for transform matrix)

78

"""

79

80

def __init__(self, operands: list[Object], operator: Operator) -> None:

81

"""

82

Create a content stream instruction.

83

84

Parameters:

85

- operands (list[Object]): Operand objects for the instruction

86

- operator (Operator): PDF operator for the instruction

87

"""

88

89

def __str__(self) -> str:

90

"""

91

String representation of the instruction.

92

93

Returns:

94

str: Human-readable format showing operands and operator

95

"""

96

97

def __repr__(self) -> str:

98

"""

99

Detailed string representation for debugging.

100

101

Returns:

102

str: Complete representation including object types

103

"""

104

```

105

106

### ContentStreamInlineImage Class

107

108

Special instruction type for inline images embedded in content streams.

109

110

```python { .api }

111

class ContentStreamInlineImage(ContentStreamInstruction):

112

"""

113

Inline image found within a content stream.

114

115

Represents images that are embedded directly in the content stream

116

using the BI...ID...EI inline image operators, rather than being

117

referenced as external XObject images.

118

"""

119

120

@property

121

def iimage(self) -> PdfInlineImage:

122

"""

123

The inline image object contained in this instruction.

124

125

Returns:

126

PdfInlineImage: Inline image that can be processed or extracted

127

"""

128

129

@property

130

def operands(self) -> list[Object]:

131

"""

132

Operands associated with the inline image.

133

134

Returns:

135

list[Object]: Image operands and parameters

136

"""

137

138

@property

139

def operator(self) -> Operator:

140

"""

141

The operator associated with this inline image.

142

143

Returns:

144

Operator: Usually the 'EI' (end inline image) operator

145

"""

146

```

147

148

### Token Processing Classes

149

150

Low-level token filtering and stream processing for advanced manipulation.

151

152

```python { .api }

153

class Token:

154

"""

155

Individual token from a content stream.

156

157

Represents the lowest level of content stream parsing,

158

where the stream is broken into individual tokens before

159

being assembled into instructions.

160

"""

161

162

@property

163

def type_(self) -> TokenType:

164

"""

165

Type of this token.

166

167

Returns:

168

TokenType: Enumeration indicating token type (operator, operand, etc.)

169

"""

170

171

@property

172

def raw_value(self) -> bytes:

173

"""

174

Raw binary value of the token as it appears in the stream.

175

176

Returns:

177

bytes: Original token data from content stream

178

"""

179

180

@property

181

def value(self) -> Object:

182

"""

183

Parsed value of the token as a PDF object.

184

185

Returns:

186

Object: PDF object representation of token value

187

"""

188

189

@property

190

def error_msg(self) -> str:

191

"""

192

Error message if token parsing failed.

193

194

Returns:

195

str: Error description, or empty string if no error

196

"""

197

198

class TokenFilter:

199

"""

200

Base class for content stream token filtering.

201

202

Provides a framework for processing content streams at the token level,

203

allowing for sophisticated content transformation and analysis.

204

"""

205

206

def handle_token(self, token: Token) -> None:

207

"""

208

Process an individual token from the content stream.

209

210

Override this method to implement custom token processing logic.

211

This method is called for each token in the content stream.

212

213

Parameters:

214

- token (Token): Token to process

215

"""

216

217

class TokenType(Enum):

218

"""Enumeration of content stream token types."""

219

220

bad = ... # Invalid or unrecognized token

221

array_close = ... # ']' array closing

222

array_open = ... # '[' array opening

223

brace_close = ... # '}' (not used in content streams)

224

brace_open = ... # '{' (not used in content streams)

225

dict_close = ... # '>>' dictionary closing

226

dict_open = ... # '<<' dictionary opening

227

integer = ... # Integer number

228

name = ... # Name object (starting with '/')

229

operator = ... # PDF operator

230

real = ... # Real (floating-point) number

231

string = ... # String literal

232

inline_image = ... # Inline image data

233

space = ... # Whitespace

234

comment = ... # Comment text

235

```

236

237

### Content Stream Exception Classes

238

239

Specialized exceptions for content stream operations.

240

241

```python { .api }

242

class PdfParsingError(Exception):

243

"""

244

Raised when content stream parsing fails.

245

246

This can occur with:

247

- Syntax errors in content streams

248

- Corrupted or incomplete stream data

249

- Unsupported content stream features

250

"""

251

252

class UnparseableContentStreamInstructions(Exception):

253

"""

254

Raised when instructions cannot be converted back to stream format.

255

256

This occurs when instruction objects contain invalid or

257

inconsistent data that cannot be serialized to PDF format.

258

"""

259

```

260

261

## Usage Examples

262

263

### Basic Content Stream Parsing

264

265

```python

266

import pikepdf

267

268

# Open PDF and get a page

269

pdf = pikepdf.open('document.pdf')

270

page = pdf.pages[0]

271

272

# Parse the page's content stream

273

instructions = pikepdf.parse_content_stream(page)

274

275

print(f"Page has {len(instructions)} content instructions")

276

277

# Analyze each instruction

278

for i, instruction in enumerate(instructions):

279

operator = instruction.operator

280

operands = instruction.operands

281

282

print(f"Instruction {i+1}: {operator}")

283

284

# Show text operations

285

if str(operator) == 'Tj': # Show text

286

text_string = operands[0] if operands else "No text"

287

print(f" Text: {text_string}")

288

289

elif str(operator) == 'TJ': # Show text with individual glyph positioning

290

text_array = operands[0] if operands else []

291

print(f" Text array with {len(text_array)} elements")

292

293

# Show graphics state changes

294

elif str(operator) == 'cm': # Concatenate matrix

295

if len(operands) >= 6:

296

matrix = [float(op) for op in operands]

297

print(f" Transform matrix: {matrix}")

298

299

elif str(operator) == 'gs': # Set graphics state

300

gs_name = operands[0] if operands else "Unknown"

301

print(f" Graphics state: {gs_name}")

302

303

# Show image operations

304

elif str(operator) == 'Do': # Invoke XObject

305

xobject_name = operands[0] if operands else "Unknown"

306

print(f" XObject: {xobject_name}")

307

308

pdf.close()

309

```

310

311

### Text Extraction from Content Streams

312

313

```python

314

import pikepdf

315

316

def extract_text_from_content_stream(page):

317

"""Extract text from a page's content stream."""

318

319

instructions = pikepdf.parse_content_stream(page)

320

321

extracted_text = []

322

current_font = None

323

current_font_size = 12

324

325

for instruction in instructions:

326

operator = str(instruction.operator)

327

operands = instruction.operands

328

329

# Track font changes

330

if operator == 'Tf' and len(operands) >= 2: # Set font and size

331

current_font = operands[0]

332

current_font_size = float(operands[1])

333

334

# Extract text

335

elif operator == 'Tj' and operands: # Show text

336

text = str(operands[0])

337

extracted_text.append({

338

'text': text,

339

'font': current_font,

340

'font_size': current_font_size

341

})

342

343

elif operator == 'TJ' and operands: # Show text with positioning

344

text_array = operands[0]

345

for element in text_array:

346

if hasattr(element, '_type_code') and element._type_code == pikepdf.ObjectType.string:

347

text = str(element)

348

extracted_text.append({

349

'text': text,

350

'font': current_font,

351

'font_size': current_font_size

352

})

353

354

return extracted_text

355

356

# Extract text with formatting information

357

pdf = pikepdf.open('document.pdf')

358

page = pdf.pages[0]

359

360

text_elements = extract_text_from_content_stream(page)

361

362

print("Extracted text with formatting:")

363

for element in text_elements:

364

print(f"Font {element['font']}, Size {element['font_size']}: '{element['text']}'")

365

366

pdf.close()

367

```

368

369

### Modifying Content Streams

370

371

```python

372

import pikepdf

373

374

def add_watermark_to_content(page, watermark_text):

375

"""Add a watermark to a page by modifying its content stream."""

376

377

# Parse existing content

378

instructions = pikepdf.parse_content_stream(page)

379

380

# Create watermark instructions

381

# Save graphics state

382

save_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('q'))

383

384

# Set transparency

385

set_alpha = pikepdf.ContentStreamInstruction(

386

[pikepdf.String('0.3')],

387

pikepdf.Operator('gs') # This would reference a graphics state with alpha

388

)

389

390

# Position for watermark (center of page)

391

mediabox = page.mediabox

392

center_x = (mediabox.lower_left[0] + mediabox.upper_right[0]) / 2

393

center_y = (mediabox.lower_left[1] + mediabox.upper_right[1]) / 2

394

395

# Begin text object

396

begin_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('BT'))

397

398

# Set font (assuming /F1 exists)

399

set_font = pikepdf.ContentStreamInstruction(

400

[pikepdf.Name.F1, 24],

401

pikepdf.Operator('Tf')

402

)

403

404

# Position text

405

set_position = pikepdf.ContentStreamInstruction(

406

[center_x, center_y],

407

pikepdf.Operator('Td')

408

)

409

410

# Show watermark text

411

show_text = pikepdf.ContentStreamInstruction(

412

[pikepdf.String(watermark_text)],

413

pikepdf.Operator('Tj')

414

)

415

416

# End text object

417

end_text = pikepdf.ContentStreamInstruction([], pikepdf.Operator('ET'))

418

419

# Restore graphics state

420

restore_gs = pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q'))

421

422

# Combine: original content + watermark

423

watermark_instructions = [

424

save_gs, begin_text, set_font, set_position,

425

show_text, end_text, restore_gs

426

]

427

428

# Add watermark instructions to the beginning

429

all_instructions = watermark_instructions + instructions

430

431

# Convert back to content stream

432

new_content = pikepdf.unparse_content_stream(all_instructions)

433

434

# Update page content

435

page['/Contents'] = pikepdf.Stream(page.owner, new_content)

436

437

# Add watermark to all pages

438

pdf = pikepdf.open('document.pdf')

439

440

for page in pdf.pages:

441

add_watermark_to_content(page, "CONFIDENTIAL")

442

443

pdf.save('watermarked_document.pdf')

444

pdf.close()

445

print("Added watermark to all pages")

446

```

447

448

### Advanced Content Analysis

449

450

```python

451

import pikepdf

452

from collections import defaultdict

453

454

def analyze_content_usage(pdf_path):

455

"""Analyze content stream operator usage across a PDF."""

456

457

pdf = pikepdf.open(pdf_path)

458

459

analysis = {

460

'operator_counts': defaultdict(int),

461

'font_usage': defaultdict(int),

462

'image_references': set(),

463

'graphics_states': set(),

464

'color_operations': [],

465

'transform_operations': []

466

}

467

468

for page_num, page in enumerate(pdf.pages):

469

try:

470

instructions = pikepdf.parse_content_stream(page)

471

472

for instruction in instructions:

473

operator = str(instruction.operator)

474

operands = instruction.operands

475

476

# Count operator usage

477

analysis['operator_counts'][operator] += 1

478

479

# Track font usage

480

if operator == 'Tf' and len(operands) >= 2:

481

font_name = str(operands[0])

482

font_size = float(operands[1])

483

analysis['font_usage'][f"{font_name} @ {font_size}pt"] += 1

484

485

# Track image references

486

elif operator == 'Do' and operands:

487

xobject_name = str(operands[0])

488

analysis['image_references'].add(xobject_name)

489

490

# Track graphics state usage

491

elif operator == 'gs' and operands:

492

gs_name = str(operands[0])

493

analysis['graphics_states'].add(gs_name)

494

495

# Track color operations

496

elif operator in ['rg', 'RG', 'g', 'G', 'k', 'K', 'cs', 'CS', 'sc', 'SC']:

497

color_info = {

498

'page': page_num,

499

'operator': operator,

500

'values': [float(op) if hasattr(op, '__float__') else str(op) for op in operands]

501

}

502

analysis['color_operations'].append(color_info)

503

504

# Track transformation matrices

505

elif operator == 'cm' and len(operands) == 6:

506

matrix = [float(op) for op in operands]

507

analysis['transform_operations'].append({

508

'page': page_num,

509

'matrix': matrix

510

})

511

512

except Exception as e:

513

print(f"Error analyzing page {page_num}: {e}")

514

515

pdf.close()

516

return analysis

517

518

def print_content_analysis(analysis):

519

"""Print a formatted content analysis report."""

520

521

print("PDF Content Stream Analysis")

522

print("=" * 50)

523

524

# Most common operators

525

print("\nTop 10 Most Used Operators:")

526

sorted_ops = sorted(analysis['operator_counts'].items(), key=lambda x: x[1], reverse=True)

527

for op, count in sorted_ops[:10]:

528

print(f" {op}: {count} times")

529

530

# Font usage

531

if analysis['font_usage']:

532

print(f"\nFont Usage ({len(analysis['font_usage'])} different fonts):")

533

for font, count in sorted(analysis['font_usage'].items(), key=lambda x: x[1], reverse=True):

534

print(f" {font}: {count} times")

535

536

# Image references

537

if analysis['image_references']:

538

print(f"\nImage References ({len(analysis['image_references'])} images):")

539

for img in sorted(analysis['image_references']):

540

print(f" {img}")

541

542

# Graphics states

543

if analysis['graphics_states']:

544

print(f"\nGraphics States ({len(analysis['graphics_states'])} states):")

545

for gs in sorted(analysis['graphics_states']):

546

print(f" {gs}")

547

548

# Color usage summary

549

color_ops = len(analysis['color_operations'])

550

if color_ops > 0:

551

print(f"\nColor Operations: {color_ops} total")

552

color_types = defaultdict(int)

553

for op_info in analysis['color_operations']:

554

color_types[op_info['operator']] += 1

555

for color_op, count in sorted(color_types.items()):

556

print(f" {color_op}: {count} times")

557

558

# Transformation summary

559

transform_count = len(analysis['transform_operations'])

560

if transform_count > 0:

561

print(f"\nTransformation Matrices: {transform_count} total")

562

563

# Analyze content usage

564

analysis = analyze_content_usage('document.pdf')

565

print_content_analysis(analysis)

566

```

567

568

### Custom Token Filter Implementation

569

570

```python

571

import pikepdf

572

573

class TextExtractionFilter(pikepdf.TokenFilter):

574

"""Custom token filter for extracting text while preserving structure."""

575

576

def __init__(self):

577

super().__init__()

578

self.extracted_text = []

579

self.current_font_size = 12

580

self.in_text_object = False

581

582

def handle_token(self, token):

583

"""Process each token in the content stream."""

584

585

if token.type_ == pikepdf.TokenType.operator:

586

operator = str(token.value)

587

588

# Track text object boundaries

589

if operator == 'BT':

590

self.in_text_object = True

591

elif operator == 'ET':

592

self.in_text_object = False

593

594

# Track font size changes

595

elif operator == 'Tf' and hasattr(self, '_pending_font_size'):

596

self.current_font_size = self._pending_font_size

597

delattr(self, '_pending_font_size')

598

599

# Extract text

600

elif operator in ['Tj', 'TJ'] and self.in_text_object:

601

if hasattr(self, '_pending_text'):

602

self.extracted_text.append({

603

'text': self._pending_text,

604

'font_size': self.current_font_size

605

})

606

delattr(self, '_pending_text')

607

608

elif token.type_ == pikepdf.TokenType.string:

609

# Store text for next operator

610

self._pending_text = str(token.value)

611

612

elif token.type_ == pikepdf.TokenType.real or token.type_ == pikepdf.TokenType.integer:

613

# Might be font size (this is simplified - real implementation would be more sophisticated)

614

try:

615

value = float(token.raw_value)

616

if 6 <= value <= 72: # Reasonable font size range

617

self._pending_font_size = value

618

except:

619

pass

620

621

def extract_text_with_filter(page):

622

"""Extract text using custom token filter."""

623

624

# Create and use custom filter

625

text_filter = TextExtractionFilter()

626

627

# Note: This is a conceptual example. The actual pikepdf API for token filtering

628

# may differ. The real implementation would need to process the content stream

629

# at the token level using the appropriate pikepdf mechanisms.

630

631

instructions = pikepdf.parse_content_stream(page)

632

633

# Simulate token filtering (in practice, this would use the actual token stream)

634

for instruction in instructions:

635

# Process operator token

636

op_token = type('Token', (), {

637

'type_': pikepdf.TokenType.operator,

638

'value': instruction.operator,

639

'raw_value': str(instruction.operator).encode()

640

})()

641

text_filter.handle_token(op_token)

642

643

# Process operand tokens

644

for operand in instruction.operands:

645

if operand._type_code == pikepdf.ObjectType.string:

646

string_token = type('Token', (), {

647

'type_': pikepdf.TokenType.string,

648

'value': operand,

649

'raw_value': str(operand).encode()

650

})()

651

text_filter.handle_token(string_token)

652

653

return text_filter.extracted_text

654

655

# Use custom token filter

656

pdf = pikepdf.open('document.pdf')

657

page = pdf.pages[0]

658

659

extracted_text = extract_text_with_filter(page)

660

661

print("Text extracted with custom filter:")

662

for text_item in extracted_text:

663

print(f"Size {text_item['font_size']}: '{text_item['text']}'")

664

665

pdf.close()

666

```

667

668

### Content Stream Optimization

669

670

```python

671

import pikepdf

672

from collections import defaultdict

673

674

def optimize_content_streams(pdf_path, output_path):

675

"""Optimize content streams by removing redundant operations."""

676

677

pdf = pikepdf.open(pdf_path)

678

679

optimization_stats = {

680

'pages_processed': 0,

681

'instructions_removed': 0,

682

'redundant_font_sets': 0,

683

'redundant_graphics_states': 0

684

}

685

686

for page in pdf.pages:

687

try:

688

instructions = pikepdf.parse_content_stream(page)

689

original_count = len(instructions)

690

691

optimized_instructions = []

692

current_font = None

693

current_font_size = None

694

current_gs = None

695

696

for instruction in instructions:

697

operator = str(instruction.operator)

698

operands = instruction.operands

699

700

# Remove redundant font settings

701

if operator == 'Tf' and len(operands) >= 2:

702

font = operands[0]

703

size = operands[1]

704

705

if font == current_font and size == current_font_size:

706

# Skip redundant font setting

707

optimization_stats['redundant_font_sets'] += 1

708

continue

709

else:

710

current_font = font

711

current_font_size = size

712

713

# Remove redundant graphics state settings

714

elif operator == 'gs' and operands:

715

gs_name = operands[0]

716

717

if gs_name == current_gs:

718

# Skip redundant graphics state

719

optimization_stats['redundant_graphics_states'] += 1

720

continue

721

else:

722

current_gs = gs_name

723

724

# Keep instruction

725

optimized_instructions.append(instruction)

726

727

# Update page if optimizations were made

728

if len(optimized_instructions) < original_count:

729

new_content = pikepdf.unparse_content_stream(optimized_instructions)

730

page['/Contents'] = pikepdf.Stream(pdf, new_content)

731

732

optimization_stats['instructions_removed'] += (original_count - len(optimized_instructions))

733

734

optimization_stats['pages_processed'] += 1

735

736

except Exception as e:

737

print(f"Error optimizing page: {e}")

738

739

# Save optimized PDF

740

pdf.save(output_path)

741

pdf.close()

742

743

print("Content Stream Optimization Results:")

744

print(f" Pages processed: {optimization_stats['pages_processed']}")

745

print(f" Instructions removed: {optimization_stats['instructions_removed']}")

746

print(f" Redundant font settings: {optimization_stats['redundant_font_sets']}")

747

print(f" Redundant graphics states: {optimization_stats['redundant_graphics_states']}")

748

749

return optimization_stats

750

751

# Optimize content streams

752

# optimize_content_streams('document.pdf', 'optimized_document.pdf')

753

```