0
# Advanced Operations
1
2
Specialized operations including matrix transformations, coordinate systems, job interface, and tree structures for advanced PDF manipulation. These capabilities enable sophisticated PDF processing and analysis workflows.
3
4
## Capabilities
5
6
### Matrix Class
7
8
2D transformation matrix for coordinate transformations and geometric operations.
9
10
```python { .api }
11
class Matrix:
12
"""
13
PDF transformation matrix for geometric operations.
14
15
Represents a 2D transformation matrix with 6 elements in the form:
16
[a b c d e f] which corresponds to the transformation:
17
x' = a*x + c*y + e
18
y' = b*x + d*y + f
19
20
Used for scaling, rotation, translation, and skewing operations.
21
"""
22
23
def __init__(self, a: float = 1, b: float = 0, c: float = 0,
24
d: float = 1, e: float = 0, f: float = 0) -> None:
25
"""
26
Create a transformation matrix with specified elements.
27
28
Parameters:
29
- a (float): X-scaling component
30
- b (float): Y-skewing component
31
- c (float): X-skewing component
32
- d (float): Y-scaling component
33
- e (float): X-translation component
34
- f (float): Y-translation component
35
36
Default creates an identity matrix (no transformation).
37
"""
38
39
@staticmethod
40
def identity() -> Matrix:
41
"""
42
Create an identity matrix that performs no transformation.
43
44
Returns:
45
Matrix: Identity matrix [1 0 0 1 0 0]
46
"""
47
48
def translated(self, dx: float, dy: float) -> Matrix:
49
"""
50
Create a new matrix with translation applied.
51
52
Parameters:
53
- dx (float): Translation distance in X direction
54
- dy (float): Translation distance in Y direction
55
56
Returns:
57
Matrix: New matrix with translation transformation
58
"""
59
60
def scaled(self, sx: float, sy: float = None) -> Matrix:
61
"""
62
Create a new matrix with scaling applied.
63
64
Parameters:
65
- sx (float): Scale factor in X direction
66
- sy (float, optional): Scale factor in Y direction (defaults to sx for uniform scaling)
67
68
Returns:
69
Matrix: New matrix with scaling transformation
70
"""
71
72
def rotated(self, angle_degrees: float) -> Matrix:
73
"""
74
Create a new matrix with rotation applied.
75
76
Parameters:
77
- angle_degrees (float): Rotation angle in degrees (positive = counterclockwise)
78
79
Returns:
80
Matrix: New matrix with rotation transformation
81
"""
82
83
def inverse(self) -> Matrix:
84
"""
85
Calculate the inverse of this transformation matrix.
86
87
Returns:
88
Matrix: Inverse transformation matrix
89
90
Raises:
91
ValueError: If matrix is not invertible (determinant is zero)
92
"""
93
94
def transform(self, point: tuple[float, float]) -> tuple[float, float]:
95
"""
96
Transform a point using this matrix.
97
98
Parameters:
99
- point (tuple[float, float]): Point coordinates (x, y)
100
101
Returns:
102
tuple[float, float]: Transformed point coordinates (x', y')
103
"""
104
105
def __mul__(self, other: Matrix) -> Matrix:
106
"""
107
Matrix multiplication (composition of transformations).
108
109
Parameters:
110
- other (Matrix): Matrix to multiply with
111
112
Returns:
113
Matrix: Result of matrix multiplication
114
"""
115
116
@property
117
def a(self) -> float:
118
"""X-scaling component of the transformation."""
119
120
@property
121
def b(self) -> float:
122
"""Y-skewing component of the transformation."""
123
124
@property
125
def c(self) -> float:
126
"""X-skewing component of the transformation."""
127
128
@property
129
def d(self) -> float:
130
"""Y-scaling component of the transformation."""
131
132
@property
133
def e(self) -> float:
134
"""X-translation component of the transformation."""
135
136
@property
137
def f(self) -> float:
138
"""Y-translation component of the transformation."""
139
```
140
141
### Job Interface
142
143
Command-line job interface providing access to qpdf functionality.
144
145
```python { .api }
146
class Job:
147
"""
148
Command-line job interface for advanced PDF operations.
149
150
Provides access to qpdf's command-line functionality through
151
a programmatic interface, enabling complex PDF processing workflows.
152
"""
153
154
def run(self) -> int:
155
"""
156
Execute the configured job.
157
158
Returns:
159
int: Exit code (0 for success, non-zero for failure)
160
"""
161
162
def check_configuration(self) -> bool:
163
"""
164
Validate the job configuration without executing.
165
166
Returns:
167
bool: True if configuration is valid
168
169
Raises:
170
JobUsageError: If configuration has errors
171
"""
172
173
def create_pdf(self) -> Pdf:
174
"""
175
Create a PDF object from the job configuration.
176
177
Returns:
178
Pdf: PDF object created by the job
179
180
Raises:
181
JobUsageError: If job doesn't create a PDF
182
"""
183
184
def write_pdf(self, pdf: Pdf) -> None:
185
"""
186
Write a PDF using the job's output configuration.
187
188
Parameters:
189
- pdf (Pdf): PDF to write using job settings
190
"""
191
192
@property
193
def creates_output(self) -> bool:
194
"""
195
Whether this job creates output files.
196
197
Returns:
198
bool: True if job will create output
199
"""
200
201
@property
202
def has_warnings(self) -> bool:
203
"""
204
Whether the job execution produced warnings.
205
206
Returns:
207
bool: True if warnings were generated
208
"""
209
210
@property
211
def exit_code(self) -> int:
212
"""
213
Exit code from the last job execution.
214
215
Returns:
216
int: Exit code (0 = success)
217
"""
218
219
@staticmethod
220
def json_out_schema() -> dict:
221
"""
222
Get the JSON schema for job output format.
223
224
Returns:
225
dict: JSON schema describing output structure
226
"""
227
228
@staticmethod
229
def job_json_schema() -> dict:
230
"""
231
Get the JSON schema for job configuration format.
232
233
Returns:
234
dict: JSON schema describing job configuration structure
235
"""
236
```
237
238
### Tree Structures
239
240
Specialized tree data structures for PDF name trees and number trees.
241
242
```python { .api }
243
class NameTree:
244
"""
245
PDF name tree structure for sorted key-value storage.
246
247
Name trees provide efficient storage and retrieval of key-value pairs
248
where keys are byte strings sorted in lexical order.
249
250
Implements MutableMapping[bytes, Object] interface.
251
"""
252
253
@staticmethod
254
def new(pdf: Pdf) -> NameTree:
255
"""
256
Create a new empty name tree.
257
258
Parameters:
259
- pdf (Pdf): PDF document to create the tree in
260
261
Returns:
262
NameTree: New empty name tree
263
"""
264
265
def __len__(self) -> int:
266
"""Number of entries in the name tree."""
267
268
def __iter__(self) -> Iterator[bytes]:
269
"""Iterate over keys in the name tree."""
270
271
def __getitem__(self, key: bytes) -> Object:
272
"""
273
Get value by key.
274
275
Parameters:
276
- key (bytes): Key to look up
277
278
Returns:
279
Object: Value associated with the key
280
281
Raises:
282
KeyError: If key is not found
283
"""
284
285
def __setitem__(self, key: bytes, value: Object) -> None:
286
"""
287
Set key-value pair.
288
289
Parameters:
290
- key (bytes): Key for the entry
291
- value (Object): Value to store
292
"""
293
294
def __delitem__(self, key: bytes) -> None:
295
"""
296
Delete entry by key.
297
298
Parameters:
299
- key (bytes): Key to delete
300
301
Raises:
302
KeyError: If key is not found
303
"""
304
305
def __contains__(self, key: bytes) -> bool:
306
"""Check if key exists in the tree."""
307
308
class NumberTree:
309
"""
310
PDF number tree structure for sorted numeric key-value storage.
311
312
Number trees provide efficient storage and retrieval of key-value pairs
313
where keys are integers sorted in numeric order.
314
315
Implements MutableMapping[int, Object] interface.
316
"""
317
318
@staticmethod
319
def new(pdf: Pdf) -> NumberTree:
320
"""
321
Create a new empty number tree.
322
323
Parameters:
324
- pdf (Pdf): PDF document to create the tree in
325
326
Returns:
327
NumberTree: New empty number tree
328
"""
329
330
def __len__(self) -> int:
331
"""Number of entries in the number tree."""
332
333
def __iter__(self) -> Iterator[int]:
334
"""Iterate over keys in the number tree."""
335
336
def __getitem__(self, key: int) -> Object:
337
"""
338
Get value by numeric key.
339
340
Parameters:
341
- key (int): Numeric key to look up
342
343
Returns:
344
Object: Value associated with the key
345
346
Raises:
347
KeyError: If key is not found
348
"""
349
350
def __setitem__(self, key: int, value: Object) -> None:
351
"""
352
Set key-value pair.
353
354
Parameters:
355
- key (int): Numeric key for the entry
356
- value (Object): Value to store
357
"""
358
359
def __delitem__(self, key: int) -> None:
360
"""
361
Delete entry by numeric key.
362
363
Parameters:
364
- key (int): Key to delete
365
366
Raises:
367
KeyError: If key is not found
368
"""
369
370
def __contains__(self, key: int) -> bool:
371
"""Check if numeric key exists in the tree."""
372
```
373
374
### Coordinate Transformation Utilities
375
376
Helper functions for working with coordinate systems and transformations.
377
378
```python { .api }
379
def get_objects_with_ctm(pdf: Pdf) -> list[tuple[Object, Matrix]]:
380
"""
381
Find objects with coordinate transformation matrices (CTM).
382
383
Scans the PDF for objects that have associated transformation
384
matrices, useful for analyzing coordinate system changes.
385
386
Parameters:
387
- pdf (Pdf): PDF document to analyze
388
389
Returns:
390
list[tuple[Object, Matrix]]: List of (object, transformation_matrix) pairs
391
"""
392
```
393
394
### Settings and Configuration
395
396
Global pikepdf configuration functions for controlling behavior.
397
398
```python { .api }
399
def get_decimal_precision() -> int:
400
"""
401
Get the current decimal precision for floating-point output.
402
403
Controls how many decimal places are used when writing
404
floating-point numbers to PDF files.
405
406
Returns:
407
int: Current precision (number of decimal places)
408
"""
409
410
def set_decimal_precision(precision: int) -> None:
411
"""
412
Set the decimal precision for floating-point output.
413
414
Parameters:
415
- precision (int): Number of decimal places (typically 2-6)
416
417
Raises:
418
ValueError: If precision is out of valid range
419
"""
420
421
def set_flate_compression_level(level: int) -> None:
422
"""
423
Set the compression level for Flate (deflate) streams.
424
425
Controls the trade-off between compression speed and compression ratio
426
when compressing PDF streams using Flate encoding.
427
428
Parameters:
429
- level (int): Compression level (0-9, where 0=no compression, 9=maximum compression)
430
431
Raises:
432
ValueError: If level is out of valid range (0-9)
433
"""
434
```
435
436
### Helper Classes
437
438
Utility classes for advanced PDF object manipulation.
439
440
```python { .api }
441
class ObjectHelper:
442
"""
443
Helper class for PDF object operations.
444
445
Provides utility methods for advanced object manipulation
446
and analysis that don't fit into the main object classes.
447
"""
448
449
# Note: Specific methods would be documented based on actual implementation
450
# This class provides low-level object utilities
451
```
452
453
## Usage Examples
454
455
### Matrix Transformations
456
457
```python
458
import pikepdf
459
import math
460
461
# Create various transformation matrices
462
identity = pikepdf.Matrix.identity()
463
print(f"Identity matrix: [{identity.a}, {identity.b}, {identity.c}, {identity.d}, {identity.e}, {identity.f}]")
464
465
# Translation
466
translate = pikepdf.Matrix().translated(100, 50)
467
print(f"Translation (100, 50): [{translate.a}, {translate.b}, {translate.c}, {translate.d}, {translate.e}, {translate.f}]")
468
469
# Scaling
470
scale = pikepdf.Matrix().scaled(2.0, 1.5) # 2x width, 1.5x height
471
print(f"Scaling (2.0, 1.5): [{scale.a}, {scale.b}, {scale.c}, {scale.d}, {scale.e}, {scale.f}]")
472
473
# Rotation (45 degrees)
474
rotate = pikepdf.Matrix().rotated(45)
475
print(f"Rotation 45°: [{rotate.a:.3f}, {rotate.b:.3f}, {rotate.c:.3f}, {rotate.d:.3f}, {rotate.e}, {rotate.f}]")
476
477
# Combined transformation: scale, then rotate, then translate
478
combined = pikepdf.Matrix().scaled(1.5, 1.5).rotated(30).translated(100, 200)
479
print(f"Combined transform: [{combined.a:.3f}, {combined.b:.3f}, {combined.c:.3f}, {combined.d:.3f}, {combined.e:.1f}, {combined.f:.1f}]")
480
481
# Transform points
482
original_point = (10, 20)
483
transformed_point = combined.transform(original_point)
484
print(f"Point {original_point} -> {transformed_point}")
485
486
# Matrix multiplication (composition)
487
m1 = pikepdf.Matrix().scaled(2, 2)
488
m2 = pikepdf.Matrix().rotated(90)
489
m3 = m1 * m2 # Apply m1 first, then m2
490
print(f"Matrix multiplication result: [{m3.a:.3f}, {m3.b:.3f}, {m3.c:.3f}, {m3.d:.3f}, {m3.e}, {m3.f}]")
491
492
# Inverse transformation
493
original_matrix = pikepdf.Matrix().scaled(2, 3).translated(10, 15)
494
inverse_matrix = original_matrix.inverse()
495
496
# Verify inverse (should return original point)
497
point = (5, 7)
498
transformed = original_matrix.transform(point)
499
back_to_original = inverse_matrix.transform(transformed)
500
print(f"Original: {point}, Transformed: {transformed}, Back: {back_to_original}")
501
```
502
503
### Applying Transformations to PDF Content
504
505
```python
506
import pikepdf
507
508
def apply_transformation_to_page(page, matrix):
509
"""Apply a transformation matrix to all content on a page."""
510
511
# Get existing content
512
if '/Contents' in page:
513
existing_content = page['/Contents']
514
515
# Create transformation commands
516
transform_commands = f"""
517
q
518
{matrix.a} {matrix.b} {matrix.c} {matrix.d} {matrix.e} {matrix.f} cm
519
"""
520
521
restore_commands = "\nQ"
522
523
# Wrap existing content with transformation
524
if isinstance(existing_content, pikepdf.Array):
525
# Multiple content streams
526
transform_stream = pikepdf.Stream(page.owner, transform_commands.encode())
527
restore_stream = pikepdf.Stream(page.owner, restore_commands.encode())
528
529
new_contents = pikepdf.Array([transform_stream])
530
new_contents.extend(existing_content)
531
new_contents.append(restore_stream)
532
533
page['/Contents'] = new_contents
534
else:
535
# Single content stream
536
new_content = transform_commands.encode() + existing_content.read_bytes() + restore_commands.encode()
537
page['/Contents'] = pikepdf.Stream(page.owner, new_content)
538
539
# Apply transformation to a PDF page
540
pdf = pikepdf.open('document.pdf')
541
page = pdf.pages[0]
542
543
# Create a transformation matrix (rotate 15 degrees and scale 90%)
544
transform_matrix = pikepdf.Matrix().rotated(15).scaled(0.9, 0.9)
545
546
# Apply transformation
547
apply_transformation_to_page(page, transform_matrix)
548
549
pdf.save('transformed_document.pdf')
550
pdf.close()
551
print("Applied transformation to page content")
552
```
553
554
### Working with Name and Number Trees
555
556
```python
557
import pikepdf
558
559
# Create a PDF with name tree
560
pdf = pikepdf.new()
561
562
# Create a name tree for storing named destinations
563
name_tree = pikepdf.NameTree.new(pdf)
564
565
# Add entries to the name tree
566
destinations = {
567
b'chapter1': pikepdf.Array([pdf.pages[0], pikepdf.Name.Fit]),
568
b'section1.1': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitH, 700]),
569
b'appendix': pikepdf.Array([pdf.pages[0], pikepdf.Name.FitV, 100]),
570
}
571
572
for name, destination in destinations.items():
573
name_tree[name] = destination
574
print(f"Added destination: {name.decode()} -> {destination}")
575
576
print(f"Name tree contains {len(name_tree)} entries")
577
578
# Iterate through name tree
579
print("All entries in name tree:")
580
for key in name_tree:
581
value = name_tree[key]
582
print(f" {key.decode()}: {value}")
583
584
# Create a number tree for page labels
585
number_tree = pikepdf.NumberTree.new(pdf)
586
587
# Add page labels (page number -> label format)
588
page_labels = {
589
0: pikepdf.Dictionary({'/S': pikepdf.Name.r}), # Roman numerals
590
5: pikepdf.Dictionary({'/S': pikepdf.Name.D, '/P': pikepdf.String('Page ')}), # Decimal with prefix
591
10: pikepdf.Dictionary({'/S': pikepdf.Name.a}), # Lowercase letters
592
}
593
594
for page_num, label_dict in page_labels.items():
595
number_tree[page_num] = label_dict
596
print(f"Added page label: Page {page_num} -> {label_dict}")
597
598
print(f"Number tree contains {len(number_tree)} entries")
599
600
# Save PDF with trees
601
pdf.save('document_with_trees.pdf')
602
pdf.close()
603
```
604
605
### Advanced Job Interface Usage
606
607
```python
608
import pikepdf
609
import json
610
611
def process_pdf_with_job_interface(input_pdf, output_pdf, operations):
612
"""Use job interface for complex PDF processing."""
613
614
try:
615
# Create a job configuration
616
job_config = {
617
'inputFile': input_pdf,
618
'outputFile': output_pdf,
619
'staticId': True, # Reproducible output
620
'deterministicId': True,
621
'operations': operations
622
}
623
624
# Create job from configuration
625
job = pikepdf.Job()
626
627
# Configure job (this is simplified - actual API may differ)
628
# In practice, you'd use specific job configuration methods
629
630
# Validate configuration
631
if job.check_configuration():
632
print("Job configuration is valid")
633
634
# Execute the job
635
exit_code = job.run()
636
637
if exit_code == 0:
638
print(f"Job completed successfully: {input_pdf} -> {output_pdf}")
639
640
if job.has_warnings:
641
print("Job completed with warnings")
642
643
return True
644
else:
645
print(f"Job failed with exit code: {exit_code}")
646
return False
647
else:
648
print("Job configuration is invalid")
649
return False
650
651
except pikepdf.JobUsageError as e:
652
print(f"Job usage error: {e}")
653
return False
654
655
# Example job operations
656
operations = [
657
{'operation': 'qdf', 'parameters': {}}, # Convert to QDF format for inspection
658
{'operation': 'optimize', 'parameters': {'compress-streams': True}},
659
{'operation': 'linearize', 'parameters': {}} # Linearize for fast web view
660
]
661
662
# Process PDF with job interface
663
# success = process_pdf_with_job_interface('input.pdf', 'output.pdf', operations)
664
```
665
666
### Configuration and Settings Management
667
668
```python
669
import pikepdf
670
671
def configure_pikepdf_settings():
672
"""Configure pikepdf global settings for optimal performance."""
673
674
# Get current settings
675
current_precision = pikepdf.settings.get_decimal_precision()
676
print(f"Current decimal precision: {current_precision}")
677
678
# Set precision for clean output (fewer decimal places)
679
pikepdf.settings.set_decimal_precision(3)
680
print("Set decimal precision to 3 places")
681
682
# Set compression level for optimal balance of speed and size
683
pikepdf.settings.set_flate_compression_level(6) # Medium compression
684
print("Set Flate compression level to 6 (medium)")
685
686
# Verify settings
687
new_precision = pikepdf.settings.get_decimal_precision()
688
print(f"New decimal precision: {new_precision}")
689
690
def create_optimized_pdf():
691
"""Create a PDF with optimized settings."""
692
693
# Configure settings for clean, compact output
694
configure_pikepdf_settings()
695
696
# Create PDF
697
pdf = pikepdf.new()
698
page = pdf.add_blank_page()
699
700
# Add content with floating-point coordinates
701
content = """
702
BT
703
/F1 12 Tf
704
100.123456789 700.987654321 Td
705
(Optimized PDF with controlled precision) Tj
706
ET
707
"""
708
709
content_stream = pikepdf.Stream(pdf, content.encode())
710
page['/Contents'] = content_stream
711
712
# Save with compression and optimization
713
pdf.save('optimized_output.pdf',
714
compress_streams=True,
715
normalize_content=True)
716
pdf.close()
717
718
print("Created optimized PDF with controlled precision and compression")
719
720
# Configure and create optimized PDF
721
# create_optimized_pdf()
722
```
723
724
### Advanced Object Analysis
725
726
```python
727
import pikepdf
728
729
def analyze_object_relationships(pdf_path):
730
"""Analyze complex object relationships in a PDF."""
731
732
pdf = pikepdf.open(pdf_path)
733
734
analysis = {
735
'total_objects': len(pdf.objects),
736
'object_types': {},
737
'indirect_objects': 0,
738
'shared_objects': {},
739
'complex_structures': {}
740
}
741
742
# Analyze all objects
743
for (obj_id, gen), obj in pdf.objects.items():
744
# Count object types
745
obj_type = str(obj._type_code)
746
analysis['object_types'][obj_type] = analysis['object_types'].get(obj_type, 0) + 1
747
748
if obj.is_indirect:
749
analysis['indirect_objects'] += 1
750
751
# Find shared objects (referenced multiple times)
752
if obj.is_indirect:
753
# Count references (this is simplified - would need full PDF traversal)
754
analysis['shared_objects'][f"{obj_id}/{gen}"] = {
755
'type': obj_type,
756
'size': len(str(obj)) if hasattr(obj, '__str__') else 0
757
}
758
759
# Find coordinate transformation matrices
760
try:
761
ctm_objects = pikepdf.get_objects_with_ctm(pdf)
762
analysis['complex_structures']['objects_with_ctm'] = len(ctm_objects)
763
764
print(f"Found {len(ctm_objects)} objects with coordinate transformations:")
765
for obj, matrix in ctm_objects[:5]: # Show first 5
766
print(f" Object {obj}: Matrix [{matrix.a:.2f}, {matrix.b:.2f}, {matrix.c:.2f}, {matrix.d:.2f}, {matrix.e:.2f}, {matrix.f:.2f}]")
767
768
except Exception as e:
769
print(f"Could not analyze CTM objects: {e}")
770
771
pdf.close()
772
773
print(f"\nPDF Object Analysis for {pdf_path}:")
774
print(f"Total objects: {analysis['total_objects']}")
775
print(f"Indirect objects: {analysis['indirect_objects']}")
776
777
print(f"\nObject types:")
778
for obj_type, count in sorted(analysis['object_types'].items()):
779
print(f" {obj_type}: {count}")
780
781
return analysis
782
783
# Analyze object relationships
784
# analysis = analyze_object_relationships('complex_document.pdf')
785
```
786
787
### Performance Optimization Techniques
788
789
```python
790
import pikepdf
791
import time
792
from pathlib import Path
793
794
def benchmark_pdf_operations(pdf_path):
795
"""Benchmark various PDF operations for performance analysis."""
796
797
operations = {}
798
799
# Time PDF opening
800
start_time = time.time()
801
pdf = pikepdf.open(pdf_path)
802
operations['open'] = time.time() - start_time
803
804
# Time page access
805
start_time = time.time()
806
page_count = len(pdf.pages)
807
first_page = pdf.pages[0] if page_count > 0 else None
808
operations['page_access'] = time.time() - start_time
809
810
# Time content parsing
811
if first_page:
812
start_time = time.time()
813
try:
814
instructions = pikepdf.parse_content_stream(first_page)
815
operations['content_parsing'] = time.time() - start_time
816
operations['instruction_count'] = len(instructions)
817
except Exception as e:
818
operations['content_parsing'] = f"Failed: {e}"
819
820
# Time object iteration
821
start_time = time.time()
822
object_count = len(pdf.objects)
823
operations['object_iteration'] = time.time() - start_time
824
operations['object_count'] = object_count
825
826
# Time save operation
827
output_path = Path(pdf_path).with_suffix('.benchmark.pdf')
828
start_time = time.time()
829
pdf.save(str(output_path))
830
operations['save'] = time.time() - start_time
831
832
pdf.close()
833
834
# Clean up benchmark file
835
if output_path.exists():
836
output_path.unlink()
837
838
print(f"Performance Benchmark for {pdf_path}:")
839
print(f" Open: {operations['open']:.3f}s")
840
print(f" Page access ({page_count} pages): {operations['page_access']:.3f}s")
841
if 'content_parsing' in operations:
842
if isinstance(operations['content_parsing'], str):
843
print(f" Content parsing: {operations['content_parsing']}")
844
else:
845
print(f" Content parsing ({operations.get('instruction_count', 0)} instructions): {operations['content_parsing']:.3f}s")
846
print(f" Object iteration ({object_count} objects): {operations['object_iteration']:.3f}s")
847
print(f" Save: {operations['save']:.3f}s")
848
849
return operations
850
851
def optimize_pdf_processing():
852
"""Demonstrate techniques for optimizing PDF processing performance."""
853
854
# Configure for optimal performance
855
pikepdf.settings.set_decimal_precision(2) # Reduce precision for speed
856
pikepdf.settings.set_flate_compression_level(1) # Fast compression
857
858
print("Configured pikepdf for performance:")
859
print(f" Decimal precision: {pikepdf.settings.get_decimal_precision()}")
860
print(" Compression level: 1 (fast)")
861
862
# Performance tips:
863
print("\nPerformance optimization tips:")
864
print("1. Use access_mode=pikepdf.AccessMode.mmap for large files")
865
print("2. Set suppress_warnings=True to reduce overhead")
866
print("3. Use static_id=True for reproducible output without timestamp overhead")
867
print("4. Consider stream_decode_level for controlling decoding complexity")
868
print("5. Process pages in batches for large documents")
869
print("6. Cache parsed content streams if reusing")
870
print("7. Use pikepdf.new() instead of opening/clearing for new documents")
871
872
# Run performance analysis
873
# if Path('document.pdf').exists():
874
# benchmark_pdf_operations('document.pdf')
875
876
optimize_pdf_processing()
877
```