0
# Errors and Utilities
1
2
Exception classes for comprehensive error handling, utility functions for specialized operations, and helper classes that support PyPDF2's core functionality.
3
4
## Capabilities
5
6
### Exception Classes
7
8
Comprehensive exception hierarchy for handling various PDF processing errors.
9
10
```python { .api }
11
class PyPdfError(Exception):
12
"""Base exception class for all PyPDF2 errors."""
13
14
class PdfReadError(PyPdfError):
15
"""Raised when there's an error reading a PDF file."""
16
17
class PdfStreamError(PdfReadError):
18
"""Raised when there's an error processing PDF streams."""
19
20
class PageSizeNotDefinedError(PyPdfError):
21
"""Raised when page size cannot be determined."""
22
23
class ParseError(Exception):
24
"""Raised when there's an error parsing PDF content."""
25
26
class FileNotDecryptedError(PdfReadError):
27
"""Raised when attempting to access encrypted content without decryption."""
28
29
class WrongPasswordError(PdfReadError):
30
"""Raised when an incorrect password is provided for an encrypted PDF."""
31
32
class EmptyFileError(PdfReadError):
33
"""Raised when attempting to read an empty or corrupt PDF file."""
34
35
class DependencyError(Exception):
36
"""Raised when a required dependency is missing."""
37
```
38
39
### Warning Classes
40
41
Warning classes for non-fatal issues during PDF processing.
42
43
```python { .api }
44
class PdfReadWarning(UserWarning):
45
"""Warning issued during PDF reading for recoverable issues."""
46
```
47
48
### Paper Size Utilities
49
50
Standard paper size definitions and utilities.
51
52
```python { .api }
53
class PaperSize:
54
"""Standard paper size constants with dimensions in points."""
55
56
# ISO A-series paper sizes
57
A0: 'Dimensions' # 2384 x 3371 points (33.1" x 46.8")
58
A1: 'Dimensions' # 1685 x 2384 points (23.4" x 33.1")
59
A2: 'Dimensions' # 1190 x 1685 points (16.5" x 23.4")
60
A3: 'Dimensions' # 842 x 1190 points (11.7" x 16.5")
61
A4: 'Dimensions' # 595 x 842 points (8.3" x 11.7")
62
A5: 'Dimensions' # 420 x 595 points (5.8" x 8.3")
63
A6: 'Dimensions' # 298 x 420 points (4.1" x 5.8")
64
A7: 'Dimensions' # 210 x 298 points (2.9" x 4.1")
65
A8: 'Dimensions' # 147 x 210 points (2.0" x 2.9")
66
67
# Envelope sizes
68
C4: 'Dimensions' # 649 x 918 points (9.0" x 12.8")
69
70
class Dimensions:
71
"""Represents paper dimensions in points."""
72
73
def __init__(self, width: float, height: float):
74
"""
75
Initialize dimensions.
76
77
Args:
78
width (float): Width in points (72 points = 1 inch)
79
height (float): Height in points (72 points = 1 inch)
80
"""
81
self.width = width
82
self.height = height
83
84
@property
85
def width_inches(self) -> float:
86
"""Width in inches."""
87
return self.width / 72.0
88
89
@property
90
def height_inches(self) -> float:
91
"""Height in inches."""
92
return self.height / 72.0
93
94
@property
95
def width_mm(self) -> float:
96
"""Width in millimeters."""
97
return self.width / 72.0 * 25.4
98
99
@property
100
def height_mm(self) -> float:
101
"""Height in millimeters."""
102
return self.height / 72.0 * 25.4
103
```
104
105
### PDF Filters
106
107
Compression and encoding filters for PDF content streams.
108
109
```python { .api }
110
class FlateDecode:
111
"""Flate/ZIP compression filter (most common)."""
112
113
@staticmethod
114
def decode(data: bytes, decode_parms: dict = None) -> bytes:
115
"""
116
Decode Flate-compressed data.
117
118
Args:
119
data (bytes): Compressed data
120
decode_parms (dict, optional): Decode parameters
121
122
Returns:
123
bytes: Decompressed data
124
"""
125
126
@staticmethod
127
def encode(data: bytes) -> bytes:
128
"""
129
Encode data with Flate compression.
130
131
Args:
132
data (bytes): Data to compress
133
134
Returns:
135
bytes: Compressed data
136
"""
137
138
class ASCIIHexDecode:
139
"""ASCII hexadecimal encoding filter."""
140
141
@staticmethod
142
def decode(data: bytes, decode_parms: dict = None) -> bytes:
143
"""
144
Decode ASCII hex encoded data.
145
146
Args:
147
data (bytes): Hex-encoded data
148
decode_parms (dict, optional): Decode parameters
149
150
Returns:
151
bytes: Decoded data
152
"""
153
154
class LZWDecode:
155
"""LZW compression filter."""
156
157
@staticmethod
158
def decode(data: bytes, decode_parms: dict = None) -> bytes:
159
"""
160
Decode LZW compressed data.
161
162
Args:
163
data (bytes): LZW compressed data
164
decode_parms (dict, optional): Decode parameters
165
166
Returns:
167
bytes: Decompressed data
168
"""
169
170
class DCTDecode:
171
"""JPEG compression filter."""
172
173
@staticmethod
174
def decode(data: bytes, decode_parms: dict = None) -> bytes:
175
"""
176
Decode JPEG compressed data.
177
178
Args:
179
data (bytes): JPEG data
180
decode_parms (dict, optional): Decode parameters
181
182
Returns:
183
bytes: Image data
184
"""
185
186
class JPXDecode:
187
"""JPEG 2000 compression filter."""
188
189
@staticmethod
190
def decode(data: bytes, decode_parms: dict = None) -> bytes:
191
"""
192
Decode JPEG 2000 compressed data.
193
194
Args:
195
data (bytes): JPEG 2000 data
196
decode_parms (dict, optional): Decode parameters
197
198
Returns:
199
bytes: Image data
200
"""
201
202
class CCITTFaxDecode:
203
"""CCITT fax compression filter."""
204
205
@staticmethod
206
def decode(data: bytes, decode_parms: dict = None) -> bytes:
207
"""
208
Decode CCITT fax compressed data.
209
210
Args:
211
data (bytes): CCITT compressed data
212
decode_parms (dict, optional): Decode parameters with Width, Height, etc.
213
214
Returns:
215
bytes: Decompressed image data
216
"""
217
```
218
219
### XMP Metadata Support
220
221
Extended metadata support for documents that include XMP information.
222
223
```python { .api }
224
class XmpInformation:
225
"""Handler for XMP (Extensible Metadata Platform) information."""
226
227
def __init__(self, stream):
228
"""
229
Initialize XMP information from stream.
230
231
Args:
232
stream: XMP metadata stream
233
"""
234
235
# Methods for accessing XMP metadata
236
# Implementation varies based on XMP schema and content
237
# Provides access to Dublin Core, PDF, and custom metadata
238
```
239
240
### Version Information
241
242
```python { .api }
243
__version__: str # Current PyPDF2 version string "2.12.1"
244
```
245
246
### Utility Functions
247
248
General utility functions used throughout the library.
249
250
```python { .api }
251
def parse_filename_page_ranges(args: List[Union[str, PageRange, None]]) -> List[Tuple[str, PageRange]]:
252
"""
253
Parse command-line style filename and page range arguments.
254
255
Args:
256
args: Arguments to parse (e.g., ["file1.pdf", "1:5", "file2.pdf", "::2"])
257
258
Returns:
259
list: List of (filename, page_range) tuples
260
"""
261
```
262
263
## Usage Examples
264
265
### Error Handling
266
267
```python
268
from PyPDF2 import PdfReader, PdfWriter
269
from PyPDF2.errors import (
270
PdfReadError, WrongPasswordError, FileNotDecryptedError,
271
EmptyFileError, DependencyError
272
)
273
274
def safe_pdf_operation(filename):
275
try:
276
reader = PdfReader(filename)
277
278
if reader.is_encrypted:
279
# Try to decrypt
280
reader.decrypt("password")
281
282
# Perform operations
283
writer = PdfWriter()
284
for page in reader.pages:
285
writer.add_page(page)
286
287
return writer
288
289
except EmptyFileError:
290
print(f"Error: {filename} is empty or corrupted")
291
except WrongPasswordError:
292
print(f"Error: Incorrect password for {filename}")
293
except FileNotDecryptedError:
294
print(f"Error: {filename} is encrypted and needs a password")
295
except PdfReadError as e:
296
print(f"Error reading {filename}: {e}")
297
except DependencyError as e:
298
print(f"Missing dependency: {e}")
299
except Exception as e:
300
print(f"Unexpected error: {e}")
301
302
return None
303
304
# Usage
305
result = safe_pdf_operation("document.pdf")
306
if result:
307
with open("processed.pdf", "wb") as output_file:
308
result.write(output_file)
309
```
310
311
### Working with Paper Sizes
312
313
```python
314
from PyPDF2 import PdfWriter
315
from PyPDF2.papersizes import PaperSize
316
317
writer = PdfWriter()
318
319
# Create pages with different standard sizes
320
sizes_to_create = [
321
("A4", PaperSize.A4),
322
("A3", PaperSize.A3),
323
("A5", PaperSize.A5),
324
("C4 Envelope", PaperSize.C4)
325
]
326
327
for name, size in sizes_to_create:
328
page = writer.add_blank_page(size.width, size.height)
329
print(f"{name}: {size.width} x {size.height} points")
330
print(f" {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")
331
print(f" {size.width_mm:.0f}mm x {size.height_mm:.0f}mm")
332
333
with open("standard_sizes.pdf", "wb") as output_file:
334
writer.write(output_file)
335
```
336
337
### Custom Paper Size Calculations
338
339
```python
340
from PyPDF2.papersizes import Dimensions
341
342
# Create custom paper sizes
343
us_letter = Dimensions(612, 792) # 8.5" x 11"
344
us_legal = Dimensions(612, 1008) # 8.5" x 14"
345
tabloid = Dimensions(792, 1224) # 11" x 17"
346
347
custom_sizes = [
348
("US Letter", us_letter),
349
("US Legal", us_legal),
350
("Tabloid", tabloid)
351
]
352
353
for name, size in custom_sizes:
354
print(f"{name}:")
355
print(f" Points: {size.width} x {size.height}")
356
print(f" Inches: {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")
357
print(f" mm: {size.width_mm:.0f} x {size.height_mm:.0f}")
358
```
359
360
### Filter Usage (Advanced)
361
362
```python
363
from PyPDF2.filters import FlateDecode
364
import zlib
365
366
# Example of manual filter usage (rarely needed)
367
original_data = b"Hello, World! This is test data for compression."
368
369
# Compress data
370
compressed = FlateDecode.encode(original_data)
371
print(f"Original size: {len(original_data)} bytes")
372
print(f"Compressed size: {len(compressed)} bytes")
373
print(f"Compression ratio: {len(compressed)/len(original_data):.2%}")
374
375
# Decompress data
376
decompressed = FlateDecode.decode(compressed)
377
print(f"Decompressed: {decompressed.decode('utf-8')}")
378
print(f"Data integrity: {original_data == decompressed}")
379
```
380
381
### Version Checking
382
383
```python
384
from PyPDF2 import __version__
385
from packaging import version
386
387
print(f"PyPDF2 version: {__version__}")
388
389
# Check if version meets requirements
390
required_version = "2.10.0"
391
if version.parse(__version__) >= version.parse(required_version):
392
print(f"PyPDF2 version {__version__} meets requirement >= {required_version}")
393
else:
394
print(f"PyPDF2 version {__version__} is below requirement >= {required_version}")
395
print("Consider upgrading with: pip install --upgrade PyPDF2")
396
```
397
398
### Parsing Command Line Arguments
399
400
```python
401
from PyPDF2 import parse_filename_page_ranges, PdfMerger
402
import sys
403
404
def merge_from_args(args):
405
"""Merge PDFs based on command line arguments."""
406
# Parse arguments like: ["file1.pdf", "1:5", "file2.pdf", "::2", "file3.pdf"]
407
file_ranges = parse_filename_page_ranges(args)
408
409
merger = PdfMerger()
410
411
for filename, page_range in file_ranges:
412
print(f"Adding {filename} with pages {page_range}")
413
merger.append(filename, pages=page_range)
414
415
merger.write("merged_output.pdf")
416
merger.close()
417
print("Merge completed: merged_output.pdf")
418
419
# Example usage
420
if __name__ == "__main__":
421
if len(sys.argv) > 1:
422
merge_from_args(sys.argv[1:])
423
else:
424
print("Usage: python script.py file1.pdf 1:5 file2.pdf ::2 file3.pdf")
425
```
426
427
### XMP Metadata Access
428
429
```python
430
from PyPDF2 import PdfReader
431
432
reader = PdfReader("document.pdf")
433
434
# Check for XMP metadata
435
if reader.xmp_metadata:
436
xmp = reader.xmp_metadata
437
print("XMP metadata found:")
438
439
# XMP access depends on the specific XMP schema and content
440
# Common patterns:
441
try:
442
print(f"Dublin Core title: {xmp.dc_title}")
443
print(f"Dublin Core creator: {xmp.dc_creator}")
444
print(f"Dublin Core subject: {xmp.dc_subject}")
445
except AttributeError:
446
print("Standard Dublin Core fields not available")
447
448
# Raw XMP data
449
print("Raw XMP metadata available for custom parsing")
450
else:
451
print("No XMP metadata found")
452
453
# Standard metadata is always available through reader.metadata
454
if reader.metadata:
455
print(f"Standard metadata title: {reader.metadata.title}")
456
```
457
458
## Error Recovery Strategies
459
460
### Handling Corrupted PDFs
461
462
```python
463
from PyPDF2 import PdfReader, PdfWriter
464
from PyPDF2.errors import PdfReadError, PdfStreamError
465
import warnings
466
467
def repair_pdf_attempt(filename):
468
"""Attempt to repair/recover a corrupted PDF."""
469
try:
470
# Try strict mode first
471
reader = PdfReader(filename, strict=True)
472
return reader, "No repair needed"
473
474
except PdfReadError:
475
try:
476
# Try non-strict mode for recovery
477
with warnings.catch_warnings():
478
warnings.simplefilter("ignore")
479
reader = PdfReader(filename, strict=False)
480
return reader, "Recovered in non-strict mode"
481
482
except PdfReadError:
483
# Try to extract what we can
484
try:
485
reader = PdfReader(filename, strict=False)
486
writer = PdfWriter()
487
488
pages_recovered = 0
489
for i, page in enumerate(reader.pages):
490
try:
491
# Test if page is readable
492
_ = page.extract_text()
493
writer.add_page(page)
494
pages_recovered += 1
495
except Exception:
496
print(f"Skipping corrupted page {i+1}")
497
continue
498
499
return writer, f"Partially recovered {pages_recovered} pages"
500
501
except Exception as e:
502
return None, f"Recovery failed: {e}"
503
504
# Usage
505
pdf_reader, status = repair_pdf_attempt("corrupted.pdf")
506
print(f"Recovery status: {status}")
507
508
if pdf_reader:
509
if hasattr(pdf_reader, 'write'): # It's a writer
510
with open("repaired.pdf", "wb") as output_file:
511
pdf_reader.write(output_file)
512
else: # It's a reader
513
print(f"Successfully opened PDF with {len(pdf_reader.pages)} pages")
514
```