or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

errors-and-utilities.mdindex.mdpage-manipulation.mdpdf-merging.mdpdf-reading.mdpdf-writing.mdtypes-and-objects.md

errors-and-utilities.mddocs/

0

# Errors and Utilities

1

2

Exception classes for comprehensive error handling, utility functions for specialized operations, and helper classes that support PyPDF2's core functionality.

3

4

## Capabilities

5

6

### Exception Classes

7

8

Comprehensive exception hierarchy for handling various PDF processing errors.

9

10

```python { .api }

11

class PyPdfError(Exception):

12

"""Base exception class for all PyPDF2 errors."""

13

14

class PdfReadError(PyPdfError):

15

"""Raised when there's an error reading a PDF file."""

16

17

class PdfStreamError(PdfReadError):

18

"""Raised when there's an error processing PDF streams."""

19

20

class PageSizeNotDefinedError(PyPdfError):

21

"""Raised when page size cannot be determined."""

22

23

class ParseError(Exception):

24

"""Raised when there's an error parsing PDF content."""

25

26

class FileNotDecryptedError(PdfReadError):

27

"""Raised when attempting to access encrypted content without decryption."""

28

29

class WrongPasswordError(PdfReadError):

30

"""Raised when an incorrect password is provided for an encrypted PDF."""

31

32

class EmptyFileError(PdfReadError):

33

"""Raised when attempting to read an empty or corrupt PDF file."""

34

35

class DependencyError(Exception):

36

"""Raised when a required dependency is missing."""

37

```

38

39

### Warning Classes

40

41

Warning classes for non-fatal issues during PDF processing.

42

43

```python { .api }

44

class PdfReadWarning(UserWarning):

45

"""Warning issued during PDF reading for recoverable issues."""

46

```

47

48

### Paper Size Utilities

49

50

Standard paper size definitions and utilities.

51

52

```python { .api }

53

class PaperSize:

54

"""Standard paper size constants with dimensions in points."""

55

56

# ISO A-series paper sizes

57

A0: 'Dimensions' # 2384 x 3371 points (33.1" x 46.8")

58

A1: 'Dimensions' # 1685 x 2384 points (23.4" x 33.1")

59

A2: 'Dimensions' # 1190 x 1685 points (16.5" x 23.4")

60

A3: 'Dimensions' # 842 x 1190 points (11.7" x 16.5")

61

A4: 'Dimensions' # 595 x 842 points (8.3" x 11.7")

62

A5: 'Dimensions' # 420 x 595 points (5.8" x 8.3")

63

A6: 'Dimensions' # 298 x 420 points (4.1" x 5.8")

64

A7: 'Dimensions' # 210 x 298 points (2.9" x 4.1")

65

A8: 'Dimensions' # 147 x 210 points (2.0" x 2.9")

66

67

# Envelope sizes

68

C4: 'Dimensions' # 649 x 918 points (9.0" x 12.8")

69

70

class Dimensions:

71

"""Represents paper dimensions in points."""

72

73

def __init__(self, width: float, height: float):

74

"""

75

Initialize dimensions.

76

77

Args:

78

width (float): Width in points (72 points = 1 inch)

79

height (float): Height in points (72 points = 1 inch)

80

"""

81

self.width = width

82

self.height = height

83

84

@property

85

def width_inches(self) -> float:

86

"""Width in inches."""

87

return self.width / 72.0

88

89

@property

90

def height_inches(self) -> float:

91

"""Height in inches."""

92

return self.height / 72.0

93

94

@property

95

def width_mm(self) -> float:

96

"""Width in millimeters."""

97

return self.width / 72.0 * 25.4

98

99

@property

100

def height_mm(self) -> float:

101

"""Height in millimeters."""

102

return self.height / 72.0 * 25.4

103

```

104

105

### PDF Filters

106

107

Compression and encoding filters for PDF content streams.

108

109

```python { .api }

110

class FlateDecode:

111

"""Flate/ZIP compression filter (most common)."""

112

113

@staticmethod

114

def decode(data: bytes, decode_parms: dict = None) -> bytes:

115

"""

116

Decode Flate-compressed data.

117

118

Args:

119

data (bytes): Compressed data

120

decode_parms (dict, optional): Decode parameters

121

122

Returns:

123

bytes: Decompressed data

124

"""

125

126

@staticmethod

127

def encode(data: bytes) -> bytes:

128

"""

129

Encode data with Flate compression.

130

131

Args:

132

data (bytes): Data to compress

133

134

Returns:

135

bytes: Compressed data

136

"""

137

138

class ASCIIHexDecode:

139

"""ASCII hexadecimal encoding filter."""

140

141

@staticmethod

142

def decode(data: bytes, decode_parms: dict = None) -> bytes:

143

"""

144

Decode ASCII hex encoded data.

145

146

Args:

147

data (bytes): Hex-encoded data

148

decode_parms (dict, optional): Decode parameters

149

150

Returns:

151

bytes: Decoded data

152

"""

153

154

class LZWDecode:

155

"""LZW compression filter."""

156

157

@staticmethod

158

def decode(data: bytes, decode_parms: dict = None) -> bytes:

159

"""

160

Decode LZW compressed data.

161

162

Args:

163

data (bytes): LZW compressed data

164

decode_parms (dict, optional): Decode parameters

165

166

Returns:

167

bytes: Decompressed data

168

"""

169

170

class DCTDecode:

171

"""JPEG compression filter."""

172

173

@staticmethod

174

def decode(data: bytes, decode_parms: dict = None) -> bytes:

175

"""

176

Decode JPEG compressed data.

177

178

Args:

179

data (bytes): JPEG data

180

decode_parms (dict, optional): Decode parameters

181

182

Returns:

183

bytes: Image data

184

"""

185

186

class JPXDecode:

187

"""JPEG 2000 compression filter."""

188

189

@staticmethod

190

def decode(data: bytes, decode_parms: dict = None) -> bytes:

191

"""

192

Decode JPEG 2000 compressed data.

193

194

Args:

195

data (bytes): JPEG 2000 data

196

decode_parms (dict, optional): Decode parameters

197

198

Returns:

199

bytes: Image data

200

"""

201

202

class CCITTFaxDecode:

203

"""CCITT fax compression filter."""

204

205

@staticmethod

206

def decode(data: bytes, decode_parms: dict = None) -> bytes:

207

"""

208

Decode CCITT fax compressed data.

209

210

Args:

211

data (bytes): CCITT compressed data

212

decode_parms (dict, optional): Decode parameters with Width, Height, etc.

213

214

Returns:

215

bytes: Decompressed image data

216

"""

217

```

218

219

### XMP Metadata Support

220

221

Extended metadata support for documents that include XMP information.

222

223

```python { .api }

224

class XmpInformation:

225

"""Handler for XMP (Extensible Metadata Platform) information."""

226

227

def __init__(self, stream):

228

"""

229

Initialize XMP information from stream.

230

231

Args:

232

stream: XMP metadata stream

233

"""

234

235

# Methods for accessing XMP metadata

236

# Implementation varies based on XMP schema and content

237

# Provides access to Dublin Core, PDF, and custom metadata

238

```

239

240

### Version Information

241

242

```python { .api }

243

__version__: str # Current PyPDF2 version string "2.12.1"

244

```

245

246

### Utility Functions

247

248

General utility functions used throughout the library.

249

250

```python { .api }

251

def parse_filename_page_ranges(args: List[Union[str, PageRange, None]]) -> List[Tuple[str, PageRange]]:

252

"""

253

Parse command-line style filename and page range arguments.

254

255

Args:

256

args: Arguments to parse (e.g., ["file1.pdf", "1:5", "file2.pdf", "::2"])

257

258

Returns:

259

list: List of (filename, page_range) tuples

260

"""

261

```

262

263

## Usage Examples

264

265

### Error Handling

266

267

```python

268

from PyPDF2 import PdfReader, PdfWriter

269

from PyPDF2.errors import (

270

PdfReadError, WrongPasswordError, FileNotDecryptedError,

271

EmptyFileError, DependencyError

272

)

273

274

def safe_pdf_operation(filename):

275

try:

276

reader = PdfReader(filename)

277

278

if reader.is_encrypted:

279

# Try to decrypt

280

reader.decrypt("password")

281

282

# Perform operations

283

writer = PdfWriter()

284

for page in reader.pages:

285

writer.add_page(page)

286

287

return writer

288

289

except EmptyFileError:

290

print(f"Error: {filename} is empty or corrupted")

291

except WrongPasswordError:

292

print(f"Error: Incorrect password for {filename}")

293

except FileNotDecryptedError:

294

print(f"Error: {filename} is encrypted and needs a password")

295

except PdfReadError as e:

296

print(f"Error reading {filename}: {e}")

297

except DependencyError as e:

298

print(f"Missing dependency: {e}")

299

except Exception as e:

300

print(f"Unexpected error: {e}")

301

302

return None

303

304

# Usage

305

result = safe_pdf_operation("document.pdf")

306

if result:

307

with open("processed.pdf", "wb") as output_file:

308

result.write(output_file)

309

```

310

311

### Working with Paper Sizes

312

313

```python

314

from PyPDF2 import PdfWriter

315

from PyPDF2.papersizes import PaperSize

316

317

writer = PdfWriter()

318

319

# Create pages with different standard sizes

320

sizes_to_create = [

321

("A4", PaperSize.A4),

322

("A3", PaperSize.A3),

323

("A5", PaperSize.A5),

324

("C4 Envelope", PaperSize.C4)

325

]

326

327

for name, size in sizes_to_create:

328

page = writer.add_blank_page(size.width, size.height)

329

print(f"{name}: {size.width} x {size.height} points")

330

print(f" {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")

331

print(f" {size.width_mm:.0f}mm x {size.height_mm:.0f}mm")

332

333

with open("standard_sizes.pdf", "wb") as output_file:

334

writer.write(output_file)

335

```

336

337

### Custom Paper Size Calculations

338

339

```python

340

from PyPDF2.papersizes import Dimensions

341

342

# Create custom paper sizes

343

us_letter = Dimensions(612, 792) # 8.5" x 11"

344

us_legal = Dimensions(612, 1008) # 8.5" x 14"

345

tabloid = Dimensions(792, 1224) # 11" x 17"

346

347

custom_sizes = [

348

("US Letter", us_letter),

349

("US Legal", us_legal),

350

("Tabloid", tabloid)

351

]

352

353

for name, size in custom_sizes:

354

print(f"{name}:")

355

print(f" Points: {size.width} x {size.height}")

356

print(f" Inches: {size.width_inches:.1f}\" x {size.height_inches:.1f}\"")

357

print(f" mm: {size.width_mm:.0f} x {size.height_mm:.0f}")

358

```

359

360

### Filter Usage (Advanced)

361

362

```python

363

from PyPDF2.filters import FlateDecode

364

import zlib

365

366

# Example of manual filter usage (rarely needed)

367

original_data = b"Hello, World! This is test data for compression."

368

369

# Compress data

370

compressed = FlateDecode.encode(original_data)

371

print(f"Original size: {len(original_data)} bytes")

372

print(f"Compressed size: {len(compressed)} bytes")

373

print(f"Compression ratio: {len(compressed)/len(original_data):.2%}")

374

375

# Decompress data

376

decompressed = FlateDecode.decode(compressed)

377

print(f"Decompressed: {decompressed.decode('utf-8')}")

378

print(f"Data integrity: {original_data == decompressed}")

379

```

380

381

### Version Checking

382

383

```python

384

from PyPDF2 import __version__

385

from packaging import version

386

387

print(f"PyPDF2 version: {__version__}")

388

389

# Check if version meets requirements

390

required_version = "2.10.0"

391

if version.parse(__version__) >= version.parse(required_version):

392

print(f"PyPDF2 version {__version__} meets requirement >= {required_version}")

393

else:

394

print(f"PyPDF2 version {__version__} is below requirement >= {required_version}")

395

print("Consider upgrading with: pip install --upgrade PyPDF2")

396

```

397

398

### Parsing Command Line Arguments

399

400

```python

401

from PyPDF2 import parse_filename_page_ranges, PdfMerger

402

import sys

403

404

def merge_from_args(args):

405

"""Merge PDFs based on command line arguments."""

406

# Parse arguments like: ["file1.pdf", "1:5", "file2.pdf", "::2", "file3.pdf"]

407

file_ranges = parse_filename_page_ranges(args)

408

409

merger = PdfMerger()

410

411

for filename, page_range in file_ranges:

412

print(f"Adding {filename} with pages {page_range}")

413

merger.append(filename, pages=page_range)

414

415

merger.write("merged_output.pdf")

416

merger.close()

417

print("Merge completed: merged_output.pdf")

418

419

# Example usage

420

if __name__ == "__main__":

421

if len(sys.argv) > 1:

422

merge_from_args(sys.argv[1:])

423

else:

424

print("Usage: python script.py file1.pdf 1:5 file2.pdf ::2 file3.pdf")

425

```

426

427

### XMP Metadata Access

428

429

```python

430

from PyPDF2 import PdfReader

431

432

reader = PdfReader("document.pdf")

433

434

# Check for XMP metadata

435

if reader.xmp_metadata:

436

xmp = reader.xmp_metadata

437

print("XMP metadata found:")

438

439

# XMP access depends on the specific XMP schema and content

440

# Common patterns:

441

try:

442

print(f"Dublin Core title: {xmp.dc_title}")

443

print(f"Dublin Core creator: {xmp.dc_creator}")

444

print(f"Dublin Core subject: {xmp.dc_subject}")

445

except AttributeError:

446

print("Standard Dublin Core fields not available")

447

448

# Raw XMP data

449

print("Raw XMP metadata available for custom parsing")

450

else:

451

print("No XMP metadata found")

452

453

# Standard metadata is always available through reader.metadata

454

if reader.metadata:

455

print(f"Standard metadata title: {reader.metadata.title}")

456

```

457

458

## Error Recovery Strategies

459

460

### Handling Corrupted PDFs

461

462

```python

463

from PyPDF2 import PdfReader, PdfWriter

464

from PyPDF2.errors import PdfReadError, PdfStreamError

465

import warnings

466

467

def repair_pdf_attempt(filename):

468

"""Attempt to repair/recover a corrupted PDF."""

469

try:

470

# Try strict mode first

471

reader = PdfReader(filename, strict=True)

472

return reader, "No repair needed"

473

474

except PdfReadError:

475

try:

476

# Try non-strict mode for recovery

477

with warnings.catch_warnings():

478

warnings.simplefilter("ignore")

479

reader = PdfReader(filename, strict=False)

480

return reader, "Recovered in non-strict mode"

481

482

except PdfReadError:

483

# Try to extract what we can

484

try:

485

reader = PdfReader(filename, strict=False)

486

writer = PdfWriter()

487

488

pages_recovered = 0

489

for i, page in enumerate(reader.pages):

490

try:

491

# Test if page is readable

492

_ = page.extract_text()

493

writer.add_page(page)

494

pages_recovered += 1

495

except Exception:

496

print(f"Skipping corrupted page {i+1}")

497

continue

498

499

return writer, f"Partially recovered {pages_recovered} pages"

500

501

except Exception as e:

502

return None, f"Recovery failed: {e}"

503

504

# Usage

505

pdf_reader, status = repair_pdf_attempt("corrupted.pdf")

506

print(f"Recovery status: {status}")

507

508

if pdf_reader:

509

if hasattr(pdf_reader, 'write'): # It's a writer

510

with open("repaired.pdf", "wb") as output_file:

511

pdf_reader.write(output_file)

512

else: # It's a reader

513

print(f"Successfully opened PDF with {len(pdf_reader.pages)} pages")

514

```