or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations.mdform-fields.mdindex.mdmetadata.mdpage-operations.mdreading-writing.mdtext-extraction.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

Supporting utilities including page ranges, standard paper sizes, constants, error handling, and type definitions for enhanced developer experience. These utilities provide convenient functionality for common PDF operations.

3

4

## Capabilities

5

6

### Page Ranges

7

8

The PageRange class provides flexible page selection and range specification for PDF operations.

9

10

```python { .api }

11

class PageRange:

12

def __init__(self, arg):

13

"""

14

Initialize a page range from various input formats.

15

16

Args:

17

arg: Range specification - can be:

18

- slice object (e.g., slice(0, 10, 2))

19

- PageRange object (copy constructor)

20

- string (e.g., "1-5", "2,4,6", "1-3,7-9")

21

- integer (single page)

22

"""

23

24

@staticmethod

25

def valid(input) -> bool:

26

"""

27

Check if input is a valid page range specification.

28

29

Args:

30

input: Input to validate

31

32

Returns:

33

True if input is valid for PageRange

34

"""

35

36

def to_slice(self) -> slice:

37

"""

38

Convert page range to a slice object.

39

40

Returns:

41

Equivalent slice object

42

"""

43

44

def indices(self, n: int) -> tuple[int, int, int]:

45

"""

46

Get slice indices for a given length.

47

48

Args:

49

n: Total number of items

50

51

Returns:

52

Tuple of (start, stop, step) indices

53

"""

54

55

def __str__(self) -> str:

56

"""String representation of the page range."""

57

58

def __repr__(self) -> str:

59

"""Developer representation of the page range."""

60

61

def __eq__(self, other) -> bool:

62

"""Check equality with another PageRange."""

63

64

def __hash__(self) -> int:

65

"""Hash function for use in sets and dictionaries."""

66

67

def __add__(self, other):

68

"""Add two page ranges together."""

69

```

70

71

### Page Range Parsing

72

73

Utility function for parsing filename and page range combinations.

74

75

```python { .api }

76

def parse_filename_page_ranges(fnprs: list[str]) -> tuple[list[str], list[PageRange]]:

77

"""

78

Parse filename and page range strings.

79

80

Args:

81

fnprs: List of strings in format "filename[pages]" or just "filename"

82

Examples: ["doc.pdf[1-5]", "other.pdf", "file.pdf[2,4,6-8]"]

83

84

Returns:

85

Tuple of (filenames, page_ranges):

86

- filenames: List of extracted filenames

87

- page_ranges: List of corresponding PageRange objects

88

"""

89

```

90

91

### Paper Sizes

92

93

Standard paper size definitions for creating properly sized documents.

94

95

```python { .api }

96

class PaperSize:

97

"""Standard paper size definitions in points (72 points = 1 inch)."""

98

99

# ISO A series (most common internationally)

100

A0: tuple[float, float] = (2384, 3370) # 841 × 1189 mm

101

A1: tuple[float, float] = (1684, 2384) # 594 × 841 mm

102

A2: tuple[float, float] = (1191, 1684) # 420 × 594 mm

103

A3: tuple[float, float] = (842, 1191) # 297 × 420 mm

104

A4: tuple[float, float] = (595, 842) # 210 × 297 mm

105

A5: tuple[float, float] = (420, 595) # 148 × 210 mm

106

A6: tuple[float, float] = (298, 420) # 105 × 148 mm

107

A7: tuple[float, float] = (210, 298) # 74 × 105 mm

108

A8: tuple[float, float] = (147, 210) # 52 × 74 mm

109

110

# Envelope sizes

111

C4: tuple[float, float] = (649, 918) # 229 × 324 mm envelope

112

```

113

114

### Constants and Enums

115

116

PDF-specific constants, enums, and flags for various operations.

117

118

```python { .api }

119

from enum import IntEnum, IntFlag

120

121

class PasswordType(IntEnum):

122

"""Types of PDF passwords."""

123

NOT_DECRYPTED = 0

124

USER_PASSWORD = 1

125

OWNER_PASSWORD = 2

126

127

class ImageType(IntFlag):

128

"""Types of images that can be extracted or processed."""

129

NONE = 0

130

XOBJECT_IMAGES = 1 # Form XObject images

131

INLINE_IMAGES = 2 # Inline images in content streams

132

DRAWING_IMAGES = 4 # Images created by drawing operations

133

IMAGES = XOBJECT_IMAGES | INLINE_IMAGES # Standard image types

134

ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES # All image types

135

136

class ObjectDeletionFlag(IntFlag):

137

"""Flags for controlling object deletion in PDFs."""

138

NONE = 0

139

TEXT = 1 # Text objects

140

LINKS = 2 # Link annotations

141

ATTACHMENTS = 4 # File attachments

142

OBJECTS_3D = 8 # 3D objects

143

ALL_ANNOTATIONS = 16 # All annotation types

144

XOBJECT_IMAGES = 32 # Form XObject images

145

INLINE_IMAGES = 64 # Inline images

146

DRAWING_IMAGES = 128 # Drawing-based images

147

IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES # All images

148

```

149

150

### Error Handling

151

152

Comprehensive exception hierarchy for proper error handling in PDF operations.

153

154

```python { .api }

155

class PyPdfError(Exception):

156

"""Base exception for all pypdf errors."""

157

158

class DeprecationError(PyPdfError):

159

"""Raised when deprecated functionality is used."""

160

161

class DependencyError(PyPdfError):

162

"""Raised when required dependencies are missing."""

163

164

class PdfReadError(PyPdfError):

165

"""Raised when PDF reading fails."""

166

167

class PdfStreamError(PdfReadError):

168

"""Raised when PDF stream processing fails."""

169

170

class FileNotDecryptedError(PdfReadError):

171

"""Raised when trying to access encrypted content without decryption."""

172

173

class WrongPasswordError(PdfReadError):

174

"""Raised when incorrect password is provided for encrypted PDF."""

175

176

class EmptyFileError(PdfReadError):

177

"""Raised when PDF file is empty or invalid."""

178

179

class ParseError(PyPdfError):

180

"""Raised when PDF parsing fails."""

181

182

class PageSizeNotDefinedError(ParseError):

183

"""Raised when page size cannot be determined."""

184

185

class EmptyImageDataError(ParseError):

186

"""Raised when image data is empty or invalid."""

187

188

class LimitReachedError(ParseError):

189

"""Raised when processing limits are exceeded."""

190

191

class PdfReadWarning(UserWarning):

192

"""Warning for non-fatal PDF reading issues."""

193

```

194

195

## Usage Examples

196

197

### Working with Page Ranges

198

199

```python

200

from pypdf import PdfReader, PdfWriter, PageRange

201

202

reader = PdfReader("document.pdf")

203

writer = PdfWriter()

204

205

# Create page ranges in different ways

206

range1 = PageRange("1-5") # Pages 1 through 5

207

range2 = PageRange("2,4,6") # Pages 2, 4, and 6

208

range3 = PageRange(slice(0, 10, 2)) # Every other page from 0 to 9

209

210

# Use page range to select pages

211

for page_num in range(len(reader.pages)):

212

if page_num in range1.indices(len(reader.pages)):

213

writer.add_page(reader.pages[page_num])

214

215

with open("selected_pages.pdf", "wb") as output:

216

writer.write(output)

217

```

218

219

### Page Range Validation and Conversion

220

221

```python

222

from pypdf import PageRange

223

224

# Validate page range inputs

225

inputs = ["1-10", "2,4,6", "invalid", slice(0, 5)]

226

227

for inp in inputs:

228

if PageRange.valid(inp):

229

pr = PageRange(inp)

230

print(f"Valid range: {inp} -> {pr}")

231

print(f" As slice: {pr.to_slice()}")

232

print(f" Indices for 20 pages: {pr.indices(20)}")

233

else:

234

print(f"Invalid range: {inp}")

235

```

236

237

### Parsing Filename and Page Ranges

238

239

```python

240

from pypdf import parse_filename_page_ranges

241

242

# Parse combined filename and page specifications

243

file_specs = [

244

"document.pdf[1-10]",

245

"report.pdf[2,4,6-8]",

246

"book.pdf", # No page range specified

247

"chapter1.pdf[5-]" # From page 5 to end

248

]

249

250

filenames, page_ranges = parse_filename_page_ranges(file_specs)

251

252

for filename, page_range in zip(filenames, page_ranges):

253

print(f"File: {filename}")

254

if page_range:

255

print(f" Pages: {page_range}")

256

else:

257

print(f" Pages: All")

258

```

259

260

### Using Standard Paper Sizes

261

262

```python

263

from pypdf import PdfWriter, PageObject, PaperSize

264

265

writer = PdfWriter()

266

267

# Create pages with standard sizes

268

sizes_to_create = [

269

("Letter", (612, 792)), # US Letter

270

("A4", PaperSize.A4), # ISO A4

271

("A3", PaperSize.A3), # ISO A3

272

("Legal", (612, 1008)) # US Legal

273

]

274

275

for name, (width, height) in sizes_to_create:

276

page = PageObject.create_blank_page(width, height)

277

writer.add_page(page)

278

print(f"Created {name} page: {width} x {height} points")

279

280

with open("standard_sizes.pdf", "wb") as output:

281

writer.write(output)

282

```

283

284

### Error Handling Best Practices

285

286

```python

287

from pypdf import PdfReader, PdfWriter

288

from pypdf.errors import (

289

PdfReadError, FileNotDecryptedError, WrongPasswordError,

290

EmptyFileError, ParseError

291

)

292

293

def safe_pdf_operation(pdf_path: str, password: str = None):

294

"""Safely perform PDF operations with comprehensive error handling."""

295

296

try:

297

reader = PdfReader(pdf_path, password=password)

298

299

if reader.is_encrypted and not password:

300

raise FileNotDecryptedError("PDF is encrypted but no password provided")

301

302

writer = PdfWriter()

303

304

# Process each page safely

305

for page_num, page in enumerate(reader.pages):

306

try:

307

# Attempt to extract text to verify page is readable

308

text = page.extract_text()

309

writer.add_page(page)

310

print(f"Processed page {page_num + 1}: {len(text)} characters")

311

312

except ParseError as e:

313

print(f"Warning: Could not process page {page_num + 1}: {e}")

314

# Skip problematic page or add blank page

315

blank_page = PageObject.create_blank_page(612, 792)

316

writer.add_page(blank_page)

317

318

# Save result

319

output_path = pdf_path.replace('.pdf', '_processed.pdf')

320

with open(output_path, "wb") as output:

321

writer.write(output)

322

323

print(f"Successfully processed {pdf_path}")

324

return True

325

326

except FileNotDecryptedError:

327

print(f"Error: {pdf_path} is encrypted. Please provide password.")

328

return False

329

330

except WrongPasswordError:

331

print(f"Error: Incorrect password for {pdf_path}")

332

return False

333

334

except EmptyFileError:

335

print(f"Error: {pdf_path} is empty or corrupted")

336

return False

337

338

except PdfReadError as e:

339

print(f"Error reading {pdf_path}: {e}")

340

return False

341

342

except Exception as e:

343

print(f"Unexpected error processing {pdf_path}: {e}")

344

return False

345

346

# Use the safe operation

347

success = safe_pdf_operation("document.pdf")

348

if not success:

349

success = safe_pdf_operation("document.pdf", password="secret")

350

```

351

352

### Working with Image Types

353

354

```python

355

from pypdf import PdfReader, ImageType

356

357

reader = PdfReader("document_with_images.pdf")

358

359

for page_num, page in enumerate(reader.pages):

360

print(f"Page {page_num + 1}:")

361

362

# Extract different types of images

363

try:

364

# All images

365

all_images = page.images

366

print(f" Total images: {len(all_images)}")

367

368

# You can specify image types when working with image extraction

369

# (This would be used in specific image extraction methods)

370

print(f" Image types available: {list(ImageType)}")

371

372

except Exception as e:

373

print(f" Error accessing images: {e}")

374

```

375

376

### Utility Functions for Common Operations

377

378

```python

379

from pypdf import PdfReader, PdfWriter, PageRange, PaperSize

380

from pypdf.errors import PyPdfError

381

382

def extract_page_range(input_pdf: str, output_pdf: str, page_range_str: str):

383

"""Extract specific pages to new PDF."""

384

try:

385

reader = PdfReader(input_pdf)

386

writer = PdfWriter()

387

388

# Parse page range

389

page_range = PageRange(page_range_str)

390

start, stop, step = page_range.indices(len(reader.pages))

391

392

# Extract pages

393

for i in range(start, stop, step):

394

if i < len(reader.pages):

395

writer.add_page(reader.pages[i])

396

397

with open(output_pdf, "wb") as output:

398

writer.write(output)

399

400

print(f"Extracted pages {page_range_str} to {output_pdf}")

401

402

except PyPdfError as e:

403

print(f"PDF Error: {e}")

404

except Exception as e:

405

print(f"Error: {e}")

406

407

def create_blank_document(output_pdf: str, page_count: int = 1, size: str = "A4"):

408

"""Create a blank PDF document."""

409

writer = PdfWriter()

410

411

# Get paper size

412

if hasattr(PaperSize, size):

413

width, height = getattr(PaperSize, size)

414

else:

415

# Default to A4 if size not found

416

width, height = PaperSize.A4

417

print(f"Unknown size '{size}', using A4")

418

419

# Create blank pages

420

for _ in range(page_count):

421

page = PageObject.create_blank_page(width, height)

422

writer.add_page(page)

423

424

with open(output_pdf, "wb") as output:

425

writer.write(output)

426

427

print(f"Created {page_count} blank {size} pages in {output_pdf}")

428

429

def get_pdf_info(pdf_path: str) -> dict:

430

"""Get comprehensive PDF information."""

431

try:

432

reader = PdfReader(pdf_path)

433

434

info = {

435

"filename": pdf_path,

436

"page_count": len(reader.pages),

437

"is_encrypted": reader.is_encrypted,

438

"pdf_version": reader.pdf_header,

439

"metadata": {},

440

"page_sizes": []

441

}

442

443

# Get metadata

444

if reader.metadata:

445

info["metadata"] = {

446

"title": reader.metadata.title,

447

"author": reader.metadata.author,

448

"subject": reader.metadata.subject,

449

"creator": reader.metadata.creator,

450

"producer": reader.metadata.producer

451

}

452

453

# Get page sizes

454

for i, page in enumerate(reader.pages):

455

try:

456

width = float(page.mediabox.width)

457

height = float(page.mediabox.height)

458

info["page_sizes"].append({

459

"page": i + 1,

460

"width": width,

461

"height": height,

462

"size_points": f"{width} x {height}"

463

})

464

except:

465

info["page_sizes"].append({

466

"page": i + 1,

467

"error": "Could not determine size"

468

})

469

470

return info

471

472

except Exception as e:

473

return {

474

"filename": pdf_path,

475

"error": str(e)

476

}

477

478

# Use utility functions

479

extract_page_range("document.pdf", "pages_1_to_5.pdf", "1-5")

480

create_blank_document("blank.pdf", 10, "A4")

481

info = get_pdf_info("document.pdf")

482

print(f"PDF Info: {info}")

483

```

484

485

## Error Classes and Exception Handling

486

487

### Exception Hierarchy

488

489

pypdf provides a comprehensive exception hierarchy for different types of PDF processing errors.

490

491

```python { .api }

492

# Base exception classes

493

class PyPdfError(Exception):

494

"""Base class for all exceptions raised by pypdf."""

495

496

class PdfReadError(PyPdfError):

497

"""Raised when there is an issue reading a PDF file."""

498

499

class PdfStreamError(PdfReadError):

500

"""Raised when there is an issue reading the stream of data in a PDF file."""

501

502

class ParseError(PyPdfError):

503

"""Raised when there is an issue parsing a PDF file."""

504

505

# File access and decryption errors

506

class FileNotDecryptedError(PdfReadError):

507

"""Raised when an encrypted PDF has not been successfully decrypted."""

508

509

class WrongPasswordError(FileNotDecryptedError):

510

"""Raised when the wrong password is used to decrypt an encrypted PDF."""

511

512

class EmptyFileError(PdfReadError):

513

"""Raised when a PDF file is empty or has no content."""

514

515

# Specific operation errors

516

class PageSizeNotDefinedError(PyPdfError):

517

"""Raised when the page size of a PDF document is not defined."""

518

519

class EmptyImageDataError(PyPdfError):

520

"""Raised when trying to process an image that has no data."""

521

522

class LimitReachedError(PyPdfError):

523

"""Raised when a limit is reached."""

524

525

# Dependency and deprecation errors

526

class DependencyError(Exception):

527

"""Raised when a required dependency is not available."""

528

529

class DeprecationError(Exception):

530

"""Raised when a deprecated feature is used."""

531

532

# Warnings

533

class PdfReadWarning(UserWarning):

534

"""Issued when there is a potential issue reading a PDF file, but it can still be read."""

535

```

536

537

### User Access Permission Constants

538

539

```python { .api }

540

from pypdf.constants import UserAccessPermissions

541

542

class UserAccessPermissions(IntFlag):

543

"""PDF user access permissions for encryption."""

544

545

PRINT = 4 # Allow printing

546

MODIFY = 8 # Allow document modification

547

EXTRACT = 16 # Allow text/graphics extraction

548

ADD_OR_MODIFY = 32 # Allow annotations/form fields

549

FILL_FORM_FIELDS = 256 # Allow form field filling

550

EXTRACT_TEXT_AND_GRAPHICS = 512 # Allow accessibility extraction

551

ASSEMBLE_DOC = 1024 # Allow document assembly

552

PRINT_TO_REPRESENTATION = 2048 # Allow high-quality printing

553

554

@classmethod

555

def all(cls) -> "UserAccessPermissions":

556

"""Get all permissions enabled."""

557

558

def to_dict(self) -> dict[str, bool]:

559

"""Convert permissions to dictionary format."""

560

561

@classmethod

562

def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions":

563

"""Create permissions from dictionary format."""

564

```

565

566

### Stream and Parsing Constants

567

568

```python { .api }

569

# Stream processing constants

570

STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"

571

572

# Core PDF structure constants

573

class Core:

574

OUTLINES = "/Outlines"

575

THREADS = "/Threads"

576

PAGE = "/Page"

577

PAGES = "/Pages"

578

CATALOG = "/Catalog"

579

580

class TrailerKeys:

581

SIZE = "/Size"

582

PREV = "/Prev"

583

ROOT = "/Root"

584

ENCRYPT = "/Encrypt"

585

INFO = "/Info"

586

ID = "/ID"

587

```