or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations.mdform-fields.mdindex.mdmetadata.mdpage-operations.mdreading-writing.mdtext-extraction.mdutilities.md

reading-writing.mddocs/

0

# PDF Reading and Writing

1

2

Core functionality for opening, reading, creating, and saving PDF documents. This module provides the fundamental classes for all PDF operations in pypdf.

3

4

## Capabilities

5

6

### PDF Reading

7

8

The PdfReader class provides comprehensive PDF file reading capabilities with support for encrypted documents, strict parsing modes, and complete document analysis.

9

10

```python { .api }

11

class PdfReader:

12

def __init__(self, stream, strict: bool = False, password: str | None = None):

13

"""

14

Initialize a PDF reader.

15

16

Args:

17

stream: Path to PDF file, file-like object, or bytes

18

strict: Enable strict parsing mode (default: False)

19

password: Password for encrypted PDFs (default: None)

20

"""

21

22

def decrypt(self, password: str) -> PasswordType:

23

"""

24

Decrypt an encrypted PDF.

25

26

Args:

27

password: Password to decrypt the PDF

28

29

Returns:

30

PasswordType indicating the type of password used

31

"""

32

33

def get_object(self, indirect_reference):

34

"""

35

Retrieve a PDF object by its indirect reference.

36

37

Args:

38

indirect_reference: Indirect object reference

39

40

Returns:

41

The requested PDF object

42

"""

43

44

def close(self) -> None:

45

"""Close the PDF file and free resources."""

46

47

def __enter__(self):

48

"""Context manager entry."""

49

50

def __exit__(self, exc_type, exc_val, exc_tb):

51

"""Context manager exit."""

52

53

@property

54

def is_encrypted(self) -> bool:

55

"""Check if the PDF is encrypted."""

56

57

@property

58

def metadata(self) -> DocumentInformation | None:

59

"""Get document metadata."""

60

61

@property

62

def pages(self):

63

"""Access to PDF pages collection."""

64

65

@property

66

def root_object(self):

67

"""Get the PDF catalog (root) object."""

68

69

@property

70

def pdf_header(self) -> str:

71

"""Get the PDF version header."""

72

73

@property

74

def xmp_metadata(self):

75

"""Get XMP metadata if present."""

76

```

77

78

### PDF Writing

79

80

The PdfWriter class enables PDF creation, modification, and output generation with support for encryption, incremental updates, and comprehensive page management.

81

82

```python { .api }

83

class PdfWriter:

84

def __init__(self, clone_from=None, incremental: bool = False):

85

"""

86

Initialize a PDF writer.

87

88

Args:

89

clone_from: PdfReader to clone structure from (optional)

90

incremental: Enable incremental updates (default: False)

91

"""

92

93

def add_page(self, page: PageObject) -> None:

94

"""

95

Add a page to the document.

96

97

Args:

98

page: PageObject to add

99

"""

100

101

def insert_page(self, page: PageObject, index: int) -> None:

102

"""

103

Insert a page at a specific position.

104

105

Args:

106

page: PageObject to insert

107

index: Position to insert at

108

"""

109

110

def add_blank_page(self, width: float, height: float) -> PageObject:

111

"""

112

Add a blank page with specified dimensions.

113

114

Args:

115

width: Page width in points

116

height: Page height in points

117

118

Returns:

119

The created PageObject

120

"""

121

122

def insert_blank_page(self, width: float, height: float, index: int) -> PageObject:

123

"""

124

Insert a blank page at a specific position.

125

126

Args:

127

width: Page width in points

128

height: Page height in points

129

index: Position to insert at

130

131

Returns:

132

The created PageObject

133

"""

134

135

def append_pages_from_reader(self, reader: PdfReader, after_page_append=None) -> None:

136

"""

137

Append all pages from another PDF reader.

138

139

Args:

140

reader: PdfReader to copy pages from

141

after_page_append: Optional callback function called after each page

142

"""

143

144

def write(self, stream) -> None:

145

"""

146

Write the PDF to a stream.

147

148

Args:

149

stream: Output stream (file-like object)

150

"""

151

152

def write_stream(self, stream) -> None:

153

"""

154

Alias for write() method.

155

156

Args:

157

stream: Output stream (file-like object)

158

"""

159

160

def encrypt(

161

self,

162

user_password: str,

163

owner_password: str | None = None,

164

use_128bit: bool = True,

165

permissions_flag: int = -1,

166

user_access_permissions: int | None = None

167

) -> None:

168

"""

169

Encrypt the PDF with password protection.

170

171

Args:

172

user_password: Password for opening the PDF

173

owner_password: Password for full access (defaults to user_password)

174

use_128bit: Use 128-bit encryption (default: True)

175

permissions_flag: Permissions bit flags

176

user_access_permissions: User access permissions

177

"""

178

179

def add_js(self, javascript: str) -> None:

180

"""

181

Add JavaScript to the PDF.

182

183

Args:

184

javascript: JavaScript code to add

185

"""

186

187

def add_attachment(self, filename: str, data: bytes) -> None:

188

"""

189

Add a file attachment to the PDF.

190

191

Args:

192

filename: Name of the attached file

193

data: File data as bytes

194

"""

195

196

def set_need_appearances_writer(self, state: bool = True) -> None:

197

"""

198

Set the needAppearances flag for form fields.

199

200

Args:

201

state: Whether to enable automatic appearance generation

202

"""

203

204

def clone_reader_document_root(self, reader: PdfReader) -> None:

205

"""

206

Clone the document structure from another PDF reader.

207

208

Args:

209

reader: PdfReader to clone from

210

"""

211

212

def clone_document_from_reader(self, reader: PdfReader, after_page_append=None) -> None:

213

"""

214

Clone an entire document from a reader.

215

216

Args:

217

reader: PdfReader to clone from

218

after_page_append: Optional callback after each page

219

"""

220

221

def compress_identical_objects(self, remove_duplicate_page_inheritable_objects: bool = True) -> None:

222

"""

223

Compress identical objects to reduce file size.

224

225

Args:

226

remove_duplicate_page_inheritable_objects: Remove duplicate inheritable objects

227

"""

228

229

def generate_file_identifiers(self) -> None:

230

"""Generate unique file identifiers for the PDF."""

231

232

def add_metadata(self, infos: dict[str, Any]) -> None:

233

"""

234

Add metadata dictionary to the PDF.

235

236

Args:

237

infos: Dictionary of metadata key-value pairs

238

"""

239

240

def get_reference(self, obj: PdfObject) -> IndirectObject:

241

"""

242

Get indirect reference for a PDF object.

243

244

Args:

245

obj: PDF object to get reference for

246

247

Returns:

248

Indirect object reference

249

"""

250

251

def update_page_form_field_values(

252

self,

253

page: PageObject,

254

fields: dict,

255

flags: int = 0

256

) -> None:

257

"""

258

Update form field values on a page.

259

260

Args:

261

page: PageObject containing the form

262

fields: Dictionary mapping field names to values

263

flags: Form field flags

264

"""

265

266

def __enter__(self):

267

"""Context manager entry."""

268

269

def __exit__(self, exc_type, exc_val, exc_tb):

270

"""Context manager exit."""

271

272

@property

273

def is_encrypted(self) -> bool:

274

"""Check if the writer will produce an encrypted PDF."""

275

276

@property

277

def root_object(self):

278

"""Get the PDF catalog (root) object."""

279

280

@property

281

def pdf_header(self) -> str:

282

"""Get the PDF version header."""

283

284

@property

285

def xmp_metadata(self):

286

"""Get XMP metadata if present."""

287

288

@property

289

def metadata(self) -> DocumentInformation | None:

290

"""Get document metadata."""

291

292

@property

293

def page_layout(self):

294

"""Get or set the page layout mode."""

295

296

@property

297

def page_mode(self):

298

"""Get or set the page viewing mode."""

299

```

300

301

## Usage Examples

302

303

### Basic Reading

304

305

```python

306

from pypdf import PdfReader

307

308

# Read from file path

309

reader = PdfReader("document.pdf")

310

print(f"Number of pages: {len(reader.pages)}")

311

312

# Read encrypted PDF

313

reader = PdfReader("encrypted.pdf", password="secret")

314

315

# Context manager usage

316

with PdfReader("document.pdf") as reader:

317

for page in reader.pages:

318

text = page.extract_text()

319

print(text)

320

```

321

322

### Basic Writing

323

324

```python

325

from pypdf import PdfWriter, PdfReader

326

327

# Create new PDF

328

writer = PdfWriter()

329

writer.add_blank_page(612, 792) # Letter size

330

with open("blank.pdf", "wb") as output:

331

writer.write(output)

332

333

# Copy pages from existing PDF

334

reader = PdfReader("source.pdf")

335

writer = PdfWriter()

336

writer.append_pages_from_reader(reader)

337

with open("copy.pdf", "wb") as output:

338

writer.write(output)

339

340

# Encrypt PDF

341

writer.encrypt("user_password", "owner_password")

342

```

343

344

### Document Merging

345

346

```python

347

from pypdf import PdfReader, PdfWriter

348

349

def merge_pdfs(input_files: list[str], output_file: str):

350

writer = PdfWriter()

351

352

for filename in input_files:

353

reader = PdfReader(filename)

354

writer.append_pages_from_reader(reader)

355

356

with open(output_file, "wb") as output:

357

writer.write(output)

358

359

merge_pdfs(["doc1.pdf", "doc2.pdf", "doc3.pdf"], "merged.pdf")

360

```

361

362

### Incremental Updates

363

364

```python

365

from pypdf import PdfReader, PdfWriter

366

367

# Open existing PDF for incremental update

368

reader = PdfReader("existing.pdf")

369

writer = PdfWriter(clone_from=reader, incremental=True)

370

371

# Make modifications

372

writer.add_blank_page(612, 792)

373

374

# Save with incremental update

375

with open("existing.pdf", "wb") as output:

376

writer.write(output)

377

```