0
# PDF Reading and Writing
1
2
Core functionality for opening, reading, creating, and saving PDF documents. This module provides the fundamental classes for all PDF operations in pypdf.
3
4
## Capabilities
5
6
### PDF Reading
7
8
The PdfReader class provides comprehensive PDF file reading capabilities with support for encrypted documents, strict parsing modes, and complete document analysis.
9
10
```python { .api }
11
class PdfReader:
12
def __init__(self, stream, strict: bool = False, password: str | None = None):
13
"""
14
Initialize a PDF reader.
15
16
Args:
17
stream: Path to PDF file, file-like object, or bytes
18
strict: Enable strict parsing mode (default: False)
19
password: Password for encrypted PDFs (default: None)
20
"""
21
22
def decrypt(self, password: str) -> PasswordType:
23
"""
24
Decrypt an encrypted PDF.
25
26
Args:
27
password: Password to decrypt the PDF
28
29
Returns:
30
PasswordType indicating the type of password used
31
"""
32
33
def get_object(self, indirect_reference):
34
"""
35
Retrieve a PDF object by its indirect reference.
36
37
Args:
38
indirect_reference: Indirect object reference
39
40
Returns:
41
The requested PDF object
42
"""
43
44
def close(self) -> None:
45
"""Close the PDF file and free resources."""
46
47
def __enter__(self):
48
"""Context manager entry."""
49
50
def __exit__(self, exc_type, exc_val, exc_tb):
51
"""Context manager exit."""
52
53
@property
54
def is_encrypted(self) -> bool:
55
"""Check if the PDF is encrypted."""
56
57
@property
58
def metadata(self) -> DocumentInformation | None:
59
"""Get document metadata."""
60
61
@property
62
def pages(self):
63
"""Access to PDF pages collection."""
64
65
@property
66
def root_object(self):
67
"""Get the PDF catalog (root) object."""
68
69
@property
70
def pdf_header(self) -> str:
71
"""Get the PDF version header."""
72
73
@property
74
def xmp_metadata(self):
75
"""Get XMP metadata if present."""
76
```
77
78
### PDF Writing
79
80
The PdfWriter class enables PDF creation, modification, and output generation with support for encryption, incremental updates, and comprehensive page management.
81
82
```python { .api }
83
class PdfWriter:
84
def __init__(self, clone_from=None, incremental: bool = False):
85
"""
86
Initialize a PDF writer.
87
88
Args:
89
clone_from: PdfReader to clone structure from (optional)
90
incremental: Enable incremental updates (default: False)
91
"""
92
93
def add_page(self, page: PageObject) -> None:
94
"""
95
Add a page to the document.
96
97
Args:
98
page: PageObject to add
99
"""
100
101
def insert_page(self, page: PageObject, index: int) -> None:
102
"""
103
Insert a page at a specific position.
104
105
Args:
106
page: PageObject to insert
107
index: Position to insert at
108
"""
109
110
def add_blank_page(self, width: float, height: float) -> PageObject:
111
"""
112
Add a blank page with specified dimensions.
113
114
Args:
115
width: Page width in points
116
height: Page height in points
117
118
Returns:
119
The created PageObject
120
"""
121
122
def insert_blank_page(self, width: float, height: float, index: int) -> PageObject:
123
"""
124
Insert a blank page at a specific position.
125
126
Args:
127
width: Page width in points
128
height: Page height in points
129
index: Position to insert at
130
131
Returns:
132
The created PageObject
133
"""
134
135
def append_pages_from_reader(self, reader: PdfReader, after_page_append=None) -> None:
136
"""
137
Append all pages from another PDF reader.
138
139
Args:
140
reader: PdfReader to copy pages from
141
after_page_append: Optional callback function called after each page
142
"""
143
144
def write(self, stream) -> None:
145
"""
146
Write the PDF to a stream.
147
148
Args:
149
stream: Output stream (file-like object)
150
"""
151
152
def write_stream(self, stream) -> None:
153
"""
154
Alias for write() method.
155
156
Args:
157
stream: Output stream (file-like object)
158
"""
159
160
def encrypt(
161
self,
162
user_password: str,
163
owner_password: str | None = None,
164
use_128bit: bool = True,
165
permissions_flag: int = -1,
166
user_access_permissions: int | None = None
167
) -> None:
168
"""
169
Encrypt the PDF with password protection.
170
171
Args:
172
user_password: Password for opening the PDF
173
owner_password: Password for full access (defaults to user_password)
174
use_128bit: Use 128-bit encryption (default: True)
175
permissions_flag: Permissions bit flags
176
user_access_permissions: User access permissions
177
"""
178
179
def add_js(self, javascript: str) -> None:
180
"""
181
Add JavaScript to the PDF.
182
183
Args:
184
javascript: JavaScript code to add
185
"""
186
187
def add_attachment(self, filename: str, data: bytes) -> None:
188
"""
189
Add a file attachment to the PDF.
190
191
Args:
192
filename: Name of the attached file
193
data: File data as bytes
194
"""
195
196
def set_need_appearances_writer(self, state: bool = True) -> None:
197
"""
198
Set the needAppearances flag for form fields.
199
200
Args:
201
state: Whether to enable automatic appearance generation
202
"""
203
204
def clone_reader_document_root(self, reader: PdfReader) -> None:
205
"""
206
Clone the document structure from another PDF reader.
207
208
Args:
209
reader: PdfReader to clone from
210
"""
211
212
def clone_document_from_reader(self, reader: PdfReader, after_page_append=None) -> None:
213
"""
214
Clone an entire document from a reader.
215
216
Args:
217
reader: PdfReader to clone from
218
after_page_append: Optional callback after each page
219
"""
220
221
def compress_identical_objects(self, remove_duplicate_page_inheritable_objects: bool = True) -> None:
222
"""
223
Compress identical objects to reduce file size.
224
225
Args:
226
remove_duplicate_page_inheritable_objects: Remove duplicate inheritable objects
227
"""
228
229
def generate_file_identifiers(self) -> None:
230
"""Generate unique file identifiers for the PDF."""
231
232
def add_metadata(self, infos: dict[str, Any]) -> None:
233
"""
234
Add metadata dictionary to the PDF.
235
236
Args:
237
infos: Dictionary of metadata key-value pairs
238
"""
239
240
def get_reference(self, obj: PdfObject) -> IndirectObject:
241
"""
242
Get indirect reference for a PDF object.
243
244
Args:
245
obj: PDF object to get reference for
246
247
Returns:
248
Indirect object reference
249
"""
250
251
def update_page_form_field_values(
252
self,
253
page: PageObject,
254
fields: dict,
255
flags: int = 0
256
) -> None:
257
"""
258
Update form field values on a page.
259
260
Args:
261
page: PageObject containing the form
262
fields: Dictionary mapping field names to values
263
flags: Form field flags
264
"""
265
266
def __enter__(self):
267
"""Context manager entry."""
268
269
def __exit__(self, exc_type, exc_val, exc_tb):
270
"""Context manager exit."""
271
272
@property
273
def is_encrypted(self) -> bool:
274
"""Check if the writer will produce an encrypted PDF."""
275
276
@property
277
def root_object(self):
278
"""Get the PDF catalog (root) object."""
279
280
@property
281
def pdf_header(self) -> str:
282
"""Get the PDF version header."""
283
284
@property
285
def xmp_metadata(self):
286
"""Get XMP metadata if present."""
287
288
@property
289
def metadata(self) -> DocumentInformation | None:
290
"""Get document metadata."""
291
292
@property
293
def page_layout(self):
294
"""Get or set the page layout mode."""
295
296
@property
297
def page_mode(self):
298
"""Get or set the page viewing mode."""
299
```
300
301
## Usage Examples
302
303
### Basic Reading
304
305
```python
306
from pypdf import PdfReader
307
308
# Read from file path
309
reader = PdfReader("document.pdf")
310
print(f"Number of pages: {len(reader.pages)}")
311
312
# Read encrypted PDF
313
reader = PdfReader("encrypted.pdf", password="secret")
314
315
# Context manager usage
316
with PdfReader("document.pdf") as reader:
317
for page in reader.pages:
318
text = page.extract_text()
319
print(text)
320
```
321
322
### Basic Writing
323
324
```python
325
from pypdf import PdfWriter, PdfReader
326
327
# Create new PDF
328
writer = PdfWriter()
329
writer.add_blank_page(612, 792) # Letter size
330
with open("blank.pdf", "wb") as output:
331
writer.write(output)
332
333
# Copy pages from existing PDF
334
reader = PdfReader("source.pdf")
335
writer = PdfWriter()
336
writer.append_pages_from_reader(reader)
337
with open("copy.pdf", "wb") as output:
338
writer.write(output)
339
340
# Encrypt PDF
341
writer.encrypt("user_password", "owner_password")
342
```
343
344
### Document Merging
345
346
```python
347
from pypdf import PdfReader, PdfWriter
348
349
def merge_pdfs(input_files: list[str], output_file: str):
350
writer = PdfWriter()
351
352
for filename in input_files:
353
reader = PdfReader(filename)
354
writer.append_pages_from_reader(reader)
355
356
with open(output_file, "wb") as output:
357
writer.write(output)
358
359
merge_pdfs(["doc1.pdf", "doc2.pdf", "doc3.pdf"], "merged.pdf")
360
```
361
362
### Incremental Updates
363
364
```python
365
from pypdf import PdfReader, PdfWriter
366
367
# Open existing PDF for incremental update
368
reader = PdfReader("existing.pdf")
369
writer = PdfWriter(clone_from=reader, incremental=True)
370
371
# Make modifications
372
writer.add_blank_page(612, 792)
373
374
# Save with incremental update
375
with open("existing.pdf", "wb") as output:
376
writer.write(output)
377
```