0
# PDF Reading
1
2
Read and parse PDF files, access document structure, extract text and metadata, handle encrypted documents with password protection. The PdfReader class provides the primary interface for reading PDF files.
3
4
## Capabilities
5
6
### PdfReader Class
7
8
Main class for reading PDF files with comprehensive access to document structure, pages, metadata, and content.
9
10
```python { .api }
11
class PdfReader:
12
def __init__(self, stream: Union[str, bytes, Path], strict: bool = False, password: Union[None, str, bytes] = None):
13
"""
14
Initialize a PdfReader instance.
15
16
Args:
17
stream: PDF file path or file-like object
18
strict: Whether to raise exceptions for correctable problems (default: False)
19
password: Password for encrypted PDFs
20
21
Raises:
22
PdfReadError: If PDF cannot be read
23
WrongPasswordError: If password is incorrect
24
"""
25
26
@property
27
def pages(self) -> List[PageObject]:
28
"""List of all pages in the PDF document."""
29
30
@property
31
def metadata(self) -> DocumentInformation:
32
"""Document metadata including title, author, subject, etc."""
33
34
@property
35
def pdf_header(self) -> str:
36
"""PDF version string from document header."""
37
38
@property
39
def xmp_metadata(self) -> Optional[XmpInformation]:
40
"""XMP metadata if present in the document."""
41
42
@property
43
def is_encrypted(self) -> bool:
44
"""True if the PDF is encrypted."""
45
46
@property
47
def outline(self) -> OutlineType:
48
"""Document outline/bookmarks structure."""
49
50
@property
51
def named_destinations(self) -> Dict[str, Any]:
52
"""Named destinations in the document."""
53
54
@property
55
def page_layout(self) -> Optional[str]:
56
"""Page layout preference."""
57
58
@property
59
def page_mode(self) -> Optional[PagemodeType]:
60
"""Page mode preference."""
61
62
@property
63
def threads(self) -> Optional[ArrayObject]:
64
"""Article threads if present."""
65
66
@property
67
def xfa(self) -> Optional[Dict[str, Any]]:
68
"""XFA (XML Forms Architecture) data if present."""
69
70
def get_page(self, page_number: int) -> PageObject:
71
"""
72
Get a specific page by number.
73
74
Args:
75
page_number (int): Zero-based page index
76
77
Returns:
78
PageObject: The requested page
79
80
Raises:
81
IndexError: If page number is out of range
82
"""
83
84
def get_fields(self, tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]:
85
"""
86
Get form fields from the PDF.
87
88
Returns:
89
dict: Form field data, or None if no fields present
90
"""
91
92
def get_form_text_fields(self) -> Dict[str, Any]:
93
"""
94
Get text form fields and their values.
95
96
Returns:
97
dict: Text field names and values
98
"""
99
100
def get_page_number(self, page: PageObject) -> int:
101
"""
102
Get the page number for a given PageObject.
103
104
Args:
105
page (PageObject): Page object to find
106
107
Returns:
108
int: Zero-based page number
109
110
Raises:
111
ValueError: If page is not in this document
112
"""
113
114
def get_destination_page_number(self, destination: Destination) -> int:
115
"""
116
Get page number for a destination.
117
118
Args:
119
destination (Destination): Destination object
120
121
Returns:
122
int: Zero-based page number
123
"""
124
125
def decrypt(self, password: Union[str, bytes]) -> PasswordType:
126
"""
127
Decrypt an encrypted PDF.
128
129
Args:
130
password (str): Password to try
131
132
Returns:
133
PasswordType: Type of password used (USER_PASSWORD, OWNER_PASSWORD, or NOT_DECRYPTED)
134
135
Raises:
136
WrongPasswordError: If password is incorrect
137
"""
138
139
def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
140
"""
141
Decode permission flags from encryption dictionary.
142
143
Args:
144
permissions_code (int): Raw permissions integer
145
146
Returns:
147
dict: Human-readable permission flags
148
"""
149
```
150
151
### Document Information
152
153
Container for PDF document metadata with standardized fields.
154
155
```python { .api }
156
class DocumentInformation(DictionaryObject):
157
"""PDF document metadata container."""
158
159
@property
160
def title(self) -> Optional[str]:
161
"""Document title."""
162
163
@property
164
def title_raw(self) -> Optional[str]:
165
"""Raw document title (unprocessed)."""
166
167
@property
168
def author(self) -> Optional[str]:
169
"""Document author."""
170
171
@property
172
def author_raw(self) -> Optional[str]:
173
"""Raw document author (unprocessed)."""
174
175
@property
176
def subject(self) -> Optional[str]:
177
"""Document subject."""
178
179
@property
180
def subject_raw(self) -> Optional[str]:
181
"""Raw document subject (unprocessed)."""
182
183
@property
184
def creator(self) -> Optional[str]:
185
"""Application that created the document."""
186
187
@property
188
def creator_raw(self) -> Optional[str]:
189
"""Raw document creator (unprocessed)."""
190
191
@property
192
def producer(self) -> Optional[str]:
193
"""Application that produced the PDF."""
194
195
@property
196
def producer_raw(self) -> Optional[str]:
197
"""Raw document producer (unprocessed)."""
198
199
@property
200
def creation_date(self) -> Optional[str]:
201
"""Document creation date."""
202
203
@property
204
def creation_date_raw(self) -> Optional[str]:
205
"""Raw document creation date (unprocessed)."""
206
207
@property
208
def modification_date(self) -> Optional[str]:
209
"""Document modification date."""
210
211
@property
212
def modification_date_raw(self) -> Optional[str]:
213
"""Raw document modification date (unprocessed)."""
214
```
215
216
### XMP Metadata
217
218
Extended metadata in XMP format for documents that include it.
219
220
```python { .api }
221
class XmpInformation:
222
"""XMP metadata information handler."""
223
224
# Methods for parsing and accessing XMP metadata
225
# Implementation varies based on XMP content structure
226
```
227
228
## Usage Examples
229
230
### Basic PDF Reading
231
232
```python
233
from PyPDF2 import PdfReader
234
235
# Open and read a PDF file
236
reader = PdfReader("document.pdf")
237
238
# Access basic information
239
print(f"Number of pages: {len(reader.pages)}")
240
print(f"PDF version: {reader.pdf_header}")
241
print(f"Is encrypted: {reader.is_encrypted}")
242
243
# Access metadata
244
if reader.metadata:
245
print(f"Title: {reader.metadata.title}")
246
print(f"Author: {reader.metadata.author}")
247
print(f"Subject: {reader.metadata.subject}")
248
```
249
250
### Working with Encrypted PDFs
251
252
```python
253
from PyPDF2 import PdfReader, WrongPasswordError
254
255
try:
256
# Try to open encrypted PDF
257
reader = PdfReader("encrypted.pdf")
258
259
if reader.is_encrypted:
260
# Decrypt with password
261
password_type = reader.decrypt("user_password")
262
print(f"Decrypted with: {password_type}")
263
264
# Check permissions
265
permissions = reader.decode_permissions(reader.encryption.permissions_flag)
266
print(f"Can print: {permissions.get('print', False)}")
267
print(f"Can modify: {permissions.get('modify', False)}")
268
269
except WrongPasswordError:
270
print("Incorrect password provided")
271
```
272
273
### Extracting Text from All Pages
274
275
```python
276
from PyPDF2 import PdfReader
277
278
reader = PdfReader("document.pdf")
279
full_text = ""
280
281
for page_num, page in enumerate(reader.pages):
282
text = page.extract_text()
283
full_text += f"\\n--- Page {page_num + 1} ---\\n{text}"
284
285
print(full_text)
286
```
287
288
### Working with Form Fields
289
290
```python
291
from PyPDF2 import PdfReader
292
293
reader = PdfReader("form.pdf")
294
295
# Get all form fields
296
fields = reader.get_fields()
297
if fields:
298
for field_name, field_info in fields.items():
299
print(f"Field: {field_name}, Value: {field_info.get('value', 'N/A')}")
300
301
# Get only text fields
302
text_fields = reader.get_form_text_fields()
303
for field_name, value in text_fields.items():
304
print(f"Text field: {field_name} = {value}")
305
```
306
307
## Deprecated Classes
308
309
### PdfFileReader (Deprecated)
310
311
```python { .api }
312
class PdfFileReader:
313
"""DEPRECATED: Use PdfReader instead. Will be removed in PyPDF2 3.0.0."""
314
```
315
316
This class is deprecated and should not be used in new code. All functionality has been moved to `PdfReader` with the same API.