Tessl Tile for pypi/pypdf2@2.12.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

errors-and-utilities.md index.md page-manipulation.md pdf-merging.md pdf-reading.md pdf-writing.md types-and-objects.md

pdf-reading.mddocs/

0
# PDF Reading
1

2
Read and parse PDF files, access document structure, extract text and metadata, handle encrypted documents with password protection. The PdfReader class provides the primary interface for reading PDF files.
3

4
## Capabilities
5

6
### PdfReader Class
7

8
Main class for reading PDF files with comprehensive access to document structure, pages, metadata, and content.
9

10
```python { .api }
11
class PdfReader:
12
    def __init__(self, stream: Union[str, bytes, Path], strict: bool = False, password: Union[None, str, bytes] = None):
13
        """
14
        Initialize a PdfReader instance.
15

16
        Args:
17
            stream: PDF file path or file-like object
18
            strict: Whether to raise exceptions for correctable problems (default: False)
19
            password: Password for encrypted PDFs
20

21
        Raises:
22
            PdfReadError: If PDF cannot be read
23
            WrongPasswordError: If password is incorrect
24
        """
25

26
    @property
27
    def pages(self) -> List[PageObject]:
28
        """List of all pages in the PDF document."""
29

30
    @property  
31
    def metadata(self) -> DocumentInformation:
32
        """Document metadata including title, author, subject, etc."""
33

34
    @property
35
    def pdf_header(self) -> str:
36
        """PDF version string from document header."""
37

38
    @property
39
    def xmp_metadata(self) -> Optional[XmpInformation]:
40
        """XMP metadata if present in the document."""
41

42
    @property
43
    def is_encrypted(self) -> bool:
44
        """True if the PDF is encrypted."""
45

46
    @property
47
    def outline(self) -> OutlineType:
48
        """Document outline/bookmarks structure."""
49

50
    @property
51
    def named_destinations(self) -> Dict[str, Any]:
52
        """Named destinations in the document."""
53

54
    @property
55
    def page_layout(self) -> Optional[str]:
56
        """Page layout preference."""
57

58
    @property
59
    def page_mode(self) -> Optional[PagemodeType]:
60
        """Page mode preference."""
61

62
    @property
63
    def threads(self) -> Optional[ArrayObject]:
64
        """Article threads if present."""
65

66
    @property
67
    def xfa(self) -> Optional[Dict[str, Any]]:
68
        """XFA (XML Forms Architecture) data if present."""
69

70
    def get_page(self, page_number: int) -> PageObject:
71
        """
72
        Get a specific page by number.
73

74
        Args:
75
            page_number (int): Zero-based page index
76

77
        Returns:
78
            PageObject: The requested page
79

80
        Raises:
81
            IndexError: If page number is out of range
82
        """
83

84
    def get_fields(self, tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]:
85
        """
86
        Get form fields from the PDF.
87

88
        Returns:
89
            dict: Form field data, or None if no fields present
90
        """
91

92
    def get_form_text_fields(self) -> Dict[str, Any]:
93
        """
94
        Get text form fields and their values.
95

96
        Returns:
97
            dict: Text field names and values
98
        """
99

100
    def get_page_number(self, page: PageObject) -> int:
101
        """
102
        Get the page number for a given PageObject.
103

104
        Args:
105
            page (PageObject): Page object to find
106

107
        Returns:
108
            int: Zero-based page number
109

110
        Raises:
111
            ValueError: If page is not in this document
112
        """
113

114
    def get_destination_page_number(self, destination: Destination) -> int:
115
        """
116
        Get page number for a destination.
117

118
        Args:
119
            destination (Destination): Destination object
120

121
        Returns:
122
            int: Zero-based page number
123
        """
124

125
    def decrypt(self, password: Union[str, bytes]) -> PasswordType:
126
        """
127
        Decrypt an encrypted PDF.
128

129
        Args:
130
            password (str): Password to try
131

132
        Returns:
133
            PasswordType: Type of password used (USER_PASSWORD, OWNER_PASSWORD, or NOT_DECRYPTED)
134

135
        Raises:
136
            WrongPasswordError: If password is incorrect
137
        """
138

139
    def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
140
        """
141
        Decode permission flags from encryption dictionary.
142

143
        Args:
144
            permissions_code (int): Raw permissions integer
145

146
        Returns:
147
            dict: Human-readable permission flags
148
        """
149
```
150

151
### Document Information
152

153
Container for PDF document metadata with standardized fields.
154

155
```python { .api }
156
class DocumentInformation(DictionaryObject):
157
    """PDF document metadata container."""
158

159
    @property
160
    def title(self) -> Optional[str]:
161
        """Document title."""
162

163
    @property
164
    def title_raw(self) -> Optional[str]:
165
        """Raw document title (unprocessed)."""
166

167
    @property
168
    def author(self) -> Optional[str]:
169
        """Document author."""
170

171
    @property
172
    def author_raw(self) -> Optional[str]:
173
        """Raw document author (unprocessed)."""
174

175
    @property
176
    def subject(self) -> Optional[str]:
177
        """Document subject."""
178

179
    @property
180
    def subject_raw(self) -> Optional[str]:
181
        """Raw document subject (unprocessed)."""
182

183
    @property
184
    def creator(self) -> Optional[str]:
185
        """Application that created the document."""
186

187
    @property
188
    def creator_raw(self) -> Optional[str]:
189
        """Raw document creator (unprocessed)."""
190

191
    @property
192
    def producer(self) -> Optional[str]:
193
        """Application that produced the PDF."""
194

195
    @property
196
    def producer_raw(self) -> Optional[str]:
197
        """Raw document producer (unprocessed)."""
198

199
    @property
200
    def creation_date(self) -> Optional[str]:
201
        """Document creation date."""
202

203
    @property
204
    def creation_date_raw(self) -> Optional[str]:
205
        """Raw document creation date (unprocessed)."""
206

207
    @property
208
    def modification_date(self) -> Optional[str]:
209
        """Document modification date."""
210

211
    @property
212
    def modification_date_raw(self) -> Optional[str]:
213
        """Raw document modification date (unprocessed)."""
214
```
215

216
### XMP Metadata
217

218
Extended metadata in XMP format for documents that include it.
219

220
```python { .api }  
221
class XmpInformation:
222
    """XMP metadata information handler."""
223
    
224
    # Methods for parsing and accessing XMP metadata
225
    # Implementation varies based on XMP content structure
226
```
227

228
## Usage Examples
229

230
### Basic PDF Reading
231

232
```python
233
from PyPDF2 import PdfReader
234

235
# Open and read a PDF file
236
reader = PdfReader("document.pdf")
237

238
# Access basic information
239
print(f"Number of pages: {len(reader.pages)}")
240
print(f"PDF version: {reader.pdf_header}")
241
print(f"Is encrypted: {reader.is_encrypted}")
242

243
# Access metadata
244
if reader.metadata:
245
    print(f"Title: {reader.metadata.title}")
246
    print(f"Author: {reader.metadata.author}")
247
    print(f"Subject: {reader.metadata.subject}")
248
```
249

250
### Working with Encrypted PDFs
251

252
```python
253
from PyPDF2 import PdfReader, WrongPasswordError
254

255
try:
256
    # Try to open encrypted PDF
257
    reader = PdfReader("encrypted.pdf")
258
    
259
    if reader.is_encrypted:
260
        # Decrypt with password
261
        password_type = reader.decrypt("user_password")
262
        print(f"Decrypted with: {password_type}")
263
        
264
        # Check permissions
265
        permissions = reader.decode_permissions(reader.encryption.permissions_flag)
266
        print(f"Can print: {permissions.get('print', False)}")
267
        print(f"Can modify: {permissions.get('modify', False)}")
268
        
269
except WrongPasswordError:
270
    print("Incorrect password provided")
271
```
272

273
### Extracting Text from All Pages
274

275
```python
276
from PyPDF2 import PdfReader
277

278
reader = PdfReader("document.pdf")
279
full_text = ""
280

281
for page_num, page in enumerate(reader.pages):
282
    text = page.extract_text()
283
    full_text += f"\\n--- Page {page_num + 1} ---\\n{text}"
284

285
print(full_text)
286
```
287

288
### Working with Form Fields
289

290
```python
291
from PyPDF2 import PdfReader
292

293
reader = PdfReader("form.pdf")
294

295
# Get all form fields
296
fields = reader.get_fields()
297
if fields:
298
    for field_name, field_info in fields.items():
299
        print(f"Field: {field_name}, Value: {field_info.get('value', 'N/A')}")
300

301
# Get only text fields
302
text_fields = reader.get_form_text_fields()
303
for field_name, value in text_fields.items():
304
    print(f"Text field: {field_name} = {value}")
305
```
306

307
## Deprecated Classes
308

309
### PdfFileReader (Deprecated)
310

311
```python { .api }
312
class PdfFileReader:
313
    """DEPRECATED: Use PdfReader instead. Will be removed in PyPDF2 3.0.0."""
314
```
315

316
This class is deprecated and should not be used in new code. All functionality has been moved to `PdfReader` with the same API.

Version

Tile

Files

pdf-reading.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

pdf-reading.mddocs/