or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

errors-and-utilities.mdindex.mdpage-manipulation.mdpdf-merging.mdpdf-reading.mdpdf-writing.mdtypes-and-objects.md

pdf-reading.mddocs/

0

# PDF Reading

1

2

Read and parse PDF files, access document structure, extract text and metadata, handle encrypted documents with password protection. The PdfReader class provides the primary interface for reading PDF files.

3

4

## Capabilities

5

6

### PdfReader Class

7

8

Main class for reading PDF files with comprehensive access to document structure, pages, metadata, and content.

9

10

```python { .api }

11

class PdfReader:

12

def __init__(self, stream: Union[str, bytes, Path], strict: bool = False, password: Union[None, str, bytes] = None):

13

"""

14

Initialize a PdfReader instance.

15

16

Args:

17

stream: PDF file path or file-like object

18

strict: Whether to raise exceptions for correctable problems (default: False)

19

password: Password for encrypted PDFs

20

21

Raises:

22

PdfReadError: If PDF cannot be read

23

WrongPasswordError: If password is incorrect

24

"""

25

26

@property

27

def pages(self) -> List[PageObject]:

28

"""List of all pages in the PDF document."""

29

30

@property

31

def metadata(self) -> DocumentInformation:

32

"""Document metadata including title, author, subject, etc."""

33

34

@property

35

def pdf_header(self) -> str:

36

"""PDF version string from document header."""

37

38

@property

39

def xmp_metadata(self) -> Optional[XmpInformation]:

40

"""XMP metadata if present in the document."""

41

42

@property

43

def is_encrypted(self) -> bool:

44

"""True if the PDF is encrypted."""

45

46

@property

47

def outline(self) -> OutlineType:

48

"""Document outline/bookmarks structure."""

49

50

@property

51

def named_destinations(self) -> Dict[str, Any]:

52

"""Named destinations in the document."""

53

54

@property

55

def page_layout(self) -> Optional[str]:

56

"""Page layout preference."""

57

58

@property

59

def page_mode(self) -> Optional[PagemodeType]:

60

"""Page mode preference."""

61

62

@property

63

def threads(self) -> Optional[ArrayObject]:

64

"""Article threads if present."""

65

66

@property

67

def xfa(self) -> Optional[Dict[str, Any]]:

68

"""XFA (XML Forms Architecture) data if present."""

69

70

def get_page(self, page_number: int) -> PageObject:

71

"""

72

Get a specific page by number.

73

74

Args:

75

page_number (int): Zero-based page index

76

77

Returns:

78

PageObject: The requested page

79

80

Raises:

81

IndexError: If page number is out of range

82

"""

83

84

def get_fields(self, tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None) -> Optional[Dict[str, Any]]:

85

"""

86

Get form fields from the PDF.

87

88

Returns:

89

dict: Form field data, or None if no fields present

90

"""

91

92

def get_form_text_fields(self) -> Dict[str, Any]:

93

"""

94

Get text form fields and their values.

95

96

Returns:

97

dict: Text field names and values

98

"""

99

100

def get_page_number(self, page: PageObject) -> int:

101

"""

102

Get the page number for a given PageObject.

103

104

Args:

105

page (PageObject): Page object to find

106

107

Returns:

108

int: Zero-based page number

109

110

Raises:

111

ValueError: If page is not in this document

112

"""

113

114

def get_destination_page_number(self, destination: Destination) -> int:

115

"""

116

Get page number for a destination.

117

118

Args:

119

destination (Destination): Destination object

120

121

Returns:

122

int: Zero-based page number

123

"""

124

125

def decrypt(self, password: Union[str, bytes]) -> PasswordType:

126

"""

127

Decrypt an encrypted PDF.

128

129

Args:

130

password (str): Password to try

131

132

Returns:

133

PasswordType: Type of password used (USER_PASSWORD, OWNER_PASSWORD, or NOT_DECRYPTED)

134

135

Raises:

136

WrongPasswordError: If password is incorrect

137

"""

138

139

def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:

140

"""

141

Decode permission flags from encryption dictionary.

142

143

Args:

144

permissions_code (int): Raw permissions integer

145

146

Returns:

147

dict: Human-readable permission flags

148

"""

149

```

150

151

### Document Information

152

153

Container for PDF document metadata with standardized fields.

154

155

```python { .api }

156

class DocumentInformation(DictionaryObject):

157

"""PDF document metadata container."""

158

159

@property

160

def title(self) -> Optional[str]:

161

"""Document title."""

162

163

@property

164

def title_raw(self) -> Optional[str]:

165

"""Raw document title (unprocessed)."""

166

167

@property

168

def author(self) -> Optional[str]:

169

"""Document author."""

170

171

@property

172

def author_raw(self) -> Optional[str]:

173

"""Raw document author (unprocessed)."""

174

175

@property

176

def subject(self) -> Optional[str]:

177

"""Document subject."""

178

179

@property

180

def subject_raw(self) -> Optional[str]:

181

"""Raw document subject (unprocessed)."""

182

183

@property

184

def creator(self) -> Optional[str]:

185

"""Application that created the document."""

186

187

@property

188

def creator_raw(self) -> Optional[str]:

189

"""Raw document creator (unprocessed)."""

190

191

@property

192

def producer(self) -> Optional[str]:

193

"""Application that produced the PDF."""

194

195

@property

196

def producer_raw(self) -> Optional[str]:

197

"""Raw document producer (unprocessed)."""

198

199

@property

200

def creation_date(self) -> Optional[str]:

201

"""Document creation date."""

202

203

@property

204

def creation_date_raw(self) -> Optional[str]:

205

"""Raw document creation date (unprocessed)."""

206

207

@property

208

def modification_date(self) -> Optional[str]:

209

"""Document modification date."""

210

211

@property

212

def modification_date_raw(self) -> Optional[str]:

213

"""Raw document modification date (unprocessed)."""

214

```

215

216

### XMP Metadata

217

218

Extended metadata in XMP format for documents that include it.

219

220

```python { .api }

221

class XmpInformation:

222

"""XMP metadata information handler."""

223

224

# Methods for parsing and accessing XMP metadata

225

# Implementation varies based on XMP content structure

226

```

227

228

## Usage Examples

229

230

### Basic PDF Reading

231

232

```python

233

from PyPDF2 import PdfReader

234

235

# Open and read a PDF file

236

reader = PdfReader("document.pdf")

237

238

# Access basic information

239

print(f"Number of pages: {len(reader.pages)}")

240

print(f"PDF version: {reader.pdf_header}")

241

print(f"Is encrypted: {reader.is_encrypted}")

242

243

# Access metadata

244

if reader.metadata:

245

print(f"Title: {reader.metadata.title}")

246

print(f"Author: {reader.metadata.author}")

247

print(f"Subject: {reader.metadata.subject}")

248

```

249

250

### Working with Encrypted PDFs

251

252

```python

253

from PyPDF2 import PdfReader, WrongPasswordError

254

255

try:

256

# Try to open encrypted PDF

257

reader = PdfReader("encrypted.pdf")

258

259

if reader.is_encrypted:

260

# Decrypt with password

261

password_type = reader.decrypt("user_password")

262

print(f"Decrypted with: {password_type}")

263

264

# Check permissions

265

permissions = reader.decode_permissions(reader.encryption.permissions_flag)

266

print(f"Can print: {permissions.get('print', False)}")

267

print(f"Can modify: {permissions.get('modify', False)}")

268

269

except WrongPasswordError:

270

print("Incorrect password provided")

271

```

272

273

### Extracting Text from All Pages

274

275

```python

276

from PyPDF2 import PdfReader

277

278

reader = PdfReader("document.pdf")

279

full_text = ""

280

281

for page_num, page in enumerate(reader.pages):

282

text = page.extract_text()

283

full_text += f"\\n--- Page {page_num + 1} ---\\n{text}"

284

285

print(full_text)

286

```

287

288

### Working with Form Fields

289

290

```python

291

from PyPDF2 import PdfReader

292

293

reader = PdfReader("form.pdf")

294

295

# Get all form fields

296

fields = reader.get_fields()

297

if fields:

298

for field_name, field_info in fields.items():

299

print(f"Field: {field_name}, Value: {field_info.get('value', 'N/A')}")

300

301

# Get only text fields

302

text_fields = reader.get_form_text_fields()

303

for field_name, value in text_fields.items():

304

print(f"Text field: {field_name} = {value}")

305

```

306

307

## Deprecated Classes

308

309

### PdfFileReader (Deprecated)

310

311

```python { .api }

312

class PdfFileReader:

313

"""DEPRECATED: Use PdfReader instead. Will be removed in PyPDF2 3.0.0."""

314

```

315

316

This class is deprecated and should not be used in new code. All functionality has been moved to `PdfReader` with the same API.