0
# PDF Document Operations
1
2
Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, document-level operations, and PDF repair capabilities.
3
4
## Capabilities
5
6
### Opening PDF Documents
7
8
The primary function for opening PDF documents from file paths, streams, or bytes with comprehensive configuration options.
9
10
```python { .api }
11
def open(path_or_fp, pages=None, laparams=None, password=None,
12
strict_metadata=False, unicode_norm=None, repair=False,
13
gs_path=None, repair_setting="default", raise_unicode_errors=True):
14
"""
15
Open PDF document from file path or stream.
16
17
Parameters:
18
- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
19
- pages: List[int] or Tuple[int], optional - Specific pages to parse
20
- laparams: Dict[str, Any], optional - Layout analysis parameters
21
- password: str, optional - PDF password for encrypted documents
22
- strict_metadata: bool - Raise errors for malformed metadata
23
- unicode_norm: str, optional - Unicode normalization ("NFC", "NFKC", "NFD", "NFKD")
24
- repair: bool - Attempt PDF repair using Ghostscript
25
- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
26
- repair_setting: str - Repair quality setting ("default", "prepress", "printer", "ebook", "screen")
27
- raise_unicode_errors: bool - Raise errors for unicode decoding issues
28
29
Returns:
30
PDF object with context manager support
31
"""
32
```
33
34
**Usage Examples:**
35
36
```python
37
# Open from file path
38
with pdfplumber.open("document.pdf") as pdf:
39
print(f"Document has {len(pdf.pages)} pages")
40
41
# Open specific pages only
42
with pdfplumber.open("large_doc.pdf", pages=[0, 1, 5]) as pdf:
43
for page in pdf.pages:
44
print(f"Page {page.page_number}: {page.extract_text()[:100]}")
45
46
# Open encrypted PDF
47
with pdfplumber.open("encrypted.pdf", password="secret") as pdf:
48
text = pdf.pages[0].extract_text()
49
50
# Open with repair for corrupted PDFs
51
with pdfplumber.open("corrupted.pdf", repair=True) as pdf:
52
text = pdf.pages[0].extract_text()
53
```
54
55
### PDF Class
56
57
The main PDF document class providing access to pages, metadata, and document-level operations.
58
59
```python { .api }
60
class PDF:
61
"""PDF document container with page access and metadata."""
62
63
def __init__(self, stream, stream_is_external=False, path=None,
64
pages=None, laparams=None, password=None,
65
strict_metadata=False, unicode_norm=None,
66
raise_unicode_errors=True):
67
"""Initialize PDF object from stream."""
68
69
@property
70
def pages(self) -> List[Page]:
71
"""List of page objects in document."""
72
73
@property
74
def objects(self) -> Dict[str, T_obj_list]:
75
"""All objects aggregated from all pages by type."""
76
77
@property
78
def annots(self) -> List[Dict[str, Any]]:
79
"""All annotations from all pages."""
80
81
@property
82
def hyperlinks(self) -> List[Dict[str, Any]]:
83
"""All hyperlinks from all pages."""
84
85
@property
86
def structure_tree(self) -> List[Dict[str, Any]]:
87
"""Document structure tree for accessibility."""
88
89
metadata: Dict
90
"""PDF metadata dictionary (instance variable)."""
91
92
def close(self):
93
"""Close PDF and cleanup resources."""
94
95
def __enter__(self):
96
"""Context manager entry."""
97
98
def __exit__(self, exc_type, exc_val, exc_tb):
99
"""Context manager exit with cleanup."""
100
```
101
102
**Usage Examples:**
103
104
```python
105
# Access document metadata
106
pdf = pdfplumber.open("document.pdf")
107
print(f"Title: {pdf.metadata.get('Title', 'No title')}")
108
print(f"Author: {pdf.metadata.get('Author', 'Unknown')}")
109
print(f"Created: {pdf.metadata.get('CreationDate', 'Unknown')}")
110
111
# Get all text objects from document
112
all_chars = pdf.objects.get('chars', [])
113
print(f"Document contains {len(all_chars)} character objects")
114
115
# Access document-level annotations
116
for annot in pdf.annots:
117
print(f"Annotation: {annot.get('contents', 'No content')}")
118
119
pdf.close()
120
```
121
122
### PDF Repair
123
124
Repair corrupted or malformed PDF documents using Ghostscript with various quality settings.
125
126
```python { .api }
127
def repair(path_or_fp, outfile=None, password=None, gs_path=None,
128
setting="default"):
129
"""
130
Repair PDF using Ghostscript.
131
132
Parameters:
133
- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
134
- outfile: str or pathlib.Path, optional - Output file path
135
- password: str, optional - PDF password
136
- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
137
- setting: str - Quality setting ("default", "prepress", "printer", "ebook", "screen")
138
139
Returns:
140
BytesIO containing repaired PDF data
141
"""
142
143
# Repair setting type
144
T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]
145
```
146
147
**Usage Examples:**
148
149
```python
150
# Repair PDF to memory
151
repaired_data = pdfplumber.repair("corrupted.pdf")
152
with pdfplumber.open(repaired_data) as pdf:
153
text = pdf.pages[0].extract_text()
154
155
# Repair PDF to file
156
pdfplumber.repair("corrupted.pdf", outfile="repaired.pdf")
157
158
# Repair with specific quality setting
159
pdfplumber.repair("corrupted.pdf", outfile="high_quality.pdf", setting="prepress")
160
161
# Repair encrypted PDF
162
pdfplumber.repair("encrypted_corrupted.pdf", password="secret", outfile="repaired.pdf")
163
```
164
165
### Container Base Class
166
167
Base class providing object property access and serialization methods inherited by PDF and Page classes.
168
169
```python { .api }
170
class Container:
171
"""Base container with object access and serialization."""
172
173
@property
174
def rects(self) -> T_obj_list:
175
"""Rectangle objects."""
176
177
@property
178
def lines(self) -> T_obj_list:
179
"""Line objects."""
180
181
@property
182
def curves(self) -> T_obj_list:
183
"""Curve objects."""
184
185
@property
186
def images(self) -> T_obj_list:
187
"""Image objects."""
188
189
@property
190
def chars(self) -> T_obj_list:
191
"""Character objects."""
192
193
@property
194
def textboxverticals(self) -> T_obj_list:
195
"""Vertical text box objects."""
196
197
@property
198
def textboxhorizontals(self) -> T_obj_list:
199
"""Horizontal text box objects."""
200
201
@property
202
def textlineverticals(self) -> T_obj_list:
203
"""Vertical text line objects."""
204
205
@property
206
def textlinehorizontals(self) -> T_obj_list:
207
"""Horizontal text line objects."""
208
209
@property
210
def rect_edges(self) -> T_obj_list:
211
"""Edges derived from rectangles."""
212
213
@property
214
def curve_edges(self) -> T_obj_list:
215
"""Edges derived from curves."""
216
217
@property
218
def edges(self) -> T_obj_list:
219
"""All edges (lines + rect_edges + curve_edges)."""
220
221
@property
222
def horizontal_edges(self) -> T_obj_list:
223
"""Horizontal edges only."""
224
225
@property
226
def vertical_edges(self) -> T_obj_list:
227
"""Vertical edges only."""
228
229
def flush_cache(self, properties=None):
230
"""Clear cached properties."""
231
232
def to_json(self, stream=None, object_types=None, include_attrs=None,
233
exclude_attrs=None, precision=None, indent=None):
234
"""Export as JSON."""
235
236
def to_csv(self, stream=None, object_types=None, precision=None,
237
include_attrs=None, exclude_attrs=None):
238
"""Export as CSV."""
239
240
def to_dict(self, object_types=None):
241
"""Convert to dictionary representation."""
242
```
243
244
## Error Handling
245
246
```python { .api }
247
# Custom exceptions for PDF operations
248
class MalformedPDFException(Exception):
249
"""Raised for malformed PDF files."""
250
251
class PdfminerException(Exception):
252
"""Wrapper for pdfminer exceptions."""
253
```
254
255
Common error scenarios:
256
- Invalid PDF files raise `MalformedPDFException`
257
- Missing Ghostscript for repair operations raises standard exceptions
258
- Encrypted PDFs without password raise pdfminer exceptions
259
- Unicode decoding errors when `raise_unicode_errors=True`