Tessl Tile for pypi/pdfplumber@0.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli.md index.md page-manipulation.md pdf-operations.md table-extraction.md text-extraction.md utilities.md visual-debugging.md

pdf-operations.mddocs/

0
# PDF Document Operations
1

2
Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, document-level operations, and PDF repair capabilities.
3

4
## Capabilities
5

6
### Opening PDF Documents
7

8
The primary function for opening PDF documents from file paths, streams, or bytes with comprehensive configuration options.
9

10
```python { .api }
11
def open(path_or_fp, pages=None, laparams=None, password=None, 
12
         strict_metadata=False, unicode_norm=None, repair=False, 
13
         gs_path=None, repair_setting="default", raise_unicode_errors=True):
14
    """
15
    Open PDF document from file path or stream.
16
    
17
    Parameters:
18
    - path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
19
    - pages: List[int] or Tuple[int], optional - Specific pages to parse
20
    - laparams: Dict[str, Any], optional - Layout analysis parameters
21
    - password: str, optional - PDF password for encrypted documents
22
    - strict_metadata: bool - Raise errors for malformed metadata
23
    - unicode_norm: str, optional - Unicode normalization ("NFC", "NFKC", "NFD", "NFKD")
24
    - repair: bool - Attempt PDF repair using Ghostscript
25
    - gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
26
    - repair_setting: str - Repair quality setting ("default", "prepress", "printer", "ebook", "screen")
27
    - raise_unicode_errors: bool - Raise errors for unicode decoding issues
28
    
29
    Returns:
30
    PDF object with context manager support
31
    """
32
```
33

34
**Usage Examples:**
35

36
```python
37
# Open from file path
38
with pdfplumber.open("document.pdf") as pdf:
39
    print(f"Document has {len(pdf.pages)} pages")
40

41
# Open specific pages only
42
with pdfplumber.open("large_doc.pdf", pages=[0, 1, 5]) as pdf:
43
    for page in pdf.pages:
44
        print(f"Page {page.page_number}: {page.extract_text()[:100]}")
45

46
# Open encrypted PDF
47
with pdfplumber.open("encrypted.pdf", password="secret") as pdf:
48
    text = pdf.pages[0].extract_text()
49

50
# Open with repair for corrupted PDFs
51
with pdfplumber.open("corrupted.pdf", repair=True) as pdf:
52
    text = pdf.pages[0].extract_text()
53
```
54

55
### PDF Class
56

57
The main PDF document class providing access to pages, metadata, and document-level operations.
58

59
```python { .api }
60
class PDF:
61
    """PDF document container with page access and metadata."""
62
    
63
    def __init__(self, stream, stream_is_external=False, path=None, 
64
                 pages=None, laparams=None, password=None, 
65
                 strict_metadata=False, unicode_norm=None, 
66
                 raise_unicode_errors=True):
67
        """Initialize PDF object from stream."""
68
        
69
    @property
70
    def pages(self) -> List[Page]:
71
        """List of page objects in document."""
72
    
73
    @property
74
    def objects(self) -> Dict[str, T_obj_list]:
75
        """All objects aggregated from all pages by type."""
76
    
77
    @property
78
    def annots(self) -> List[Dict[str, Any]]:
79
        """All annotations from all pages."""
80
    
81
    @property
82
    def hyperlinks(self) -> List[Dict[str, Any]]:
83
        """All hyperlinks from all pages."""
84
    
85
    @property
86
    def structure_tree(self) -> List[Dict[str, Any]]:
87
        """Document structure tree for accessibility."""
88
    
89
    metadata: Dict
90
    """PDF metadata dictionary (instance variable)."""
91
    
92
    def close(self):
93
        """Close PDF and cleanup resources."""
94
    
95
    def __enter__(self):
96
        """Context manager entry."""
97
    
98
    def __exit__(self, exc_type, exc_val, exc_tb):
99
        """Context manager exit with cleanup."""
100
```
101

102
**Usage Examples:**
103

104
```python
105
# Access document metadata
106
pdf = pdfplumber.open("document.pdf")
107
print(f"Title: {pdf.metadata.get('Title', 'No title')}")
108
print(f"Author: {pdf.metadata.get('Author', 'Unknown')}")
109
print(f"Created: {pdf.metadata.get('CreationDate', 'Unknown')}")
110

111
# Get all text objects from document
112
all_chars = pdf.objects.get('chars', [])
113
print(f"Document contains {len(all_chars)} character objects")
114

115
# Access document-level annotations
116
for annot in pdf.annots:
117
    print(f"Annotation: {annot.get('contents', 'No content')}")
118
    
119
pdf.close()
120
```
121

122
### PDF Repair
123

124
Repair corrupted or malformed PDF documents using Ghostscript with various quality settings.
125

126
```python { .api }
127
def repair(path_or_fp, outfile=None, password=None, gs_path=None, 
128
           setting="default"):
129
    """
130
    Repair PDF using Ghostscript.
131
    
132
    Parameters:
133
    - path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source
134
    - outfile: str or pathlib.Path, optional - Output file path
135
    - password: str, optional - PDF password
136
    - gs_path: str or pathlib.Path, optional - Path to Ghostscript executable
137
    - setting: str - Quality setting ("default", "prepress", "printer", "ebook", "screen")
138
    
139
    Returns:
140
    BytesIO containing repaired PDF data
141
    """
142

143
# Repair setting type
144
T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]
145
```
146

147
**Usage Examples:**
148

149
```python
150
# Repair PDF to memory
151
repaired_data = pdfplumber.repair("corrupted.pdf")
152
with pdfplumber.open(repaired_data) as pdf:
153
    text = pdf.pages[0].extract_text()
154

155
# Repair PDF to file
156
pdfplumber.repair("corrupted.pdf", outfile="repaired.pdf")
157

158
# Repair with specific quality setting
159
pdfplumber.repair("corrupted.pdf", outfile="high_quality.pdf", setting="prepress")
160

161
# Repair encrypted PDF
162
pdfplumber.repair("encrypted_corrupted.pdf", password="secret", outfile="repaired.pdf")
163
```
164

165
### Container Base Class
166

167
Base class providing object property access and serialization methods inherited by PDF and Page classes.
168

169
```python { .api }
170
class Container:
171
    """Base container with object access and serialization."""
172
    
173
    @property
174
    def rects(self) -> T_obj_list:
175
        """Rectangle objects."""
176
    
177
    @property
178
    def lines(self) -> T_obj_list:
179
        """Line objects."""
180
    
181
    @property
182
    def curves(self) -> T_obj_list:
183
        """Curve objects."""
184
    
185
    @property 
186
    def images(self) -> T_obj_list:
187
        """Image objects."""
188
    
189
    @property
190
    def chars(self) -> T_obj_list:
191
        """Character objects."""
192
    
193
    @property
194
    def textboxverticals(self) -> T_obj_list:
195
        """Vertical text box objects."""
196
    
197
    @property
198
    def textboxhorizontals(self) -> T_obj_list:
199
        """Horizontal text box objects."""
200
    
201
    @property
202
    def textlineverticals(self) -> T_obj_list:
203
        """Vertical text line objects."""
204
    
205
    @property
206
    def textlinehorizontals(self) -> T_obj_list:
207
        """Horizontal text line objects."""
208
    
209
    @property
210
    def rect_edges(self) -> T_obj_list:
211
        """Edges derived from rectangles."""
212
    
213
    @property
214
    def curve_edges(self) -> T_obj_list:
215
        """Edges derived from curves."""
216
    
217
    @property
218
    def edges(self) -> T_obj_list:
219
        """All edges (lines + rect_edges + curve_edges)."""
220
    
221
    @property
222
    def horizontal_edges(self) -> T_obj_list:
223
        """Horizontal edges only."""
224
    
225
    @property
226
    def vertical_edges(self) -> T_obj_list:
227
        """Vertical edges only."""
228
    
229
    def flush_cache(self, properties=None):
230
        """Clear cached properties."""
231
    
232
    def to_json(self, stream=None, object_types=None, include_attrs=None, 
233
                exclude_attrs=None, precision=None, indent=None):
234
        """Export as JSON."""
235
    
236
    def to_csv(self, stream=None, object_types=None, precision=None, 
237
               include_attrs=None, exclude_attrs=None):
238
        """Export as CSV."""
239
    
240
    def to_dict(self, object_types=None):
241
        """Convert to dictionary representation."""
242
```
243

244
## Error Handling
245

246
```python { .api }
247
# Custom exceptions for PDF operations
248
class MalformedPDFException(Exception):
249
    """Raised for malformed PDF files."""
250

251
class PdfminerException(Exception):
252
    """Wrapper for pdfminer exceptions."""
253
```
254

255
Common error scenarios:
256
- Invalid PDF files raise `MalformedPDFException`
257
- Missing Ghostscript for repair operations raises standard exceptions
258
- Encrypted PDFs without password raise pdfminer exceptions
259
- Unicode decoding errors when `raise_unicode_errors=True`

Version

Tile

Files

pdf-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

pdf-operations.mddocs/