or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdindex.mdpage-manipulation.mdpdf-operations.mdtable-extraction.mdtext-extraction.mdutilities.mdvisual-debugging.md

pdf-operations.mddocs/

0

# PDF Document Operations

1

2

Core functionality for opening, accessing, and managing PDF documents including metadata extraction, page access, document-level operations, and PDF repair capabilities.

3

4

## Capabilities

5

6

### Opening PDF Documents

7

8

The primary function for opening PDF documents from file paths, streams, or bytes with comprehensive configuration options.

9

10

```python { .api }

11

def open(path_or_fp, pages=None, laparams=None, password=None,

12

strict_metadata=False, unicode_norm=None, repair=False,

13

gs_path=None, repair_setting="default", raise_unicode_errors=True):

14

"""

15

Open PDF document from file path or stream.

16

17

Parameters:

18

- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source

19

- pages: List[int] or Tuple[int], optional - Specific pages to parse

20

- laparams: Dict[str, Any], optional - Layout analysis parameters

21

- password: str, optional - PDF password for encrypted documents

22

- strict_metadata: bool - Raise errors for malformed metadata

23

- unicode_norm: str, optional - Unicode normalization ("NFC", "NFKC", "NFD", "NFKD")

24

- repair: bool - Attempt PDF repair using Ghostscript

25

- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable

26

- repair_setting: str - Repair quality setting ("default", "prepress", "printer", "ebook", "screen")

27

- raise_unicode_errors: bool - Raise errors for unicode decoding issues

28

29

Returns:

30

PDF object with context manager support

31

"""

32

```

33

34

**Usage Examples:**

35

36

```python

37

# Open from file path

38

with pdfplumber.open("document.pdf") as pdf:

39

print(f"Document has {len(pdf.pages)} pages")

40

41

# Open specific pages only

42

with pdfplumber.open("large_doc.pdf", pages=[0, 1, 5]) as pdf:

43

for page in pdf.pages:

44

print(f"Page {page.page_number}: {page.extract_text()[:100]}")

45

46

# Open encrypted PDF

47

with pdfplumber.open("encrypted.pdf", password="secret") as pdf:

48

text = pdf.pages[0].extract_text()

49

50

# Open with repair for corrupted PDFs

51

with pdfplumber.open("corrupted.pdf", repair=True) as pdf:

52

text = pdf.pages[0].extract_text()

53

```

54

55

### PDF Class

56

57

The main PDF document class providing access to pages, metadata, and document-level operations.

58

59

```python { .api }

60

class PDF:

61

"""PDF document container with page access and metadata."""

62

63

def __init__(self, stream, stream_is_external=False, path=None,

64

pages=None, laparams=None, password=None,

65

strict_metadata=False, unicode_norm=None,

66

raise_unicode_errors=True):

67

"""Initialize PDF object from stream."""

68

69

@property

70

def pages(self) -> List[Page]:

71

"""List of page objects in document."""

72

73

@property

74

def objects(self) -> Dict[str, T_obj_list]:

75

"""All objects aggregated from all pages by type."""

76

77

@property

78

def annots(self) -> List[Dict[str, Any]]:

79

"""All annotations from all pages."""

80

81

@property

82

def hyperlinks(self) -> List[Dict[str, Any]]:

83

"""All hyperlinks from all pages."""

84

85

@property

86

def structure_tree(self) -> List[Dict[str, Any]]:

87

"""Document structure tree for accessibility."""

88

89

metadata: Dict

90

"""PDF metadata dictionary (instance variable)."""

91

92

def close(self):

93

"""Close PDF and cleanup resources."""

94

95

def __enter__(self):

96

"""Context manager entry."""

97

98

def __exit__(self, exc_type, exc_val, exc_tb):

99

"""Context manager exit with cleanup."""

100

```

101

102

**Usage Examples:**

103

104

```python

105

# Access document metadata

106

pdf = pdfplumber.open("document.pdf")

107

print(f"Title: {pdf.metadata.get('Title', 'No title')}")

108

print(f"Author: {pdf.metadata.get('Author', 'Unknown')}")

109

print(f"Created: {pdf.metadata.get('CreationDate', 'Unknown')}")

110

111

# Get all text objects from document

112

all_chars = pdf.objects.get('chars', [])

113

print(f"Document contains {len(all_chars)} character objects")

114

115

# Access document-level annotations

116

for annot in pdf.annots:

117

print(f"Annotation: {annot.get('contents', 'No content')}")

118

119

pdf.close()

120

```

121

122

### PDF Repair

123

124

Repair corrupted or malformed PDF documents using Ghostscript with various quality settings.

125

126

```python { .api }

127

def repair(path_or_fp, outfile=None, password=None, gs_path=None,

128

setting="default"):

129

"""

130

Repair PDF using Ghostscript.

131

132

Parameters:

133

- path_or_fp: str, pathlib.Path, BufferedReader, or BytesIO - PDF source

134

- outfile: str or pathlib.Path, optional - Output file path

135

- password: str, optional - PDF password

136

- gs_path: str or pathlib.Path, optional - Path to Ghostscript executable

137

- setting: str - Quality setting ("default", "prepress", "printer", "ebook", "screen")

138

139

Returns:

140

BytesIO containing repaired PDF data

141

"""

142

143

# Repair setting type

144

T_repair_setting = Literal["default", "prepress", "printer", "ebook", "screen"]

145

```

146

147

**Usage Examples:**

148

149

```python

150

# Repair PDF to memory

151

repaired_data = pdfplumber.repair("corrupted.pdf")

152

with pdfplumber.open(repaired_data) as pdf:

153

text = pdf.pages[0].extract_text()

154

155

# Repair PDF to file

156

pdfplumber.repair("corrupted.pdf", outfile="repaired.pdf")

157

158

# Repair with specific quality setting

159

pdfplumber.repair("corrupted.pdf", outfile="high_quality.pdf", setting="prepress")

160

161

# Repair encrypted PDF

162

pdfplumber.repair("encrypted_corrupted.pdf", password="secret", outfile="repaired.pdf")

163

```

164

165

### Container Base Class

166

167

Base class providing object property access and serialization methods inherited by PDF and Page classes.

168

169

```python { .api }

170

class Container:

171

"""Base container with object access and serialization."""

172

173

@property

174

def rects(self) -> T_obj_list:

175

"""Rectangle objects."""

176

177

@property

178

def lines(self) -> T_obj_list:

179

"""Line objects."""

180

181

@property

182

def curves(self) -> T_obj_list:

183

"""Curve objects."""

184

185

@property

186

def images(self) -> T_obj_list:

187

"""Image objects."""

188

189

@property

190

def chars(self) -> T_obj_list:

191

"""Character objects."""

192

193

@property

194

def textboxverticals(self) -> T_obj_list:

195

"""Vertical text box objects."""

196

197

@property

198

def textboxhorizontals(self) -> T_obj_list:

199

"""Horizontal text box objects."""

200

201

@property

202

def textlineverticals(self) -> T_obj_list:

203

"""Vertical text line objects."""

204

205

@property

206

def textlinehorizontals(self) -> T_obj_list:

207

"""Horizontal text line objects."""

208

209

@property

210

def rect_edges(self) -> T_obj_list:

211

"""Edges derived from rectangles."""

212

213

@property

214

def curve_edges(self) -> T_obj_list:

215

"""Edges derived from curves."""

216

217

@property

218

def edges(self) -> T_obj_list:

219

"""All edges (lines + rect_edges + curve_edges)."""

220

221

@property

222

def horizontal_edges(self) -> T_obj_list:

223

"""Horizontal edges only."""

224

225

@property

226

def vertical_edges(self) -> T_obj_list:

227

"""Vertical edges only."""

228

229

def flush_cache(self, properties=None):

230

"""Clear cached properties."""

231

232

def to_json(self, stream=None, object_types=None, include_attrs=None,

233

exclude_attrs=None, precision=None, indent=None):

234

"""Export as JSON."""

235

236

def to_csv(self, stream=None, object_types=None, precision=None,

237

include_attrs=None, exclude_attrs=None):

238

"""Export as CSV."""

239

240

def to_dict(self, object_types=None):

241

"""Convert to dictionary representation."""

242

```

243

244

## Error Handling

245

246

```python { .api }

247

# Custom exceptions for PDF operations

248

class MalformedPDFException(Exception):

249

"""Raised for malformed PDF files."""

250

251

class PdfminerException(Exception):

252

"""Wrapper for pdfminer exceptions."""

253

```

254

255

Common error scenarios:

256

- Invalid PDF files raise `MalformedPDFException`

257

- Missing Ghostscript for repair operations raises standard exceptions

258

- Encrypted PDFs without password raise pdfminer exceptions

259

- Unicode decoding errors when `raise_unicode_errors=True`