or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations.mdform-fields.mdindex.mdmetadata.mdpage-operations.mdreading-writing.mdtext-extraction.mdutilities.md

text-extraction.mddocs/

0

# Text Extraction

1

2

Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options. pypdf provides sophisticated text extraction that can handle complex PDF layouts while maintaining readability.

3

4

## Capabilities

5

6

### Text Extraction Methods

7

8

Extract text from PDF pages with various modes and customization options to handle different document types and layout requirements.

9

10

```python { .api }

11

def extract_text(

12

self,

13

orientations: tuple | int = (0, 90, 180, 270),

14

space_width: float = 200.0,

15

visitor_operand_before=None,

16

visitor_operand_after=None,

17

visitor_text=None,

18

extraction_mode: str = "plain"

19

) -> str:

20

"""

21

Extract text from the page with advanced options.

22

23

Args:

24

orientations: Text orientations to consider in degrees (default: (0, 90, 180, 270))

25

space_width: Minimum width threshold for inserting spaces (default: 200.0)

26

visitor_operand_before: Callback function called before processing operands

27

visitor_operand_after: Callback function called after processing operands

28

visitor_text: Custom text visitor function for advanced processing

29

extraction_mode: Text extraction mode ("plain" or "layout", default: "plain")

30

- "plain": Simple text extraction without layout preservation (default)

31

- "layout": Preserves spatial layout and formatting

32

33

Returns:

34

Extracted text as string

35

"""

36

```

37

38

### Text Visitor Functions

39

40

Custom text processing through visitor functions for advanced text extraction scenarios.

41

42

```python { .api }

43

def mult(m: list[float], n: list[float]) -> list[float]:

44

"""

45

Matrix multiplication utility for text transformation calculations.

46

47

Args:

48

m: First matrix as list of floats

49

n: Second matrix as list of floats

50

51

Returns:

52

Result of matrix multiplication

53

"""

54

```

55

56

## Usage Examples

57

58

### Basic Text Extraction

59

60

```python

61

from pypdf import PdfReader

62

63

reader = PdfReader("document.pdf")

64

65

# Extract text from first page

66

page = reader.pages[0]

67

text = page.extract_text()

68

print(text)

69

70

# Extract text from all pages

71

full_text = ""

72

for page in reader.pages:

73

full_text += page.extract_text()

74

full_text += "\n\n" # Separate pages

75

76

print(full_text)

77

```

78

79

### Layout-Preserving Extraction

80

81

```python

82

from pypdf import PdfReader

83

84

reader = PdfReader("formatted_document.pdf")

85

86

for page_num, page in enumerate(reader.pages):

87

# Extract with layout preservation (default)

88

layout_text = page.extract_text(

89

extraction_mode="layout",

90

layout_mode_space_vertically=True,

91

layout_mode_scale_weight=1.25

92

)

93

94

print(f"Page {page_num + 1}:")

95

print(layout_text)

96

print("-" * 50)

97

```

98

99

### Plain Text Extraction

100

101

```python

102

from pypdf import PdfReader

103

104

reader = PdfReader("document.pdf")

105

106

for page in reader.pages:

107

# Simple text extraction without layout

108

plain_text = page.extract_text(extraction_mode="plain")

109

print(plain_text)

110

```

111

112

### Handling Rotated Text

113

114

```python

115

from pypdf import PdfReader

116

117

reader = PdfReader("rotated_content.pdf")

118

119

for page in reader.pages:

120

# Include all text orientations

121

text_all_orientations = page.extract_text(

122

orientations=(0, 90, 180, 270),

123

layout_mode_strip_rotated=False

124

)

125

126

# Only horizontal text

127

text_horizontal_only = page.extract_text(

128

orientations=(0,),

129

layout_mode_strip_rotated=True

130

)

131

132

print("All orientations:")

133

print(text_all_orientations)

134

print("\nHorizontal only:")

135

print(text_horizontal_only)

136

```

137

138

### Custom Space Width Handling

139

140

```python

141

from pypdf import PdfReader

142

143

reader = PdfReader("document.pdf")

144

145

for page in reader.pages:

146

# Tighter spacing (less spaces inserted)

147

tight_spacing = page.extract_text(space_width=100.0)

148

149

# Looser spacing (more spaces inserted)

150

loose_spacing = page.extract_text(space_width=300.0)

151

152

print("Tight spacing:")

153

print(tight_spacing[:200], "...")

154

print("\nLoose spacing:")

155

print(loose_spacing[:200], "...")

156

```

157

158

### Advanced Text Processing with Visitor

159

160

```python

161

from pypdf import PdfReader

162

163

def custom_text_visitor(text, cm, tm, font_dict, font_size):

164

"""

165

Custom text visitor function for advanced text processing.

166

167

Args:

168

text: Extracted text

169

cm: Current transformation matrix

170

tm: Text matrix

171

font_dict: Font dictionary

172

font_size: Font size

173

"""

174

# Example: Only extract text larger than 12pt

175

if font_size >= 12:

176

return text

177

return ""

178

179

reader = PdfReader("document.pdf")

180

181

for page in reader.pages:

182

# Extract only large text

183

large_text_only = page.extract_text(visitor_text=custom_text_visitor)

184

print(large_text_only)

185

```

186

187

### Extracting Text from Specific Regions

188

189

```python

190

from pypdf import PdfReader, PageObject

191

192

def extract_text_from_region(page: PageObject, x1: float, y1: float, x2: float, y2: float) -> str:

193

"""

194

Extract text from a specific rectangular region of a page.

195

196

Args:

197

page: PageObject to extract from

198

x1, y1: Bottom-left coordinates

199

x2, y2: Top-right coordinates

200

201

Returns:

202

Extracted text from the region

203

"""

204

# Create a copy of the page

205

cropped_page = PageObject.create_blank_page(x2 - x1, y2 - y1)

206

207

# Crop the original page to the desired region

208

original_cropbox = page.cropbox

209

page.cropbox = [x1, y1, x2, y2]

210

211

# Merge the cropped content

212

cropped_page.merge_page(page)

213

214

# Restore original cropbox

215

page.cropbox = original_cropbox

216

217

return cropped_page.extract_text()

218

219

reader = PdfReader("document.pdf")

220

page = reader.pages[0]

221

222

# Extract text from top-left quarter of the page

223

width = float(page.mediabox.width)

224

height = float(page.mediabox.height)

225

226

top_left_text = extract_text_from_region(

227

page, 0, height/2, width/2, height

228

)

229

print("Top-left quarter text:")

230

print(top_left_text)

231

```

232

233

### Text Extraction with Error Handling

234

235

```python

236

from pypdf import PdfReader

237

from pypdf.errors import PdfReadError, PdfStreamError

238

239

def safe_extract_text(pdf_path: str) -> list[str]:

240

"""

241

Safely extract text from all pages with error handling.

242

243

Args:

244

pdf_path: Path to PDF file

245

246

Returns:

247

List of extracted text strings (one per page)

248

"""

249

texts = []

250

251

try:

252

reader = PdfReader(pdf_path)

253

254

for page_num, page in enumerate(reader.pages):

255

try:

256

text = page.extract_text()

257

texts.append(text)

258

except (PdfReadError, PdfStreamError) as e:

259

print(f"Error extracting text from page {page_num + 1}: {e}")

260

texts.append("") # Empty string for failed pages

261

262

except Exception as e:

263

print(f"Error opening PDF {pdf_path}: {e}")

264

265

return texts

266

267

# Extract text safely

268

page_texts = safe_extract_text("problematic.pdf")

269

for i, text in enumerate(page_texts):

270

if text:

271

print(f"Page {i + 1}: {len(text)} characters extracted")

272

else:

273

print(f"Page {i + 1}: Text extraction failed")

274

```

275

276

### Batch Text Extraction

277

278

```python

279

from pypdf import PdfReader

280

import os

281

from pathlib import Path

282

283

def extract_text_from_directory(directory_path: str, output_dir: str = None) -> dict[str, str]:

284

"""

285

Extract text from all PDF files in a directory.

286

287

Args:

288

directory_path: Directory containing PDF files

289

output_dir: Optional directory to save text files

290

291

Returns:

292

Dictionary mapping PDF filenames to extracted text

293

"""

294

pdf_texts = {}

295

296

for file_path in Path(directory_path).glob("*.pdf"):

297

try:

298

reader = PdfReader(str(file_path))

299

300

# Extract all text

301

full_text = ""

302

for page in reader.pages:

303

full_text += page.extract_text()

304

full_text += "\n\n"

305

306

pdf_texts[file_path.name] = full_text

307

308

# Optionally save to text file

309

if output_dir:

310

output_path = Path(output_dir) / f"{file_path.stem}.txt"

311

output_path.parent.mkdir(parents=True, exist_ok=True)

312

output_path.write_text(full_text, encoding='utf-8')

313

314

except Exception as e:

315

print(f"Error processing {file_path.name}: {e}")

316

pdf_texts[file_path.name] = ""

317

318

return pdf_texts

319

320

# Extract text from all PDFs in a directory

321

texts = extract_text_from_directory("pdf_documents/", "extracted_text/")

322

print(f"Processed {len(texts)} PDF files")

323

```