0
# Text Extraction
1
2
Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options. pypdf provides sophisticated text extraction that can handle complex PDF layouts while maintaining readability.
3
4
## Capabilities
5
6
### Text Extraction Methods
7
8
Extract text from PDF pages with various modes and customization options to handle different document types and layout requirements.
9
10
```python { .api }
11
def extract_text(
12
self,
13
orientations: tuple | int = (0, 90, 180, 270),
14
space_width: float = 200.0,
15
visitor_operand_before=None,
16
visitor_operand_after=None,
17
visitor_text=None,
18
extraction_mode: str = "plain"
19
) -> str:
20
"""
21
Extract text from the page with advanced options.
22
23
Args:
24
orientations: Text orientations to consider in degrees (default: (0, 90, 180, 270))
25
space_width: Minimum width threshold for inserting spaces (default: 200.0)
26
visitor_operand_before: Callback function called before processing operands
27
visitor_operand_after: Callback function called after processing operands
28
visitor_text: Custom text visitor function for advanced processing
29
extraction_mode: Text extraction mode ("plain" or "layout", default: "plain")
30
- "plain": Simple text extraction without layout preservation (default)
31
- "layout": Preserves spatial layout and formatting
32
33
Returns:
34
Extracted text as string
35
"""
36
```
37
38
### Text Visitor Functions
39
40
Custom text processing through visitor functions for advanced text extraction scenarios.
41
42
```python { .api }
43
def mult(m: list[float], n: list[float]) -> list[float]:
44
"""
45
Matrix multiplication utility for text transformation calculations.
46
47
Args:
48
m: First matrix as list of floats
49
n: Second matrix as list of floats
50
51
Returns:
52
Result of matrix multiplication
53
"""
54
```
55
56
## Usage Examples
57
58
### Basic Text Extraction
59
60
```python
61
from pypdf import PdfReader
62
63
reader = PdfReader("document.pdf")
64
65
# Extract text from first page
66
page = reader.pages[0]
67
text = page.extract_text()
68
print(text)
69
70
# Extract text from all pages
71
full_text = ""
72
for page in reader.pages:
73
full_text += page.extract_text()
74
full_text += "\n\n" # Separate pages
75
76
print(full_text)
77
```
78
79
### Layout-Preserving Extraction
80
81
```python
82
from pypdf import PdfReader
83
84
reader = PdfReader("formatted_document.pdf")
85
86
for page_num, page in enumerate(reader.pages):
87
# Extract with layout preservation (default)
88
layout_text = page.extract_text(
89
extraction_mode="layout",
90
layout_mode_space_vertically=True,
91
layout_mode_scale_weight=1.25
92
)
93
94
print(f"Page {page_num + 1}:")
95
print(layout_text)
96
print("-" * 50)
97
```
98
99
### Plain Text Extraction
100
101
```python
102
from pypdf import PdfReader
103
104
reader = PdfReader("document.pdf")
105
106
for page in reader.pages:
107
# Simple text extraction without layout
108
plain_text = page.extract_text(extraction_mode="plain")
109
print(plain_text)
110
```
111
112
### Handling Rotated Text
113
114
```python
115
from pypdf import PdfReader
116
117
reader = PdfReader("rotated_content.pdf")
118
119
for page in reader.pages:
120
# Include all text orientations
121
text_all_orientations = page.extract_text(
122
orientations=(0, 90, 180, 270),
123
layout_mode_strip_rotated=False
124
)
125
126
# Only horizontal text
127
text_horizontal_only = page.extract_text(
128
orientations=(0,),
129
layout_mode_strip_rotated=True
130
)
131
132
print("All orientations:")
133
print(text_all_orientations)
134
print("\nHorizontal only:")
135
print(text_horizontal_only)
136
```
137
138
### Custom Space Width Handling
139
140
```python
141
from pypdf import PdfReader
142
143
reader = PdfReader("document.pdf")
144
145
for page in reader.pages:
146
# Tighter spacing (less spaces inserted)
147
tight_spacing = page.extract_text(space_width=100.0)
148
149
# Looser spacing (more spaces inserted)
150
loose_spacing = page.extract_text(space_width=300.0)
151
152
print("Tight spacing:")
153
print(tight_spacing[:200], "...")
154
print("\nLoose spacing:")
155
print(loose_spacing[:200], "...")
156
```
157
158
### Advanced Text Processing with Visitor
159
160
```python
161
from pypdf import PdfReader
162
163
def custom_text_visitor(text, cm, tm, font_dict, font_size):
164
"""
165
Custom text visitor function for advanced text processing.
166
167
Args:
168
text: Extracted text
169
cm: Current transformation matrix
170
tm: Text matrix
171
font_dict: Font dictionary
172
font_size: Font size
173
"""
174
# Example: Only extract text larger than 12pt
175
if font_size >= 12:
176
return text
177
return ""
178
179
reader = PdfReader("document.pdf")
180
181
for page in reader.pages:
182
# Extract only large text
183
large_text_only = page.extract_text(visitor_text=custom_text_visitor)
184
print(large_text_only)
185
```
186
187
### Extracting Text from Specific Regions
188
189
```python
190
from pypdf import PdfReader, PageObject
191
192
def extract_text_from_region(page: PageObject, x1: float, y1: float, x2: float, y2: float) -> str:
193
"""
194
Extract text from a specific rectangular region of a page.
195
196
Args:
197
page: PageObject to extract from
198
x1, y1: Bottom-left coordinates
199
x2, y2: Top-right coordinates
200
201
Returns:
202
Extracted text from the region
203
"""
204
# Create a copy of the page
205
cropped_page = PageObject.create_blank_page(x2 - x1, y2 - y1)
206
207
# Crop the original page to the desired region
208
original_cropbox = page.cropbox
209
page.cropbox = [x1, y1, x2, y2]
210
211
# Merge the cropped content
212
cropped_page.merge_page(page)
213
214
# Restore original cropbox
215
page.cropbox = original_cropbox
216
217
return cropped_page.extract_text()
218
219
reader = PdfReader("document.pdf")
220
page = reader.pages[0]
221
222
# Extract text from top-left quarter of the page
223
width = float(page.mediabox.width)
224
height = float(page.mediabox.height)
225
226
top_left_text = extract_text_from_region(
227
page, 0, height/2, width/2, height
228
)
229
print("Top-left quarter text:")
230
print(top_left_text)
231
```
232
233
### Text Extraction with Error Handling
234
235
```python
236
from pypdf import PdfReader
237
from pypdf.errors import PdfReadError, PdfStreamError
238
239
def safe_extract_text(pdf_path: str) -> list[str]:
240
"""
241
Safely extract text from all pages with error handling.
242
243
Args:
244
pdf_path: Path to PDF file
245
246
Returns:
247
List of extracted text strings (one per page)
248
"""
249
texts = []
250
251
try:
252
reader = PdfReader(pdf_path)
253
254
for page_num, page in enumerate(reader.pages):
255
try:
256
text = page.extract_text()
257
texts.append(text)
258
except (PdfReadError, PdfStreamError) as e:
259
print(f"Error extracting text from page {page_num + 1}: {e}")
260
texts.append("") # Empty string for failed pages
261
262
except Exception as e:
263
print(f"Error opening PDF {pdf_path}: {e}")
264
265
return texts
266
267
# Extract text safely
268
page_texts = safe_extract_text("problematic.pdf")
269
for i, text in enumerate(page_texts):
270
if text:
271
print(f"Page {i + 1}: {len(text)} characters extracted")
272
else:
273
print(f"Page {i + 1}: Text extraction failed")
274
```
275
276
### Batch Text Extraction
277
278
```python
279
from pypdf import PdfReader
280
import os
281
from pathlib import Path
282
283
def extract_text_from_directory(directory_path: str, output_dir: str = None) -> dict[str, str]:
284
"""
285
Extract text from all PDF files in a directory.
286
287
Args:
288
directory_path: Directory containing PDF files
289
output_dir: Optional directory to save text files
290
291
Returns:
292
Dictionary mapping PDF filenames to extracted text
293
"""
294
pdf_texts = {}
295
296
for file_path in Path(directory_path).glob("*.pdf"):
297
try:
298
reader = PdfReader(str(file_path))
299
300
# Extract all text
301
full_text = ""
302
for page in reader.pages:
303
full_text += page.extract_text()
304
full_text += "\n\n"
305
306
pdf_texts[file_path.name] = full_text
307
308
# Optionally save to text file
309
if output_dir:
310
output_path = Path(output_dir) / f"{file_path.stem}.txt"
311
output_path.parent.mkdir(parents=True, exist_ok=True)
312
output_path.write_text(full_text, encoding='utf-8')
313
314
except Exception as e:
315
print(f"Error processing {file_path.name}: {e}")
316
pdf_texts[file_path.name] = ""
317
318
return pdf_texts
319
320
# Extract text from all PDFs in a directory
321
texts = extract_text_from_directory("pdf_documents/", "extracted_text/")
322
print(f"Processed {len(texts)} PDF files")
323
```