Tessl Tile for pypi/pypdf@6.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

annotations.md form-fields.md index.md metadata.md page-operations.md reading-writing.md text-extraction.md utilities.md

text-extraction.mddocs/

0
# Text Extraction
1

2
Advanced text extraction capabilities with multiple extraction modes, layout preservation, and customizable text processing options. pypdf provides sophisticated text extraction that can handle complex PDF layouts while maintaining readability.
3

4
## Capabilities
5

6
### Text Extraction Methods
7

8
Extract text from PDF pages with various modes and customization options to handle different document types and layout requirements.
9

10
```python { .api }
11
def extract_text(
12
    self,
13
    orientations: tuple | int = (0, 90, 180, 270),
14
    space_width: float = 200.0,
15
    visitor_operand_before=None,
16
    visitor_operand_after=None,  
17
    visitor_text=None,
18
    extraction_mode: str = "plain"
19
) -> str:
20
    """
21
    Extract text from the page with advanced options.
22
    
23
    Args:
24
        orientations: Text orientations to consider in degrees (default: (0, 90, 180, 270))
25
        space_width: Minimum width threshold for inserting spaces (default: 200.0)
26
        visitor_operand_before: Callback function called before processing operands
27
        visitor_operand_after: Callback function called after processing operands  
28
        visitor_text: Custom text visitor function for advanced processing
29
        extraction_mode: Text extraction mode ("plain" or "layout", default: "plain")
30
            - "plain": Simple text extraction without layout preservation (default)
31
            - "layout": Preserves spatial layout and formatting
32
        
33
    Returns:
34
        Extracted text as string
35
    """
36
```
37

38
### Text Visitor Functions
39

40
Custom text processing through visitor functions for advanced text extraction scenarios.
41

42
```python { .api }
43
def mult(m: list[float], n: list[float]) -> list[float]:
44
    """
45
    Matrix multiplication utility for text transformation calculations.
46
    
47
    Args:
48
        m: First matrix as list of floats
49
        n: Second matrix as list of floats
50
        
51
    Returns:
52
        Result of matrix multiplication
53
    """
54
```
55

56
## Usage Examples
57

58
### Basic Text Extraction
59

60
```python
61
from pypdf import PdfReader
62

63
reader = PdfReader("document.pdf")
64

65
# Extract text from first page
66
page = reader.pages[0]
67
text = page.extract_text()
68
print(text)
69

70
# Extract text from all pages
71
full_text = ""
72
for page in reader.pages:
73
    full_text += page.extract_text()
74
    full_text += "\n\n"  # Separate pages
75

76
print(full_text)
77
```
78

79
### Layout-Preserving Extraction
80

81
```python
82
from pypdf import PdfReader
83

84
reader = PdfReader("formatted_document.pdf")
85

86
for page_num, page in enumerate(reader.pages):
87
    # Extract with layout preservation (default)
88
    layout_text = page.extract_text(
89
        extraction_mode="layout",
90
        layout_mode_space_vertically=True,
91
        layout_mode_scale_weight=1.25
92
    )
93
    
94
    print(f"Page {page_num + 1}:")
95
    print(layout_text)
96
    print("-" * 50)
97
```
98

99
### Plain Text Extraction
100

101
```python
102
from pypdf import PdfReader
103

104
reader = PdfReader("document.pdf")
105

106
for page in reader.pages:
107
    # Simple text extraction without layout
108
    plain_text = page.extract_text(extraction_mode="plain")
109
    print(plain_text)
110
```
111

112
### Handling Rotated Text
113

114
```python
115
from pypdf import PdfReader
116

117
reader = PdfReader("rotated_content.pdf")
118

119
for page in reader.pages:
120
    # Include all text orientations
121
    text_all_orientations = page.extract_text(
122
        orientations=(0, 90, 180, 270),
123
        layout_mode_strip_rotated=False
124
    )
125
    
126
    # Only horizontal text
127
    text_horizontal_only = page.extract_text(
128
        orientations=(0,),
129
        layout_mode_strip_rotated=True
130
    )
131
    
132
    print("All orientations:")
133
    print(text_all_orientations)
134
    print("\nHorizontal only:")
135
    print(text_horizontal_only)
136
```
137

138
### Custom Space Width Handling
139

140
```python
141
from pypdf import PdfReader
142

143
reader = PdfReader("document.pdf")
144

145
for page in reader.pages:
146
    # Tighter spacing (less spaces inserted)
147
    tight_spacing = page.extract_text(space_width=100.0)
148
    
149
    # Looser spacing (more spaces inserted)
150
    loose_spacing = page.extract_text(space_width=300.0)
151
    
152
    print("Tight spacing:")
153
    print(tight_spacing[:200], "...")
154
    print("\nLoose spacing:")
155
    print(loose_spacing[:200], "...")
156
```
157

158
### Advanced Text Processing with Visitor
159

160
```python
161
from pypdf import PdfReader
162

163
def custom_text_visitor(text, cm, tm, font_dict, font_size):
164
    """
165
    Custom text visitor function for advanced text processing.
166
    
167
    Args:
168
        text: Extracted text
169
        cm: Current transformation matrix
170
        tm: Text matrix
171
        font_dict: Font dictionary
172
        font_size: Font size
173
    """
174
    # Example: Only extract text larger than 12pt
175
    if font_size >= 12:
176
        return text
177
    return ""
178

179
reader = PdfReader("document.pdf")
180

181
for page in reader.pages:
182
    # Extract only large text
183
    large_text_only = page.extract_text(visitor_text=custom_text_visitor)
184
    print(large_text_only)
185
```
186

187
### Extracting Text from Specific Regions
188

189
```python
190
from pypdf import PdfReader, PageObject
191

192
def extract_text_from_region(page: PageObject, x1: float, y1: float, x2: float, y2: float) -> str:
193
    """
194
    Extract text from a specific rectangular region of a page.
195
    
196
    Args:
197
        page: PageObject to extract from
198
        x1, y1: Bottom-left coordinates
199
        x2, y2: Top-right coordinates
200
        
201
    Returns:
202
        Extracted text from the region
203
    """
204
    # Create a copy of the page
205
    cropped_page = PageObject.create_blank_page(x2 - x1, y2 - y1)
206
    
207
    # Crop the original page to the desired region
208
    original_cropbox = page.cropbox
209
    page.cropbox = [x1, y1, x2, y2]
210
    
211
    # Merge the cropped content
212
    cropped_page.merge_page(page)
213
    
214
    # Restore original cropbox
215
    page.cropbox = original_cropbox
216
    
217
    return cropped_page.extract_text()
218

219
reader = PdfReader("document.pdf")
220
page = reader.pages[0]
221

222
# Extract text from top-left quarter of the page
223
width = float(page.mediabox.width)
224
height = float(page.mediabox.height)
225

226
top_left_text = extract_text_from_region(
227
    page, 0, height/2, width/2, height
228
)
229
print("Top-left quarter text:")
230
print(top_left_text)
231
```
232

233
### Text Extraction with Error Handling
234

235
```python
236
from pypdf import PdfReader
237
from pypdf.errors import PdfReadError, PdfStreamError
238

239
def safe_extract_text(pdf_path: str) -> list[str]:
240
    """
241
    Safely extract text from all pages with error handling.
242
    
243
    Args:
244
        pdf_path: Path to PDF file
245
        
246
    Returns:
247
        List of extracted text strings (one per page)
248
    """
249
    texts = []
250
    
251
    try:
252
        reader = PdfReader(pdf_path)
253
        
254
        for page_num, page in enumerate(reader.pages):
255
            try:
256
                text = page.extract_text()
257
                texts.append(text)
258
            except (PdfReadError, PdfStreamError) as e:
259
                print(f"Error extracting text from page {page_num + 1}: {e}")
260
                texts.append("")  # Empty string for failed pages
261
                
262
    except Exception as e:
263
        print(f"Error opening PDF {pdf_path}: {e}")
264
        
265
    return texts
266

267
# Extract text safely
268
page_texts = safe_extract_text("problematic.pdf")
269
for i, text in enumerate(page_texts):
270
    if text:
271
        print(f"Page {i + 1}: {len(text)} characters extracted")
272
    else:
273
        print(f"Page {i + 1}: Text extraction failed")
274
```
275

276
### Batch Text Extraction
277

278
```python
279
from pypdf import PdfReader
280
import os
281
from pathlib import Path
282

283
def extract_text_from_directory(directory_path: str, output_dir: str = None) -> dict[str, str]:
284
    """
285
    Extract text from all PDF files in a directory.
286
    
287
    Args:
288
        directory_path: Directory containing PDF files
289
        output_dir: Optional directory to save text files
290
        
291
    Returns:
292
        Dictionary mapping PDF filenames to extracted text
293
    """
294
    pdf_texts = {}
295
    
296
    for file_path in Path(directory_path).glob("*.pdf"):
297
        try:
298
            reader = PdfReader(str(file_path))
299
            
300
            # Extract all text
301
            full_text = ""
302
            for page in reader.pages:
303
                full_text += page.extract_text()
304
                full_text += "\n\n"
305
            
306
            pdf_texts[file_path.name] = full_text
307
            
308
            # Optionally save to text file
309
            if output_dir:
310
                output_path = Path(output_dir) / f"{file_path.stem}.txt"
311
                output_path.parent.mkdir(parents=True, exist_ok=True)
312
                output_path.write_text(full_text, encoding='utf-8')
313
                
314
        except Exception as e:
315
            print(f"Error processing {file_path.name}: {e}")
316
            pdf_texts[file_path.name] = ""
317
    
318
    return pdf_texts
319

320
# Extract text from all PDFs in a directory
321
texts = extract_text_from_directory("pdf_documents/", "extracted_text/")
322
print(f"Processed {len(texts)} PDF files")
323
```

Version

Tile

Files

text-extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-extraction.mddocs/