0
# Page Content Extraction
1
2
Comprehensive text and image extraction from document pages with multiple output formats, search capabilities, and detailed layout analysis. PyMuPDF provides powerful extraction tools that preserve formatting and structural information.
3
4
## Capabilities
5
6
### Text Extraction
7
8
Extract text in various formats with layout and formatting information.
9
10
```python { .api }
11
def get_text(
12
page: Page,
13
option: str = "text",
14
*,
15
clip: Rect = None,
16
flags: int = None,
17
textpage: TextPage = None,
18
sort: bool = False,
19
delimiters=None,
20
tolerance=3
21
) -> str:
22
"""
23
Extract text from a page in specified format (standalone utility function).
24
25
Parameters:
26
- page: Page object to extract text from
27
- option: output format ("text", "html", "dict", "json", "rawdict", "xml", "xhtml", "words", "blocks")
28
- clip: Rect to limit extraction area
29
- flags: text extraction flags (TEXT_PRESERVE_LIGATURES, etc.)
30
- textpage: existing TextPage object to reuse
31
- sort: sort text by reading order
32
- delimiters: characters to use as word delimiters (for words option)
33
- tolerance: consider words part of same line if coordinates don't differ more than this
34
35
Returns:
36
Extracted text in requested format
37
"""
38
39
def get_text_blocks(
40
page: Page,
41
clip: Rect = None,
42
flags: int = None,
43
textpage: TextPage = None,
44
sort: bool = False
45
) -> list:
46
"""
47
Return the text blocks on a page.
48
49
Parameters:
50
- page: Page object to extract blocks from
51
- clip: Rect to limit extraction area
52
- flags: text extraction flags
53
- textpage: existing TextPage object to reuse
54
- sort: sort blocks by reading order
55
56
Returns:
57
List of text blocks with coordinates and content
58
"""
59
60
def get_text_words(
61
page: Page,
62
clip: Rect = None,
63
flags: int = None,
64
textpage: TextPage = None,
65
sort: bool = False,
66
delimiters=None,
67
tolerance=3
68
) -> list:
69
"""
70
Return text words as list with bounding box for each word.
71
72
Parameters:
73
- page: Page object to extract words from
74
- clip: Rect to limit extraction area
75
- flags: text extraction flags
76
- textpage: existing TextPage object to reuse
77
- sort: sort words by reading order
78
- delimiters: characters to use as word delimiters
79
- tolerance: consider words part of same line if coordinates don't differ more than this
80
81
Returns:
82
List of words with bounding rectangles
83
"""
84
85
def get_textbox(page: Page, rect: Rect, textpage: TextPage = None) -> str:
86
"""
87
Extract text from specific rectangular area.
88
89
Parameters:
90
- page: Page object
91
- rect: rectangular area to extract text from
92
- textpage: existing TextPage object to reuse
93
94
Returns:
95
Text content within the specified rectangle
96
"""
97
98
def get_text_selection(
99
page: Page,
100
p1: Point,
101
p2: Point,
102
clip: Rect = None,
103
textpage: TextPage = None
104
) -> str:
105
"""
106
Extract text between two points on page.
107
108
Parameters:
109
- page: Page object
110
- p1: start point for text selection
111
- p2: end point for text selection
112
- clip: Rect to limit extraction area
113
- textpage: existing TextPage object to reuse
114
115
Returns:
116
Selected text content
117
"""
118
119
class Page:
120
def get_textpage(self, clip: Rect = None, flags: int = 0, matrix: Matrix = None) -> TextPage:
121
"""
122
Get TextPage object for detailed text analysis.
123
124
Parameters:
125
- clip: rectangle to limit text extraction
126
- flags: extraction flags for text processing
127
128
Returns:
129
TextPage object with detailed text information
130
"""
131
```
132
133
### TextPage Class
134
135
Detailed text extraction and analysis with layout information.
136
137
```python { .api }
138
class TextPage:
139
def extractText(self, sort: bool = False) -> str:
140
"""
141
Extract plain text.
142
143
Parameters:
144
- sort: sort text by reading order
145
146
Returns:
147
Plain text string
148
"""
149
150
def extractHTML(self) -> str:
151
"""
152
Extract text as HTML with formatting.
153
154
Returns:
155
HTML formatted text
156
"""
157
158
def extractJSON(self, cb=None) -> str:
159
"""
160
Extract text as JSON with detailed layout info.
161
162
Parameters:
163
- cb: optional callback function
164
165
Returns:
166
JSON string with text blocks, lines, spans, and characters
167
"""
168
169
def extractXHTML(self) -> str:
170
"""
171
Extract text as XHTML.
172
173
Returns:
174
XHTML formatted text
175
"""
176
177
def extractXML(self) -> str:
178
"""
179
Extract text as XML.
180
181
Returns:
182
XML formatted text with structure
183
"""
184
185
def extractDICT(self, cb=None, sort: bool = False) -> dict:
186
"""
187
Extract text as dictionary with detailed information.
188
189
Parameters:
190
- cb: optional callback function
191
- sort: sort text by reading order
192
193
Returns:
194
Dictionary with blocks, lines, spans, and character details
195
"""
196
197
def extractBLOCKS(self) -> list:
198
"""
199
Extract text blocks.
200
201
Returns:
202
List of text blocks with coordinates and content
203
"""
204
205
def extractWORDS(self, delimiters: str = None) -> list:
206
"""
207
Extract individual words with positions.
208
209
Parameters:
210
- delimiters: word delimiter characters
211
212
Returns:
213
List of words with bounding boxes
214
"""
215
216
def search(self, needle: str, hit_max: int = 16, quads: bool = False) -> list:
217
"""
218
Search for text on the page.
219
220
Parameters:
221
- needle: text to search for
222
- hit_max: maximum number of hits
223
- quads: return results as Quad objects instead of Rect
224
225
Returns:
226
List of Rect or Quad objects indicating match locations
227
"""
228
```
229
230
### Text Search
231
232
Search for text with various options and return location information.
233
234
```python { .api }
235
class Page:
236
def search_for(self, needle: str, hit_max: int = 16, quads: bool = False,
237
flags: int = 0, clip: Rect = None) -> list:
238
"""
239
Search for text on page.
240
241
Parameters:
242
- needle: text to search for
243
- hit_max: maximum number of hits to return
244
- quads: return Quad objects instead of Rect objects
245
- flags: search flags for case sensitivity, etc.
246
- clip: limit search to this rectangle
247
248
Returns:
249
List of Rect or Quad objects indicating match locations
250
"""
251
```
252
253
### Image Extraction
254
255
Extract embedded images from document pages.
256
257
```python { .api }
258
class Page:
259
def get_images(self, full: bool = False) -> list:
260
"""
261
Get list of images on page.
262
263
Parameters:
264
- full: include detailed image information
265
266
Returns:
267
List of image dictionaries with xref, bbox, transform, etc.
268
"""
269
270
def get_image_bbox(self, name: str, transform: bool = True) -> Rect:
271
"""
272
Get bounding box of named image.
273
274
Parameters:
275
- name: image name/reference
276
- transform: apply transformation matrix
277
278
Returns:
279
Image bounding rectangle
280
"""
281
282
def get_pixmap(self, matrix: Matrix = None, colorspace: Colorspace = None,
283
clip: Rect = None, alpha: bool = False, annots: bool = True) -> Pixmap:
284
"""
285
Render page to Pixmap for image extraction.
286
287
Parameters:
288
- matrix: transformation matrix
289
- colorspace: target color space
290
- clip: clipping rectangle
291
- alpha: include alpha channel
292
- annots: include annotations
293
294
Returns:
295
Pixmap object with page image
296
"""
297
```
298
299
### Links and Annotations
300
301
Extract interactive elements from pages.
302
303
```python { .api }
304
class Page:
305
def get_links(self) -> list:
306
"""
307
Get list of links on page.
308
309
Returns:
310
List of link dictionaries with kind, from, to, uri, etc.
311
"""
312
313
def first_link(self) -> Link:
314
"""
315
Get first link on page.
316
317
Returns:
318
Link object or None
319
"""
320
321
def load_links(self) -> None:
322
"""Load links from page for iteration."""
323
324
def first_annot(self) -> Annot:
325
"""
326
Get first annotation on page.
327
328
Returns:
329
Annot object or None
330
"""
331
332
def load_annot(self, ident: typing.Union[str, int]) -> Annot:
333
"""
334
Load annotation by identifier.
335
336
Parameters:
337
- ident: annotation identifier (xref number or unique name)
338
339
Returns:
340
Annot object
341
"""
342
343
def annot_names(self) -> list:
344
"""
345
Get list of annotation names on page.
346
347
Returns:
348
List of annotation names
349
"""
350
351
def annots(self, types: list = None) -> list:
352
"""
353
Get list of annotations on page.
354
355
Parameters:
356
- types: filter by annotation types
357
358
Returns:
359
List of Annot objects
360
"""
361
```
362
363
### Drawing and Vector Content
364
365
Extract vector graphics and drawing information.
366
367
```python { .api }
368
class Page:
369
def get_drawings(self, extended: bool = False) -> list:
370
"""
371
Get vector drawings from page.
372
373
Parameters:
374
- extended: include extended path information
375
376
Returns:
377
List of drawing dictionaries with paths, colors, etc.
378
"""
379
380
def get_cdrawings(self, extended: bool = False) -> list:
381
"""
382
Get drawings in compact format.
383
384
Parameters:
385
- extended: include extended information
386
387
Returns:
388
List of compact drawing representations
389
"""
390
```
391
392
## Usage Examples
393
394
### Basic Text Extraction
395
396
```python
397
import pymupdf
398
399
doc = pymupdf.open("document.pdf")
400
page = doc.load_page(0)
401
402
# Extract plain text using standalone function
403
text = pymupdf.get_text(page)
404
print(text)
405
406
# Extract with formatting as HTML
407
html = pymupdf.get_text(page, "html")
408
print(html)
409
410
# Extract detailed layout information
411
layout_dict = pymupdf.get_text(page, "dict")
412
for block in layout_dict["blocks"]:
413
if "lines" in block: # Text block
414
for line in block["lines"]:
415
for span in line["spans"]:
416
print(f"Text: {span['text']}, Font: {span['font']}, Size: {span['size']}")
417
418
# Extract text blocks
419
blocks = pymupdf.get_text_blocks(page)
420
for block in blocks:
421
print(f"Block text: {block[4]}") # block[4] contains the text
422
423
# Extract individual words with coordinates
424
words = pymupdf.get_text_words(page)
425
for word in words:
426
x0, y0, x1, y1, text, block_no, line_no, word_no = word
427
print(f"Word '{text}' at ({x0}, {y0}, {x1}, {y1})")
428
429
doc.close()
430
```
431
432
### Advanced Text Search
433
434
```python
435
import pymupdf
436
437
doc = pymupdf.open("document.pdf")
438
439
# Search across all pages
440
search_term = "important keyword"
441
results = []
442
443
for page_num in range(doc.page_count):
444
page = doc.load_page(page_num)
445
matches = page.search_for(search_term, quads=True)
446
for match in matches:
447
results.append({
448
"page": page_num,
449
"text": search_term,
450
"quad": match,
451
"bbox": match.rect
452
})
453
454
print(f"Found {len(results)} matches")
455
doc.close()
456
```
457
458
### Image Extraction with Details
459
460
```python
461
import pymupdf
462
463
doc = pymupdf.open("document.pdf")
464
page = doc.load_page(0)
465
466
# Get image information
467
images = page.get_images(full=True)
468
469
for img_index, img in enumerate(images):
470
xref = img[0] # Image xref number
471
pix = pymupdf.Pixmap(doc, xref) # Extract image
472
473
if pix.n - pix.alpha < 4: # GRAY or RGB
474
pix.save(f"image_{page.number}_{img_index}.png")
475
else: # CMYK: convert to RGB first
476
pix1 = pymupdf.Pixmap(pymupdf.csRGB, pix)
477
pix1.save(f"image_{page.number}_{img_index}.png")
478
pix1 = None
479
480
pix = None
481
482
doc.close()
483
```
484
485
### Working with TextPage Objects
486
487
```python
488
import pymupdf
489
490
doc = pymupdf.open("document.pdf")
491
page = doc.load_page(0)
492
493
# Create TextPage for detailed analysis
494
textpage = page.get_textpage()
495
496
# Extract words with coordinates
497
words = textpage.extractWORDS()
498
for word in words:
499
x0, y0, x1, y1, text, block_no, line_no, word_no = word
500
print(f"Word: '{text}' at ({x0}, {y0}, {x1}, {y1})")
501
502
# Search within TextPage
503
matches = textpage.search("search term")
504
print(f"Found {len(matches)} matches")
505
506
doc.close()
507
```
508
509
### Link Analysis
510
511
```python
512
import pymupdf
513
514
doc = pymupdf.open("document.pdf")
515
page = doc.load_page(0)
516
517
# Get all links
518
links = page.get_links()
519
520
for link in links:
521
print(f"Link type: {link['kind']}")
522
print(f"From: {link['from']}") # Source rectangle
523
524
if link['kind'] == pymupdf.LINK_URI:
525
print(f"URI: {link['uri']}")
526
elif link['kind'] == pymupdf.LINK_GOTO:
527
print(f"Target page: {link['page']}")
528
if 'to' in link:
529
print(f"Target point: {link['to']}")
530
531
doc.close()
532
```