Tessl Tile for pypi/pymupdf@1.26.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

annotations-forms.md document-creation-modification.md document-operations.md document-rendering.md geometry-transformations.md index.md page-content-extraction.md table-extraction.md

page-content-extraction.mddocs/

0
# Page Content Extraction
1

2
Comprehensive text and image extraction from document pages with multiple output formats, search capabilities, and detailed layout analysis. PyMuPDF provides powerful extraction tools that preserve formatting and structural information.
3

4
## Capabilities
5

6
### Text Extraction
7

8
Extract text in various formats with layout and formatting information.
9

10
```python { .api }
11
def get_text(
12
    page: Page, 
13
    option: str = "text", 
14
    *, 
15
    clip: Rect = None,
16
    flags: int = None,
17
    textpage: TextPage = None,
18
    sort: bool = False,
19
    delimiters=None,
20
    tolerance=3
21
) -> str:
22
    """
23
    Extract text from a page in specified format (standalone utility function).
24
    
25
    Parameters:
26
    - page: Page object to extract text from
27
    - option: output format ("text", "html", "dict", "json", "rawdict", "xml", "xhtml", "words", "blocks")
28
    - clip: Rect to limit extraction area
29
    - flags: text extraction flags (TEXT_PRESERVE_LIGATURES, etc.)
30
    - textpage: existing TextPage object to reuse
31
    - sort: sort text by reading order
32
    - delimiters: characters to use as word delimiters (for words option)
33
    - tolerance: consider words part of same line if coordinates don't differ more than this
34
    
35
    Returns:
36
    Extracted text in requested format
37
    """
38

39
def get_text_blocks(
40
    page: Page,
41
    clip: Rect = None,
42
    flags: int = None,
43
    textpage: TextPage = None,
44
    sort: bool = False
45
) -> list:
46
    """
47
    Return the text blocks on a page.
48
    
49
    Parameters:
50
    - page: Page object to extract blocks from
51
    - clip: Rect to limit extraction area
52
    - flags: text extraction flags
53
    - textpage: existing TextPage object to reuse
54
    - sort: sort blocks by reading order
55
    
56
    Returns:
57
    List of text blocks with coordinates and content
58
    """
59

60
def get_text_words(
61
    page: Page,
62
    clip: Rect = None,
63
    flags: int = None,
64
    textpage: TextPage = None,
65
    sort: bool = False,
66
    delimiters=None,
67
    tolerance=3
68
) -> list:
69
    """
70
    Return text words as list with bounding box for each word.
71
    
72
    Parameters:
73
    - page: Page object to extract words from
74
    - clip: Rect to limit extraction area  
75
    - flags: text extraction flags
76
    - textpage: existing TextPage object to reuse
77
    - sort: sort words by reading order
78
    - delimiters: characters to use as word delimiters
79
    - tolerance: consider words part of same line if coordinates don't differ more than this
80
    
81
    Returns:
82
    List of words with bounding rectangles
83
    """
84

85
def get_textbox(page: Page, rect: Rect, textpage: TextPage = None) -> str:
86
    """
87
    Extract text from specific rectangular area.
88
    
89
    Parameters:
90
    - page: Page object
91
    - rect: rectangular area to extract text from
92
    - textpage: existing TextPage object to reuse
93
    
94
    Returns:
95
    Text content within the specified rectangle
96
    """
97

98
def get_text_selection(
99
    page: Page,
100
    p1: Point,
101
    p2: Point,
102
    clip: Rect = None,
103
    textpage: TextPage = None
104
) -> str:
105
    """
106
    Extract text between two points on page.
107
    
108
    Parameters:
109
    - page: Page object
110
    - p1: start point for text selection
111
    - p2: end point for text selection
112
    - clip: Rect to limit extraction area
113
    - textpage: existing TextPage object to reuse
114
    
115
    Returns:
116
    Selected text content
117
    """
118

119
class Page:
120
    def get_textpage(self, clip: Rect = None, flags: int = 0, matrix: Matrix = None) -> TextPage:
121
        """
122
        Get TextPage object for detailed text analysis.
123
        
124
        Parameters:
125
        - clip: rectangle to limit text extraction
126
        - flags: extraction flags for text processing
127
        
128
        Returns:
129
        TextPage object with detailed text information
130
        """
131
```
132

133
### TextPage Class
134

135
Detailed text extraction and analysis with layout information.
136

137
```python { .api }
138
class TextPage:
139
    def extractText(self, sort: bool = False) -> str:
140
        """
141
        Extract plain text.
142
        
143
        Parameters:
144
        - sort: sort text by reading order
145
        
146
        Returns:
147
        Plain text string
148
        """
149
    
150
    def extractHTML(self) -> str:
151
        """
152
        Extract text as HTML with formatting.
153
        
154
        Returns:
155
        HTML formatted text
156
        """
157
    
158
    def extractJSON(self, cb=None) -> str:
159
        """
160
        Extract text as JSON with detailed layout info.
161
        
162
        Parameters:
163
        - cb: optional callback function
164
        
165
        Returns:
166
        JSON string with text blocks, lines, spans, and characters
167
        """
168
    
169
    def extractXHTML(self) -> str:
170
        """
171
        Extract text as XHTML.
172
        
173
        Returns:
174
        XHTML formatted text
175
        """
176
    
177
    def extractXML(self) -> str:
178
        """
179
        Extract text as XML.
180
        
181
        Returns:
182
        XML formatted text with structure
183
        """
184
    
185
    def extractDICT(self, cb=None, sort: bool = False) -> dict:
186
        """
187
        Extract text as dictionary with detailed information.
188
        
189
        Parameters:
190
        - cb: optional callback function
191
        - sort: sort text by reading order
192
        
193
        Returns:
194
        Dictionary with blocks, lines, spans, and character details
195
        """
196
    
197
    def extractBLOCKS(self) -> list:
198
        """
199
        Extract text blocks.
200
        
201
        Returns:
202
        List of text blocks with coordinates and content
203
        """
204
    
205
    def extractWORDS(self, delimiters: str = None) -> list:
206
        """
207
        Extract individual words with positions.
208
        
209
        Parameters:
210
        - delimiters: word delimiter characters
211
        
212
        Returns:
213
        List of words with bounding boxes
214
        """
215
    
216
    def search(self, needle: str, hit_max: int = 16, quads: bool = False) -> list:
217
        """
218
        Search for text on the page.
219
        
220
        Parameters:
221
        - needle: text to search for
222
        - hit_max: maximum number of hits
223
        - quads: return results as Quad objects instead of Rect
224
        
225
        Returns:
226
        List of Rect or Quad objects indicating match locations
227
        """
228
```
229

230
### Text Search
231

232
Search for text with various options and return location information.
233

234
```python { .api }
235
class Page:
236
    def search_for(self, needle: str, hit_max: int = 16, quads: bool = False, 
237
                   flags: int = 0, clip: Rect = None) -> list:
238
        """
239
        Search for text on page.
240
        
241
        Parameters:
242
        - needle: text to search for
243
        - hit_max: maximum number of hits to return
244
        - quads: return Quad objects instead of Rect objects
245
        - flags: search flags for case sensitivity, etc.
246
        - clip: limit search to this rectangle
247
        
248
        Returns:
249
        List of Rect or Quad objects indicating match locations
250
        """
251
```
252

253
### Image Extraction
254

255
Extract embedded images from document pages.
256

257
```python { .api }
258
class Page:
259
    def get_images(self, full: bool = False) -> list:
260
        """
261
        Get list of images on page.
262
        
263
        Parameters:
264
        - full: include detailed image information
265
        
266
        Returns:
267
        List of image dictionaries with xref, bbox, transform, etc.
268
        """
269
    
270
    def get_image_bbox(self, name: str, transform: bool = True) -> Rect:
271
        """
272
        Get bounding box of named image.
273
        
274
        Parameters:
275
        - name: image name/reference
276
        - transform: apply transformation matrix
277
        
278
        Returns:
279
        Image bounding rectangle
280
        """
281
    
282
    def get_pixmap(self, matrix: Matrix = None, colorspace: Colorspace = None, 
283
                   clip: Rect = None, alpha: bool = False, annots: bool = True) -> Pixmap:
284
        """
285
        Render page to Pixmap for image extraction.
286
        
287
        Parameters:
288
        - matrix: transformation matrix
289
        - colorspace: target color space
290
        - clip: clipping rectangle
291
        - alpha: include alpha channel
292
        - annots: include annotations
293
        
294
        Returns:
295
        Pixmap object with page image
296
        """
297
```
298

299
### Links and Annotations
300

301
Extract interactive elements from pages.
302

303
```python { .api }
304
class Page:
305
    def get_links(self) -> list:
306
        """
307
        Get list of links on page.
308
        
309
        Returns:
310
        List of link dictionaries with kind, from, to, uri, etc.
311
        """
312
    
313
    def first_link(self) -> Link:
314
        """
315
        Get first link on page.
316
        
317
        Returns:
318
        Link object or None
319
        """
320
    
321
    def load_links(self) -> None:
322
        """Load links from page for iteration."""
323
    
324
    def first_annot(self) -> Annot:
325
        """
326
        Get first annotation on page.
327
        
328
        Returns:
329
        Annot object or None
330
        """
331
    
332
    def load_annot(self, ident: typing.Union[str, int]) -> Annot:
333
        """
334
        Load annotation by identifier.
335
        
336
        Parameters:
337
        - ident: annotation identifier (xref number or unique name)
338
        
339
        Returns:
340
        Annot object
341
        """
342
    
343
    def annot_names(self) -> list:
344
        """
345
        Get list of annotation names on page.
346
        
347
        Returns:
348
        List of annotation names
349
        """
350
    
351
    def annots(self, types: list = None) -> list:
352
        """
353
        Get list of annotations on page.
354
        
355
        Parameters:
356
        - types: filter by annotation types
357
        
358
        Returns:
359
        List of Annot objects
360
        """
361
```
362

363
### Drawing and Vector Content
364

365
Extract vector graphics and drawing information.
366

367
```python { .api }
368
class Page:
369
    def get_drawings(self, extended: bool = False) -> list:
370
        """
371
        Get vector drawings from page.
372
        
373
        Parameters:
374
        - extended: include extended path information
375
        
376
        Returns:
377
        List of drawing dictionaries with paths, colors, etc.
378
        """
379
    
380
    def get_cdrawings(self, extended: bool = False) -> list:
381
        """
382
        Get drawings in compact format.
383
        
384
        Parameters:
385
        - extended: include extended information
386
        
387
        Returns:  
388
        List of compact drawing representations
389
        """
390
```
391

392
## Usage Examples
393

394
### Basic Text Extraction
395

396
```python
397
import pymupdf
398

399
doc = pymupdf.open("document.pdf")
400
page = doc.load_page(0)
401

402
# Extract plain text using standalone function
403
text = pymupdf.get_text(page)
404
print(text)
405

406
# Extract with formatting as HTML
407
html = pymupdf.get_text(page, "html")
408
print(html)
409

410
# Extract detailed layout information
411
layout_dict = pymupdf.get_text(page, "dict")
412
for block in layout_dict["blocks"]:
413
    if "lines" in block:  # Text block
414
        for line in block["lines"]:
415
            for span in line["spans"]:
416
                print(f"Text: {span['text']}, Font: {span['font']}, Size: {span['size']}")
417

418
# Extract text blocks  
419
blocks = pymupdf.get_text_blocks(page)
420
for block in blocks:
421
    print(f"Block text: {block[4]}")  # block[4] contains the text
422

423
# Extract individual words with coordinates
424
words = pymupdf.get_text_words(page)
425
for word in words:
426
    x0, y0, x1, y1, text, block_no, line_no, word_no = word
427
    print(f"Word '{text}' at ({x0}, {y0}, {x1}, {y1})")
428

429
doc.close()
430
```
431

432
### Advanced Text Search
433

434
```python
435
import pymupdf
436

437
doc = pymupdf.open("document.pdf")
438

439
# Search across all pages
440
search_term = "important keyword"
441
results = []
442

443
for page_num in range(doc.page_count):
444
    page = doc.load_page(page_num)
445
    matches = page.search_for(search_term, quads=True)
446
    for match in matches:
447
        results.append({
448
            "page": page_num,
449
            "text": search_term,
450
            "quad": match,
451
            "bbox": match.rect
452
        })
453

454
print(f"Found {len(results)} matches")
455
doc.close()
456
```
457

458
### Image Extraction with Details
459

460
```python
461
import pymupdf
462

463
doc = pymupdf.open("document.pdf") 
464
page = doc.load_page(0)
465

466
# Get image information
467
images = page.get_images(full=True)
468

469
for img_index, img in enumerate(images):
470
    xref = img[0]  # Image xref number
471
    pix = pymupdf.Pixmap(doc, xref)  # Extract image
472
    
473
    if pix.n - pix.alpha < 4:  # GRAY or RGB
474
        pix.save(f"image_{page.number}_{img_index}.png")
475
    else:  # CMYK: convert to RGB first
476
        pix1 = pymupdf.Pixmap(pymupdf.csRGB, pix)
477
        pix1.save(f"image_{page.number}_{img_index}.png")
478
        pix1 = None
479
    
480
    pix = None
481

482
doc.close()
483
```
484

485
### Working with TextPage Objects
486

487
```python
488
import pymupdf
489

490
doc = pymupdf.open("document.pdf")
491
page = doc.load_page(0)
492

493
# Create TextPage for detailed analysis
494
textpage = page.get_textpage()
495

496
# Extract words with coordinates
497
words = textpage.extractWORDS()
498
for word in words:
499
    x0, y0, x1, y1, text, block_no, line_no, word_no = word
500
    print(f"Word: '{text}' at ({x0}, {y0}, {x1}, {y1})")
501

502
# Search within TextPage
503
matches = textpage.search("search term")
504
print(f"Found {len(matches)} matches")
505

506
doc.close()
507
```
508

509
### Link Analysis
510

511
```python
512
import pymupdf
513

514
doc = pymupdf.open("document.pdf")
515
page = doc.load_page(0)
516

517
# Get all links
518
links = page.get_links()
519

520
for link in links:
521
    print(f"Link type: {link['kind']}")
522
    print(f"From: {link['from']}")  # Source rectangle
523
    
524
    if link['kind'] == pymupdf.LINK_URI:
525
        print(f"URI: {link['uri']}")
526
    elif link['kind'] == pymupdf.LINK_GOTO:
527
        print(f"Target page: {link['page']}")
528
        if 'to' in link:
529
            print(f"Target point: {link['to']}")
530

531
doc.close()
532
```

Version

Tile

Files

page-content-extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

page-content-extraction.mddocs/