or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations-forms.mddocument-creation-modification.mddocument-operations.mddocument-rendering.mdgeometry-transformations.mdindex.mdpage-content-extraction.mdtable-extraction.md

page-content-extraction.mddocs/

0

# Page Content Extraction

1

2

Comprehensive text and image extraction from document pages with multiple output formats, search capabilities, and detailed layout analysis. PyMuPDF provides powerful extraction tools that preserve formatting and structural information.

3

4

## Capabilities

5

6

### Text Extraction

7

8

Extract text in various formats with layout and formatting information.

9

10

```python { .api }

11

def get_text(

12

page: Page,

13

option: str = "text",

14

*,

15

clip: Rect = None,

16

flags: int = None,

17

textpage: TextPage = None,

18

sort: bool = False,

19

delimiters=None,

20

tolerance=3

21

) -> str:

22

"""

23

Extract text from a page in specified format (standalone utility function).

24

25

Parameters:

26

- page: Page object to extract text from

27

- option: output format ("text", "html", "dict", "json", "rawdict", "xml", "xhtml", "words", "blocks")

28

- clip: Rect to limit extraction area

29

- flags: text extraction flags (TEXT_PRESERVE_LIGATURES, etc.)

30

- textpage: existing TextPage object to reuse

31

- sort: sort text by reading order

32

- delimiters: characters to use as word delimiters (for words option)

33

- tolerance: consider words part of same line if coordinates don't differ more than this

34

35

Returns:

36

Extracted text in requested format

37

"""

38

39

def get_text_blocks(

40

page: Page,

41

clip: Rect = None,

42

flags: int = None,

43

textpage: TextPage = None,

44

sort: bool = False

45

) -> list:

46

"""

47

Return the text blocks on a page.

48

49

Parameters:

50

- page: Page object to extract blocks from

51

- clip: Rect to limit extraction area

52

- flags: text extraction flags

53

- textpage: existing TextPage object to reuse

54

- sort: sort blocks by reading order

55

56

Returns:

57

List of text blocks with coordinates and content

58

"""

59

60

def get_text_words(

61

page: Page,

62

clip: Rect = None,

63

flags: int = None,

64

textpage: TextPage = None,

65

sort: bool = False,

66

delimiters=None,

67

tolerance=3

68

) -> list:

69

"""

70

Return text words as list with bounding box for each word.

71

72

Parameters:

73

- page: Page object to extract words from

74

- clip: Rect to limit extraction area

75

- flags: text extraction flags

76

- textpage: existing TextPage object to reuse

77

- sort: sort words by reading order

78

- delimiters: characters to use as word delimiters

79

- tolerance: consider words part of same line if coordinates don't differ more than this

80

81

Returns:

82

List of words with bounding rectangles

83

"""

84

85

def get_textbox(page: Page, rect: Rect, textpage: TextPage = None) -> str:

86

"""

87

Extract text from specific rectangular area.

88

89

Parameters:

90

- page: Page object

91

- rect: rectangular area to extract text from

92

- textpage: existing TextPage object to reuse

93

94

Returns:

95

Text content within the specified rectangle

96

"""

97

98

def get_text_selection(

99

page: Page,

100

p1: Point,

101

p2: Point,

102

clip: Rect = None,

103

textpage: TextPage = None

104

) -> str:

105

"""

106

Extract text between two points on page.

107

108

Parameters:

109

- page: Page object

110

- p1: start point for text selection

111

- p2: end point for text selection

112

- clip: Rect to limit extraction area

113

- textpage: existing TextPage object to reuse

114

115

Returns:

116

Selected text content

117

"""

118

119

class Page:

120

def get_textpage(self, clip: Rect = None, flags: int = 0, matrix: Matrix = None) -> TextPage:

121

"""

122

Get TextPage object for detailed text analysis.

123

124

Parameters:

125

- clip: rectangle to limit text extraction

126

- flags: extraction flags for text processing

127

128

Returns:

129

TextPage object with detailed text information

130

"""

131

```

132

133

### TextPage Class

134

135

Detailed text extraction and analysis with layout information.

136

137

```python { .api }

138

class TextPage:

139

def extractText(self, sort: bool = False) -> str:

140

"""

141

Extract plain text.

142

143

Parameters:

144

- sort: sort text by reading order

145

146

Returns:

147

Plain text string

148

"""

149

150

def extractHTML(self) -> str:

151

"""

152

Extract text as HTML with formatting.

153

154

Returns:

155

HTML formatted text

156

"""

157

158

def extractJSON(self, cb=None) -> str:

159

"""

160

Extract text as JSON with detailed layout info.

161

162

Parameters:

163

- cb: optional callback function

164

165

Returns:

166

JSON string with text blocks, lines, spans, and characters

167

"""

168

169

def extractXHTML(self) -> str:

170

"""

171

Extract text as XHTML.

172

173

Returns:

174

XHTML formatted text

175

"""

176

177

def extractXML(self) -> str:

178

"""

179

Extract text as XML.

180

181

Returns:

182

XML formatted text with structure

183

"""

184

185

def extractDICT(self, cb=None, sort: bool = False) -> dict:

186

"""

187

Extract text as dictionary with detailed information.

188

189

Parameters:

190

- cb: optional callback function

191

- sort: sort text by reading order

192

193

Returns:

194

Dictionary with blocks, lines, spans, and character details

195

"""

196

197

def extractBLOCKS(self) -> list:

198

"""

199

Extract text blocks.

200

201

Returns:

202

List of text blocks with coordinates and content

203

"""

204

205

def extractWORDS(self, delimiters: str = None) -> list:

206

"""

207

Extract individual words with positions.

208

209

Parameters:

210

- delimiters: word delimiter characters

211

212

Returns:

213

List of words with bounding boxes

214

"""

215

216

def search(self, needle: str, hit_max: int = 16, quads: bool = False) -> list:

217

"""

218

Search for text on the page.

219

220

Parameters:

221

- needle: text to search for

222

- hit_max: maximum number of hits

223

- quads: return results as Quad objects instead of Rect

224

225

Returns:

226

List of Rect or Quad objects indicating match locations

227

"""

228

```

229

230

### Text Search

231

232

Search for text with various options and return location information.

233

234

```python { .api }

235

class Page:

236

def search_for(self, needle: str, hit_max: int = 16, quads: bool = False,

237

flags: int = 0, clip: Rect = None) -> list:

238

"""

239

Search for text on page.

240

241

Parameters:

242

- needle: text to search for

243

- hit_max: maximum number of hits to return

244

- quads: return Quad objects instead of Rect objects

245

- flags: search flags for case sensitivity, etc.

246

- clip: limit search to this rectangle

247

248

Returns:

249

List of Rect or Quad objects indicating match locations

250

"""

251

```

252

253

### Image Extraction

254

255

Extract embedded images from document pages.

256

257

```python { .api }

258

class Page:

259

def get_images(self, full: bool = False) -> list:

260

"""

261

Get list of images on page.

262

263

Parameters:

264

- full: include detailed image information

265

266

Returns:

267

List of image dictionaries with xref, bbox, transform, etc.

268

"""

269

270

def get_image_bbox(self, name: str, transform: bool = True) -> Rect:

271

"""

272

Get bounding box of named image.

273

274

Parameters:

275

- name: image name/reference

276

- transform: apply transformation matrix

277

278

Returns:

279

Image bounding rectangle

280

"""

281

282

def get_pixmap(self, matrix: Matrix = None, colorspace: Colorspace = None,

283

clip: Rect = None, alpha: bool = False, annots: bool = True) -> Pixmap:

284

"""

285

Render page to Pixmap for image extraction.

286

287

Parameters:

288

- matrix: transformation matrix

289

- colorspace: target color space

290

- clip: clipping rectangle

291

- alpha: include alpha channel

292

- annots: include annotations

293

294

Returns:

295

Pixmap object with page image

296

"""

297

```

298

299

### Links and Annotations

300

301

Extract interactive elements from pages.

302

303

```python { .api }

304

class Page:

305

def get_links(self) -> list:

306

"""

307

Get list of links on page.

308

309

Returns:

310

List of link dictionaries with kind, from, to, uri, etc.

311

"""

312

313

def first_link(self) -> Link:

314

"""

315

Get first link on page.

316

317

Returns:

318

Link object or None

319

"""

320

321

def load_links(self) -> None:

322

"""Load links from page for iteration."""

323

324

def first_annot(self) -> Annot:

325

"""

326

Get first annotation on page.

327

328

Returns:

329

Annot object or None

330

"""

331

332

def load_annot(self, ident: typing.Union[str, int]) -> Annot:

333

"""

334

Load annotation by identifier.

335

336

Parameters:

337

- ident: annotation identifier (xref number or unique name)

338

339

Returns:

340

Annot object

341

"""

342

343

def annot_names(self) -> list:

344

"""

345

Get list of annotation names on page.

346

347

Returns:

348

List of annotation names

349

"""

350

351

def annots(self, types: list = None) -> list:

352

"""

353

Get list of annotations on page.

354

355

Parameters:

356

- types: filter by annotation types

357

358

Returns:

359

List of Annot objects

360

"""

361

```

362

363

### Drawing and Vector Content

364

365

Extract vector graphics and drawing information.

366

367

```python { .api }

368

class Page:

369

def get_drawings(self, extended: bool = False) -> list:

370

"""

371

Get vector drawings from page.

372

373

Parameters:

374

- extended: include extended path information

375

376

Returns:

377

List of drawing dictionaries with paths, colors, etc.

378

"""

379

380

def get_cdrawings(self, extended: bool = False) -> list:

381

"""

382

Get drawings in compact format.

383

384

Parameters:

385

- extended: include extended information

386

387

Returns:

388

List of compact drawing representations

389

"""

390

```

391

392

## Usage Examples

393

394

### Basic Text Extraction

395

396

```python

397

import pymupdf

398

399

doc = pymupdf.open("document.pdf")

400

page = doc.load_page(0)

401

402

# Extract plain text using standalone function

403

text = pymupdf.get_text(page)

404

print(text)

405

406

# Extract with formatting as HTML

407

html = pymupdf.get_text(page, "html")

408

print(html)

409

410

# Extract detailed layout information

411

layout_dict = pymupdf.get_text(page, "dict")

412

for block in layout_dict["blocks"]:

413

if "lines" in block: # Text block

414

for line in block["lines"]:

415

for span in line["spans"]:

416

print(f"Text: {span['text']}, Font: {span['font']}, Size: {span['size']}")

417

418

# Extract text blocks

419

blocks = pymupdf.get_text_blocks(page)

420

for block in blocks:

421

print(f"Block text: {block[4]}") # block[4] contains the text

422

423

# Extract individual words with coordinates

424

words = pymupdf.get_text_words(page)

425

for word in words:

426

x0, y0, x1, y1, text, block_no, line_no, word_no = word

427

print(f"Word '{text}' at ({x0}, {y0}, {x1}, {y1})")

428

429

doc.close()

430

```

431

432

### Advanced Text Search

433

434

```python

435

import pymupdf

436

437

doc = pymupdf.open("document.pdf")

438

439

# Search across all pages

440

search_term = "important keyword"

441

results = []

442

443

for page_num in range(doc.page_count):

444

page = doc.load_page(page_num)

445

matches = page.search_for(search_term, quads=True)

446

for match in matches:

447

results.append({

448

"page": page_num,

449

"text": search_term,

450

"quad": match,

451

"bbox": match.rect

452

})

453

454

print(f"Found {len(results)} matches")

455

doc.close()

456

```

457

458

### Image Extraction with Details

459

460

```python

461

import pymupdf

462

463

doc = pymupdf.open("document.pdf")

464

page = doc.load_page(0)

465

466

# Get image information

467

images = page.get_images(full=True)

468

469

for img_index, img in enumerate(images):

470

xref = img[0] # Image xref number

471

pix = pymupdf.Pixmap(doc, xref) # Extract image

472

473

if pix.n - pix.alpha < 4: # GRAY or RGB

474

pix.save(f"image_{page.number}_{img_index}.png")

475

else: # CMYK: convert to RGB first

476

pix1 = pymupdf.Pixmap(pymupdf.csRGB, pix)

477

pix1.save(f"image_{page.number}_{img_index}.png")

478

pix1 = None

479

480

pix = None

481

482

doc.close()

483

```

484

485

### Working with TextPage Objects

486

487

```python

488

import pymupdf

489

490

doc = pymupdf.open("document.pdf")

491

page = doc.load_page(0)

492

493

# Create TextPage for detailed analysis

494

textpage = page.get_textpage()

495

496

# Extract words with coordinates

497

words = textpage.extractWORDS()

498

for word in words:

499

x0, y0, x1, y1, text, block_no, line_no, word_no = word

500

print(f"Word: '{text}' at ({x0}, {y0}, {x1}, {y1})")

501

502

# Search within TextPage

503

matches = textpage.search("search term")

504

print(f"Found {len(matches)} matches")

505

506

doc.close()

507

```

508

509

### Link Analysis

510

511

```python

512

import pymupdf

513

514

doc = pymupdf.open("document.pdf")

515

page = doc.load_page(0)

516

517

# Get all links

518

links = page.get_links()

519

520

for link in links:

521

print(f"Link type: {link['kind']}")

522

print(f"From: {link['from']}") # Source rectangle

523

524

if link['kind'] == pymupdf.LINK_URI:

525

print(f"URI: {link['uri']}")

526

elif link['kind'] == pymupdf.LINK_GOTO:

527

print(f"Target page: {link['page']}")

528

if 'to' in link:

529

print(f"Target point: {link['to']}")

530

531

doc.close()

532

```