or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations-forms.mddocument-creation-modification.mddocument-operations.mddocument-rendering.mdgeometry-transformations.mdindex.mdpage-content-extraction.mdtable-extraction.md

table-extraction.mddocs/

0

# Table Extraction

1

2

Advanced table detection and extraction capabilities with support for table structure analysis, cell content extraction, and export to various formats including pandas DataFrames. PyMuPDF provides sophisticated algorithms for identifying and parsing tabular data from PDF documents.

3

4

## Capabilities

5

6

### Table Finding and Detection

7

8

Locate tables within document pages with configurable detection settings.

9

10

```python { .api }

11

class TableFinder:

12

def __init__(self, page: Page):

13

"""

14

Create table finder for a page.

15

16

Parameters:

17

- page: Page object to search for tables

18

"""

19

20

def find_tables(self, clip: Rect = None, strategy: str = "lines_strict",

21

vertical_strategy: str = "lines", horizontal_strategy: str = "lines",

22

explicit_vertical_lines: list = None, explicit_horizontal_lines: list = None,

23

snap_tolerance: float = 3, snap_x_tolerance: float = None,

24

snap_y_tolerance: float = None, join_tolerance: float = 3,

25

join_x_tolerance: float = None, join_y_tolerance: float = None,

26

edge_min_length: float = 3, min_words_vertical: float = 3,

27

min_words_horizontal: float = 1, intersection_tolerance: float = 3,

28

intersection_x_tolerance: float = None, intersection_y_tolerance: float = None,

29

text_tolerance: float = 3, text_x_tolerance: float = None,

30

text_y_tolerance: float = None) -> list:

31

"""

32

Find tables on the page.

33

34

Parameters:

35

- clip: rectangle to limit search area

36

- strategy: table detection strategy ("lines_strict", "lines", "text", "explicit")

37

- vertical_strategy: strategy for detecting vertical lines

38

- horizontal_strategy: strategy for detecting horizontal lines

39

- explicit_vertical_lines: explicit vertical line positions

40

- explicit_horizontal_lines: explicit horizontal line positions

41

- snap_tolerance: tolerance for snapping lines to text

42

- snap_x_tolerance: x-direction snap tolerance

43

- snap_y_tolerance: y-direction snap tolerance

44

- join_tolerance: tolerance for joining line segments

45

- join_x_tolerance: x-direction join tolerance

46

- join_y_tolerance: y-direction join tolerance

47

- edge_min_length: minimum line length to consider

48

- min_words_vertical: minimum words to form vertical line

49

- min_words_horizontal: minimum words to form horizontal line

50

- intersection_tolerance: tolerance for line intersections

51

- intersection_x_tolerance: x-direction intersection tolerance

52

- intersection_y_tolerance: y-direction intersection tolerance

53

- text_tolerance: tolerance for text-based table detection

54

- text_x_tolerance: x-direction text tolerance

55

- text_y_tolerance: y-direction text tolerance

56

57

Returns:

58

List of Table objects found on the page

59

"""

60

```

61

62

### Table Class

63

64

Individual table representation with extraction and manipulation capabilities.

65

66

```python { .api }

67

class Table:

68

def __init__(self, page: Page, bbox: Rect):

69

"""

70

Create table object.

71

72

Parameters:

73

- page: parent Page object

74

- bbox: table bounding rectangle

75

"""

76

77

def extract(self, x_tolerance: float = 3, y_tolerance: float = 3) -> list:

78

"""

79

Extract table data as list of rows.

80

81

Parameters:

82

- x_tolerance: horizontal tolerance for cell alignment

83

- y_tolerance: vertical tolerance for cell alignment

84

85

Returns:

86

List of lists representing table rows and cells

87

"""

88

89

def to_pandas(self, **kwargs) -> 'pandas.DataFrame':

90

"""

91

Convert table to pandas DataFrame.

92

93

Parameters:

94

- kwargs: additional pandas DataFrame parameters

95

96

Returns:

97

pandas DataFrame with table data

98

"""

99

100

def to_csv(self, file_path: str = None, **kwargs) -> str:

101

"""

102

Export table to CSV format.

103

104

Parameters:

105

- file_path: output file path (None for string return)

106

- kwargs: additional CSV export parameters

107

108

Returns:

109

CSV string if file_path is None, otherwise None

110

"""

111

112

def to_dict(self, orient: str = "records") -> typing.Union[list, dict]:

113

"""

114

Convert table to dictionary format.

115

116

Parameters:

117

- orient: dictionary orientation ("records", "list", "dict", etc.)

118

119

Returns:

120

Table data as dictionary

121

"""

122

123

@property

124

def bbox(self) -> Rect:

125

"""Table bounding rectangle."""

126

127

@property

128

def cells(self) -> list:

129

"""List of table cells with positions and content."""

130

131

@property

132

def rows(self) -> list:

133

"""List of table rows."""

134

135

@property

136

def cols(self) -> list:

137

"""List of table columns."""

138

```

139

140

### Table Settings and Configuration

141

142

Fine-tune table detection parameters for different document types.

143

144

```python { .api }

145

class TableSettings:

146

def __init__(self):

147

"""Create default table settings."""

148

149

@property

150

def vertical_strategy(self) -> str:

151

"""Strategy for vertical line detection."""

152

153

@property

154

def horizontal_strategy(self) -> str:

155

"""Strategy for horizontal line detection."""

156

157

@property

158

def snap_tolerance(self) -> float:

159

"""Tolerance for snapping lines to text."""

160

161

@property

162

def join_tolerance(self) -> float:

163

"""Tolerance for joining line segments."""

164

165

@property

166

def edge_min_length(self) -> float:

167

"""Minimum line length to consider."""

168

169

@property

170

def min_words_vertical(self) -> float:

171

"""Minimum words to form vertical line."""

172

173

@property

174

def min_words_horizontal(self) -> float:

175

"""Minimum words to form horizontal line."""

176

177

@property

178

def intersection_tolerance(self) -> float:

179

"""Tolerance for line intersections."""

180

181

@property

182

def text_tolerance(self) -> float:

183

"""Tolerance for text-based detection."""

184

```

185

186

### Advanced Table Analysis

187

188

Analyze table structure and content for complex data extraction.

189

190

```python { .api }

191

class TableRow:

192

@property

193

def cells(self) -> list:

194

"""Cells in this row."""

195

196

@property

197

def bbox(self) -> Rect:

198

"""Row bounding rectangle."""

199

200

@property

201

def height(self) -> float:

202

"""Row height."""

203

204

class TableHeader:

205

@property

206

def cells(self) -> list:

207

"""Header cells."""

208

209

@property

210

def bbox(self) -> Rect:

211

"""Header bounding rectangle."""

212

213

# Cell content analysis

214

class TextMap:

215

def __init__(self, page: Page):

216

"""Create text map for table analysis."""

217

218

def get_text_in_bbox(self, bbox: Rect) -> str:

219

"""Get text within bounding box."""

220

221

class WordMap:

222

def __init__(self, page: Page):

223

"""Create word map for table analysis."""

224

225

def get_words_in_bbox(self, bbox: Rect) -> list:

226

"""Get words within bounding box."""

227

```

228

229

### Simple Table Extraction Function

230

231

Convenient high-level function for basic table extraction.

232

233

```python { .api }

234

def find_tables(page: Page, **kwargs) -> list:

235

"""

236

Find tables on page (convenience function).

237

238

Parameters:

239

- page: Page object to search

240

- kwargs: table detection parameters

241

242

Returns:

243

List of Table objects

244

"""

245

```

246

247

## Usage Examples

248

249

### Basic Table Extraction

250

251

```python

252

import pymupdf

253

254

doc = pymupdf.open("document_with_tables.pdf")

255

page = doc.load_page(0)

256

257

# Find tables on the page

258

tables = page.find_tables()

259

260

print(f"Found {len(tables)} tables")

261

262

for i, table in enumerate(tables):

263

print(f"\nTable {i + 1}:")

264

print(f" Bounding box: {table.bbox}")

265

266

# Extract table data

267

table_data = table.extract()

268

269

# Print table content

270

for row_num, row in enumerate(table_data):

271

print(f" Row {row_num}: {row}")

272

273

doc.close()

274

```

275

276

### Advanced Table Detection

277

278

```python

279

import pymupdf

280

281

doc = pymupdf.open("complex_document.pdf")

282

page = doc.load_page(0)

283

284

# Create table finder with custom settings

285

table_finder = pymupdf.TableFinder(page)

286

287

# Find tables with custom parameters

288

tables = table_finder.find_tables(

289

strategy="lines", # Use line-based detection

290

snap_tolerance=5, # More lenient line snapping

291

join_tolerance=5, # More aggressive line joining

292

edge_min_length=10, # Longer minimum lines

293

min_words_vertical=2, # Fewer words needed for vertical lines

294

text_tolerance=5 # Text-based detection tolerance

295

)

296

297

print(f"Found {len(tables)} tables with custom settings")

298

299

for table in tables:

300

# Extract with custom tolerances

301

data = table.extract(x_tolerance=5, y_tolerance=3)

302

print(f"Table with {len(data)} rows")

303

304

doc.close()

305

```

306

307

### Converting Tables to Different Formats

308

309

```python

310

import pymupdf

311

import pandas as pd

312

313

doc = pymupdf.open("data_report.pdf")

314

page = doc.load_page(0)

315

316

tables = page.find_tables()

317

318

for i, table in enumerate(tables):

319

# Convert to pandas DataFrame

320

try:

321

df = table.to_pandas()

322

print(f"Table {i + 1}: {df.shape} DataFrame")

323

print(df.head())

324

325

# Save as CSV

326

df.to_csv(f"table_{i + 1}.csv", index=False)

327

328

# Save as Excel

329

df.to_excel(f"table_{i + 1}.xlsx", index=False)

330

331

except Exception as e:

332

print(f"Error converting table {i + 1}: {e}")

333

334

# Convert to dictionary

335

table_dict = table.to_dict(orient="records")

336

print(f"Table as dict: {len(table_dict)} records")

337

338

# Convert to CSV string

339

csv_string = table.to_csv()

340

print(f"CSV length: {len(csv_string)} characters")

341

342

doc.close()

343

```

344

345

### Searching for Specific Tables

346

347

```python

348

import pymupdf

349

350

def find_tables_containing_text(page: pymupdf.Page, search_text: str) -> list:

351

"""Find tables that contain specific text."""

352

tables = page.find_tables()

353

matching_tables = []

354

355

for table in tables:

356

table_data = table.extract()

357

358

# Check if any cell contains the search text

359

for row in table_data:

360

for cell in row:

361

if cell and search_text.lower() in str(cell).lower():

362

matching_tables.append(table)

363

break

364

if table in matching_tables:

365

break

366

367

return matching_tables

368

369

doc = pymupdf.open("financial_report.pdf")

370

371

# Search all pages for tables containing "Revenue"

372

revenue_tables = []

373

for page_num in range(doc.page_count):

374

page = doc.load_page(page_num)

375

tables = find_tables_containing_text(page, "Revenue")

376

revenue_tables.extend([(page_num, table) for table in tables])

377

378

print(f"Found {len(revenue_tables)} tables containing 'Revenue'")

379

380

for page_num, table in revenue_tables:

381

print(f"Page {page_num + 1}: Table at {table.bbox}")

382

data = table.extract()

383

# Process revenue table data...

384

385

doc.close()

386

```

387

388

### Table Structure Analysis

389

390

```python

391

import pymupdf

392

393

def analyze_table_structure(table: pymupdf.Table) -> dict:

394

"""Analyze table structure and provide statistics."""

395

data = table.extract()

396

397

if not data:

398

return {"error": "Empty table"}

399

400

num_rows = len(data)

401

num_cols = len(data[0]) if data else 0

402

403

# Check for consistent column count

404

consistent_cols = all(len(row) == num_cols for row in data)

405

406

# Find empty cells

407

empty_cells = 0

408

total_cells = 0

409

410

for row in data:

411

for cell in row:

412

total_cells += 1

413

if not cell or str(cell).strip() == "":

414

empty_cells += 1

415

416

# Detect header row (often has different formatting)

417

likely_header = 0 # First row is most likely header

418

419

# Check for numeric columns

420

numeric_cols = []

421

for col_idx in range(num_cols):

422

numeric_count = 0

423

for row_idx in range(1, num_rows): # Skip header

424

if row_idx < len(data) and col_idx < len(data[row_idx]):

425

cell = data[row_idx][col_idx]

426

try:

427

float(str(cell).replace(',', '').replace('$', ''))

428

numeric_count += 1

429

except (ValueError, AttributeError):

430

pass

431

432

if numeric_count > (num_rows - 1) * 0.7: # 70% numeric

433

numeric_cols.append(col_idx)

434

435

return {

436

"dimensions": (num_rows, num_cols),

437

"consistent_columns": consistent_cols,

438

"empty_cells": empty_cells,

439

"total_cells": total_cells,

440

"fill_rate": (total_cells - empty_cells) / total_cells if total_cells > 0 else 0,

441

"likely_header_row": likely_header,

442

"numeric_columns": numeric_cols,

443

"bbox": table.bbox

444

}

445

446

doc = pymupdf.open("data_tables.pdf")

447

page = doc.load_page(0)

448

tables = page.find_tables()

449

450

for i, table in enumerate(tables):

451

analysis = analyze_table_structure(table)

452

print(f"\nTable {i + 1} Analysis:")

453

for key, value in analysis.items():

454

print(f" {key}: {value}")

455

456

doc.close()

457

```

458

459

### Merging Tables Across Pages

460

461

```python

462

import pymupdf

463

import pandas as pd

464

465

def extract_all_tables(doc: pymupdf.Document) -> list:

466

"""Extract all tables from all pages."""

467

all_tables = []

468

469

for page_num in range(doc.page_count):

470

page = doc.load_page(page_num)

471

tables = page.find_tables()

472

473

for table in tables:

474

table_data = {

475

"page": page_num,

476

"bbox": table.bbox,

477

"data": table.extract(),

478

"dataframe": table.to_pandas() if table.extract() else None

479

}

480

all_tables.append(table_data)

481

482

return all_tables

483

484

def merge_similar_tables(tables: list, similarity_threshold: float = 0.8) -> list:

485

"""Merge tables with similar column structures."""

486

merged_groups = []

487

488

for table in tables:

489

if table["dataframe"] is None:

490

continue

491

492

# Find similar tables

493

similar_group = None

494

for group in merged_groups:

495

if len(group) > 0:

496

reference_df = group[0]["dataframe"]

497

current_df = table["dataframe"]

498

499

# Check column similarity (simple heuristic)

500

if (len(reference_df.columns) == len(current_df.columns) and

501

len(set(reference_df.columns) & set(current_df.columns)) /

502

len(reference_df.columns) >= similarity_threshold):

503

similar_group = group

504

break

505

506

if similar_group:

507

similar_group.append(table)

508

else:

509

merged_groups.append([table])

510

511

return merged_groups

512

513

# Usage

514

doc = pymupdf.open("multi_page_report.pdf")

515

all_tables = extract_all_tables(doc)

516

print(f"Found {len(all_tables)} total tables")

517

518

# Group similar tables

519

table_groups = merge_similar_tables(all_tables)

520

print(f"Grouped into {len(table_groups)} similar table groups")

521

522

# Merge each group

523

for i, group in enumerate(table_groups):

524

if len(group) > 1:

525

# Merge DataFrames

526

dfs = [table["dataframe"] for table in group if table["dataframe"] is not None]

527

merged_df = pd.concat(dfs, ignore_index=True)

528

529

print(f"Group {i + 1}: Merged {len(group)} tables into {merged_df.shape} DataFrame")

530

merged_df.to_csv(f"merged_tables_group_{i + 1}.csv", index=False)

531

else:

532

# Single table

533

table = group[0]

534

if table["dataframe"] is not None:

535

table["dataframe"].to_csv(f"single_table_page_{table['page'] + 1}.csv", index=False)

536

537

doc.close()

538

```

539

540

### Custom Table Detection Strategies

541

542

```python

543

import pymupdf

544

545

def detect_tables_by_whitespace(page: pymupdf.Page, min_gap: float = 20) -> list:

546

"""Detect tables by analyzing whitespace patterns."""

547

# Get all words with positions

548

words = page.get_text("words")

549

550

if not words:

551

return []

552

553

# Group words by approximate rows based on y-coordinates

554

rows = {}

555

for word in words:

556

x0, y0, x1, y1, text, block_no, line_no, word_no = word

557

y_key = round(y0 / 5) * 5 # Group by 5-point intervals

558

559

if y_key not in rows:

560

rows[y_key] = []

561

rows[y_key].append((x0, x1, text))

562

563

# Analyze column alignment

564

potential_tables = []

565

sorted_rows = sorted(rows.items())

566

567

for y_pos, row_words in sorted_rows:

568

if len(row_words) >= 3: # At least 3 columns

569

row_words.sort() # Sort by x position

570

571

# Check for regular spacing

572

gaps = []

573

for i in range(1, len(row_words)):

574

gap = row_words[i][0] - row_words[i-1][1]

575

gaps.append(gap)

576

577

if gaps and min(gaps) > min_gap: # Significant gaps between words

578

potential_tables.append((y_pos, row_words))

579

580

# Convert to Table-like objects (simplified)

581

tables = []

582

for y_pos, words in potential_tables:

583

# Create bounding box

584

min_x = min(word[0] for word in words)

585

max_x = max(word[1] for word in words)

586

bbox = pymupdf.Rect(min_x, y_pos - 5, max_x, y_pos + 15)

587

588

# This would need more sophisticated conversion to actual Table objects

589

# For demonstration, we'll use the regular table finder on this area

590

tables_in_area = page.find_tables(clip=bbox)

591

tables.extend(tables_in_area)

592

593

return tables

594

595

# Usage

596

doc = pymupdf.open("whitespace_tables.pdf")

597

page = doc.load_page(0)

598

599

# Try different detection methods

600

regular_tables = page.find_tables()

601

whitespace_tables = detect_tables_by_whitespace(page)

602

603

print(f"Regular detection: {len(regular_tables)} tables")

604

print(f"Whitespace detection: {len(whitespace_tables)} tables")

605

606

doc.close()

607

```