or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdindex.mdpage-manipulation.mdpdf-operations.mdtable-extraction.mdtext-extraction.mdutilities.mdvisual-debugging.md

table-extraction.mddocs/

0

# Table Extraction

1

2

Sophisticated table detection and extraction capabilities with customizable strategies, edge detection algorithms, comprehensive configuration options, and visual debugging support.

3

4

## Capabilities

5

6

### Table Finding

7

8

Detect all tables on a page using various detection strategies and algorithms.

9

10

```python { .api }

11

def find_tables(table_settings=None):

12

"""

13

Find all tables using detection algorithms.

14

15

Parameters:

16

- table_settings: TableSettings or dict, optional - Configuration for detection

17

18

Returns:

19

List[Table]: List of detected table objects

20

"""

21

22

def find_table(table_settings=None):

23

"""

24

Find largest table on page.

25

26

Parameters:

27

- table_settings: TableSettings or dict, optional - Configuration for detection

28

29

Returns:

30

Table or None: Largest detected table or None if no tables found

31

"""

32

```

33

34

**Usage Examples:**

35

36

```python

37

with pdfplumber.open("document.pdf") as pdf:

38

page = pdf.pages[0]

39

40

# Find all tables with default settings

41

tables = page.find_tables()

42

print(f"Found {len(tables)} tables")

43

44

# Find largest table only

45

main_table = page.find_table()

46

if main_table:

47

print(f"Main table area: {main_table.bbox}")

48

49

# Find tables with custom settings

50

custom_settings = {

51

"vertical_strategy": "text",

52

"horizontal_strategy": "lines"

53

}

54

tables = page.find_tables(table_settings=custom_settings)

55

```

56

57

### Table Extraction

58

59

Extract table data as structured 2D arrays with various formatting options.

60

61

```python { .api }

62

def extract_tables(table_settings=None):

63

"""

64

Extract all tables as 2D arrays.

65

66

Parameters:

67

- table_settings: TableSettings or dict, optional - Configuration for detection

68

69

Returns:

70

List[List[List[str]]]: List of tables, each as 2D array of strings

71

"""

72

73

def extract_table(table_settings=None):

74

"""

75

Extract largest table as 2D array.

76

77

Parameters:

78

- table_settings: TableSettings or dict, optional - Configuration for detection

79

80

Returns:

81

List[List[str]] or None: 2D array of strings or None if no table found

82

"""

83

```

84

85

**Usage Examples:**

86

87

```python

88

with pdfplumber.open("document.pdf") as pdf:

89

page = pdf.pages[0]

90

91

# Extract all tables

92

tables = page.extract_tables()

93

for i, table in enumerate(tables):

94

print(f"Table {i+1}:")

95

for row in table:

96

print(" ", row)

97

98

# Extract main table only

99

main_table = page.extract_table()

100

if main_table:

101

# Process header row

102

headers = main_table[0]

103

data_rows = main_table[1:]

104

105

for row in data_rows:

106

row_dict = dict(zip(headers, row))

107

print(row_dict)

108

```

109

110

### Table Class

111

112

Represents a detected table with extraction and analysis capabilities.

113

114

```python { .api }

115

class Table:

116

"""Detected table with extraction capabilities."""

117

118

def __init__(self, page, cells):

119

"""Initialize table from page and cell data."""

120

121

@property

122

def bbox(self) -> T_bbox:

123

"""Table bounding box coordinates."""

124

125

@property

126

def cells(self) -> List[T_bbox]:

127

"""List of cell bounding boxes."""

128

129

@property

130

def rows(self) -> List[CellGroup]:

131

"""Table rows as CellGroup objects."""

132

133

@property

134

def columns(self) -> List[CellGroup]:

135

"""Table columns as CellGroup objects."""

136

137

def extract(self, **kwargs):

138

"""

139

Extract table data as 2D array.

140

141

Parameters:

142

- **kwargs: Text extraction options for cell content

143

144

Returns:

145

List[List[str]]: 2D array of cell text content

146

"""

147

```

148

149

**Usage Examples:**

150

151

```python

152

with pdfplumber.open("document.pdf") as pdf:

153

page = pdf.pages[0]

154

155

tables = page.find_tables()

156

for table in tables:

157

print(f"Table at {table.bbox}")

158

print(f"Dimensions: {len(table.rows)} rows × {len(table.columns)} columns")

159

160

# Extract with custom text options

161

data = table.extract(layout=True, x_tolerance=1)

162

163

# Analyze cell structure

164

for i, row in enumerate(table.rows):

165

print(f"Row {i}: {len(row.cells)} cells")

166

```

167

168

### TableFinder Class

169

170

Handles the table detection algorithm implementation and provides debugging capabilities.

171

172

```python { .api }

173

class TableFinder:

174

"""Table detection algorithm implementation."""

175

176

def __init__(self, page, settings=None):

177

"""Initialize TableFinder with page and settings."""

178

179

@property

180

def page(self) -> Page:

181

"""Source page object."""

182

183

@property

184

def settings(self) -> TableSettings:

185

"""Table detection settings."""

186

187

@property

188

def edges(self) -> T_obj_list:

189

"""Detected edges for table detection."""

190

191

@property

192

def intersections(self) -> T_intersections:

193

"""Edge intersection points."""

194

195

@property

196

def cells(self) -> List[T_bbox]:

197

"""Detected table cells."""

198

199

@property

200

def tables(self) -> List[Table]:

201

"""Detected table objects."""

202

203

def get_edges(self):

204

"""Get edges based on detection strategy."""

205

```

206

207

### TableSettings Class

208

209

Comprehensive configuration class for table detection parameters and strategies.

210

211

```python { .api }

212

class TableSettings:

213

"""Configuration for table detection parameters."""

214

215

def __init__(self, vertical_strategy="lines", horizontal_strategy="lines",

216

explicit_vertical_lines=None, explicit_horizontal_lines=None,

217

snap_tolerance=3, snap_x_tolerance=None, snap_y_tolerance=None,

218

join_tolerance=3, join_x_tolerance=None, join_y_tolerance=None,

219

edge_min_length=3, min_words_vertical=3, min_words_horizontal=1,

220

intersection_tolerance=3, intersection_x_tolerance=None,

221

intersection_y_tolerance=None, text_settings=None):

222

"""Initialize table detection settings."""

223

224

@classmethod

225

def resolve(cls, settings):

226

"""

227

Create TableSettings from dict or existing instance.

228

229

Parameters:

230

- settings: dict, TableSettings, or None

231

232

Returns:

233

TableSettings: Resolved settings object

234

"""

235

236

# Detection strategy options

237

vertical_strategy: str # "lines", "lines_strict", "text", "explicit"

238

horizontal_strategy: str # "lines", "lines_strict", "text", "explicit"

239

240

# Explicit line positions

241

explicit_vertical_lines: Optional[List[T_num]]

242

explicit_horizontal_lines: Optional[List[T_num]]

243

244

# Edge processing tolerances

245

snap_tolerance: T_num

246

snap_x_tolerance: Optional[T_num]

247

snap_y_tolerance: Optional[T_num]

248

join_tolerance: T_num

249

join_x_tolerance: Optional[T_num]

250

join_y_tolerance: Optional[T_num]

251

edge_min_length: T_num

252

253

# Text-based detection parameters

254

min_words_vertical: int

255

min_words_horizontal: int

256

257

# Intersection detection

258

intersection_tolerance: T_num

259

intersection_x_tolerance: Optional[T_num]

260

intersection_y_tolerance: Optional[T_num]

261

262

# Text extraction settings for cells

263

text_settings: Optional[Dict[str, Any]]

264

```

265

266

**Usage Examples:**

267

268

```python

269

from pdfplumber.table import TableSettings

270

271

with pdfplumber.open("document.pdf") as pdf:

272

page = pdf.pages[0]

273

274

# Custom settings for line-based detection

275

line_settings = TableSettings(

276

vertical_strategy="lines_strict",

277

horizontal_strategy="lines_strict",

278

snap_tolerance=2,

279

edge_min_length=10

280

)

281

282

# Custom settings for text-based detection

283

text_settings = TableSettings(

284

vertical_strategy="text",

285

horizontal_strategy="text",

286

min_words_vertical=2,

287

min_words_horizontal=1

288

)

289

290

# Explicit line positions

291

explicit_settings = TableSettings(

292

vertical_strategy="explicit",

293

horizontal_strategy="explicit",

294

explicit_vertical_lines=[100, 200, 300, 400],

295

explicit_horizontal_lines=[50, 100, 150, 200]

296

)

297

298

# Use settings

299

tables = page.find_tables(table_settings=line_settings)

300

```

301

302

### Table Debugging

303

304

Visual debugging capabilities for understanding table detection algorithms.

305

306

```python { .api }

307

def debug_tablefinder(table_settings=None):

308

"""

309

Get TableFinder for debugging table detection.

310

311

Parameters:

312

- table_settings: TableSettings or dict, optional

313

314

Returns:

315

TableFinder: TableFinder object for algorithm inspection

316

"""

317

```

318

319

**Usage Examples:**

320

321

```python

322

with pdfplumber.open("document.pdf") as pdf:

323

page = pdf.pages[0]

324

325

# Debug table detection process

326

finder = page.debug_tablefinder()

327

328

print(f"Detected {len(finder.edges)} edges")

329

print(f"Found {len(finder.intersections)} intersections")

330

print(f"Identified {len(finder.cells)} cells")

331

print(f"Grouped into {len(finder.tables)} tables")

332

333

# Visualize detection process

334

im = page.to_image()

335

im.debug_tablefinder(table_settings=finder.settings)

336

im.save("table_debug.png")

337

```

338

339

### Cell Group Classes

340

341

Helper classes for table structure analysis.

342

343

```python { .api }

344

class CellGroup:

345

"""Base class for table rows and columns."""

346

347

@property

348

def cells(self) -> List[T_bbox]:

349

"""Cell bounding boxes in this group."""

350

351

@property

352

def bbox(self) -> T_bbox:

353

"""Bounding box of entire group."""

354

355

class Row(CellGroup):

356

"""Table row representation."""

357

358

class Column(CellGroup):

359

"""Table column representation."""

360

```

361

362

## Advanced Table Detection Strategies

363

364

### Line-Based Detection

365

366

```python

367

# Strict line detection - only uses actual PDF line objects

368

settings = TableSettings(

369

vertical_strategy="lines_strict",

370

horizontal_strategy="lines_strict"

371

)

372

373

# Flexible line detection - includes rectangle edges

374

settings = TableSettings(

375

vertical_strategy="lines",

376

horizontal_strategy="lines"

377

)

378

```

379

380

### Text-Based Detection

381

382

```python

383

# Use text alignment to infer table structure

384

settings = TableSettings(

385

vertical_strategy="text",

386

horizontal_strategy="text",

387

min_words_vertical=3, # Minimum words to establish column

388

min_words_horizontal=2 # Minimum words to establish row

389

)

390

```

391

392

### Explicit Line Detection

393

394

```python

395

# Manually specify table grid lines

396

settings = TableSettings(

397

vertical_strategy="explicit",

398

horizontal_strategy="explicit",

399

explicit_vertical_lines=[72, 144, 216, 288], # X coordinates

400

explicit_horizontal_lines=[100, 130, 160, 190] # Y coordinates

401

)

402

```

403

404

### Hybrid Detection

405

406

```python

407

# Combine different strategies for horizontal and vertical

408

settings = TableSettings(

409

vertical_strategy="text", # Use text alignment for columns

410

horizontal_strategy="lines", # Use lines for rows

411

snap_tolerance=5, # Snap nearby elements together

412

join_tolerance=2 # Join connected elements

413

)

414

```