Tessl Tile for pypi/pdfplumber@0.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli.md index.md page-manipulation.md pdf-operations.md table-extraction.md text-extraction.md utilities.md visual-debugging.md

table-extraction.mddocs/

0
# Table Extraction
1

2
Sophisticated table detection and extraction capabilities with customizable strategies, edge detection algorithms, comprehensive configuration options, and visual debugging support.
3

4
## Capabilities
5

6
### Table Finding
7

8
Detect all tables on a page using various detection strategies and algorithms.
9

10
```python { .api }
11
def find_tables(table_settings=None):
12
    """
13
    Find all tables using detection algorithms.
14
    
15
    Parameters:
16
    - table_settings: TableSettings or dict, optional - Configuration for detection
17
    
18
    Returns:
19
    List[Table]: List of detected table objects
20
    """
21

22
def find_table(table_settings=None):
23
    """
24
    Find largest table on page.
25
    
26
    Parameters:
27
    - table_settings: TableSettings or dict, optional - Configuration for detection
28
    
29
    Returns:
30
    Table or None: Largest detected table or None if no tables found
31
    """
32
```
33

34
**Usage Examples:**
35

36
```python
37
with pdfplumber.open("document.pdf") as pdf:
38
    page = pdf.pages[0]
39
    
40
    # Find all tables with default settings
41
    tables = page.find_tables()
42
    print(f"Found {len(tables)} tables")
43
    
44
    # Find largest table only
45
    main_table = page.find_table()
46
    if main_table:
47
        print(f"Main table area: {main_table.bbox}")
48
    
49
    # Find tables with custom settings
50
    custom_settings = {
51
        "vertical_strategy": "text",
52
        "horizontal_strategy": "lines"
53
    }
54
    tables = page.find_tables(table_settings=custom_settings)
55
```
56

57
### Table Extraction
58

59
Extract table data as structured 2D arrays with various formatting options.
60

61
```python { .api }
62
def extract_tables(table_settings=None):
63
    """
64
    Extract all tables as 2D arrays.
65
    
66
    Parameters:
67
    - table_settings: TableSettings or dict, optional - Configuration for detection
68
    
69
    Returns:
70
    List[List[List[str]]]: List of tables, each as 2D array of strings
71
    """
72

73
def extract_table(table_settings=None):
74
    """
75
    Extract largest table as 2D array.
76
    
77
    Parameters:
78
    - table_settings: TableSettings or dict, optional - Configuration for detection
79
    
80
    Returns:
81
    List[List[str]] or None: 2D array of strings or None if no table found
82
    """
83
```
84

85
**Usage Examples:**
86

87
```python
88
with pdfplumber.open("document.pdf") as pdf:
89
    page = pdf.pages[0]
90
    
91
    # Extract all tables
92
    tables = page.extract_tables()
93
    for i, table in enumerate(tables):
94
        print(f"Table {i+1}:")
95
        for row in table:
96
            print("  ", row)
97
    
98
    # Extract main table only  
99
    main_table = page.extract_table()
100
    if main_table:
101
        # Process header row
102
        headers = main_table[0]
103
        data_rows = main_table[1:]
104
        
105
        for row in data_rows:
106
            row_dict = dict(zip(headers, row))
107
            print(row_dict)
108
```
109

110
### Table Class
111

112
Represents a detected table with extraction and analysis capabilities.
113

114
```python { .api }
115
class Table:
116
    """Detected table with extraction capabilities."""
117
    
118
    def __init__(self, page, cells):
119
        """Initialize table from page and cell data."""
120
    
121
    @property
122
    def bbox(self) -> T_bbox:
123
        """Table bounding box coordinates."""
124
    
125
    @property
126
    def cells(self) -> List[T_bbox]:
127
        """List of cell bounding boxes."""
128
    
129
    @property
130
    def rows(self) -> List[CellGroup]:
131
        """Table rows as CellGroup objects."""
132
    
133
    @property
134
    def columns(self) -> List[CellGroup]:
135
        """Table columns as CellGroup objects."""
136
    
137
    def extract(self, **kwargs):
138
        """
139
        Extract table data as 2D array.
140
        
141
        Parameters:
142
        - **kwargs: Text extraction options for cell content
143
        
144
        Returns:
145
        List[List[str]]: 2D array of cell text content
146
        """
147
```
148

149
**Usage Examples:**
150

151
```python
152
with pdfplumber.open("document.pdf") as pdf:
153
    page = pdf.pages[0]
154
    
155
    tables = page.find_tables()
156
    for table in tables:
157
        print(f"Table at {table.bbox}")
158
        print(f"Dimensions: {len(table.rows)} rows × {len(table.columns)} columns")
159
        
160
        # Extract with custom text options
161
        data = table.extract(layout=True, x_tolerance=1)
162
        
163
        # Analyze cell structure
164
        for i, row in enumerate(table.rows):
165
            print(f"Row {i}: {len(row.cells)} cells")
166
```
167

168
### TableFinder Class
169

170
Handles the table detection algorithm implementation and provides debugging capabilities.
171

172
```python { .api }
173
class TableFinder:
174
    """Table detection algorithm implementation."""
175
    
176
    def __init__(self, page, settings=None):
177
        """Initialize TableFinder with page and settings."""
178
    
179
    @property
180
    def page(self) -> Page:
181
        """Source page object."""
182
    
183
    @property
184
    def settings(self) -> TableSettings:
185
        """Table detection settings."""
186
    
187
    @property
188
    def edges(self) -> T_obj_list:
189
        """Detected edges for table detection."""
190
    
191
    @property
192
    def intersections(self) -> T_intersections:
193
        """Edge intersection points."""
194
    
195
    @property
196
    def cells(self) -> List[T_bbox]:
197
        """Detected table cells."""
198
    
199
    @property
200
    def tables(self) -> List[Table]:
201
        """Detected table objects."""
202
    
203
    def get_edges(self):
204
        """Get edges based on detection strategy."""
205
```
206

207
### TableSettings Class
208

209
Comprehensive configuration class for table detection parameters and strategies.
210

211
```python { .api }
212
class TableSettings:
213
    """Configuration for table detection parameters."""
214
    
215
    def __init__(self, vertical_strategy="lines", horizontal_strategy="lines",
216
                 explicit_vertical_lines=None, explicit_horizontal_lines=None,
217
                 snap_tolerance=3, snap_x_tolerance=None, snap_y_tolerance=None,
218
                 join_tolerance=3, join_x_tolerance=None, join_y_tolerance=None,
219
                 edge_min_length=3, min_words_vertical=3, min_words_horizontal=1,
220
                 intersection_tolerance=3, intersection_x_tolerance=None,
221
                 intersection_y_tolerance=None, text_settings=None):
222
        """Initialize table detection settings."""
223
    
224
    @classmethod
225
    def resolve(cls, settings):
226
        """
227
        Create TableSettings from dict or existing instance.
228
        
229
        Parameters:
230
        - settings: dict, TableSettings, or None
231
        
232
        Returns:
233
        TableSettings: Resolved settings object
234
        """
235
    
236
    # Detection strategy options
237
    vertical_strategy: str  # "lines", "lines_strict", "text", "explicit"
238
    horizontal_strategy: str  # "lines", "lines_strict", "text", "explicit"
239
    
240
    # Explicit line positions
241
    explicit_vertical_lines: Optional[List[T_num]]
242
    explicit_horizontal_lines: Optional[List[T_num]]
243
    
244
    # Edge processing tolerances
245
    snap_tolerance: T_num
246
    snap_x_tolerance: Optional[T_num]
247
    snap_y_tolerance: Optional[T_num]
248
    join_tolerance: T_num
249
    join_x_tolerance: Optional[T_num]
250
    join_y_tolerance: Optional[T_num]
251
    edge_min_length: T_num
252
    
253
    # Text-based detection parameters
254
    min_words_vertical: int
255
    min_words_horizontal: int
256
    
257
    # Intersection detection
258
    intersection_tolerance: T_num
259
    intersection_x_tolerance: Optional[T_num]
260
    intersection_y_tolerance: Optional[T_num]
261
    
262
    # Text extraction settings for cells
263
    text_settings: Optional[Dict[str, Any]]
264
```
265

266
**Usage Examples:**
267

268
```python
269
from pdfplumber.table import TableSettings
270

271
with pdfplumber.open("document.pdf") as pdf:
272
    page = pdf.pages[0]
273
    
274
    # Custom settings for line-based detection
275
    line_settings = TableSettings(
276
        vertical_strategy="lines_strict",
277
        horizontal_strategy="lines_strict",
278
        snap_tolerance=2,
279
        edge_min_length=10
280
    )
281
    
282
    # Custom settings for text-based detection
283
    text_settings = TableSettings(
284
        vertical_strategy="text",
285
        horizontal_strategy="text", 
286
        min_words_vertical=2,
287
        min_words_horizontal=1
288
    )
289
    
290
    # Explicit line positions
291
    explicit_settings = TableSettings(
292
        vertical_strategy="explicit",
293
        horizontal_strategy="explicit",
294
        explicit_vertical_lines=[100, 200, 300, 400],
295
        explicit_horizontal_lines=[50, 100, 150, 200]
296
    )
297
    
298
    # Use settings
299
    tables = page.find_tables(table_settings=line_settings)
300
```
301

302
### Table Debugging
303

304
Visual debugging capabilities for understanding table detection algorithms.
305

306
```python { .api }
307
def debug_tablefinder(table_settings=None):
308
    """
309
    Get TableFinder for debugging table detection.
310
    
311
    Parameters:
312
    - table_settings: TableSettings or dict, optional
313
    
314
    Returns:
315
    TableFinder: TableFinder object for algorithm inspection
316
    """
317
```
318

319
**Usage Examples:**
320

321
```python
322
with pdfplumber.open("document.pdf") as pdf:
323
    page = pdf.pages[0]
324
    
325
    # Debug table detection process
326
    finder = page.debug_tablefinder()
327
    
328
    print(f"Detected {len(finder.edges)} edges")
329
    print(f"Found {len(finder.intersections)} intersections")
330
    print(f"Identified {len(finder.cells)} cells")
331
    print(f"Grouped into {len(finder.tables)} tables")
332
    
333
    # Visualize detection process
334
    im = page.to_image()
335
    im.debug_tablefinder(table_settings=finder.settings)
336
    im.save("table_debug.png")
337
```
338

339
### Cell Group Classes
340

341
Helper classes for table structure analysis.
342

343
```python { .api }
344
class CellGroup:
345
    """Base class for table rows and columns."""
346
    
347
    @property
348
    def cells(self) -> List[T_bbox]:
349
        """Cell bounding boxes in this group."""
350
    
351
    @property
352
    def bbox(self) -> T_bbox:
353
        """Bounding box of entire group."""
354

355
class Row(CellGroup):
356
    """Table row representation."""
357

358
class Column(CellGroup):
359
    """Table column representation."""
360
```
361

362
## Advanced Table Detection Strategies
363

364
### Line-Based Detection
365

366
```python
367
# Strict line detection - only uses actual PDF line objects
368
settings = TableSettings(
369
    vertical_strategy="lines_strict",
370
    horizontal_strategy="lines_strict"
371
)
372

373
# Flexible line detection - includes rectangle edges
374
settings = TableSettings(
375
    vertical_strategy="lines",
376
    horizontal_strategy="lines"
377
)
378
```
379

380
### Text-Based Detection
381

382
```python
383
# Use text alignment to infer table structure
384
settings = TableSettings(
385
    vertical_strategy="text",
386
    horizontal_strategy="text",
387
    min_words_vertical=3,  # Minimum words to establish column
388
    min_words_horizontal=2  # Minimum words to establish row
389
)
390
```
391

392
### Explicit Line Detection
393

394
```python
395
# Manually specify table grid lines
396
settings = TableSettings(
397
    vertical_strategy="explicit",
398
    horizontal_strategy="explicit",
399
    explicit_vertical_lines=[72, 144, 216, 288],  # X coordinates
400
    explicit_horizontal_lines=[100, 130, 160, 190]  # Y coordinates
401
)
402
```
403

404
### Hybrid Detection
405

406
```python
407
# Combine different strategies for horizontal and vertical
408
settings = TableSettings(
409
    vertical_strategy="text",      # Use text alignment for columns
410
    horizontal_strategy="lines",   # Use lines for rows
411
    snap_tolerance=5,             # Snap nearby elements together
412
    join_tolerance=2              # Join connected elements
413
)
414
```

Version

Tile

Files

table-extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

table-extraction.mddocs/