tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Advanced table recognition pipeline supporting both wired (bordered) and wireless (borderless) tables. Converts table images to structured HTML with cell detection, OCR, and intelligent table structure recognition.
class TableRecognitionPipelineV2:
"""
Advanced table recognition for wired and wireless tables.
Combines table classification, structure recognition, cell detection,
and OCR to convert table images to structured HTML format.
"""
def __init__(
self,
# Table recognition models
table_classification_model_name: str = None,
table_classification_model_dir: str = None,
table_structure_recognition_model_name: str = None,
table_structure_recognition_model_dir: str = None,
table_cells_detection_model_name: str = None,
table_cells_detection_model_dir: str = None,
# OCR models
text_detection_model_name: str = None,
text_detection_model_dir: str = None,
text_recognition_model_name: str = None,
text_recognition_model_dir: str = None,
text_recognition_batch_size: int = None,
# Layout detection
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
use_layout_detection: bool = None,
layout_threshold: float = None,
# Document preprocessing
doc_orientation_classify_model_name: str = None,
doc_unwarping_model_name: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
# OCR parameters
text_det_limit_side_len: int = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_rec_score_thresh: float = None,
# Common parameters
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize table recognition pipeline.
Args:
table_classification_model_name (str, optional): Table type classifier
Default: 'PP-LCNet_x1_0_table_cls'
table_structure_recognition_model_name (str, optional): Structure model
Default: 'SLANet'
table_cells_detection_model_name (str, optional): Cell detection model
Default: 'RT-DETR-L_wired_table_cell_det'
use_layout_detection (bool, optional): Enable layout detection for
automatic table region detection
text_rec_score_thresh (float, optional): Text recognition threshold
device (str, optional): Device for inference ('cpu', 'gpu')
paddlex_config (str or dict, optional): Configuration file or dict
"""def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
use_ocr_model: bool = None,
overall_ocr_res: list = None,
layout_det_res: list = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
text_rec_score_thresh: float = None,
use_e2e_wired_table_rec_model: bool = False,
use_e2e_wireless_table_rec_model: bool = False,
use_wired_table_cells_trans_to_html: bool = False,
use_wireless_table_cells_trans_to_html: bool = False,
use_table_orientation_classify: bool = True,
use_ocr_results_with_table_cells: bool = True,
**kwargs
) -> list:
"""
Recognize tables and convert to HTML.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_layout_detection (bool, optional): Override layout detection
use_ocr_model (bool, optional): Use OCR for cell text recognition
overall_ocr_res (list, optional): Pre-computed OCR results
layout_det_res (list, optional): Pre-computed layout results
text_rec_score_thresh (float, optional): Text recognition threshold
use_e2e_wired_table_rec_model (bool, optional): Use end-to-end model for wired tables
use_e2e_wireless_table_rec_model (bool, optional): Use end-to-end model for wireless tables
use_wired_table_cells_trans_to_html (bool, optional): Convert wired table cells to HTML
use_wireless_table_cells_trans_to_html (bool, optional): Convert wireless table cells to HTML
use_table_orientation_classify (bool, optional): Classify table orientation
use_ocr_results_with_table_cells (bool, optional): Use OCR with cell detection
Returns:
list: Table recognition results with HTML output
"""
def predict_iter(self, input, **kwargs):
"""Iterate over table recognition results for memory efficiency."""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import TableRecognitionPipelineV2
# Initialize pipeline
pipeline = TableRecognitionPipelineV2()
# Recognize table
result = pipeline.predict('table.jpg')
# Get HTML output
for item in result:
for table in item.get('table_result', []):
print(table['html'])
pipeline.close()from paddleocr import TableRecognitionPipelineV2
# Enable layout detection to find tables automatically
pipeline = TableRecognitionPipelineV2(
use_layout_detection=True
)
# Process document
result = pipeline.predict('report.pdf')
# Extract all tables
for item in result:
tables = item.get('table_result', [])
print(f"Found {len(tables)} tables")
for i, table in enumerate(tables, 1):
# Save HTML
with open(f'table_{i}.html', 'w') as f:
f.write(table['html'])
print(f"Table {i} saved")
pipeline.close()from paddleocr import TableRecognitionPipelineV2
pipeline = TableRecognitionPipelineV2()
# The pipeline automatically classifies and handles both types
result = pipeline.predict(['wired_table.jpg', 'wireless_table.jpg'])
for item in result:
for table in item.get('table_result', []):
table_type = table.get('table_type', 'unknown')
print(f"Table type: {table_type}")
print(f"HTML: {table['html']}\n")
pipeline.close()from paddleocr import TableRecognitionPipelineV2
# Configure for high quality
pipeline = TableRecognitionPipelineV2(
text_rec_score_thresh=0.7, # Higher threshold
use_doc_orientation_classify=True,
use_doc_unwarping=True
)
result = pipeline.predict('complex_table.jpg')
pipeline.close()from paddleocr import TableRecognitionPipelineV2
import pandas as pd
from bs4 import BeautifulSoup
pipeline = TableRecognitionPipelineV2()
result = pipeline.predict('table.jpg')
for item in result:
for table in item.get('table_result', []):
html = table['html']
# Convert HTML to pandas DataFrame
df = pd.read_html(html)[0]
# Save as CSV
df.to_csv('table.csv', index=False)
# Save as Excel
df.to_excel('table.xlsx', index=False)
# Print table
print(df)
pipeline.close()from paddleocr import TableRecognitionPipelineV2
import os
pipeline = TableRecognitionPipelineV2(
use_layout_detection=True
)
# Process quarterly reports
results = pipeline.predict('quarterly_reports/')
# Save tables organized by document
for item in results:
doc_name = os.path.basename(item['input_path'])
output_dir = f'tables/{doc_name}'
os.makedirs(output_dir, exist_ok=True)
tables = item.get('table_result', [])
for i, table in enumerate(tables):
html_path = os.path.join(output_dir, f'table_{i}.html')
with open(html_path, 'w') as f:
f.write(table['html'])
pipeline.close()from paddleocr import PaddleOCR, TableRecognitionPipelineV2
# First, run OCR
ocr = PaddleOCR()
ocr_results = ocr.predict('document_with_tables.pdf')
# Use OCR results for table recognition
pipeline = TableRecognitionPipelineV2()
result = pipeline.predict(
'document_with_tables.pdf',
overall_ocr_res=ocr_results
)
ocr.close()
pipeline.close()from paddleocr import TableRecognitionPipelineV2
pipeline = TableRecognitionPipelineV2()
# Enable cell-to-HTML conversion
result = pipeline.predict(
'table.jpg',
use_wired_table_cells_trans_to_html=True,
use_wireless_table_cells_trans_to_html=True
)
pipeline.close()from paddleocr import TableRecognitionPipelineV2
import json
pipeline = TableRecognitionPipelineV2(use_layout_detection=True)
# Process large collection
table_data = []
for result in pipeline.predict_iter('large_dataset/'):
doc = result['input_path']
for table in result.get('table_result', []):
table_data.append({
'document': doc,
'html': table['html'],
'bbox': table['bbox']
})
# Save incrementally
with open('tables.jsonl', 'a') as f:
f.write(json.dumps({'doc': doc, 'tables': result.get('table_result', [])}) + '\n')
pipeline.close()[
{
"input_path": "path/to/document.pdf",
"table_result": [
{
"bbox": [x1, y1, x2, y2], # Table bounding box
"table_type": "wired", # 'wired' or 'wireless'
"html": "<html><body><table>...</table></body></html>",
"structure": {
"rows": 5,
"cols": 3,
"cells": [
{
"bbox": [x1, y1, x2, y2],
"text": "Cell content",
"row": 0,
"col": 0,
"rowspan": 1,
"colspan": 1
}
]
},
"confidence": 0.95
}
],
"layout_result": [ # If use_layout_detection=True
{
"bbox": [x1, y1, x2, y2],
"label": "table",
"score": 0.98
}
],
"ocr_result": [ # If use_ocr_model=True
{
"dt_polys": [[x1,y1], [x2,y2], ...],
"rec_text": "text content",
"rec_score": 0.96
}
]
}
]device='gpu' for faster processingDefault Models:
The pipeline generates clean HTML table output:
<html>
<body>
<table>
<thead>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
</thead>
<tbody>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</tbody>
</table>
</body>
</html>Supports: