tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Comprehensive document structure recognition pipeline that converts complex PDFs and document images into Markdown and JSON files while preserving original structure. Includes layout detection, OCR, table recognition, formula recognition, seal recognition, and chart recognition.
Initialize PP-StructureV3 with configuration for various recognition modules.
class PPStructureV3:
"""
Comprehensive document structure recognition pipeline.
Converts complex PDFs and document images into Markdown/JSON while preserving
structure through layout detection, OCR, table recognition, formula recognition,
seal recognition, and chart recognition.
"""
def __init__(
self,
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
chart_recognition_model_name: str = None,
chart_recognition_model_dir: str = None,
chart_recognition_batch_size: int = None,
region_detection_model_name: str = None,
region_detection_model_dir: str = None,
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
text_detection_model_name: str = None,
text_detection_model_dir: str = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
textline_orientation_model_name: str = None,
textline_orientation_model_dir: str = None,
textline_orientation_batch_size: int = None,
text_recognition_model_name: str = None,
text_recognition_model_dir: str = None,
text_recognition_batch_size: int = None,
text_rec_score_thresh: float = None,
table_classification_model_name: str = None,
table_classification_model_dir: str = None,
wired_table_structure_recognition_model_name: str = None,
wired_table_structure_recognition_model_dir: str = None,
wireless_table_structure_recognition_model_name: str = None,
wireless_table_structure_recognition_model_dir: str = None,
wired_table_cells_detection_model_name: str = None,
wired_table_cells_detection_model_dir: str = None,
wireless_table_cells_detection_model_name: str = None,
wireless_table_cells_detection_model_dir: str = None,
table_orientation_classify_model_name: str = None,
table_orientation_classify_model_dir: str = None,
seal_text_detection_model_name: str = None,
seal_text_detection_model_dir: str = None,
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_text_recognition_model_name: str = None,
seal_text_recognition_model_dir: str = None,
seal_text_recognition_batch_size: int = None,
seal_rec_score_thresh: float = None,
formula_recognition_model_name: str = None,
formula_recognition_model_dir: str = None,
formula_recognition_batch_size: int = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
use_region_detection: bool = None,
lang: str = None,
ocr_version: str = None,
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize PP-StructureV3 pipeline.
Args:
lang (str, optional): Language code for OCR (same as PaddleOCR)
ocr_version (str, optional): PP-OCR version ('PP-OCRv3', 'PP-OCRv4', 'PP-OCRv5')
use_doc_orientation_classify (bool, optional): Enable document orientation classification
use_doc_unwarping (bool, optional): Enable document unwarping
use_textline_orientation (bool, optional): Enable text line orientation
use_seal_recognition (bool, optional): Enable seal/stamp recognition
use_table_recognition (bool, optional): Enable table recognition
use_formula_recognition (bool, optional): Enable formula recognition
use_chart_recognition (bool, optional): Enable chart recognition
use_region_detection (bool, optional): Enable region detection
layout_threshold (float, optional): Threshold for layout detection
layout_nms (bool, optional): Use layout-aware NMS
layout_unclip_ratio (float, optional): Unclip ratio for layout boxes
layout_merge_bboxes_mode (str, optional): Mode for merging layout boxes
text_det_* parameters: Same as PaddleOCR
seal_det_* parameters: Parameters for seal detection
device (str, optional): Device for inference
paddlex_config (str or dict, optional): Configuration file or dict
"""Perform comprehensive document parsing to extract structure and content.
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
use_region_detection: bool = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
text_rec_score_thresh: float = None,
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_rec_score_thresh: float = None,
use_wired_table_cells_trans_to_html: bool = False,
use_wireless_table_cells_trans_to_html: bool = False,
use_table_orientation_classify: bool = True,
use_ocr_results_with_table_cells: bool = True,
use_e2e_wired_table_rec_model: bool = False,
use_e2e_wireless_table_rec_model: bool = True,
**kwargs
) -> list:
"""
Parse document structure and content.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_doc_orientation_classify (bool, optional): Override orientation classification
use_doc_unwarping (bool, optional): Override document unwarping
use_textline_orientation (bool, optional): Override text orientation
use_seal_recognition (bool, optional): Override seal recognition
use_table_recognition (bool, optional): Override table recognition
use_formula_recognition (bool, optional): Override formula recognition
use_chart_recognition (bool, optional): Override chart recognition
use_region_detection (bool, optional): Override region detection
layout_threshold (float, optional): Override layout detection threshold
layout_nms (bool, optional): Override layout NMS setting
layout_unclip_ratio (float, optional): Override layout unclip ratio
layout_merge_bboxes_mode (str, optional): Override bbox merging mode
text_det_* parameters: Override text detection parameters
seal_det_* parameters: Override seal detection parameters
use_wired_table_cells_trans_to_html (bool): Convert wired tables to HTML
use_wireless_table_cells_trans_to_html (bool): Convert wireless tables to HTML
use_table_orientation_classify (bool): Use table orientation classification
use_ocr_results_with_table_cells (bool): Use OCR with table cells
use_e2e_wired_table_rec_model (bool): Use end-to-end wired table recognition
use_e2e_wireless_table_rec_model (bool): Use end-to-end wireless table recognition
Returns:
list: List of dictionaries containing parsing results. Each dict contains:
- input_path (str): Path to input file
- markdown (str): Markdown representation of document
- layout_result (list): Layout detection results
- ocr_result (list): OCR results for text regions
- table_result (list): Table recognition results (if enabled)
- formula_result (list): Formula recognition results (if enabled)
- seal_result (list): Seal recognition results (if enabled)
- chart_result (list): Chart parsing results (if enabled)
"""
def predict_iter(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
use_region_detection: bool = None,
**kwargs
):
"""
Parse documents with iterator for memory efficiency.
Args: Same as predict()
Yields:
dict: Parsing result for each document (same format as predict())
"""def concatenate_markdown_pages(self, markdown_list: list) -> str:
"""
Concatenate markdown from multiple pages into single document.
Args:
markdown_list (list): List of markdown strings from different pages
Returns:
str: Concatenated markdown document
"""def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""
Export pipeline configuration to YAML file.
Args:
yaml_path (str): Path to save YAML configuration
"""from paddleocr import PPStructureV3
# Initialize pipeline
structure = PPStructureV3(lang='en')
# Parse document
result = structure.predict('document.pdf')
# Extract markdown
for item in result:
markdown = item.get('markdown', '')
print(markdown)
structure.close()from paddleocr import PPStructureV3
# Enable all recognition modules
structure = PPStructureV3(
lang='en',
ocr_version='PP-OCRv5',
use_seal_recognition=True,
use_table_recognition=True,
use_formula_recognition=True,
use_chart_recognition=True
)
result = structure.predict('complex_document.pdf')
for item in result:
# Markdown output
print("Markdown:")
print(item.get('markdown', ''))
# Layout structure
print("\nLayout elements:")
for layout in item.get('layout_result', []):
print(f" Type: {layout['label']}, Bbox: {layout['bbox']}")
# Table results
if 'table_result' in item:
print("\nTables found:")
for table in item['table_result']:
print(table.get('html', ''))
# Formula results
if 'formula_result' in item:
print("\nFormulas found:")
for formula in item['formula_result']:
print(formula.get('latex', ''))
structure.close()from paddleocr import PPStructureV3
structure = PPStructureV3(lang='en')
# Process PDF
results = structure.predict('multi_page_document.pdf')
# Collect markdown from all pages
markdown_pages = [item['markdown'] for item in results]
# Concatenate into single document
full_markdown = structure.concatenate_markdown_pages(markdown_pages)
print(full_markdown)
structure.close()from paddleocr import PPStructureV3
structure = PPStructureV3(
lang='en',
use_table_recognition=True
)
# Convert tables to HTML
result = structure.predict(
'document_with_tables.pdf',
use_wired_table_cells_trans_to_html=True,
use_wireless_table_cells_trans_to_html=True
)
for item in result:
for table in item.get('table_result', []):
html = table.get('html', '')
print(html)
structure.close()from paddleocr import PPStructureV3
# Only use OCR and layout detection (faster, lower resource usage)
structure = PPStructureV3(
lang='en',
use_seal_recognition=False,
use_table_recognition=False,
use_formula_recognition=False,
use_chart_recognition=False
)
result = structure.predict('simple_document.jpg')
structure.close()
# Only extract tables from document
structure_tables = PPStructureV3(
lang='en',
use_table_recognition=True,
use_seal_recognition=False,
use_formula_recognition=False,
use_chart_recognition=False
)
result = structure_tables.predict('document_with_tables.pdf')
structure_tables.close()from paddleocr import PPStructureV3
structure = PPStructureV3(lang='en')
pdf_files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
# Memory-efficient processing
for result in structure.predict_iter(pdf_files):
markdown = result.get('markdown', '')
# Save to file
output_path = result['input_path'].replace('.pdf', '.md')
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown)
structure.close()from paddleocr import PPStructureV3
# Adjust layout detection parameters
structure = PPStructureV3(
lang='en',
layout_threshold=0.5, # Detection confidence threshold
layout_nms=True, # Enable layout-aware NMS
layout_unclip_ratio=1.0, # Box expansion ratio
layout_merge_bboxes_mode='v' # Vertical merging mode
)
result = structure.predict('complex_layout.pdf')
structure.close()from paddleocr import PPStructureV3
# Enable preprocessing for skewed or distorted documents
structure = PPStructureV3(
lang='en',
use_doc_orientation_classify=True,
use_doc_unwarping=True,
use_textline_orientation=True
)
result = structure.predict('skewed_document.jpg')
structure.close()from paddleocr import PPStructureV3
# Create custom configuration
structure = PPStructureV3(
lang='en',
ocr_version='PP-OCRv5',
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5
)
# Export configuration
structure.export_paddlex_config_to_yaml('my_structure_config.yaml')
structure.close()
# Reuse configuration
structure_reloaded = PPStructureV3(
paddlex_config='my_structure_config.yaml'
)
result = structure_reloaded.predict('document.pdf')
structure_reloaded.close()The predict() and predict_iter() methods return results with the following structure:
[
{
"input_path": "path/to/document.pdf",
"markdown": "# Document Title\n\nContent...",
"layout_result": [
{
"bbox": [x1, y1, x2, y2],
"label": "text", # or "title", "table", "figure", etc.
"score": 0.95
},
# ...
],
"ocr_result": [
{
"dt_polys": [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]]],
"rec_text": ["recognized text"],
"rec_score": [0.98]
},
# ...
],
"table_result": [ # If use_table_recognition=True
{
"bbox": [x1, y1, x2, y2],
"html": "<table>...</table>",
"cells": [...],
"structure": "wired" # or "wireless"
},
# ...
],
"formula_result": [ # If use_formula_recognition=True
{
"bbox": [x1, y1, x2, y2],
"latex": "E = mc^2",
"score": 0.96
},
# ...
],
"seal_result": [ # If use_seal_recognition=True
{
"bbox": [x1, y1, x2, y2],
"text": "Official Seal Text",
"score": 0.92
},
# ...
],
"chart_result": [ # If use_chart_recognition=True
{
"bbox": [x1, y1, x2, y2],
"table_data": {...},
"chart_type": "bar"
},
# ...
]
}
]The layout_result can contain the following element types: