tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Advanced document understanding pipeline with conversational capabilities, retrieval-augmented generation (RAG), and multimodal LLM integration. Provides comprehensive document analysis combining layout detection, OCR, table recognition, seal recognition, formula recognition, and chat-based information extraction.
class PPChatOCRv4Doc:
"""
Advanced document understanding with conversational capabilities.
Combines visual document analysis with retrieval-augmented generation
for intelligent information extraction through natural language queries.
"""
def __init__(
self,
# Layout detection
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
# Document preprocessing
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
# Text detection and recognition
text_detection_model_name: str = None,
text_detection_model_dir: str = None,
textline_orientation_model_name: str = None,
textline_orientation_model_dir: str = None,
textline_orientation_batch_size: int = None,
text_recognition_model_name: str = None,
text_recognition_model_dir: str = None,
text_recognition_batch_size: int = None,
# Table recognition
table_structure_recognition_model_name: str = None,
table_structure_recognition_model_dir: str = None,
# Seal recognition
seal_text_detection_model_name: str = None,
seal_text_detection_model_dir: str = None,
seal_text_recognition_model_name: str = None,
seal_text_recognition_model_dir: str = None,
seal_text_recognition_batch_size: int = None,
# Feature flags
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
# Layout parameters
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
# Text detection parameters
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
# Text recognition parameters
text_rec_score_thresh: float = None,
# Seal detection parameters
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_rec_score_thresh: float = None,
# Retrieval and chat configuration
retriever_config: dict = None,
mllm_chat_bot_config: dict = None,
chat_bot_config: dict = None,
# Common parameters
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize PPChatOCRv4Doc pipeline.
Args:
use_table_recognition (bool, optional): Enable table structure recognition
use_seal_recognition (bool, optional): Enable seal/stamp recognition
use_doc_orientation_classify (bool, optional): Enable document orientation correction
use_doc_unwarping (bool, optional): Enable document image unwarping
use_textline_orientation (bool, optional): Enable text line orientation classification
layout_threshold (float, optional): Confidence threshold for layout detection
layout_nms (bool, optional): Whether to apply NMS in layout detection
text_det_limit_side_len (int, optional): Limit on text detection image side length
text_det_thresh (float, optional): Pixel threshold for text detection
text_det_box_thresh (float, optional): Box confidence threshold for text detection
text_rec_score_thresh (float, optional): Text recognition confidence threshold
seal_det_thresh (float, optional): Seal detection threshold
seal_rec_score_thresh (float, optional): Seal recognition confidence threshold
retriever_config (dict, optional): Configuration for vector retrieval
mllm_chat_bot_config (dict, optional): Multimodal LLM chatbot configuration
chat_bot_config (dict, optional): General chatbot configuration
device (str, optional): Device for inference ('cpu', 'gpu', 'gpu:0', etc.)
paddlex_config (str or dict, optional): PaddleX configuration file path or dict
use_hpi (bool, optional): Enable high-performance inference
"""def visual_predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
text_rec_score_thresh: float = None,
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_rec_score_thresh: float = None,
**kwargs
) -> list:
"""
Perform visual analysis of document.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_seal_recognition (bool, optional): Override seal recognition
use_table_recognition (bool, optional): Override table recognition
use_formula_recognition (bool, optional): Override formula recognition
use_textline_orientation (bool, optional): Override text orientation
text_rec_score_thresh (float, optional): Text recognition score threshold
return_word_box (bool, optional): Return character-level boxes
Returns:
list: Visual analysis results with layout, text, tables, formulas, seals
"""
def visual_predict_iter(self, input, **kwargs):
"""Iterate over visual analysis results for memory efficiency."""def build_vector(
self,
visual_info: list,
*,
min_characters: int = 3500,
block_size: int = 300,
flag_save_bytes_vector: bool = False,
retriever_config: dict = None
) -> dict:
"""
Build vector index for retrieval-augmented generation.
Args:
visual_info (list): Output from visual_predict()
min_characters (int, optional): Minimum characters to split into blocks
block_size (int, optional): Characters per block
flag_save_bytes_vector (bool, optional): Save byte representation
retriever_config (dict, optional): Custom retriever configuration
Returns:
dict: Vector information for retrieval
"""
def save_vector(
self,
vector_info: dict,
save_path: str,
retriever_config: dict = None
) -> None:
"""Save vector embeddings to file."""
def load_vector(
self,
data_path: str,
retriever_config: dict = None
) -> dict:
"""Load vector embeddings from file."""
def save_visual_info_list(
self,
visual_info: list,
save_path: str
) -> None:
"""Save visual information to file."""
def load_visual_info_list(
self,
data_path: str
) -> list:
"""Load visual information from file."""def chat(
self,
key_list: list,
visual_info: list,
*,
use_vector_retrieval: bool = True,
vector_info: dict = None,
min_characters: int = 3500,
text_task_description: str = None,
text_output_format: str = None,
text_rules_str: str = None,
text_few_shot_demo_text_content: str = None,
text_few_shot_demo_key_value_list: list = None,
table_task_description: str = None,
table_output_format: str = None,
table_rules_str: str = None,
table_few_shot_demo_text_content: str = None,
table_few_shot_demo_key_value_list: list = None,
mllm_predict_info: dict = None,
mllm_integration_strategy: str = "integration",
chat_bot_config: dict = None,
retriever_config: dict = None
) -> list:
"""
Extract information through conversational queries.
Args:
key_list (list): List of questions/queries to ask
visual_info (list): Output from visual_predict()
use_vector_retrieval (bool, optional): Use RAG for large documents
vector_info (dict, optional): Pre-built vector index
min_characters (int, optional): Min chars to trigger RAG
text_task_description (str, optional): Custom task description for text extraction
text_output_format (str, optional): Desired output format for text
text_rules_str (str, optional): Extraction rules for text regions
text_few_shot_demo_text_content (str, optional): Few-shot example text
text_few_shot_demo_key_value_list (list, optional): Few-shot key-value examples for text
table_task_description (str, optional): Custom task description for table extraction
table_output_format (str, optional): Desired output format for tables
table_rules_str (str, optional): Extraction rules for table regions
table_few_shot_demo_text_content (str, optional): Few-shot example for tables
table_few_shot_demo_key_value_list (list, optional): Few-shot key-value examples for tables
mllm_predict_info (dict, optional): Pre-computed multimodal LLM predictions
mllm_integration_strategy (str, optional): Strategy for integrating MLLM results (default: 'integration')
chat_bot_config (dict, optional): LLM chatbot configuration
retriever_config (dict, optional): Vector retrieval configuration
Returns:
list: Extraction results for each query
"""
def mllm_pred(
self,
input,
key_list: list,
*,
mllm_chat_bot_config: dict = None
) -> list:
"""
Direct multimodal LLM prediction without visual analysis.
Args:
input: Image/PDF path or data
key_list (list): List of questions/queries
mllm_chat_bot_config (dict, optional): MLLM configuration
Returns:
list: Prediction results for each query
"""def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import PPChatOCRv4Doc
# Initialize pipeline
pipeline = PPChatOCRv4Doc(
lang='ch',
ocr_version='PP-OCRv5',
use_table_recognition=True,
use_seal_recognition=True
)
# Analyze document
visual_info = pipeline.visual_predict('contract.pdf')
# Extract information via chat
queries = [
"What is the contract number?",
"Who are the parties involved?",
"What is the contract value?"
]
results = pipeline.chat(queries, visual_info)
for query, result in zip(queries, results):
print(f"Q: {query}")
print(f"A: {result}")
pipeline.close()from paddleocr import PPChatOCRv4Doc
pipeline = PPChatOCRv4Doc()
# Process large document
visual_info = pipeline.visual_predict('large_report.pdf')
# Build vector index for efficient retrieval
vector_info = pipeline.build_vector(
visual_info,
min_characters=3500,
block_size=300
)
# Save for future use
pipeline.save_vector(vector_info, 'report_vectors.pkl')
pipeline.save_visual_info_list(visual_info, 'report_visual.pkl')
# Query with RAG
queries = [
"Summarize the financial highlights",
"What are the key risks mentioned?",
"List all recommendations"
]
results = pipeline.chat(
queries,
visual_info,
use_vector_retrieval=True,
vector_info=vector_info,
retrieval_result_num=3
)
pipeline.close()from paddleocr import PPChatOCRv4Doc
pipeline = PPChatOCRv4Doc()
# Load previously saved analysis
visual_info = pipeline.load_visual_info_list('report_visual.pkl')
vector_info = pipeline.load_vector('report_vectors.pkl')
# Query without re-analyzing
new_queries = ["What is the projected growth rate?"]
results = pipeline.chat(
new_queries,
visual_info,
vector_info=vector_info
)
pipeline.close()from paddleocr import PPChatOCRv4Doc
pipeline = PPChatOCRv4Doc()
# Custom MLLM configuration
mllm_config = {
'model_name': 'custom-vl-model',
'temperature': 0.7,
'max_tokens': 500
}
# Direct prediction without visual_predict
queries = ["What is shown in this image?"]
results = pipeline.mllm_pred(
'document.png',
queries,
mllm_chat_bot_config=mllm_config
)
pipeline.close()[
{
"input_path": "path/to/document.pdf",
"layout_result": [
{
"bbox": [x1, y1, x2, y2],
"label": "text",
"ocr_result": {
"dt_polys": [[x1,y1], [x2,y2], ...],
"rec_text": "recognized text",
"rec_score": 0.95
}
}
],
"table_result": [ # If use_table_recognition=True
{
"bbox": [x1, y1, x2, y2],
"html": "<table>...</table>"
}
],
"seal_result": [ # If use_seal_recognition=True
{
"bbox": [x1, y1, x2, y2],
"text": "seal text",
"score": 0.92
}
],
"formula_result": [ # If use_formula_recognition=True
{
"bbox": [x1, y1, x2, y2],
"latex": "x^2 + y^2 = r^2"
}
]
}
][
{
"query": "What is the contract number?",
"answer": "CT-2024-001",
"confidence": 0.95,
"source_blocks": [
{
"page": 1,
"bbox": [x1, y1, x2, y2],
"text": "Contract Number: CT-2024-001"
}
]
}
]