or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/paddleocr@3.3.x

docs

index.mdutilities.md
tile.json

tessl/pypi-paddleocr

tessl install tessl/pypi-paddleocr@3.3.0

Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.

pp-chatocrv4.mddocs/pipelines/

PPChatOCRv4Doc Pipeline

Advanced document understanding pipeline with conversational capabilities, retrieval-augmented generation (RAG), and multimodal LLM integration. Provides comprehensive document analysis combining layout detection, OCR, table recognition, seal recognition, formula recognition, and chat-based information extraction.

Capabilities

Pipeline Initialization

class PPChatOCRv4Doc:
    """
    Advanced document understanding with conversational capabilities.

    Combines visual document analysis with retrieval-augmented generation
    for intelligent information extraction through natural language queries.
    """
    def __init__(
        self,
        # Layout detection
        layout_detection_model_name: str = None,
        layout_detection_model_dir: str = None,
        # Document preprocessing
        doc_orientation_classify_model_name: str = None,
        doc_orientation_classify_model_dir: str = None,
        doc_unwarping_model_name: str = None,
        doc_unwarping_model_dir: str = None,
        # Text detection and recognition
        text_detection_model_name: str = None,
        text_detection_model_dir: str = None,
        textline_orientation_model_name: str = None,
        textline_orientation_model_dir: str = None,
        textline_orientation_batch_size: int = None,
        text_recognition_model_name: str = None,
        text_recognition_model_dir: str = None,
        text_recognition_batch_size: int = None,
        # Table recognition
        table_structure_recognition_model_name: str = None,
        table_structure_recognition_model_dir: str = None,
        # Seal recognition
        seal_text_detection_model_name: str = None,
        seal_text_detection_model_dir: str = None,
        seal_text_recognition_model_name: str = None,
        seal_text_recognition_model_dir: str = None,
        seal_text_recognition_batch_size: int = None,
        # Feature flags
        use_doc_orientation_classify: bool = None,
        use_doc_unwarping: bool = None,
        use_textline_orientation: bool = None,
        use_seal_recognition: bool = None,
        use_table_recognition: bool = None,
        # Layout parameters
        layout_threshold: float = None,
        layout_nms: bool = None,
        layout_unclip_ratio: float = None,
        layout_merge_bboxes_mode: str = None,
        # Text detection parameters
        text_det_limit_side_len: int = None,
        text_det_limit_type: str = None,
        text_det_thresh: float = None,
        text_det_box_thresh: float = None,
        text_det_unclip_ratio: float = None,
        # Text recognition parameters
        text_rec_score_thresh: float = None,
        # Seal detection parameters
        seal_det_limit_side_len: int = None,
        seal_det_limit_type: str = None,
        seal_det_thresh: float = None,
        seal_det_box_thresh: float = None,
        seal_det_unclip_ratio: float = None,
        seal_rec_score_thresh: float = None,
        # Retrieval and chat configuration
        retriever_config: dict = None,
        mllm_chat_bot_config: dict = None,
        chat_bot_config: dict = None,
        # Common parameters
        paddlex_config: str = None,
        device: str = None,
        use_hpi: bool = None,
        **kwargs
    ):
        """
        Initialize PPChatOCRv4Doc pipeline.

        Args:
            use_table_recognition (bool, optional): Enable table structure recognition
            use_seal_recognition (bool, optional): Enable seal/stamp recognition
            use_doc_orientation_classify (bool, optional): Enable document orientation correction
            use_doc_unwarping (bool, optional): Enable document image unwarping
            use_textline_orientation (bool, optional): Enable text line orientation classification
            layout_threshold (float, optional): Confidence threshold for layout detection
            layout_nms (bool, optional): Whether to apply NMS in layout detection
            text_det_limit_side_len (int, optional): Limit on text detection image side length
            text_det_thresh (float, optional): Pixel threshold for text detection
            text_det_box_thresh (float, optional): Box confidence threshold for text detection
            text_rec_score_thresh (float, optional): Text recognition confidence threshold
            seal_det_thresh (float, optional): Seal detection threshold
            seal_rec_score_thresh (float, optional): Seal recognition confidence threshold
            retriever_config (dict, optional): Configuration for vector retrieval
            mllm_chat_bot_config (dict, optional): Multimodal LLM chatbot configuration
            chat_bot_config (dict, optional): General chatbot configuration
            device (str, optional): Device for inference ('cpu', 'gpu', 'gpu:0', etc.)
            paddlex_config (str or dict, optional): PaddleX configuration file path or dict
            use_hpi (bool, optional): Enable high-performance inference
        """

Visual Document Analysis

def visual_predict(
    self,
    input,
    *,
    use_doc_orientation_classify: bool = None,
    use_doc_unwarping: bool = None,
    use_textline_orientation: bool = None,
    use_seal_recognition: bool = None,
    use_table_recognition: bool = None,
    layout_threshold: float = None,
    layout_nms: bool = None,
    layout_unclip_ratio: float = None,
    layout_merge_bboxes_mode: str = None,
    text_det_limit_side_len: int = None,
    text_det_limit_type: str = None,
    text_det_thresh: float = None,
    text_det_box_thresh: float = None,
    text_det_unclip_ratio: float = None,
    text_rec_score_thresh: float = None,
    seal_det_limit_side_len: int = None,
    seal_det_limit_type: str = None,
    seal_det_thresh: float = None,
    seal_det_box_thresh: float = None,
    seal_det_unclip_ratio: float = None,
    seal_rec_score_thresh: float = None,
    **kwargs
) -> list:
    """
    Perform visual analysis of document.

    Args:
        input: Image/PDF path, numpy array, PIL Image, directory, or list
        use_seal_recognition (bool, optional): Override seal recognition
        use_table_recognition (bool, optional): Override table recognition
        use_formula_recognition (bool, optional): Override formula recognition
        use_textline_orientation (bool, optional): Override text orientation
        text_rec_score_thresh (float, optional): Text recognition score threshold
        return_word_box (bool, optional): Return character-level boxes

    Returns:
        list: Visual analysis results with layout, text, tables, formulas, seals
    """

def visual_predict_iter(self, input, **kwargs):
    """Iterate over visual analysis results for memory efficiency."""

Vector-Based Retrieval

def build_vector(
    self,
    visual_info: list,
    *,
    min_characters: int = 3500,
    block_size: int = 300,
    flag_save_bytes_vector: bool = False,
    retriever_config: dict = None
) -> dict:
    """
    Build vector index for retrieval-augmented generation.

    Args:
        visual_info (list): Output from visual_predict()
        min_characters (int, optional): Minimum characters to split into blocks
        block_size (int, optional): Characters per block
        flag_save_bytes_vector (bool, optional): Save byte representation
        retriever_config (dict, optional): Custom retriever configuration

    Returns:
        dict: Vector information for retrieval
    """

def save_vector(
    self,
    vector_info: dict,
    save_path: str,
    retriever_config: dict = None
) -> None:
    """Save vector embeddings to file."""

def load_vector(
    self,
    data_path: str,
    retriever_config: dict = None
) -> dict:
    """Load vector embeddings from file."""

def save_visual_info_list(
    self,
    visual_info: list,
    save_path: str
) -> None:
    """Save visual information to file."""

def load_visual_info_list(
    self,
    data_path: str
) -> list:
    """Load visual information from file."""

Chat-Based Information Extraction

def chat(
    self,
    key_list: list,
    visual_info: list,
    *,
    use_vector_retrieval: bool = True,
    vector_info: dict = None,
    min_characters: int = 3500,
    text_task_description: str = None,
    text_output_format: str = None,
    text_rules_str: str = None,
    text_few_shot_demo_text_content: str = None,
    text_few_shot_demo_key_value_list: list = None,
    table_task_description: str = None,
    table_output_format: str = None,
    table_rules_str: str = None,
    table_few_shot_demo_text_content: str = None,
    table_few_shot_demo_key_value_list: list = None,
    mllm_predict_info: dict = None,
    mllm_integration_strategy: str = "integration",
    chat_bot_config: dict = None,
    retriever_config: dict = None
) -> list:
    """
    Extract information through conversational queries.

    Args:
        key_list (list): List of questions/queries to ask
        visual_info (list): Output from visual_predict()
        use_vector_retrieval (bool, optional): Use RAG for large documents
        vector_info (dict, optional): Pre-built vector index
        min_characters (int, optional): Min chars to trigger RAG
        text_task_description (str, optional): Custom task description for text extraction
        text_output_format (str, optional): Desired output format for text
        text_rules_str (str, optional): Extraction rules for text regions
        text_few_shot_demo_text_content (str, optional): Few-shot example text
        text_few_shot_demo_key_value_list (list, optional): Few-shot key-value examples for text
        table_task_description (str, optional): Custom task description for table extraction
        table_output_format (str, optional): Desired output format for tables
        table_rules_str (str, optional): Extraction rules for table regions
        table_few_shot_demo_text_content (str, optional): Few-shot example for tables
        table_few_shot_demo_key_value_list (list, optional): Few-shot key-value examples for tables
        mllm_predict_info (dict, optional): Pre-computed multimodal LLM predictions
        mllm_integration_strategy (str, optional): Strategy for integrating MLLM results (default: 'integration')
        chat_bot_config (dict, optional): LLM chatbot configuration
        retriever_config (dict, optional): Vector retrieval configuration

    Returns:
        list: Extraction results for each query
    """

def mllm_pred(
    self,
    input,
    key_list: list,
    *,
    mllm_chat_bot_config: dict = None
) -> list:
    """
    Direct multimodal LLM prediction without visual analysis.

    Args:
        input: Image/PDF path or data
        key_list (list): List of questions/queries
        mllm_chat_bot_config (dict, optional): MLLM configuration

    Returns:
        list: Prediction results for each query
    """

Resource Management

def close(self) -> None:
    """Close the pipeline and free resources."""

def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
    """Export configuration to YAML."""

Usage Examples

Basic Chat-Based Extraction

from paddleocr import PPChatOCRv4Doc

# Initialize pipeline
pipeline = PPChatOCRv4Doc(
    lang='ch',
    ocr_version='PP-OCRv5',
    use_table_recognition=True,
    use_seal_recognition=True
)

# Analyze document
visual_info = pipeline.visual_predict('contract.pdf')

# Extract information via chat
queries = [
    "What is the contract number?",
    "Who are the parties involved?",
    "What is the contract value?"
]

results = pipeline.chat(queries, visual_info)

for query, result in zip(queries, results):
    print(f"Q: {query}")
    print(f"A: {result}")

pipeline.close()

Large Document with RAG

from paddleocr import PPChatOCRv4Doc

pipeline = PPChatOCRv4Doc()

# Process large document
visual_info = pipeline.visual_predict('large_report.pdf')

# Build vector index for efficient retrieval
vector_info = pipeline.build_vector(
    visual_info,
    min_characters=3500,
    block_size=300
)

# Save for future use
pipeline.save_vector(vector_info, 'report_vectors.pkl')
pipeline.save_visual_info_list(visual_info, 'report_visual.pkl')

# Query with RAG
queries = [
    "Summarize the financial highlights",
    "What are the key risks mentioned?",
    "List all recommendations"
]

results = pipeline.chat(
    queries,
    visual_info,
    use_vector_retrieval=True,
    vector_info=vector_info,
    retrieval_result_num=3
)

pipeline.close()

Reusing Cached Analysis

from paddleocr import PPChatOCRv4Doc

pipeline = PPChatOCRv4Doc()

# Load previously saved analysis
visual_info = pipeline.load_visual_info_list('report_visual.pkl')
vector_info = pipeline.load_vector('report_vectors.pkl')

# Query without re-analyzing
new_queries = ["What is the projected growth rate?"]
results = pipeline.chat(
    new_queries,
    visual_info,
    vector_info=vector_info
)

pipeline.close()

Direct MLLM Prediction

from paddleocr import PPChatOCRv4Doc

pipeline = PPChatOCRv4Doc()

# Custom MLLM configuration
mllm_config = {
    'model_name': 'custom-vl-model',
    'temperature': 0.7,
    'max_tokens': 500
}

# Direct prediction without visual_predict
queries = ["What is shown in this image?"]
results = pipeline.mllm_pred(
    'document.png',
    queries,
    mllm_chat_bot_config=mllm_config
)

pipeline.close()

Return Value Structure

visual_predict() Output

[
    {
        "input_path": "path/to/document.pdf",
        "layout_result": [
            {
                "bbox": [x1, y1, x2, y2],
                "label": "text",
                "ocr_result": {
                    "dt_polys": [[x1,y1], [x2,y2], ...],
                    "rec_text": "recognized text",
                    "rec_score": 0.95
                }
            }
        ],
        "table_result": [  # If use_table_recognition=True
            {
                "bbox": [x1, y1, x2, y2],
                "html": "<table>...</table>"
            }
        ],
        "seal_result": [  # If use_seal_recognition=True
            {
                "bbox": [x1, y1, x2, y2],
                "text": "seal text",
                "score": 0.92
            }
        ],
        "formula_result": [  # If use_formula_recognition=True
            {
                "bbox": [x1, y1, x2, y2],
                "latex": "x^2 + y^2 = r^2"
            }
        ]
    }
]

chat() Output

[
    {
        "query": "What is the contract number?",
        "answer": "CT-2024-001",
        "confidence": 0.95,
        "source_blocks": [
            {
                "page": 1,
                "bbox": [x1, y1, x2, y2],
                "text": "Contract Number: CT-2024-001"
            }
        ]
    }
]