or run

tessl search

Version

Workspace: tessl
Visibility: Public
Created: 10 days ago
Last updated: 1 day ago
Describes: pkg:pypi/paddleocr@3.3.x

docs

models

pipelines

doc-preprocessor.md doc-understanding.md formula-recognition.md paddleocr-vl.md paddleocr.md pp-chatocrv4.md pp-doctranslation.md pp-structurev3.md seal-recognition.md table-recognition.md

index.md utilities.md

tile.json

tessl/pypi-paddleocr

tessl install tessl/pypi-paddleocr@3.3.0

Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.

PPDocTranslation Pipeline

Document translation pipeline combining layout parsing, OCR, and LLM-based translation. Converts documents to markdown while preserving structure, then translates content using language models with support for glossaries, custom rules, and few-shot examples.

Capabilities

Pipeline Initialization

class PPDocTranslation:
    """
    Document translation pipeline with structure preservation.

    Analyzes document layout and content, converts to markdown,
    then translates using LLM while maintaining formatting.
    """
    def __init__(
        self,
        # Layout detection
        layout_detection_model_name: str = None,
        layout_detection_model_dir: str = None,
        layout_threshold: float = None,
        layout_nms: bool = None,
        layout_unclip_ratio: float = None,
        layout_merge_bboxes_mode: str = None,
        # Document preprocessing
        doc_orientation_classify_model_name: str = None,
        doc_orientation_classify_model_dir: str = None,
        doc_unwarping_model_name: str = None,
        doc_unwarping_model_dir: str = None,
        use_doc_orientation_classify: bool = None,
        use_doc_unwarping: bool = None,
        # OCR models
        text_detection_model_name: str = None,
        text_detection_model_dir: str = None,
        text_recognition_model_name: str = None,
        text_recognition_model_dir: str = None,
        text_recognition_batch_size: int = None,
        textline_orientation_model_name: str = None,
        textline_orientation_model_dir: str = None,
        use_textline_orientation: bool = None,
        # Table recognition
        use_table_recognition: bool = None,
        table_classification_model_name: str = None,
        table_structure_recognition_model_name: str = None,
        # Formula recognition
        use_formula_recognition: bool = None,
        formula_recognition_model_name: str = None,
        # Chart recognition
        use_chart_recognition: bool = None,
        chart_parsing_model_name: str = None,
        # Language and version
        lang: str = None,
        ocr_version: str = None,
        # Common parameters
        paddlex_config: str = None,
        device: str = None,
        use_hpi: bool = None,
        **kwargs
    ):
        """
        Initialize PPDocTranslation pipeline.

        Args:
            lang (str, optional): Source language code
            ocr_version (str, optional): OCR version ('PP-OCRv3', 'PP-OCRv4', 'PP-OCRv5')
            use_table_recognition (bool, optional): Enable table recognition
            use_formula_recognition (bool, optional): Enable formula recognition
            use_chart_recognition (bool, optional): Enable chart recognition
            use_doc_orientation_classify (bool, optional): Enable orientation correction
            use_doc_unwarping (bool, optional): Enable document unwarping
            device (str, optional): Device for inference ('cpu', 'gpu')
            paddlex_config (str or dict, optional): Configuration file or dict
        """

Visual Document Analysis

def visual_predict(
    self,
    input,
    *,
    use_doc_orientation_classify: bool = None,
    use_doc_unwarping: bool = None,
    use_textline_orientation: bool = None,
    use_table_recognition: bool = None,
    use_formula_recognition: bool = None,
    use_chart_recognition: bool = None,
    layout_threshold: float = None,
    layout_nms: bool = None,
    layout_unclip_ratio: float = None,
    layout_merge_bboxes_mode: str = None,
    text_det_limit_side_len: int = None,
    text_det_thresh: float = None,
    text_det_box_thresh: float = None,
    text_rec_score_thresh: float = None,
    **kwargs
) -> list:
    """
    Analyze document and convert to markdown.

    Args:
        input: Image/PDF path, numpy array, PIL Image, directory, or list
        use_table_recognition (bool, optional): Override table recognition
        use_formula_recognition (bool, optional): Override formula recognition
        use_chart_recognition (bool, optional): Override chart recognition
        text_rec_score_thresh (float, optional): Text recognition threshold
        layout_threshold (float, optional): Layout detection threshold

    Returns:
        list: Analysis results with markdown and layout information
    """

def visual_predict_iter(self, input, **kwargs):
    """Iterate over visual analysis results for memory efficiency."""

Translation

def translate(
    self,
    ori_md_info_list: list,
    *,
    target_language: str = "zh",
    chunk_size: int = 5000,
    task_description: str = None,
    output_format: str = None,
    rules_str: str = None,
    few_shot_demo_text_content: str = None,
    few_shot_demo_key_value_list: list = None,
    glossary: dict = None,
    llm_request_interval: float = 0.0,
    chat_bot_config: dict = None,
    **kwargs
) -> list:
    """
    Translate markdown content to target language.

    Args:
        ori_md_info_list (list): Output from visual_predict()
        target_language (str, optional): Target language code (default: 'zh')
        chunk_size (int, optional): Max characters per translation chunk
        task_description (str, optional): Custom translation task description
        output_format (str, optional): Desired output format description
        rules_str (str, optional): Translation rules and guidelines
        few_shot_demo_text_content (str, optional): Few-shot example text
        few_shot_demo_key_value_list (list, optional): Few-shot key-value examples
        glossary (dict, optional): Term translations (source -> target)
        llm_request_interval (float, optional): Delay between LLM requests
        chat_bot_config (dict, optional): LLM configuration

    Returns:
        list: Translated markdown for each page
    """

def translate_iter(self, ori_md_info_list, **kwargs):
    """Iterate over translation results for memory efficiency."""

Utility Methods

def load_from_markdown(
    self,
    input
) -> list:
    """
    Load content from existing markdown files.

    Args:
        input: Markdown file path or list of paths

    Returns:
        list: Loaded markdown information
    """

def concatenate_markdown_pages(
    self,
    markdown_list: list
) -> str:
    """
    Concatenate markdown from multiple pages.

    Args:
        markdown_list (list): List of markdown strings

    Returns:
        str: Combined markdown
    """

def close(self) -> None:
    """Close the pipeline and free resources."""

def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
    """Export configuration to YAML."""

Usage Examples

Basic Document Translation

from paddleocr import PPDocTranslation

# Initialize pipeline
pipeline = PPDocTranslation(
    lang='en',
    ocr_version='PP-OCRv5',
    use_table_recognition=True
)

# Analyze document
visual_info = pipeline.visual_predict('english_doc.pdf')

# Translate to Chinese
translated = pipeline.translate(
    visual_info,
    target_language='zh'
)

# Save translated markdown
with open('chinese_doc.md', 'w', encoding='utf-8') as f:
    combined = pipeline.concatenate_markdown_pages(translated)
    f.write(combined)

pipeline.close()

Translation with Glossary

from paddleocr import PPDocTranslation

pipeline = PPDocTranslation()

# Analyze technical document
visual_info = pipeline.visual_predict('technical_spec.pdf')

# Define domain-specific glossary
glossary = {
    "API": "应用程序接口",
    "machine learning": "机器学习",
    "neural network": "神经网络",
    "inference": "推理"
}

# Translate with glossary
translated = pipeline.translate(
    visual_info,
    target_language='zh',
    glossary=glossary
)

pipeline.close()

Translation with Custom Rules

from paddleocr import PPDocTranslation

pipeline = PPDocTranslation(
    use_formula_recognition=True,
    use_chart_recognition=True
)

visual_info = pipeline.visual_predict('research_paper.pdf')

# Define translation rules
rules = """
1. Keep all mathematical formulas in original LaTeX
2. Preserve technical terms in English with Chinese translation in parentheses
3. Maintain formal academic tone
4. Keep citations and references unchanged
"""

# Translate with rules
translated = pipeline.translate(
    visual_info,
    target_language='zh',
    rules_str=rules,
    task_description="Translate academic research paper"
)

pipeline.close()

Few-Shot Translation

from paddleocr import PPDocTranslation

pipeline = PPDocTranslation()

visual_info = pipeline.visual_predict('contract.pdf')

# Provide few-shot examples
few_shot_examples = [
    {"source": "Party A", "target": "甲方"},
    {"source": "Party B", "target": "乙方"},
    {"source": "effective date", "target": "生效日期"}
]

# Example text showing desired style
few_shot_text = """
English: This Agreement is entered into on January 1, 2024.
Chinese: 本协议于2024年1月1日订立。
"""

translated = pipeline.translate(
    visual_info,
    target_language='zh',
    few_shot_demo_text_content=few_shot_text,
    few_shot_demo_key_value_list=few_shot_examples,
    output_format="Formal legal document style"
)

pipeline.close()

Batch Translation with Rate Limiting

from paddleocr import PPDocTranslation

pipeline = PPDocTranslation()

# Process multiple documents
visual_info = pipeline.visual_predict('documents/')

# Custom LLM configuration
llm_config = {
    'model_name': 'gpt-4',
    'temperature': 0.3,
    'max_tokens': 2000
}

# Translate with rate limiting
translated = pipeline.translate(
    visual_info,
    target_language='fr',
    chunk_size=3000,
    llm_request_interval=1.0,  # 1 second between requests
    chat_bot_config=llm_config
)

pipeline.close()

Loading Pre-Processed Markdown

from paddleocr import PPDocTranslation

pipeline = PPDocTranslation()

# Load existing markdown files
md_info = pipeline.load_from_markdown('document.md')

# Translate without re-analyzing
translated = pipeline.translate(
    md_info,
    target_language='es',
    task_description="Translate to Spanish (Spain)"
)

pipeline.close()

Supported Languages

PPDocTranslation supports translation to/from all major languages including:

Chinese (Simplified/Traditional): zh, cht
European: en, fr, es, de, it, pt, ru, nl, pl, etc.
Asian: ja, ko, ar, th, hi, vi, etc.
And many more (depends on LLM backend)

Return Value Structure

visual_predict() Output

[
    {
        "input_path": "path/to/document.pdf",
        "markdown": "# Original Markdown\n\nContent...",
        "layout_result": [
            {
                "bbox": [x1, y1, x2, y2],
                "label": "text",
                "content": "original text"
            }
        ],
        "table_result": [  # If use_table_recognition=True
            {
                "bbox": [x1, y1, x2, y2],
                "html": "<table>...</table>"
            }
        ],
        "formula_result": [  # If use_formula_recognition=True
            {
                "bbox": [x1, y1, x2, y2],
                "latex": "E = mc^2"
            }
        ]
    }
]

translate() Output

[
    {
        "input_path": "path/to/document.pdf",
        "original_markdown": "# Original\n\nContent...",
        "translated_markdown": "# 翻译后的标题\n\n内容...",
        "target_language": "zh"
    }
]