tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Document translation pipeline combining layout parsing, OCR, and LLM-based translation. Converts documents to markdown while preserving structure, then translates content using language models with support for glossaries, custom rules, and few-shot examples.
class PPDocTranslation:
"""
Document translation pipeline with structure preservation.
Analyzes document layout and content, converts to markdown,
then translates using LLM while maintaining formatting.
"""
def __init__(
self,
# Layout detection
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
# Document preprocessing
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
# OCR models
text_detection_model_name: str = None,
text_detection_model_dir: str = None,
text_recognition_model_name: str = None,
text_recognition_model_dir: str = None,
text_recognition_batch_size: int = None,
textline_orientation_model_name: str = None,
textline_orientation_model_dir: str = None,
use_textline_orientation: bool = None,
# Table recognition
use_table_recognition: bool = None,
table_classification_model_name: str = None,
table_structure_recognition_model_name: str = None,
# Formula recognition
use_formula_recognition: bool = None,
formula_recognition_model_name: str = None,
# Chart recognition
use_chart_recognition: bool = None,
chart_parsing_model_name: str = None,
# Language and version
lang: str = None,
ocr_version: str = None,
# Common parameters
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize PPDocTranslation pipeline.
Args:
lang (str, optional): Source language code
ocr_version (str, optional): OCR version ('PP-OCRv3', 'PP-OCRv4', 'PP-OCRv5')
use_table_recognition (bool, optional): Enable table recognition
use_formula_recognition (bool, optional): Enable formula recognition
use_chart_recognition (bool, optional): Enable chart recognition
use_doc_orientation_classify (bool, optional): Enable orientation correction
use_doc_unwarping (bool, optional): Enable document unwarping
device (str, optional): Device for inference ('cpu', 'gpu')
paddlex_config (str or dict, optional): Configuration file or dict
"""def visual_predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
text_det_limit_side_len: int = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_rec_score_thresh: float = None,
**kwargs
) -> list:
"""
Analyze document and convert to markdown.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_table_recognition (bool, optional): Override table recognition
use_formula_recognition (bool, optional): Override formula recognition
use_chart_recognition (bool, optional): Override chart recognition
text_rec_score_thresh (float, optional): Text recognition threshold
layout_threshold (float, optional): Layout detection threshold
Returns:
list: Analysis results with markdown and layout information
"""
def visual_predict_iter(self, input, **kwargs):
"""Iterate over visual analysis results for memory efficiency."""def translate(
self,
ori_md_info_list: list,
*,
target_language: str = "zh",
chunk_size: int = 5000,
task_description: str = None,
output_format: str = None,
rules_str: str = None,
few_shot_demo_text_content: str = None,
few_shot_demo_key_value_list: list = None,
glossary: dict = None,
llm_request_interval: float = 0.0,
chat_bot_config: dict = None,
**kwargs
) -> list:
"""
Translate markdown content to target language.
Args:
ori_md_info_list (list): Output from visual_predict()
target_language (str, optional): Target language code (default: 'zh')
chunk_size (int, optional): Max characters per translation chunk
task_description (str, optional): Custom translation task description
output_format (str, optional): Desired output format description
rules_str (str, optional): Translation rules and guidelines
few_shot_demo_text_content (str, optional): Few-shot example text
few_shot_demo_key_value_list (list, optional): Few-shot key-value examples
glossary (dict, optional): Term translations (source -> target)
llm_request_interval (float, optional): Delay between LLM requests
chat_bot_config (dict, optional): LLM configuration
Returns:
list: Translated markdown for each page
"""
def translate_iter(self, ori_md_info_list, **kwargs):
"""Iterate over translation results for memory efficiency."""def load_from_markdown(
self,
input
) -> list:
"""
Load content from existing markdown files.
Args:
input: Markdown file path or list of paths
Returns:
list: Loaded markdown information
"""
def concatenate_markdown_pages(
self,
markdown_list: list
) -> str:
"""
Concatenate markdown from multiple pages.
Args:
markdown_list (list): List of markdown strings
Returns:
str: Combined markdown
"""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import PPDocTranslation
# Initialize pipeline
pipeline = PPDocTranslation(
lang='en',
ocr_version='PP-OCRv5',
use_table_recognition=True
)
# Analyze document
visual_info = pipeline.visual_predict('english_doc.pdf')
# Translate to Chinese
translated = pipeline.translate(
visual_info,
target_language='zh'
)
# Save translated markdown
with open('chinese_doc.md', 'w', encoding='utf-8') as f:
combined = pipeline.concatenate_markdown_pages(translated)
f.write(combined)
pipeline.close()from paddleocr import PPDocTranslation
pipeline = PPDocTranslation()
# Analyze technical document
visual_info = pipeline.visual_predict('technical_spec.pdf')
# Define domain-specific glossary
glossary = {
"API": "应用程序接口",
"machine learning": "机器学习",
"neural network": "神经网络",
"inference": "推理"
}
# Translate with glossary
translated = pipeline.translate(
visual_info,
target_language='zh',
glossary=glossary
)
pipeline.close()from paddleocr import PPDocTranslation
pipeline = PPDocTranslation(
use_formula_recognition=True,
use_chart_recognition=True
)
visual_info = pipeline.visual_predict('research_paper.pdf')
# Define translation rules
rules = """
1. Keep all mathematical formulas in original LaTeX
2. Preserve technical terms in English with Chinese translation in parentheses
3. Maintain formal academic tone
4. Keep citations and references unchanged
"""
# Translate with rules
translated = pipeline.translate(
visual_info,
target_language='zh',
rules_str=rules,
task_description="Translate academic research paper"
)
pipeline.close()from paddleocr import PPDocTranslation
pipeline = PPDocTranslation()
visual_info = pipeline.visual_predict('contract.pdf')
# Provide few-shot examples
few_shot_examples = [
{"source": "Party A", "target": "甲方"},
{"source": "Party B", "target": "乙方"},
{"source": "effective date", "target": "生效日期"}
]
# Example text showing desired style
few_shot_text = """
English: This Agreement is entered into on January 1, 2024.
Chinese: 本协议于2024年1月1日订立。
"""
translated = pipeline.translate(
visual_info,
target_language='zh',
few_shot_demo_text_content=few_shot_text,
few_shot_demo_key_value_list=few_shot_examples,
output_format="Formal legal document style"
)
pipeline.close()from paddleocr import PPDocTranslation
pipeline = PPDocTranslation()
# Process multiple documents
visual_info = pipeline.visual_predict('documents/')
# Custom LLM configuration
llm_config = {
'model_name': 'gpt-4',
'temperature': 0.3,
'max_tokens': 2000
}
# Translate with rate limiting
translated = pipeline.translate(
visual_info,
target_language='fr',
chunk_size=3000,
llm_request_interval=1.0, # 1 second between requests
chat_bot_config=llm_config
)
pipeline.close()from paddleocr import PPDocTranslation
pipeline = PPDocTranslation()
# Load existing markdown files
md_info = pipeline.load_from_markdown('document.md')
# Translate without re-analyzing
translated = pipeline.translate(
md_info,
target_language='es',
task_description="Translate to Spanish (Spain)"
)
pipeline.close()PPDocTranslation supports translation to/from all major languages including:
[
{
"input_path": "path/to/document.pdf",
"markdown": "# Original Markdown\n\nContent...",
"layout_result": [
{
"bbox": [x1, y1, x2, y2],
"label": "text",
"content": "original text"
}
],
"table_result": [ # If use_table_recognition=True
{
"bbox": [x1, y1, x2, y2],
"html": "<table>...</table>"
}
],
"formula_result": [ # If use_formula_recognition=True
{
"bbox": [x1, y1, x2, y2],
"latex": "E = mc^2"
}
]
}
][
{
"input_path": "path/to/document.pdf",
"original_markdown": "# Original\n\nContent...",
"translated_markdown": "# 翻译后的标题\n\n内容...",
"target_language": "zh"
}
]