tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Vision-Language OCR pipeline using the compact 0.9B PaddleOCR-VL model for multilingual document parsing. Supports 109 languages with minimal resource consumption and excels at recognizing complex elements like text, tables, formulas, and charts.
class PaddleOCRVL:
"""
Vision-Language document parsing pipeline using PaddleOCR-VL model.
Uses a compact 0.9B VLM (vision-language model) for efficient document parsing
across 109 languages with state-of-the-art accuracy on complex elements.
"""
def __init__(
self,
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
vl_rec_model_name: str = None,
vl_rec_model_dir: str = None,
vl_rec_backend: str = None,
vl_rec_server_url: str = None,
vl_rec_max_concurrency: int = None,
vl_rec_api_model_name: str = None,
vl_rec_api_key: str = None,
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
use_chart_recognition: bool = None,
format_block_content: bool = None,
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize PaddleOCR-VL pipeline.
Args:
vl_rec_backend (str, optional): VL recognition backend:
- 'native': Local model inference (default)
- 'vllm-server': vLLM server
- 'sglang-server': SGLang server
- 'fastdeploy-server': FastDeploy server
vl_rec_server_url (str, optional): Server URL for remote backends
vl_rec_max_concurrency (int, optional): Max concurrent requests for server backends
vl_rec_api_model_name (str, optional): API model name for server backends
vl_rec_api_key (str, optional): API key for authentication
use_layout_detection (bool, optional): Enable layout detection preprocessing
use_chart_recognition (bool, optional): Enable chart-to-table conversion
use_doc_orientation_classify (bool, optional): Enable orientation correction
use_doc_unwarping (bool, optional): Enable document unwarping
format_block_content (bool, optional): Format block content in output
device (str, optional): Device for inference
paddlex_config (str or dict, optional): Configuration file or dict
"""def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
use_chart_recognition: bool = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
use_queues: bool = None,
prompt_label: str = None,
format_block_content: bool = None,
repetition_penalty: float = None,
temperature: float = None,
top_p: float = None,
min_pixels: int = None,
max_pixels: int = None,
**kwargs
) -> list:
"""
Parse document using vision-language model.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_layout_detection (bool, optional): Override layout detection
use_chart_recognition (bool, optional): Override chart recognition
prompt_label (str, optional): Custom prompt for VL model
format_block_content (bool, optional): Format block content
repetition_penalty (float, optional): Repetition penalty for generation
temperature (float, optional): Temperature for generation
top_p (float, optional): Top-p sampling parameter
min_pixels (int, optional): Minimum image pixels for VL model
max_pixels (int, optional): Maximum image pixels for VL model
Returns:
list: Parsing results with markdown, layout, and element information
"""
def predict_iter(self, input, **kwargs):
"""Iterate over parsing results for memory efficiency."""
def concatenate_markdown_pages(self, markdown_list: list) -> str:
"""Concatenate markdown from multiple pages."""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import PaddleOCRVL
# Initialize PaddleOCR-VL
ocr_vl = PaddleOCRVL()
# Parse document
result = ocr_vl.predict('document.pdf')
# Get markdown output
for item in result:
print(item['markdown'])
ocr_vl.close()from paddleocr import PaddleOCRVL
# PaddleOCR-VL supports 109 languages natively
ocr_vl = PaddleOCRVL()
# Parse documents in different languages
result_en = ocr_vl.predict('english_doc.pdf')
result_zh = ocr_vl.predict('chinese_doc.pdf')
result_ar = ocr_vl.predict('arabic_doc.pdf')
result_hi = ocr_vl.predict('hindi_doc.pdf')
ocr_vl.close()from paddleocr import PaddleOCRVL
ocr_vl = PaddleOCRVL(
use_layout_detection=True,
use_chart_recognition=True
)
result = ocr_vl.predict('document_with_charts.pdf')
ocr_vl.close()from paddleocr import PaddleOCRVL
# Connect to vLLM server
ocr_vl = PaddleOCRVL(
vl_rec_backend='vllm-server',
vl_rec_server_url='http://localhost:8000',
vl_rec_max_concurrency=4
)
result = ocr_vl.predict('document.pdf')
ocr_vl.close()from paddleocr import PaddleOCRVL
ocr_vl = PaddleOCRVL()
result = ocr_vl.predict(
'document.pdf',
repetition_penalty=1.2,
temperature=0.7,
top_p=0.9
)
ocr_vl.close()PaddleOCR-VL natively supports 109 languages including all major world languages, scripts (Latin, CJK, Cyrillic, Arabic, Devanagari, Thai, etc.), and specialized use cases like handwritten text and historical documents.
[
{
"input_path": "path/to/document.pdf",
"markdown": "# Markdown output\n\nContent...",
"layout_result": [
{
"bbox": [x1, y1, x2, y2],
"label": "text",
"content": "recognized content"
},
# ...
],
"chart_result": [ # If use_chart_recognition=True
{
"bbox": [x1, y1, x2, y2],
"table_data": {...}
}
]
}
]