tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
PaddleOCR is an industry-leading, production-ready OCR and document AI engine that converts documents and images into structured, AI-friendly data formats (JSON and Markdown) with industry-leading accuracy. The library provides comprehensive solutions from text extraction to intelligent document understanding, including multilingual OCR, document parsing, table recognition, formula recognition, and intelligent information extraction.
# Step 1: Install PaddlePaddle framework first (REQUIRED dependency)
pip install paddlepaddle
# Step 2: Install PaddleOCR
pip install paddleocrCritical Installation Notes:
pip install paddlepaddle (suitable for development/testing)pip install paddlepaddle-gpu (recommended for production)import paddleocrImport pipelines (most common):
from paddleocr import PaddleOCR, PPStructureV3, PaddleOCRVLImport individual models:
from paddleocr import (
TextDetection,
TextRecognition,
LayoutDetection,
FormulaRecognition,
TableStructureRecognition
)Import all exports:
from paddleocr import (
# Pipelines
PaddleOCR,
PPStructureV3,
PaddleOCRVL,
PPChatOCRv4Doc,
PPDocTranslation,
DocPreprocessor,
DocUnderstanding,
FormulaRecognitionPipeline,
SealRecognition,
TableRecognitionPipelineV2,
# Models
TextDetection,
TextRecognition,
LayoutDetection,
FormulaRecognition,
TableStructureRecognition,
ChartParsing,
DocImgOrientationClassification,
DocVLM,
SealTextDetection,
TableCellsDetection,
TableClassification,
TextImageUnwarping,
TextLineOrientationClassification,
# Utilities
benchmark,
logger,
__version__
)from paddleocr import PaddleOCR
# Initialize OCR pipeline
ocr = PaddleOCR(lang='en') # lang can be 'ch', 'en', 'fr', 'es', etc.
# Perform OCR on an image
result = ocr.predict('path/to/image.jpg')
# Process results
for item in result:
for line in item.get('rec_result', []):
print(f"Text: {line['rec_text']}, Score: {line['rec_score']}")
# IMPORTANT: Always close pipeline when done
ocr.close()Error Handling Pattern:
from paddleocr import PaddleOCR
def safe_ocr(image_path, lang='en', device='gpu'):
"""OCR with automatic GPU/CPU fallback and error handling."""
ocr = None
try:
# Try GPU first
ocr = PaddleOCR(lang=lang, device=device)
result = ocr.predict(image_path)
return result
except RuntimeError as e:
if 'CUDA' in str(e) or 'GPU' in str(e):
print(f"GPU error: {e}, falling back to CPU")
if ocr:
ocr.close()
# Fallback to CPU
ocr = PaddleOCR(lang=lang, device='cpu')
result = ocr.predict(image_path)
return result
raise
except FileNotFoundError:
print(f"Error: File not found: {image_path}")
return []
except Exception as e:
print(f"Unexpected error: {e}")
return []
finally:
if ocr is not None:
ocr.close()from paddleocr import PPStructureV3
# Initialize document structure pipeline
structure = PPStructureV3(lang='en')
# Parse document to markdown
result = structure.predict('document.pdf')
# Extract markdown content
for item in result:
markdown = item.get('markdown', '')
print(markdown)
structure.close()from paddleocr import PaddleOCRVL
# Initialize PaddleOCR-VL pipeline (supports 109 languages)
ocr_vl = PaddleOCRVL()
# Parse document
result = ocr_vl.predict('document_image.jpg')
# Get markdown output
for item in result:
print(item.get('markdown', ''))
ocr_vl.close()PaddleOCR is built around a modular architecture with three main layers:
High-level workflows that combine multiple models for complete document processing tasks:
Each pipeline can be configured with specific models, parameters, and processing options.
13 individual model components for specific OCR and document AI tasks:
All models inherit from PaddleXPredictorWrapper and provide consistent predict(), predict_iter(), and close() methods.
close() methods and iterator-based APIs for memory efficiencyPaddleOCR provides a complete OCR pipeline combining text detection and recognition with support for 109 languages. The PP-OCRv5 model achieves state-of-the-art accuracy for multilingual text recognition.
class PaddleOCR:
def __init__(
self,
lang: str = None,
ocr_version: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
text_rec_score_thresh: float = None,
return_word_box: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_textline_orientation: bool = None,
text_det_limit_side_len: int = None,
text_det_limit_type: str = None,
text_det_thresh: float = None,
text_det_box_thresh: float = None,
text_det_unclip_ratio: float = None,
text_rec_score_thresh: float = None,
return_word_box: bool = None
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...Key Parameters for Agents:
lang: Language code (e.g., 'en', 'ch', 'fr') - affects model selectionocr_version: 'PP-OCRv3', 'PP-OCRv4', or 'PP-OCRv5' (default: v5, highest accuracy)device: 'cpu' or 'gpu:0' - GPU provides 3-10x speeduptext_det_thresh: Lower values (0.2-0.4) detect more text but increase false positivestext_rec_score_thresh: Filter results by confidence (0.0-1.0, default: ~0.5)return_word_box: Enable for character-level bounding boxes (useful for text editing)Performance Characteristics:
predict_iter() for >10 images to reduce memory usageDecision Tree for Agents:
use_doc_orientation_classify=True, use_doc_unwarping=Truepredict_iter() with batch_size=1return_word_box=TrueConfidence Validation Pattern:
from paddleocr import PaddleOCR
def ocr_with_validation(image_path, min_confidence=0.7):
"""OCR with confidence-based validation and filtering."""
ocr = PaddleOCR(lang='en', device='gpu')
try:
result = ocr.predict(image_path)
validated_results = []
for item in result:
high_conf_lines = []
low_conf_lines = []
for line in item.get('rec_result', []):
if line['rec_score'] >= min_confidence:
high_conf_lines.append(line)
else:
low_conf_lines.append(line)
validated_results.append({
'input_path': item.get('input_path'),
'high_confidence': high_conf_lines,
'low_confidence': low_conf_lines,
'total_detected': len(item.get('rec_result', [])),
'passed_validation': len(high_conf_lines)
})
return validated_results
finally:
ocr.close()PP-StructureV3 intelligently converts complex PDFs and document images into Markdown and JSON files while preserving original structure. Supports layout detection, table recognition, formula recognition, seal recognition, and chart recognition.
class PPStructureV3:
def __init__(
self,
lang: str = None,
ocr_version: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_seal_recognition: bool = None,
use_table_recognition: bool = None,
use_formula_recognition: bool = None,
use_chart_recognition: bool = None,
**kwargs
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def concatenate_markdown_pages(self, markdown_list: list) -> str: ...
def close(self) -> None: ...When to Use PP-StructureV3:
Performance vs Capability Trade-offs:
Resource Usage:
Selective Feature Usage Pattern:
from paddleocr import PPStructureV3
def detect_document_type_and_process(document_path):
"""Dynamically enable features based on document analysis."""
# First pass: lightweight layout analysis
structure_basic = PPStructureV3(
lang='en',
use_table_recognition=False,
use_formula_recognition=False,
use_seal_recognition=False,
use_chart_recognition=False
)
try:
basic_result = structure_basic.predict(document_path)
# Analyze what elements are present
has_tables = any('table' in str(item.get('layout_result', []))
for item in basic_result)
has_formulas = any('formula' in str(item.get('layout_result', []))
for item in basic_result)
# Second pass: enable only needed features
if has_tables or has_formulas:
structure_full = PPStructureV3(
lang='en',
use_table_recognition=has_tables,
use_formula_recognition=has_formulas
)
try:
full_result = structure_full.predict(document_path)
return full_result
finally:
structure_full.close()
else:
return basic_result
finally:
structure_basic.close()PaddleOCR-VL uses a 0.9B vision-language model supporting 109 languages for efficient multilingual document parsing with minimal resource consumption. Excels at recognizing complex elements like text, tables, formulas, and charts.
class PaddleOCRVL:
def __init__(
self,
vl_rec_backend: str = None,
vl_rec_server_url: str = None,
use_layout_detection: bool = None,
use_chart_recognition: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_layout_detection: bool = None,
use_chart_recognition: bool = None,
prompt_label: str = None,
format_block_content: bool = None,
**kwargs
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def concatenate_markdown_pages(self, markdown_list: list) -> str: ...
def close(self) -> None: ...PaddleOCRVL vs PPStructureV3 Decision Guide:
Performance Comparison:
PaddleOCRVL:
- Speed: ~1-2 seconds/page (GPU)
- Memory: ~2GB model + ~400MB per page
- Accuracy: 90-94% (varies by language)
- Languages: 109 automatic
PPStructureV3:
- Speed: ~2-5 seconds/page (GPU, all features)
- Memory: ~1GB base + ~200MB per page
- Accuracy: 92-96% (single language optimized)
- Languages: Requires lang parameterMultilingual Document Handling Pattern:
from paddleocr import PaddleOCRVL, PaddleOCR
def process_multilingual_document(document_path, known_languages=None):
"""Handle documents with unknown or mixed languages."""
if known_languages is None or len(known_languages) > 1:
# Use PaddleOCRVL for automatic language handling
ocr_vl = PaddleOCRVL()
try:
result = ocr_vl.predict(document_path)
return {
'method': 'VLM',
'result': result,
'languages': 'auto-detected'
}
finally:
ocr_vl.close()
else:
# Use PaddleOCR for single known language (better accuracy)
ocr = PaddleOCR(lang=known_languages[0])
try:
result = ocr.predict(document_path)
return {
'method': 'OCR',
'result': result,
'languages': known_languages[0]
}
finally:
ocr.close()PP-ChatOCRv4 integrates ERNIE 4.5 to precisely extract key information from documents with conversational capabilities and vector retrieval support.
class PPChatOCRv4Doc:
def __init__(self, device: str = None, **kwargs): ...
def visual_predict(self, input, **kwargs) -> list: ...
def visual_predict_iter(self, input, **kwargs): ...
def build_vector(
self,
visual_info,
*,
min_characters: int = 3500,
block_size: int = 300,
retriever_config: dict = None
) -> dict: ...
def chat(
self,
key_list,
visual_info,
*,
use_vector_retrieval: bool = True,
vector_info: dict = None,
**kwargs
) -> list: ...
def save_vector(self, vector_info, save_path: str, retriever_config: dict = None) -> None: ...
def load_vector(self, data_path: str, retriever_config: dict = None) -> dict: ...
def close(self) -> None: ...Use Cases for PP-ChatOCRv4:
RAG Threshold Decision Tree:
build_vector() (better accuracy, ~2-4s initial + ~0.5s per query)Memory Requirements:
Complete RAG Workflow Pattern:
from paddleocr import PPChatOCRv4Doc
import os
def intelligent_document_qa(document_path, questions, cache_dir='./cache'):
"""Question answering with caching for repeated queries."""
os.makedirs(cache_dir, exist_ok=True)
# Generate cache filenames
doc_hash = str(abs(hash(document_path)))
visual_cache = os.path.join(cache_dir, f'{doc_hash}_visual.pkl')
vector_cache = os.path.join(cache_dir, f'{doc_hash}_vector.pkl')
pipeline = PPChatOCRv4Doc()
try:
# Check for cached analysis
if os.path.exists(visual_cache) and os.path.exists(vector_cache):
print("Loading cached analysis...")
visual_info = pipeline.load_visual_info_list(visual_cache)
vector_info = pipeline.load_vector(vector_cache)
else:
print("Analyzing document...")
# Process document
visual_info = pipeline.visual_predict(document_path)
# Build vector index if document is large
total_chars = sum(
len(str(item.get('layout_result', [])))
for item in visual_info
)
if total_chars >= 3500:
print("Building vector index for large document...")
vector_info = pipeline.build_vector(
visual_info,
min_characters=3500,
block_size=300
)
# Cache for future use
pipeline.save_visual_info_list(visual_info, visual_cache)
pipeline.save_vector(vector_info, vector_cache)
else:
vector_info = None
# Answer questions
print(f"Answering {len(questions)} questions...")
results = pipeline.chat(
questions,
visual_info,
use_vector_retrieval=(vector_info is not None),
vector_info=vector_info
)
return results
finally:
pipeline.close()
# Usage
questions = [
"What is the document date?",
"Who are the parties mentioned?",
"What is the total amount?"
]
answers = intelligent_document_qa('contract.pdf', questions)PP-DocTranslation combines layout parsing, OCR, and LLM-based translation to translate complex documents while preserving structure.
class PPDocTranslation:
def __init__(self, device: str = None, **kwargs): ...
def visual_predict(self, input, **kwargs) -> list: ...
def translate(
self,
ori_md_info_list,
*,
target_language: str = "zh",
chunk_size: int = 5000,
task_description: str = None,
glossary: str = None,
**kwargs
) -> list: ...
def load_from_markdown(self, input) -> list: ...
def concatenate_markdown_pages(self, markdown_list: list) -> str: ...
def close(self) -> None: ...Translation Strategy:
glossary parameter for domain termschunk_size (larger = better context, slower)task_description with tone guidanceload_from_markdown() to skip OCRDocument preprocessing pipeline for orientation correction and image unwarping.
class DocPreprocessor:
def __init__(
self,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...When to Use DocPreprocessor:
use_doc_orientation_classify=True onlyuse_doc_unwarping=True onlyPerformance Impact:
Vision-language model for document question answering and understanding.
class DocUnderstanding:
def __init__(self, device: str = None, **kwargs): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...Input Format:
# For VQA (Visual Question Answering)
input = {'image': 'doc.jpg', 'prompt': 'What is the total?'}
# For general understanding
input = 'doc.jpg'Complete pipeline for recognizing mathematical formulas in documents.
class FormulaRecognitionPipeline:
def __init__(
self,
use_layout_detection: bool = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_layout_detection: bool = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
**kwargs
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...Supported Formula Elements:
Specialized pipeline for recognizing text in seals and stamps.
class SealRecognition:
def __init__(
self,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
**kwargs
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...Seal Types Supported:
Advanced table recognition supporting both wired and wireless tables.
class TableRecognitionPipelineV2:
def __init__(
self,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
use_ocr_model: bool = None,
device: str = None,
**kwargs
): ...
def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
use_ocr_model: bool = None,
**kwargs
) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...Table Types:
PaddleOCR also exports individual model classes that can be used independently for specific tasks.
class TextDetection:
def __init__(
self,
model_name: str = None,
model_dir: str = None,
limit_side_len: int = None,
thresh: float = None,
box_thresh: float = None,
unclip_ratio: float = None,
device: str = None,
**kwargs
): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...
class TextRecognition:
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...
class LayoutDetection:
def __init__(
self,
model_name: str = None,
model_dir: str = None,
threshold: float = None,
device: str = None,
**kwargs
): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...
class FormulaRecognition:
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...
class TableStructureRecognition:
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
): ...
def predict(self, input, **kwargs) -> list: ...
def predict_iter(self, input, **kwargs): ...
def close(self) -> None: ...When to Use Individual Models vs Pipelines:
PaddleOCR provides utility objects for benchmarking, logging, and version management.
# benchmark is a Benchmark object instance (not a function)
benchmark: Benchmark # Instance of paddlex.inference.utils.benchmark.Benchmark
# Objects
logger: logging.Logger
__version__: strPaddleOCR supports 109 languages including:
Language Selection Strategy:
lang='en', lang='ch', etc. (faster, more accurate)Always close pipelines and models when done to free resources:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en')
try:
result = ocr.predict('image.jpg')
# Process result
finally:
ocr.close()Why Closing Matters:
Memory Leak Example:
# BAD: Memory leak in loop
for img in images:
ocr = PaddleOCR(lang='en') # Creates new instance each time
result = ocr.predict(img)
# ocr.close() not called - leaks ~500MB per iteration
# GOOD: Reuse instance
ocr = PaddleOCR(lang='en')
try:
for img in images:
result = ocr.predict(img)
finally:
ocr.close() # Frees all resources at endContext Manager Pattern (Best Practice):
from paddleocr import PaddleOCR
from contextlib import contextmanager
@contextmanager
def ocr_pipeline(lang='en', **kwargs):
"""Context manager for automatic resource cleanup."""
ocr = PaddleOCR(lang=lang, **kwargs)
try:
yield ocr
finally:
ocr.close()
# Usage
with ocr_pipeline(lang='en', device='gpu') as ocr:
result = ocr.predict('image.jpg')
# Automatically closed even if exception occursUse predict_iter() for memory-efficient batch processing:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en')
image_paths = ['img1.jpg', 'img2.jpg', 'img3.jpg']
for result in ocr.predict_iter(image_paths):
# Process each result
print(result)
ocr.close()Batch Processing Guidelines:
predict(): Loads all results into memory - use for <100 imagespredict_iter(): Memory-efficient iterator - use for >100 images or limited RAMpredict() uses N×image_memory, predict_iter() uses constant memoryAdvanced Batch Processing with Progress Tracking:
from paddleocr import PaddleOCR
from tqdm import tqdm
import json
def batch_process_with_progress(image_dir, output_file, lang='en'):
"""Process large image batches with progress tracking and error handling."""
from pathlib import Path
# Get all image files
image_files = list(Path(image_dir).glob('*.jpg')) + \
list(Path(image_dir).glob('*.png'))
ocr = PaddleOCR(lang=lang, device='gpu')
results = []
errors = []
try:
pbar = tqdm(total=len(image_files), desc="Processing images")
for i, result in enumerate(ocr.predict_iter([str(f) for f in image_files])):
try:
# Validate result
if result and result[0].get('rec_result'):
results.append({
'file': str(image_files[i]),
'text': [line['rec_text'] for line in result[0]['rec_result']],
'scores': [line['rec_score'] for line in result[0]['rec_result']]
})
else:
errors.append({
'file': str(image_files[i]),
'error': 'No text detected'
})
except Exception as e:
errors.append({
'file': str(image_files[i]),
'error': str(e)
})
pbar.update(1)
pbar.close()
# Save results
with open(output_file, 'w') as f:
json.dump({
'results': results,
'errors': errors,
'summary': {
'total': len(image_files),
'successful': len(results),
'failed': len(errors)
}
}, f, indent=2)
return results, errors
finally:
ocr.close()Specify device for inference:
from paddleocr import PaddleOCR
# Use GPU (default GPU)
ocr = PaddleOCR(lang='en', device='gpu')
# Use CPU
ocr = PaddleOCR(lang='en', device='cpu')
# Use specific GPU
ocr = PaddleOCR(lang='en', device='gpu:0')Device Selection Decision Tree:
device='cpu' (no GPU setup required)device='gpu' (3-10x faster)device='gpu:0' or device='gpu:1' for load balancingGPU Availability Check Pattern:
def create_ocr_with_fallback(lang='en', preferred_device='gpu'):
"""Try GPU first, fallback to CPU if unavailable."""
from paddleocr import PaddleOCR
import torch
# Check CUDA availability
if preferred_device.startswith('gpu'):
if not torch.cuda.is_available():
print("Warning: CUDA not available, using CPU")
return PaddleOCR(lang=lang, device='cpu')
# Check specific GPU
gpu_id = 0
if ':' in preferred_device:
gpu_id = int(preferred_device.split(':')[1])
if gpu_id >= torch.cuda.device_count():
print(f"Warning: GPU {gpu_id} not found, using GPU 0")
return PaddleOCR(lang=lang, device='gpu:0')
try:
ocr = PaddleOCR(lang=lang, device=preferred_device)
# Test with dummy prediction to ensure it works
return ocr
except (RuntimeError, Exception) as e:
if 'CUDA' in str(e) or 'GPU' in str(e):
print(f"GPU initialization failed: {e}, using CPU")
return PaddleOCR(lang=lang, device='cpu')
raiseEnable high-performance inference for faster processing:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en', use_hpi=True)HPI (High-Performance Inference) Notes:
HPI Setup and Validation:
from paddleocr import PaddleOCR
import time
def benchmark_hpi_vs_standard(image_path, lang='en', num_runs=10):
"""Compare HPI vs standard inference performance."""
# Standard inference
ocr_standard = PaddleOCR(lang=lang, device='gpu', use_hpi=False)
times_standard = []
for _ in range(num_runs):
start = time.time()
ocr_standard.predict(image_path)
times_standard.append(time.time() - start)
ocr_standard.close()
# HPI inference
ocr_hpi = PaddleOCR(lang=lang, device='gpu', use_hpi=True)
times_hpi = []
# First run (includes optimization)
warmup_start = time.time()
ocr_hpi.predict(image_path)
warmup_time = time.time() - warmup_start
# Subsequent runs
for _ in range(num_runs):
start = time.time()
ocr_hpi.predict(image_path)
times_hpi.append(time.time() - start)
ocr_hpi.close()
avg_standard = sum(times_standard) / len(times_standard)
avg_hpi = sum(times_hpi) / len(times_hpi)
return {
'standard_avg_ms': avg_standard * 1000,
'hpi_avg_ms': avg_hpi * 1000,
'hpi_warmup_ms': warmup_time * 1000,
'speedup': avg_standard / avg_hpi
}Export pipeline configuration to YAML:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en')
ocr.export_paddlex_config_to_yaml('config.yaml')Load configuration from YAML:
from paddleocr import PaddleOCR
ocr = PaddleOCR(paddlex_config='config.yaml')Configuration Management Benefits:
Choose the right pipeline for your task:
| Task | Recommended Pipeline | Alternative | Performance | Notes |
|---|---|---|---|---|
| Extract text from images | PaddleOCR | - | 0.1-0.5s (GPU) | Fastest, simplest for pure OCR |
| Parse complex documents | PPStructureV3 | PaddleOCRVL | 2-5s (GPU) | PPStructureV3 for specialized elements |
| Multilingual documents | PaddleOCRVL | PaddleOCR | 1-2s (GPU) | VL model handles 109 languages automatically |
| Question answering | PPChatOCRv4Doc | DocUnderstanding | 2-4s first, 0.5s subsequent | ChatOCR for complex queries with RAG |
| Extract tables | TableRecognitionPipelineV2 | PPStructureV3 | 0.5-2s per table | Dedicated table pipeline for best accuracy |
| Extract formulas | FormulaRecognitionPipeline | PPStructureV3 | 0.3-1s per formula | Dedicated pipeline converts to LaTeX |
| Recognize seals/stamps | SealRecognition | PPStructureV3 | 0.2-0.8s per seal | Specialized for circular/curved text |
| Translate documents | PPDocTranslation | - | 5-10s per page | Preserves structure during translation |
| Preprocess photos | DocPreprocessor | - | 0.4-0.7s | Use before OCR for photos/scanned docs |
Decision Flowchart for Agents:
Start
├─ Need text only? → PaddleOCR
├─ Need document structure (markdown)?
│ ├─ Single language? → PPStructureV3
│ └─ Multiple/unknown languages? → PaddleOCRVL
├─ Need Q&A / information extraction?
│ ├─ Simple questions? → DocUnderstanding
│ └─ Complex/long documents? → PPChatOCRv4Doc
├─ Need specific element?
│ ├─ Tables only → TableRecognitionPipelineV2
│ ├─ Formulas only → FormulaRecognitionPipeline
│ └─ Seals only → SealRecognition
└─ Need translation? → PPDocTranslationImportError: No module named 'paddle'
# Solution: Install paddlepaddle first
# pip install paddlepaddleCUDA Error / GPU Not Found
from paddleocr import PaddleOCR
# Solution: Fallback to CPU or install correct CUDA version
try:
ocr = PaddleOCR(lang='en', device='gpu')
except Exception as e:
print(f"GPU initialization failed: {e}, falling back to CPU")
ocr = PaddleOCR(lang='en', device='cpu')OutOfMemoryError (GPU/CPU)
from paddleocr import PaddleOCR
# Solution: Use iterator mode or reduce batch size
ocr = PaddleOCR(lang='en')
for result in ocr.predict_iter(images): # Process one at a time
process(result)
ocr.close()Model Download Failures
from paddleocr import PaddleOCR
# Models are downloaded automatically on first use
# If download fails:
# 1. Check internet connection
# 2. Try specifying model directory manually
# 3. Download models manually from PaddleOCR repository
ocr = PaddleOCR(
lang='en',
text_detection_model_dir='./models/det',
text_recognition_model_dir='./models/rec'
)No Text Detected (Empty Results)
from paddleocr import PaddleOCR
# Solution: Adjust detection thresholds
ocr = PaddleOCR(
lang='en',
text_det_thresh=0.2, # Lower threshold for more sensitive detection
text_det_box_thresh=0.4 # Lower box threshold
)Low Recognition Accuracy
from paddleocr import PaddleOCR
# Solution: Enable preprocessing and use higher resolution
ocr = PaddleOCR(
lang='en',
use_doc_orientation_classify=True,
use_doc_unwarping=True,
text_det_limit_side_len=2048 # Higher resolution
)from paddleocr import PaddleOCR
import logging
import traceback
class OCRErrorHandler:
"""Comprehensive error handling for OCR operations."""
def __init__(self, lang='en', fallback_to_cpu=True):
self.lang = lang
self.fallback_to_cpu = fallback_to_cpu
self.logger = logging.getLogger(__name__)
self.ocr = None
def __enter__(self):
try:
self.ocr = PaddleOCR(lang=self.lang, device='gpu')
return self
except Exception as e:
if self.fallback_to_cpu and 'CUDA' in str(e):
self.logger.warning(f"GPU failed: {e}, using CPU")
self.ocr = PaddleOCR(lang=self.lang, device='cpu')
return self
raise
def __exit__(self, exc_type, exc_val, exc_tb):
if self.ocr:
self.ocr.close()
return False
def predict_safe(self, image_path, min_confidence=0.5):
"""Predict with validation and error recovery."""
try:
# Validate input
from pathlib import Path
if not Path(image_path).exists():
return {
'success': False,
'error': f'File not found: {image_path}',
'results': None
}
# Perform OCR
result = self.ocr.predict(image_path)
# Validate results
if not result or not result[0].get('rec_result'):
return {
'success': False,
'error': 'No text detected',
'results': result,
'suggestion': 'Try lowering detection thresholds'
}
# Filter by confidence
high_conf = [
line for line in result[0]['rec_result']
if line['rec_score'] >= min_confidence
]
if not high_conf:
return {
'success': False,
'error': f'No results above confidence threshold {min_confidence}',
'results': result,
'suggestion': 'Lower min_confidence or improve image quality'
}
return {
'success': True,
'results': result,
'high_confidence_count': len(high_conf),
'total_detected': len(result[0]['rec_result'])
}
except Exception as e:
self.logger.error(f"OCR error: {e}\n{traceback.format_exc()}")
return {
'success': False,
'error': str(e),
'traceback': traceback.format_exc(),
'results': None
}
# Usage
with OCRErrorHandler(lang='en') as handler:
result = handler.predict_safe('image.jpg', min_confidence=0.7)
if result['success']:
print(f"Success: {result['high_confidence_count']} lines detected")
else:
print(f"Error: {result['error']}")
if 'suggestion' in result:
print(f"Suggestion: {result['suggestion']}")device='gpu')use_hpi=True)predict_iter() for large batchesuse_seal_recognition=False)PaddleOCR accepts multiple input formats:
from paddleocr import PaddleOCR
import cv2
from PIL import Image
ocr = PaddleOCR(lang='en')
# File path (string)
result = ocr.predict('image.jpg')
# Directory path (all images in directory)
result = ocr.predict('images/')
# NumPy array
img = cv2.imread('image.jpg')
result = ocr.predict(img)
# PIL Image
img = Image.open('image.jpg')
result = ocr.predict(img)
# List of any above types
result = ocr.predict(['img1.jpg', img_array, pil_img])
# PDF files (automatically converted to images per page)
result = ocr.predict('document.pdf')
ocr.close()Supported Formats:
Format Conversion Utility:
import cv2
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
def normalize_input_for_ocr(input_data):
"""Convert various input formats to OCR-compatible format."""
if isinstance(input_data, str):
# File path - return as is
return input_data
elif isinstance(input_data, np.ndarray):
# NumPy array - ensure BGR format
if len(input_data.shape) == 2:
# Grayscale - convert to BGR
return cv2.cvtColor(input_data, cv2.COLOR_GRAY2BGR)
elif input_data.shape[2] == 4:
# RGBA - convert to BGR
return cv2.cvtColor(input_data, cv2.COLOR_RGBA2BGR)
return input_data
elif isinstance(input_data, Image.Image):
# PIL Image - convert to NumPy BGR
img_array = np.array(input_data)
if len(img_array.shape) == 2:
return cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
elif img_array.shape[2] == 3:
# RGB to BGR
return cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
elif img_array.shape[2] == 4:
# RGBA to BGR
return cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
else:
raise ValueError(f"Unsupported input type: {type(input_data)}")Preprocess then OCR:
from paddleocr import DocPreprocessor, PaddleOCR
# Pattern 1: Sequential processing with separate pipelines
preprocessor = DocPreprocessor(use_doc_orientation_classify=True, use_doc_unwarping=True)
ocr = PaddleOCR(lang='en')
preprocessed = preprocessor.predict('photo.jpg')
corrected_img = preprocessed[0]['img']
result = ocr.predict(corrected_img)
preprocessor.close()
ocr.close()
# Pattern 2: Integrated preprocessing (simpler, recommended)
ocr = PaddleOCR(
lang='en',
use_doc_orientation_classify=True,
use_doc_unwarping=True
)
result = ocr.predict('photo.jpg') # Preprocessing happens automatically
ocr.close()Extract different elements from same document:
from paddleocr import PPStructureV3
# Efficient: Run structure recognition once, get all elements
structure = PPStructureV3(
lang='en',
use_table_recognition=True,
use_formula_recognition=True,
use_seal_recognition=True
)
result = structure.predict('document.pdf')
# Extract specific elements
for item in result:
tables = item.get('table_result', [])
formulas = item.get('formula_result', [])
seals = item.get('seal_result', [])
structure.close()
# Inefficient: Don't run separate pipelines for each element type
# This downloads/loads models multiple timesFor repeated queries on same document:
from paddleocr import PPChatOCRv4Doc
# Pattern: Analyze once, query multiple times
pipeline = PPChatOCRv4Doc()
# Step 1: Analyze document (expensive operation)
visual_info = pipeline.visual_predict('document.pdf')
vector_info = pipeline.build_vector(visual_info)
# Step 2: Cache for reuse
pipeline.save_visual_info_list(visual_info, 'doc_visual.pkl')
pipeline.save_vector(vector_info, 'doc_vectors.pkl')
# Step 3: Multiple queries reuse cached data (fast)
queries = [
"What is the contract date?",
"Who are the parties?",
"What is the payment amount?"
]
results = pipeline.chat(queries, visual_info, vector_info=vector_info)
# Later session: Load cached data instead of reprocessing
visual_info = pipeline.load_visual_info_list('doc_visual.pkl')
vector_info = pipeline.load_vector('doc_vectors.pkl')
new_results = pipeline.chat(new_queries, visual_info, vector_info=vector_info)
pipeline.close()PaddleOCR supports multiple PP-OCR versions:
Version Selection:
Concrete Performance Comparison (1024x768 images, GPU):
PP-OCRv3: ~0.10s per image, 90.0% accuracy
PP-OCRv4: ~0.12s per image, 92.5% accuracy
PP-OCRv5: ~0.15s per image, 94.0% accuracyDefault models are "server" models (larger, more accurate). Mobile models available:
from paddleocr import PaddleOCR
# Server models (default): Higher accuracy, larger size, slower
ocr = PaddleOCR(lang='en')
# Mobile models: Faster, smaller size, slightly lower accuracy
ocr = PaddleOCR(
lang='en',
text_detection_model_name='PP-OCRv4_mobile_det',
text_recognition_model_name='PP-OCRv4_mobile_rec'
)Model Selection Guidelines:
Concrete Comparison:
Server models:
- Detection model size: ~120MB
- Recognition model size: ~15MB
- Speed (CPU): ~1-3s per image
- Speed (GPU): ~0.15s per image
- Accuracy: 94%
Mobile models:
- Detection model size: ~4MB
- Recognition model size: ~2MB
- Speed (CPU): ~0.5-1s per image
- Speed (GPU): ~0.05s per image
- Accuracy: 91%PaddleOCR pipelines are not thread-safe. For concurrent requests:
from paddleocr import PaddleOCR
from threading import Thread
# Wrong: Don't share pipeline across threads
ocr = PaddleOCR(lang='en') # Shared instance
# Multiple threads calling ocr.predict() - UNSAFE
# Correct: Create instance per thread
def process_image(image_path):
ocr = PaddleOCR(lang='en') # Instance per thread
try:
result = ocr.predict(image_path)
return result
finally:
ocr.close()
threads = [Thread(target=process_image, args=(img,)) for img in images]Multi-Processing Pattern for Production (Better than threading):
from paddleocr import PaddleOCR
from multiprocessing import Pool, cpu_count
import os
def init_worker():
"""Initialize worker process with OCR instance."""
global worker_ocr
# Each process gets its own OCR instance
worker_ocr = PaddleOCR(lang='en', device='cpu')
def process_image_worker(image_path):
"""Worker function for multiprocessing."""
try:
result = worker_ocr.predict(image_path)
return {'success': True, 'path': image_path, 'result': result}
except Exception as e:
return {'success': False, 'path': image_path, 'error': str(e)}
def cleanup_worker():
"""Cleanup worker process."""
global worker_ocr
if 'worker_ocr' in globals():
worker_ocr.close()
def batch_process_multiprocess(image_paths, num_workers=None):
"""Process images using multiple processes."""
if num_workers is None:
num_workers = min(cpu_count(), 4) # Limit to 4 to avoid memory issues
with Pool(num_workers, initializer=init_worker,
finalizer=cleanup_worker) as pool:
results = pool.map(process_image_worker, image_paths)
return resultsFor long-running services:
from paddleocr import PaddleOCR
# Pattern: Process, get result, close immediately
def process_document(doc_path):
ocr = PaddleOCR(lang='en')
try:
result = ocr.predict(doc_path)
# Extract needed data
extracted_data = extract_info(result)
return extracted_data
finally:
ocr.close() # Critical: Free resources
# Avoid: Keeping pipeline open for entire service lifetime
# class DocumentService:
# def __init__(self):
# self.ocr = PaddleOCR(lang='en') # Holds memory indefinitelyService Pattern with Connection Pooling:
from paddleocr import PaddleOCR
from queue import Queue
from threading import Lock
import atexit
class OCRPool:
"""Connection pool for OCR instances."""
def __init__(self, pool_size=3, lang='en', device='gpu'):
self.pool = Queue(maxsize=pool_size)
self.lock = Lock()
self.lang = lang
self.device = device
# Pre-create instances
for _ in range(pool_size):
self.pool.put(self._create_instance())
# Register cleanup
atexit.register(self.cleanup)
def _create_instance(self):
"""Create new OCR instance."""
return PaddleOCR(lang=self.lang, device=self.device)
def acquire(self, timeout=30):
"""Get an OCR instance from pool."""
return self.pool.get(timeout=timeout)
def release(self, ocr):
"""Return OCR instance to pool."""
self.pool.put(ocr)
def cleanup(self):
"""Close all instances in pool."""
while not self.pool.empty():
try:
ocr = self.pool.get_nowait()
ocr.close()
except:
pass
# Global pool instance
ocr_pool = OCRPool(pool_size=3, lang='en', device='gpu')
def process_request(image_path):
"""Process request using pooled OCR instance."""
ocr = ocr_pool.acquire()
try:
result = ocr.predict(image_path)
return result
finally:
ocr_pool.release(ocr)Implement retry logic with exponential backoff:
import time
from paddleocr import PaddleOCR
def robust_ocr(image_path, max_retries=3):
for attempt in range(max_retries):
try:
ocr = PaddleOCR(lang='en', device='gpu')
result = ocr.predict(image_path)
ocr.close()
return result
except RuntimeError as e:
if 'CUDA' in str(e) or 'GPU' in str(e):
# GPU error - fallback to CPU
print(f"GPU error on attempt {attempt + 1}, trying CPU")
try:
ocr = PaddleOCR(lang='en', device='cpu')
result = ocr.predict(image_path)
ocr.close()
return result
except Exception:
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise
except Exception as e:
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
else:
raisefrom paddleocr import PaddleOCR
import time
import logging
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict
import json
@dataclass
class OCRMetrics:
"""Metrics for OCR operations."""
total_requests: int = 0
successful: int = 0
failed: int = 0
total_time: float = 0.0
avg_confidence: float = 0.0
errors: Dict[str, int] = None
def __post_init__(self):
if self.errors is None:
self.errors = defaultdict(int)
class MonitoredOCR:
"""OCR wrapper with monitoring and metrics."""
def __init__(self, lang='en', device='gpu'):
self.ocr = PaddleOCR(lang=lang, device=device)
self.metrics = OCRMetrics()
self.logger = logging.getLogger(__name__)
def predict(self, image_path, **kwargs):
"""Predict with monitoring."""
self.metrics.total_requests += 1
start_time = time.time()
try:
result = self.ocr.predict(image_path, **kwargs)
elapsed = time.time() - start_time
# Update metrics
self.metrics.successful += 1
self.metrics.total_time += elapsed
# Calculate average confidence
if result and result[0].get('rec_result'):
avg_conf = sum(
line['rec_score']
for line in result[0]['rec_result']
) / len(result[0]['rec_result'])
# Running average
total = self.metrics.successful
self.metrics.avg_confidence = (
(self.metrics.avg_confidence * (total - 1) + avg_conf) / total
)
self.logger.info(
f"OCR success: {image_path}, time: {elapsed:.2f}s"
)
return result
except Exception as e:
self.metrics.failed += 1
error_type = type(e).__name__
self.metrics.errors[error_type] += 1
self.logger.error(
f"OCR failed: {image_path}, error: {e}"
)
raise
def get_metrics(self):
"""Get current metrics."""
avg_time = (
self.metrics.total_time / self.metrics.successful
if self.metrics.successful > 0 else 0
)
return {
'total_requests': self.metrics.total_requests,
'successful': self.metrics.successful,
'failed': self.metrics.failed,
'success_rate': (
self.metrics.successful / self.metrics.total_requests
if self.metrics.total_requests > 0 else 0
),
'avg_time_seconds': avg_time,
'avg_confidence': self.metrics.avg_confidence,
'errors': dict(self.metrics.errors)
}
def export_metrics(self, filepath):
"""Export metrics to JSON file."""
with open(filepath, 'w') as f:
json.dump(self.get_metrics(), f, indent=2)
def close(self):
"""Close OCR and log final metrics."""
self.logger.info(f"Final metrics: {self.get_metrics()}")
self.ocr.close()
# Usage
monitored_ocr = MonitoredOCR(lang='en', device='gpu')
try:
for image in images:
result = monitored_ocr.predict(image)
# Process result
finally:
monitored_ocr.export_metrics('ocr_metrics.json')
monitored_ocr.close()# Task: Extract all text from an image
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en')
result = ocr.predict('image.jpg')
text = '\n'.join([line['rec_text'] for item in result for line in item['rec_result']])
ocr.close()
# Task: Convert PDF to markdown
from paddleocr import PPStructureV3
structure = PPStructureV3(lang='en')
result = structure.predict('document.pdf')
markdown = structure.concatenate_markdown_pages([item['markdown'] for item in result])
structure.close()
# Task: Extract all tables from document
from paddleocr import TableRecognitionPipelineV2
table_pipeline = TableRecognitionPipelineV2(use_layout_detection=True)
result = table_pipeline.predict('report.pdf')
tables = [table['html'] for item in result for table in item.get('table_result', [])]
table_pipeline.close()
# Task: Answer questions about document
from paddleocr import PPChatOCRv4Doc
chat = PPChatOCRv4Doc()
visual_info = chat.visual_predict('contract.pdf')
answers = chat.chat(["What is the date?", "Who signed?"], visual_info)
chat.close()
# Task: Extract formulas as LaTeX
from paddleocr import FormulaRecognitionPipeline
formula_pipeline = FormulaRecognitionPipeline(use_layout_detection=True)
result = formula_pipeline.predict('math_paper.pdf')
latex_formulas = [formula['latex'] for item in result for formula in item.get('formula_result', [])]
formula_pipeline.close()