tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
PaddleOCR provides 13 individual model classes that can be used independently for specific tasks. These models are building blocks used by the pipelines but can also be used directly for fine-grained control.
All models share a common base API with predict(), predict_iter(), and close() methods.
class BaseModel:
"""Base interface for all PaddleOCR models."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
use_hpi: bool = None,
hpi_params: dict = None,
**kwargs
):
"""
Initialize model.
Args:
model_name (str, optional): Name of pretrained model
model_dir (str, optional): Path to custom model directory
device (str, optional): Device for inference ('cpu', 'gpu', 'gpu:0', etc.)
use_hpi (bool, optional): Enable high-performance inference
hpi_params (dict, optional): High-performance inference parameters
"""
def predict(self, input, **kwargs) -> list:
"""
Perform prediction.
Args:
input: Image path, numpy array, PIL Image, or list
Returns:
list: Prediction results
"""
def predict_iter(self, input, **kwargs):
"""Iterate over prediction results."""
def close(self) -> None:
"""Close model and free resources."""Model Usage Patterns:
Detects text regions in images using PP-OCRv5 detection model.
class TextDetection:
"""Text detection model for locating text regions."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
limit_side_len: int = None,
limit_type: str = None,
thresh: float = None,
box_thresh: float = None,
unclip_ratio: float = None,
input_shape: tuple = None,
device: str = None,
**kwargs
):
"""
Initialize text detection model.
Default model: PP-OCRv5_server_det
Args:
model_name (str, optional): Pretrained model name
Options: 'PP-OCRv5_server_det', 'PP-OCRv5_mobile_det', 'PP-OCRv4_server_det', 'PP-OCRv4_mobile_det'
limit_side_len (int, optional): Limit on image side length (default: 960)
Higher values: Better for large images, more memory, slower
Lower values: Faster, less memory, may miss small text
limit_type (str, optional): Limit type ('min' or 'max', default: 'max')
'max': Resize if longest side > limit_side_len
'min': Resize if shortest side < limit_side_len
thresh (float, optional): Pixel threshold for detection (default: 0.3)
Range: 0.0-1.0
Lower: More sensitive, may include noise
Higher: More selective, may miss faint text
box_thresh (float, optional): Box confidence threshold (default: 0.6)
Range: 0.0-1.0
Lower: More detected boxes, more false positives
Higher: Fewer boxes, fewer false positives
unclip_ratio (float, optional): Expansion ratio for text boxes (default: 1.5)
Range: 1.0-2.0
Higher: Larger boxes, ensures full text capture
Lower: Tighter boxes, may clip text edges
input_shape (tuple, optional): Input shape (C, H, W)
"""Usage:
from paddleocr import TextDetection
det = TextDetection()
result = det.predict('image.jpg')
# Returns: [{'dt_polys': [...], 'dt_scores': [...]}]
det.close()Performance Tuning:
Recognizes text from cropped text regions.
class TextRecognition:
"""Text recognition model for reading text from images."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
input_shape: tuple = None,
device: str = None,
**kwargs
):
"""
Initialize text recognition model.
Default model: PP-OCRv5_server_rec
Args:
model_name (str, optional): Pretrained model name
Options: 'PP-OCRv5_server_rec', 'PP-OCRv5_mobile_rec', language-specific models
input_shape (tuple, optional): Input shape (C, H, W)
device (str, optional): Device for inference
Notes:
- Recognition model selection affects supported characters
- Language-specific models provide better accuracy for that language
- Server models support more character types than mobile models
"""Usage:
from paddleocr import TextRecognition
rec = TextRecognition()
result = rec.predict('cropped_text.jpg')
# Returns: [{'rec_text': 'Hello', 'rec_score': 0.98}]
rec.close()Character Sets:
Detects layout elements in document images.
class LayoutDetection:
"""Layout detection model for identifying document structure."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
img_size: int = None,
threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
device: str = None,
**kwargs
):
"""
Initialize layout detection model.
Default model: PP-DocLayout_plus-L
Args:
model_name (str, optional): Pretrained model name
img_size (int, optional): Input image size (default: 800)
Larger: Better for detailed layouts, slower
Smaller: Faster, may miss small elements
threshold (float, optional): Detection confidence threshold (default: 0.5)
Range: 0.0-1.0
layout_nms (bool, optional): Use layout-aware NMS (default: False)
Enable for overlapping elements
layout_unclip_ratio (float, optional): Box expansion ratio (default: 0.0)
layout_merge_bboxes_mode (str, optional): Box merging mode
Options: 'h' (horizontal), 'v' (vertical), 'hv' (both)
Detected Element Types:
- text: Paragraph text
- title: Headings
- figure: Images/diagrams
- table: Tables
- formula: Mathematical formulas
- list: Bullet/numbered lists
- header: Page headers
- footer: Page footers
- reference: Citations/references
"""Usage:
from paddleocr import LayoutDetection
layout = LayoutDetection()
result = layout.predict('document.pdf')
# Returns: [{'bbox': [...], 'label': 'text', 'score': 0.95}, ...]
layout.close()Performance:
Classifies document image orientation (0°, 90°, 180°, 270°).
class DocImgOrientationClassification:
"""Document orientation classification model."""
def __init__(
self,
topk: int = None,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize orientation classification model.
Default model: PP-LCNet_x1_0_doc_ori
Args:
topk (int, optional): Top-k predictions to return (default: 1)
Returns Angles:
0: No rotation needed
90: Rotate 90° clockwise needed
180: Rotate 180° needed
270: Rotate 270° clockwise (90° counter-clockwise) needed
"""Usage:
from paddleocr import DocImgOrientationClassification
classifier = DocImgOrientationClassification()
result = classifier.predict('rotated_doc.jpg')
# Returns: [{'angle': 90, 'score': 0.99}]
classifier.close()Classifies text line orientation (useful for vertical/horizontal text detection).
class TextLineOrientationClassification:
"""Text line orientation classification model."""
def __init__(
self,
topk: int = None,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize text line orientation model.
Default model: PP-LCNet_x0_25_textline_ori
Classifies:
- Horizontal text (0°)
- Vertical text (90°)
- Upside-down text (180°)
"""Corrects distorted or warped document images.
class TextImageUnwarping:
"""Document unwarping model for correcting distortions."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize unwarping model.
Default model: UVDoc
Corrects:
- Perspective distortion (photos taken at an angle)
- Curved pages (book scanning)
- Warped documents
Use Cases:
- Mobile phone photos of documents
- Scanned book pages
- Curved or folded documents
"""Performance Impact:
Recognizes mathematical formulas and converts to LaTeX.
class FormulaRecognition:
"""Formula recognition model for mathematical expressions."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize formula recognition model.
Default model: PP-FormulaNet_plus-M
Supported Elements:
- Basic operators: +, -, ×, ÷, =
- Fractions: \\frac{a}{b}
- Exponents/subscripts: x^2, a_i
- Roots: \\sqrt{x}, \\sqrt[n]{x}
- Greek letters: \\alpha, \\beta, etc.
- Operators: \\sum, \\int, \\prod, \\lim
- Matrices and arrays
- Complex nested expressions
Accuracy:
- Printed formulas: 90-95%
- Clean handwritten: 70-80%
- Complex nested: 85-90%
"""Usage:
from paddleocr import FormulaRecognition
formula = FormulaRecognition()
result = formula.predict('formula.jpg')
# Returns: [{'latex': 'E = mc^2', 'score': 0.96}]
formula.close()Detects text in seals and stamps (circular/curved text).
class SealTextDetection:
"""Seal text detection model for stamps and seals."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
limit_side_len: int = None,
limit_type: str = None,
thresh: float = None,
box_thresh: float = None,
unclip_ratio: float = None,
input_shape: tuple = None,
device: str = None,
**kwargs
):
"""
Initialize seal detection model.
Default model: PP-OCRv4_mobile_seal_det
Specialized Features:
- Detects circular/arc text arrangements
- Handles rotated text in seals
- Processes multi-line seal text
- Supports various seal shapes (circular, oval, rectangular)
Args: Same as TextDetection, but optimized for curved text
Recommended Settings:
- unclip_ratio: 2.0-2.5 (larger for very curved text)
- thresh: 0.3 (lower for faint seal impressions)
"""Converts chart images to structured table data.
class ChartParsing:
"""Chart parsing model for converting charts to tables."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize chart parsing model.
Default model: PP-Chart2Table
Supported Chart Types:
- Bar charts (horizontal/vertical)
- Line charts
- Pie charts
- Scatter plots
- Area charts
Output Format:
- Structured data as table
- X/Y axis labels
- Data series information
- Legend mapping
"""Usage:
from paddleocr import ChartParsing
chart = ChartParsing()
result = chart.predict('chart.jpg')
# Returns: [{'table_data': {...}, 'chart_type': 'bar'}]
chart.close()Classifies table type (wired vs wireless).
class TableClassification:
"""Table classification model."""
def __init__(
self,
topk: int = None,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize table classification model.
Default model: PP-LCNet_x1_0_table_cls
Classifications:
- wired: Tables with visible borders/gridlines
- wireless: Borderless tables (whitespace-separated)
Accuracy: ~95% for clear table images
Purpose: Routes table to appropriate recognition model
"""Recognizes table structure (rows, columns, cells).
class TableStructureRecognition:
"""Table structure recognition model."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize table structure model.
Default model: SLANet
Recognizes:
- Cell boundaries
- Row/column structure
- Merged cells (colspan/rowspan)
- Header rows
Output:
- HTML table structure
- Cell coordinates
- Cell relationships
"""Detects individual cells in tables.
class TableCellsDetection:
"""Table cells detection model."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
img_size: int = None,
threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
device: str = None,
**kwargs
):
"""
Initialize table cells detection model.
Default model: RT-DETR-L_wired_table_cell_det
Args: Same as LayoutDetection
Use Cases:
- Precise cell boundary detection
- Complex table structures
- Borderless table cell separation
"""Vision-language model for document question answering.
class DocVLM:
"""Document vision-language model for visual Q&A."""
def __init__(
self,
model_name: str = None,
model_dir: str = None,
device: str = None,
**kwargs
):
"""
Initialize document VLM.
Default model: PP-DocBee2-3B
Model Specifications:
- Parameters: 3 billion
- Type: Vision-Language Model
- Languages: English, Chinese, and others
- Context: Document understanding and Q&A
Capabilities:
- Visual question answering
- Document classification
- Information extraction
- Content summarization
- Multi-language support
Performance:
- Model size: ~6GB
- GPU memory required: ~8GB
- Inference time: ~1-3s per query (GPU)
"""Usage:
from paddleocr import DocVLM
vlm = DocVLM()
result = vlm.predict({
'image': 'document.jpg',
'prompt': 'What is the main topic?'
})
vlm.close()Build custom pipeline by combining models:
from paddleocr import TextDetection, TextRecognition
# Detect text
det = TextDetection()
det_result = det.predict('image.jpg')
# Recognize text from detected regions
rec = TextRecognition()
for item in det_result:
for poly in item['dt_polys']:
# Crop region and recognize
# cropped = crop_image(image, poly)
# rec_result = rec.predict(cropped)
pass
det.close()
rec.close()Benefits of Custom Pipelines:
Load custom trained models:
from paddleocr import TextRecognition
# Load custom trained model
rec = TextRecognition(model_dir='./my_custom_model')
result = rec.predict('image.jpg')
rec.close()Custom Model Requirements:
Choose specific model versions:
from paddleocr import TextDetection
# Use specific model version
det = TextDetection(model_name='PP-OCRv4_mobile_det')
result = det.predict('image.jpg')
det.close()Available Model Variants:
*_server_* (larger, more accurate)*_mobile_* (smaller, faster)PP-OCRv3, PP-OCRv4, PP-OCRv5| Model | Size | CPU Speed | GPU Speed | Accuracy |
|---|---|---|---|---|
| PP-OCRv5_server_det | ~120MB | ~1s | ~0.1s | 95% |
| PP-OCRv5_mobile_det | ~4MB | ~0.3s | ~0.05s | 92% |
| PP-OCRv4_server_det | ~100MB | ~0.8s | ~0.08s | 93% |
| PP-OCRv4_mobile_det | ~3MB | ~0.25s | ~0.04s | 90% |
| Model | Size | CPU Speed | GPU Speed | Accuracy |
|---|---|---|---|---|
| PP-OCRv5_server_rec | ~15MB | ~0.5s | ~0.05s | 94% |
| PP-OCRv5_mobile_rec | ~2MB | ~0.2s | ~0.03s | 91% |
Note: Times are per image on typical hardware (Intel i7 CPU, NVIDIA RTX 3080 GPU)
Common model errors and solutions:
from paddleocr import TextDetection
try:
det = TextDetection(device='gpu')
result = det.predict('image.jpg')
except RuntimeError as e:
if 'CUDA' in str(e):
# Fallback to CPU
det = TextDetection(device='cpu')
result = det.predict('image.jpg')
except FileNotFoundError:
# Model not downloaded
print("Model downloading on first use...")
# Retry or check internet connection
except Exception as e:
print(f"Unexpected error: {e}")
finally:
if 'det' in locals():
det.close().close() to free resourcespredict_iter() for large datasetsdevice='gpu' for significant speedupUse Individual Models When:
Use Pipelines When:
from paddleocr import TextDetection
# Enable HPI for 2-3x speedup
det = TextDetection(
device='gpu',
use_hpi=True,
hpi_params={
'precision': 'fp16', # Use FP16 for speed
'max_batch_size': 8
}
)HPI Requirements:
For specific image sizes:
from paddleocr import TextDetection
# Optimize for specific image dimensions
det = TextDetection(
input_shape=(3, 640, 640), # (C, H, W)
limit_side_len=640
)When to Customize:
Models are automatically downloaded on first use:
from paddleocr import TextDetection
# First use: Downloads model (~120MB for server model)
det = TextDetection() # Downloads to ~/.paddleocr/
# Subsequent uses: Loads from cache (fast)
det2 = TextDetection() # InstantCache Location:
~/.paddleocr/C:\Users\<username>\.paddleocr\Manual Model Management:
# Specify custom model location
det = TextDetection(model_dir='/path/to/models')
# Useful for:
# - Offline environments
# - Shared model storage
# - Version control