tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Specialized pipeline for recognizing text in seals, stamps, and circular/curved text. Combines seal-specific text detection with standard text recognition, with optional layout detection for locating seal regions in documents.
class SealRecognition:
"""
Seal and stamp text recognition pipeline.
Specialized for circular, curved, and rotated text in seals/stamps.
Uses seal-optimized detection models for accurate text extraction.
"""
def __init__(
self,
# Seal detection and recognition
seal_text_detection_model_name: str = None,
seal_text_detection_model_dir: str = None,
text_recognition_model_name: str = None,
text_recognition_model_dir: str = None,
text_recognition_batch_size: int = None,
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_rec_score_thresh: float = None,
# Layout detection
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
use_layout_detection: bool = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
# Document preprocessing
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
# Common parameters
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize seal recognition pipeline.
Args:
seal_text_detection_model_name (str, optional): Seal detection model
Default: 'PP-OCRv4_mobile_seal_det'
text_recognition_model_name (str, optional): Text recognition model
seal_rec_score_thresh (float, optional): Recognition score threshold
use_layout_detection (bool, optional): Enable layout detection to
locate seal regions automatically
seal_det_limit_side_len (int, optional): Limit on detection side length
seal_det_thresh (float, optional): Detection pixel threshold
seal_det_box_thresh (float, optional): Detection box threshold
seal_det_unclip_ratio (float, optional): Text region expansion ratio
device (str, optional): Device for inference ('cpu', 'gpu')
paddlex_config (str or dict, optional): Configuration file or dict
"""def predict(
self,
input,
*,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
layout_det_res: list = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
seal_det_limit_side_len: int = None,
seal_det_limit_type: str = None,
seal_det_thresh: float = None,
seal_det_box_thresh: float = None,
seal_det_unclip_ratio: float = None,
seal_rec_score_thresh: float = None,
**kwargs
) -> list:
"""
Recognize text in seals and stamps.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_layout_detection (bool, optional): Override layout detection
use_doc_orientation_classify (bool, optional): Override orientation correction
use_doc_unwarping (bool, optional): Override unwarping
layout_det_res (list, optional): Pre-computed layout detection results
seal_det_thresh (float, optional): Override detection threshold
seal_det_box_thresh (float, optional): Override box threshold
seal_det_unclip_ratio (float, optional): Override unclip ratio
seal_rec_score_thresh (float, optional): Override recognition threshold
Returns:
list: Seal recognition results with text and positions
"""
def predict_iter(self, input, **kwargs):
"""Iterate over seal recognition results for memory efficiency."""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import SealRecognition
# Initialize seal recognition
pipeline = SealRecognition()
# Recognize seal text
result = pipeline.predict('company_seal.jpg')
# Print recognized text
for item in result:
for seal in item.get('seal_result', []):
print(f"Text: {seal['text']}")
print(f"Confidence: {seal['score']:.2f}")
print(f"Position: {seal['bbox']}\n")
pipeline.close()from paddleocr import SealRecognition
# Enable layout detection to find seal regions
pipeline = SealRecognition(
use_layout_detection=True
)
# Process document with multiple seals
result = pipeline.predict('contract.pdf')
for item in result:
seals = item.get('seal_result', [])
print(f"Found {len(seals)} seals:")
for i, seal in enumerate(seals, 1):
print(f"\nSeal {i}:")
print(f" Text: {seal['text']}")
print(f" Confidence: {seal['score']:.2f}")
pipeline.close()from paddleocr import SealRecognition
# Configure for high precision
pipeline = SealRecognition(
seal_det_thresh=0.3, # Lower threshold for more detections
seal_det_box_thresh=0.6, # Higher box threshold for quality
seal_rec_score_thresh=0.7, # Higher recognition threshold
seal_det_unclip_ratio=2.0 # Larger expansion for curved text
)
result = pipeline.predict('official_document.pdf')
pipeline.close()from paddleocr import SealRecognition
import json
pipeline = SealRecognition(use_layout_detection=True)
# Process multiple documents
results = pipeline.predict('stamped_documents/')
# Save seal data
seal_database = []
for item in results:
doc_name = item['input_path']
for seal in item.get('seal_result', []):
seal_database.append({
'document': doc_name,
'text': seal['text'],
'confidence': seal['score'],
'bbox': seal['bbox']
})
with open('seals.json', 'w', encoding='utf-8') as f:
json.dump(seal_database, f, ensure_ascii=False, indent=2)
pipeline.close()from paddleocr import SealRecognition
# Enable preprocessing for scanned documents
pipeline = SealRecognition(
use_doc_orientation_classify=True,
use_doc_unwarping=True,
use_layout_detection=True
)
# Process scanned contract
result = pipeline.predict('scanned_contract.jpg')
pipeline.close()from paddleocr import SealRecognition
import cv2
pipeline = SealRecognition()
# Recognize seals
result = pipeline.predict('document_with_seal.jpg')
# Extract seal regions for verification
img = cv2.imread('document_with_seal.jpg')
for item in result:
for i, seal in enumerate(item.get('seal_result', [])):
x1, y1, x2, y2 = seal['bbox']
seal_img = img[y1:y2, x1:x2]
# Save individual seal
cv2.imwrite(f'seal_{i}.jpg', seal_img)
print(f"Seal {i}: {seal['text']}")
pipeline.close()from paddleocr import LayoutDetection, SealRecognition
# Detect layout first
layout_model = LayoutDetection()
layout_results = layout_model.predict('contract.pdf')
# Use for seal recognition
pipeline = SealRecognition()
result = pipeline.predict(
'contract.pdf',
layout_det_res=layout_results
)
layout_model.close()
pipeline.close()from paddleocr import SealRecognition
pipeline = SealRecognition(use_layout_detection=True)
# Process large batch with iterator
seal_texts = []
for result in pipeline.predict_iter('archive/'):
for seal in result.get('seal_result', []):
seal_texts.append(seal['text'])
# Analyze seal patterns
from collections import Counter
common_seals = Counter(seal_texts).most_common(10)
print("Most common seals:")
for text, count in common_seals:
print(f" {text}: {count}")
pipeline.close()[
{
"input_path": "path/to/document.pdf",
"seal_result": [
{
"bbox": [x1, y1, x2, y2], # Seal bounding box
"text": "公司印章 COMPANY SEAL", # Recognized text
"score": 0.95, # Overall confidence
"dt_polys": [ # Individual text line polygons
[[x1,y1], [x2,y2], [x3,y3], [x4,y4]],
# ...
],
"rec_text": [ # Text for each polygon
"公司印章",
"COMPANY SEAL"
],
"rec_score": [0.96, 0.94] # Scores for each line
}
],
"layout_result": [ # If use_layout_detection=True
{
"bbox": [x1, y1, x2, y2],
"label": "seal",
"score": 0.98
}
]
}
]seal_det_thresh and seal_rec_score_thresh for your seal typesdevice='gpu' for faster processingDefault Models:
The seal detection model is specifically trained to: