tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Complete pipeline for recognizing mathematical formulas in documents. Detects formula regions using layout detection and converts them to LaTeX format, with optional document preprocessing for scanned or photographed materials.
class FormulaRecognitionPipeline:
"""
Mathematical formula recognition pipeline.
Combines layout detection and formula recognition to extract
mathematical expressions from documents and convert them to LaTeX.
"""
def __init__(
self,
# Formula recognition
formula_recognition_model_name: str = None,
formula_recognition_model_dir: str = None,
# Layout detection
layout_detection_model_name: str = None,
layout_detection_model_dir: str = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
# Document preprocessing
doc_orientation_classify_model_name: str = None,
doc_orientation_classify_model_dir: str = None,
doc_unwarping_model_name: str = None,
doc_unwarping_model_dir: str = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
use_layout_detection: bool = None,
# Common parameters
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize formula recognition pipeline.
Args:
formula_recognition_model_name (str, optional): Formula recognition model
Default: 'PP-FormulaNet_plus-M'
use_layout_detection (bool, optional): Enable layout detection for
automatic formula region detection
use_doc_orientation_classify (bool, optional): Enable orientation correction
use_doc_unwarping (bool, optional): Enable document unwarping
layout_threshold (float, optional): Threshold for layout detection
device (str, optional): Device for inference ('cpu', 'gpu')
paddlex_config (str or dict, optional): Configuration file or dict
"""def predict(
self,
input,
*,
use_layout_detection: bool = None,
use_doc_orientation_classify: bool = None,
use_doc_unwarping: bool = None,
layout_det_res: list = None,
layout_threshold: float = None,
layout_nms: bool = None,
layout_unclip_ratio: float = None,
layout_merge_bboxes_mode: str = None,
**kwargs
) -> list:
"""
Recognize formulas in documents.
Args:
input: Image/PDF path, numpy array, PIL Image, directory, or list
use_layout_detection (bool, optional): Override layout detection
use_doc_orientation_classify (bool, optional): Override orientation correction
use_doc_unwarping (bool, optional): Override unwarping
layout_det_res (list, optional): Pre-computed layout detection results
layout_threshold (float, optional): Layout detection threshold
Returns:
list: Formula recognition results with LaTeX output
"""
def predict_iter(self, input, **kwargs):
"""Iterate over formula recognition results for memory efficiency."""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import FormulaRecognitionPipeline
# Initialize pipeline
pipeline = FormulaRecognitionPipeline()
# Recognize formulas in a document
result = pipeline.predict('math_paper.pdf')
# Print LaTeX for each formula
for item in result:
for formula in item.get('formula_result', []):
print(f"LaTeX: {formula['latex']}")
print(f"Position: {formula['bbox']}\n")
pipeline.close()from paddleocr import FormulaRecognitionPipeline
# Enable automatic formula region detection
pipeline = FormulaRecognitionPipeline(
use_layout_detection=True
)
# Process document - formulas will be detected automatically
result = pipeline.predict('textbook_page.png')
for item in result:
formulas = item.get('formula_result', [])
print(f"Found {len(formulas)} formulas:")
for i, formula in enumerate(formulas, 1):
print(f"{i}. {formula['latex']}")
pipeline.close()from paddleocr import FormulaRecognitionPipeline
# Enable preprocessing for scanned documents
pipeline = FormulaRecognitionPipeline(
use_doc_orientation_classify=True,
use_doc_unwarping=True,
use_layout_detection=True
)
# Process scanned or photographed document
result = pipeline.predict('scanned_paper.jpg')
pipeline.close()from paddleocr import FormulaRecognitionPipeline
import json
pipeline = FormulaRecognitionPipeline(
use_layout_detection=True
)
# Process multiple papers
results = pipeline.predict('papers/')
# Save formulas with metadata
formulas_db = []
for item in results:
doc_name = item['input_path']
for formula in item.get('formula_result', []):
formulas_db.append({
'document': doc_name,
'latex': formula['latex'],
'bbox': formula['bbox'],
'page': formula.get('page', 0)
})
with open('formulas.json', 'w') as f:
json.dump(formulas_db, f, indent=2)
pipeline.close()from paddleocr import FormulaRecognitionPipeline
# Adjust threshold for better formula detection
pipeline = FormulaRecognitionPipeline(
use_layout_detection=True,
layout_threshold=0.7 # Higher = more confident detections
)
result = pipeline.predict('complex_document.pdf')
pipeline.close()from paddleocr import LayoutDetection, FormulaRecognitionPipeline
# First, detect layout separately
layout_model = LayoutDetection()
layout_results = layout_model.predict('document.pdf')
# Use layout results for formula recognition
pipeline = FormulaRecognitionPipeline()
result = pipeline.predict(
'document.pdf',
layout_det_res=layout_results
)
layout_model.close()
pipeline.close()from paddleocr import FormulaRecognitionPipeline
import re
pipeline = FormulaRecognitionPipeline(use_layout_detection=True)
result = pipeline.predict('math_doc.pdf')
# Extract formulas and convert to different formats
for item in result:
for formula in item.get('formula_result', []):
latex = formula['latex']
# LaTeX (original)
print(f"LaTeX: {latex}")
# Inline LaTeX for markdown
inline = f"${latex}$"
print(f"Markdown inline: {inline}")
# Display LaTeX for markdown
display = f"$$\n{latex}\n$$"
print(f"Markdown display: {display}\n")
pipeline.close()from paddleocr import FormulaRecognitionPipeline
pipeline = FormulaRecognitionPipeline(use_layout_detection=True)
# Process large collection with iterator
with open('all_formulas.txt', 'w') as f:
for result in pipeline.predict_iter('large_paper_collection/'):
doc = result['input_path']
formulas = result.get('formula_result', [])
f.write(f"\n=== {doc} ===\n")
for formula in formulas:
f.write(f"{formula['latex']}\n")
pipeline.close()[
{
"input_path": "path/to/document.pdf",
"formula_result": [
{
"bbox": [x1, y1, x2, y2], # Formula bounding box
"latex": "E = mc^2", # LaTeX representation
"confidence": 0.95, # Recognition confidence
"page": 0 # Page number (for PDFs)
},
{
"bbox": [x1, y1, x2, y2],
"latex": "\\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}",
"confidence": 0.92,
"page": 0
}
],
"layout_result": [ # If use_layout_detection=True
{
"bbox": [x1, y1, x2, y2],
"label": "formula",
"score": 0.98
}
]
}
]The PP-FormulaNet_plus-M model supports:
+, -, ×, ÷\frac{a}{b}x^2, a_i\sqrt{x}, \sqrt[n]{x}\alpha, \beta, \gamma, etc.\sum, \int, \prod, \limdevice='gpu' for faster processinglayout_threshold for optimal detectionDefault Model: PP-FormulaNet_plus-M