tessl install tessl/pypi-paddleocr@3.3.0Industry-leading OCR and document AI engine that converts documents and images into structured, AI-friendly data formats with comprehensive solutions from text extraction to intelligent document understanding.
Document understanding pipeline combining vision-language models with document processing for intelligent question answering and information extraction. Uses multimodal AI to understand document content and answer queries about images and documents.
class DocUnderstanding:
"""
Document understanding through vision-language models.
Uses VLM (Vision-Language Model) for visual question answering,
document parsing, and intelligent content extraction.
"""
def __init__(
self,
doc_understanding_model_name: str = None,
doc_understanding_model_dir: str = None,
doc_understanding_batch_size: int = None,
paddlex_config: str = None,
device: str = None,
use_hpi: bool = None,
**kwargs
):
"""
Initialize DocUnderstanding pipeline.
Args:
doc_understanding_model_name (str, optional): VLM model name
Default: 'PP-DocBee2-3B'
doc_understanding_model_dir (str, optional): Model directory
doc_understanding_batch_size (int, optional): Batch size for inference
device (str, optional): Device for inference ('cpu', 'gpu')
use_hpi (bool, optional): Use high-performance inference
paddlex_config (str or dict, optional): Configuration file or dict
"""def predict(
self,
input,
**kwargs
) -> list:
"""
Understand document content and answer questions.
Args:
input: Can be:
- Image/PDF path
- Numpy array or PIL Image
- Directory path
- List of paths/images
- Dict with 'image' and 'prompt' keys for VQA
Returns:
list: Understanding results with answers and analysis
"""
def predict_iter(
self,
input,
**kwargs
):
"""
Iterate over understanding results for memory efficiency.
Args:
input: Same as predict()
Yields:
dict: Understanding result for each input
"""
def close(self) -> None:
"""Close the pipeline and free resources."""
def export_paddlex_config_to_yaml(self, yaml_path: str) -> None:
"""Export configuration to YAML."""from paddleocr import DocUnderstanding
# Initialize pipeline
model = DocUnderstanding()
# Ask questions about an image
vqa_input = {
'image': 'invoice.jpg',
'prompt': 'What is the total amount on this invoice?'
}
result = model.predict(vqa_input)
print(result[0]['answer'])
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding()
# Prepare multiple questions
questions = [
{'image': 'contract.pdf', 'prompt': 'Who are the parties in this contract?'},
{'image': 'contract.pdf', 'prompt': 'What is the contract date?'},
{'image': 'contract.pdf', 'prompt': 'What is the contract value?'}
]
# Get answers
results = model.predict(questions)
for q, r in zip(questions, results):
print(f"Q: {q['prompt']}")
print(f"A: {r['answer']}\n")
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding()
# General document understanding
result = model.predict('diagram.png')
# Get document description
print(result[0]['description'])
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding(
doc_understanding_batch_size=4
)
# Process multiple documents with questions
inputs = [
{'image': 'receipt1.jpg', 'prompt': 'Extract the date and total'},
{'image': 'receipt2.jpg', 'prompt': 'Extract the date and total'},
{'image': 'receipt3.jpg', 'prompt': 'Extract the date and total'},
{'image': 'receipt4.jpg', 'prompt': 'Extract the date and total'}
]
results = model.predict(inputs)
for i, result in enumerate(results):
print(f"Receipt {i+1}: {result['answer']}")
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding()
# Extract structured information
vqa_input = {
'image': 'resume.pdf',
'prompt': '''Extract the following information in JSON format:
- Name
- Email
- Phone
- Work Experience (company, role, duration)
- Education
'''
}
result = model.predict(vqa_input)
print(result[0]['answer'])
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding()
# Ask about table contents
vqa_input = {
'image': 'financial_table.png',
'prompt': 'What are the top 3 performing products by revenue?'
}
result = model.predict(vqa_input)
print(result[0]['answer'])
model.close()from paddleocr import DocUnderstanding
model = DocUnderstanding()
# Process large dataset efficiently
questions = [
{'image': f'doc_{i}.pdf', 'prompt': 'Summarize the main points'}
for i in range(100)
]
# Use iterator to avoid loading all results in memory
for result in model.predict_iter(questions):
# Process one at a time
with open('summaries.txt', 'a') as f:
f.write(f"{result['input_path']}: {result['answer']}\n")
model.close()# Classify document types
result = model.predict({
'image': 'document.pdf',
'prompt': 'What type of document is this? (invoice, contract, receipt, etc.)'
})# Verify specific information
result = model.predict({
'image': 'id_card.jpg',
'prompt': 'Is this person over 18 years old?'
})# Analyze visualizations
result = model.predict({
'image': 'sales_chart.png',
'prompt': 'What trend does this chart show?'
})# Handle multiple languages (PP-DocBee2-3B supports many languages)
result = model.predict({
'image': 'chinese_form.jpg',
'prompt': 'What is the applicant name? (答案用中文)'
})[
{
"input_path": "path/to/document.jpg",
"prompt": "What is the total amount?",
"answer": "$1,234.56",
"confidence": 0.95,
"model": "PP-DocBee2-3B"
}
][
{
"input_path": "path/to/document.jpg",
"description": "This image shows a sales invoice with...",
"detected_elements": ["text", "table", "logo"],
"confidence": 0.92,
"model": "PP-DocBee2-3B"
}
]Default Model: PP-DocBee2-3B
doc_understanding_batch_size for your GPU memorydevice='gpu' for faster inferencepredict_iter() for large datasets to manage memory