Azure AI Document Intelligence client library for Python - a cloud service that uses machine learning to analyze text and structured data from documents
76
Core document processing functionality for analyzing single documents, processing batches, and classifying documents. These operations support both prebuilt models (layout, invoice, receipt, etc.) and custom models with advanced features like high-resolution OCR, language detection, and structured data extraction.
Analyzes individual documents using specified models to extract text, tables, key-value pairs, and structured data. Returns enhanced LRO poller with operation metadata.
def begin_analyze_document(
model_id: str,
body: Union[AnalyzeDocumentRequest, JSON, IO[bytes]],
*,
pages: Optional[str] = None,
locale: Optional[str] = None,
string_index_type: Optional[Union[str, StringIndexType]] = None,
features: Optional[List[Union[str, DocumentAnalysisFeature]]] = None,
query_fields: Optional[List[str]] = None,
output_content_format: Optional[Union[str, DocumentContentFormat]] = None,
output: Optional[List[Union[str, AnalyzeOutputOption]]] = None,
**kwargs: Any
) -> AnalyzeDocumentLROPoller[AnalyzeResult]:
"""
Analyzes document with the specified model.
Parameters:
- model_id (str): Model ID for analysis (e.g., "prebuilt-layout", "prebuilt-invoice")
- body: Document data as AnalyzeDocumentRequest, JSON dict, or file bytes
- pages (str, optional): Page range specification (e.g., "1-3,5")
- locale (str, optional): Locale hint for better recognition
- string_index_type (StringIndexType, optional): Character indexing scheme
- features (List[DocumentAnalysisFeature], optional): Additional features to enable
- query_fields (List[str], optional): Custom field extraction queries
- output_content_format (DocumentContentFormat, optional): Content format (text/markdown)
- output (List[AnalyzeOutputOption], optional): Additional outputs (pdf/figures)
Returns:
AnalyzeDocumentLROPoller[AnalyzeResult]: Enhanced poller with operation metadata
"""Usage example:
# Analyze with file upload
with open("document.pdf", "rb") as f:
poller = client.begin_analyze_document(
model_id="prebuilt-layout",
body=f,
features=["languages", "barcodes"],
output_content_format="markdown"
)
result = poller.result()
# Access operation metadata
operation_id = poller.details["operation_id"]
# Analyze with custom fields
with open("invoice.pdf", "rb") as f:
poller = client.begin_analyze_document(
"prebuilt-invoice",
f,
query_fields=["Tax ID", "Purchase Order"]
)
result = poller.result()Processes multiple documents in a single operation for efficient bulk processing. Supports Azure Blob Storage as document source with flexible file selection.
def begin_analyze_batch_documents(
model_id: str,
body: Union[AnalyzeBatchDocumentsRequest, JSON, IO[bytes]],
**kwargs: Any
) -> LROPoller[AnalyzeBatchResult]:
"""
Analyzes multiple documents in batch.
Parameters:
- model_id (str): Model ID for batch analysis
- body: Batch request with Azure Blob source configuration
Returns:
LROPoller[AnalyzeBatchResult]: Batch operation poller
"""Retrieves and manages batch processing results with support for listing operations and accessing individual results.
def list_analyze_batch_results(
model_id: str,
*,
skip: Optional[int] = None,
top: Optional[int] = None,
**kwargs: Any
) -> Iterable[AnalyzeBatchOperation]:
"""
Lists batch analysis operations for the specified model.
Parameters:
- model_id (str): Model ID to filter operations
- skip (int, optional): Number of operations to skip
- top (int, optional): Maximum operations to return
Returns:
Iterable[AnalyzeBatchOperation]: Paginated batch operations
"""
def get_analyze_batch_result(
continuation_token: str,
**kwargs: Any
) -> LROPoller[AnalyzeBatchResult]:
"""
Continues batch analysis operation from continuation token.
Parameters:
- continuation_token (str): Continuation token for resuming batch operation
Returns:
LROPoller[AnalyzeBatchResult]: Batch operation poller
"""
def delete_analyze_batch_result(
model_id: str,
result_id: str,
**kwargs: Any
) -> None:
"""
Deletes batch analysis result.
Parameters:
- model_id (str): Model ID used for analysis
- result_id (str): Batch operation result ID to delete
"""Classifies documents using trained classifiers to automatically determine document types and route processing workflows.
def begin_classify_document(
classifier_id: str,
body: Union[ClassifyDocumentRequest, JSON, IO[bytes]],
*,
string_index_type: Optional[Union[str, StringIndexType]] = None,
split_mode: Optional[Union[str, SplitMode]] = None,
pages: Optional[str] = None,
**kwargs: Any
) -> LROPoller[AnalyzeResult]:
"""
Classifies document using specified classifier.
Parameters:
- classifier_id (str): Document classifier ID
- body: Document data as ClassifyDocumentRequest, JSON dict, or file bytes
- string_index_type (StringIndexType, optional): Character indexing scheme
- split_mode (SplitMode, optional): Document splitting behavior
- pages (str, optional): Page range specification
Returns:
LROPoller[AnalyzeResult]: Classification result poller
"""Retrieves analysis outputs in various formats including searchable PDFs and extracted figure images.
def get_analyze_result_pdf(
model_id: str,
result_id: str,
**kwargs: Any
) -> Iterator[bytes]:
"""
Gets analysis result as searchable PDF.
Parameters:
- model_id (str): Model ID used for analysis
- result_id (str): Analysis result ID
Returns:
Iterator[bytes]: PDF content stream
"""
def get_analyze_result_figure(
model_id: str,
result_id: str,
figure_id: str,
**kwargs: Any
) -> Iterator[bytes]:
"""
Gets extracted figure as image.
Parameters:
- model_id (str): Model ID used for analysis
- result_id (str): Analysis result ID
- figure_id (str): Figure identifier
Returns:
Iterator[bytes]: Image content stream
"""
def delete_analyze_result(
model_id: str,
result_id: str,
**kwargs: Any
) -> None:
"""
Deletes analysis result.
Parameters:
- model_id (str): Model ID used for analysis
- result_id (str): Analysis result ID to delete
"""class AnalyzeDocumentRequest:
"""Request for single document analysis."""
url_source: Optional[str]
base64_source: Optional[str]
pages: Optional[str]
locale: Optional[str]
string_index_type: Optional[StringIndexType]
features: Optional[List[DocumentAnalysisFeature]]
query_fields: Optional[List[str]]
output_content_format: Optional[DocumentContentFormat]
output: Optional[List[AnalyzeOutputOption]]
class AnalyzeBatchDocumentsRequest:
"""Request for batch document analysis."""
azure_blob_source: Optional[AzureBlobContentSource]
azure_blob_file_list_source: Optional[AzureBlobFileListContentSource]
result_container_url: str
result_prefix: Optional[str]
overwrite_existing: Optional[bool]
pages: Optional[str]
locale: Optional[str]
string_index_type: Optional[StringIndexType]
features: Optional[List[DocumentAnalysisFeature]]
query_fields: Optional[List[str]]
output_content_format: Optional[DocumentContentFormat]
output: Optional[List[AnalyzeOutputOption]]
class ClassifyDocumentRequest:
"""Request for document classification."""
url_source: Optional[str]
base64_source: Optional[str]
pages: Optional[str]
string_index_type: Optional[StringIndexType]
split_mode: Optional[SplitMode]class AnalyzeResult:
"""Main analysis result containing extracted content and metadata."""
api_version: Optional[str]
model_id: str
string_index_type: Optional[StringIndexType]
content: Optional[str]
pages: Optional[List[DocumentPage]]
paragraphs: Optional[List[DocumentParagraph]]
tables: Optional[List[DocumentTable]]
figures: Optional[List[DocumentFigure]]
sections: Optional[List[DocumentSection]]
key_value_pairs: Optional[List[DocumentKeyValuePair]]
styles: Optional[List[DocumentStyle]]
languages: Optional[List[DocumentLanguage]]
documents: Optional[List[AnalyzedDocument]]
warnings: Optional[List[DocumentIntelligenceWarning]]
class AnalyzeBatchResult:
"""Results from batch document analysis."""
succeeded_count: int
failed_count: int
skipped_count: int
details: List[AnalyzeBatchOperationDetail]
class AnalyzeBatchOperation:
"""Batch operation metadata and status."""
operation_id: str
status: DocumentIntelligenceOperationStatus
created_date_time: datetime
last_updated_date_time: datetime
percent_completed: Optional[int]
result: Optional[AnalyzeBatchResult]
error: Optional[DocumentIntelligenceError]class AnalyzeDocumentLROPoller(LROPoller[AnalyzeResult]):
"""Enhanced poller for document analysis operations."""
@property
def details(self) -> Dict[str, Any]:
"""
Returns operation metadata including operation_id.
Returns:
Dict containing operation_id extracted from Operation-Location header
"""
@classmethod
def from_continuation_token(
cls,
polling_method: PollingMethod,
continuation_token: str,
**kwargs: Any
) -> "AnalyzeDocumentLROPoller[AnalyzeResult]":
"""Resume operation from continuation token."""def send_request(
request: HttpRequest,
*,
stream: bool = False,
**kwargs: Any
) -> HttpResponse:
"""
Sends custom HTTP request using the client's pipeline.
Parameters:
- request (HttpRequest): HTTP request to send
- stream (bool): Whether to stream the response
Returns:
HttpResponse: Raw HTTP response
"""
def close() -> None:
"""Close the client and release resources."""Install with Tessl CLI
npx tessl i tessl/pypi-azure-ai-documentintelligencedocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10