Google Cloud Document AI client library for extracting structured information from documents using machine learning
—
This guide covers core document processing operations using Google Cloud Document AI, including synchronous processing, handling different document formats, and extracting structured data.
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, RawDocument
def process_document_from_file(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str
) -> "Document":
"""
Process a document file using Document AI.
Args:
project_id: Google Cloud project ID
location: Processor location (e.g., 'us', 'eu')
processor_id: Document processor ID
file_path: Path to the document file
mime_type: MIME type of the document
Returns:
Document: Processed document with extracted data
"""
client = DocumentProcessorServiceClient()
# Build the processor resource name
name = client.processor_path(project_id, location, processor_id)
# Read document file
with open(file_path, "rb") as document_file:
document_content = document_file.read()
# Create raw document
raw_document = RawDocument(
content=document_content,
mime_type=mime_type
)
# Configure process request
request = ProcessRequest(
name=name,
raw_document=raw_document
)
# Process the document
result = client.process_document(request=request)
return result.documentfrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, GcsDocument
def process_gcs_document(
project_id: str,
location: str,
processor_id: str,
gcs_uri: str,
mime_type: str
) -> "Document":
"""
Process a document stored in Google Cloud Storage.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Document processor ID
gcs_uri: Cloud Storage URI (gs://bucket/path/file.pdf)
mime_type: MIME type of the document
Returns:
Document: Processed document with extracted data
"""
client = DocumentProcessorServiceClient()
# Build the processor resource name
name = client.processor_path(project_id, location, processor_id)
# Create GCS document reference
gcs_document = GcsDocument(
gcs_uri=gcs_uri,
mime_type=mime_type
)
# Configure process request
request = ProcessRequest(
name=name,
gcs_document=gcs_document
)
# Process the document
result = client.process_document(request=request)
return result.documentfrom google.cloud.documentai.types import ProcessRequest, OcrConfig, ProcessOptions
def process_with_ocr_options(
client: DocumentProcessorServiceClient,
processor_name: str,
raw_document: "RawDocument",
enable_native_pdf_parsing: bool = True,
enable_image_quality_scores: bool = False,
enable_symbol: bool = False
) -> "Document":
"""
Process document with specific OCR configuration.
Args:
client: DocumentProcessorServiceClient instance
processor_name: Full processor resource name
raw_document: Raw document to process
enable_native_pdf_parsing: Use native PDF parsing when possible
enable_image_quality_scores: Include image quality scores
enable_symbol: Enable symbol detection
Returns:
Document: Processed document
"""
# Configure OCR options
ocr_config = OcrConfig(
enable_native_pdf_parsing=enable_native_pdf_parsing,
enable_image_quality_scores=enable_image_quality_scores,
enable_symbol=enable_symbol
)
# Configure process options
process_options = ProcessOptions(ocr_config=ocr_config)
# Create request with options
request = ProcessRequest(
name=processor_name,
raw_document=raw_document,
process_options=process_options
)
# Process document
result = client.process_document(request=request)
return result.documentfrom google.cloud.documentai.types import ProcessRequest
from google.protobuf.field_mask_pb2 import FieldMask
def process_with_field_mask(
client: DocumentProcessorServiceClient,
processor_name: str,
raw_document: "RawDocument",
fields: list[str]
) -> "Document":
"""
Process document returning only specified fields.
Args:
client: DocumentProcessorServiceClient instance
processor_name: Full processor resource name
raw_document: Raw document to process
fields: List of field paths to return (e.g., ['text', 'pages.blocks'])
Returns:
Document: Processed document with only requested fields
"""
# Create field mask
field_mask = FieldMask(paths=fields)
# Create request with field mask
request = ProcessRequest(
name=processor_name,
raw_document=raw_document,
field_mask=field_mask
)
# Process document
result = client.process_document(request=request)
return result.documentfrom google.cloud.documentai.types import Document
def analyze_document_text(document: Document) -> dict:
"""
Analyze text content and layout from processed document.
Args:
document: Processed Document object
Returns:
dict: Analysis results including text statistics and layout info
"""
analysis = {
"total_text": document.text,
"text_length": len(document.text),
"pages": [],
"text_segments": []
}
# Analyze each page
for page_idx, page in enumerate(document.pages):
page_info = {
"page_number": page_idx + 1,
"dimensions": {
"width": page.dimension.width,
"height": page.dimension.height,
"unit": page.dimension.unit
},
"blocks": len(page.blocks),
"paragraphs": len(page.paragraphs),
"lines": len(page.lines),
"tokens": len(page.tokens)
}
# Extract text segments from page
for block in page.blocks:
if block.layout and block.layout.text_anchor:
text_segment = extract_text_from_anchor(
document.text,
block.layout.text_anchor
)
analysis["text_segments"].append({
"type": "block",
"page": page_idx + 1,
"text": text_segment,
"confidence": block.layout.confidence
})
analysis["pages"].append(page_info)
return analysis
def extract_text_from_anchor(full_text: str, text_anchor: "Document.TextAnchor") -> str:
"""
Extract text segment using TextAnchor.
Args:
full_text: Full document text
text_anchor: TextAnchor specifying text location
Returns:
str: Extracted text segment
"""
text_segments = []
for segment in text_anchor.text_segments:
start_index = int(segment.start_index) if segment.start_index else 0
end_index = int(segment.end_index) if segment.end_index else len(full_text)
text_segments.append(full_text[start_index:end_index])
return "".join(text_segments)from google.cloud.documentai.types import Document
def extract_entities(document: Document) -> dict:
"""
Extract and organize entities from processed document.
Args:
document: Processed Document object
Returns:
dict: Organized entities by type with confidence scores
"""
entities_by_type = {}
for entity in document.entities:
entity_type = entity.type_
if entity_type not in entities_by_type:
entities_by_type[entity_type] = []
# Extract entity information
entity_info = {
"text": entity.mention_text,
"confidence": entity.confidence,
"page_refs": []
}
# Add page references if available
if entity.page_anchor:
for page_ref in entity.page_anchor.page_refs:
entity_info["page_refs"].append({
"page": page_ref.page + 1, # Convert to 1-based
"layout_type": page_ref.layout_type,
"layout_id": page_ref.layout_id
})
# Add text anchor information
if entity.text_anchor:
entity_info["text_segments"] = []
for segment in entity.text_anchor.text_segments:
entity_info["text_segments"].append({
"start_index": int(segment.start_index or 0),
"end_index": int(segment.end_index or 0)
})
# Add properties if available
if entity.properties:
entity_info["properties"] = []
for prop in entity.properties:
prop_info = {
"type": prop.type_,
"text": prop.mention_text,
"confidence": prop.confidence
}
entity_info["properties"].append(prop_info)
entities_by_type[entity_type].append(entity_info)
return entities_by_typefrom google.cloud.documentai.types import Document
def extract_tables(document: Document) -> list[dict]:
"""
Extract table data from processed document.
Args:
document: Processed Document object
Returns:
list[dict]: List of tables with structured data
"""
tables = []
for page_idx, page in enumerate(document.pages):
for table_idx, table in enumerate(page.tables):
table_data = {
"page": page_idx + 1,
"table_index": table_idx,
"rows": [],
"header_rows": [],
"body_rows": []
}
# Process table rows
for row in table.header_rows:
header_row = extract_table_row(document.text, row)
table_data["header_rows"].append(header_row)
table_data["rows"].append(header_row)
for row in table.body_rows:
body_row = extract_table_row(document.text, row)
table_data["body_rows"].append(body_row)
table_data["rows"].append(body_row)
tables.append(table_data)
return tables
def extract_table_row(full_text: str, row: "Document.Page.Table.TableRow") -> list[dict]:
"""
Extract data from a table row.
Args:
full_text: Full document text
row: Table row object
Returns:
list[dict]: List of cell data
"""
cells = []
for cell in row.cells:
cell_data = {
"text": "",
"row_span": cell.row_span,
"col_span": cell.col_span
}
# Extract cell text
if cell.layout and cell.layout.text_anchor:
cell_data["text"] = extract_text_from_anchor(
full_text,
cell.layout.text_anchor
).strip()
cells.append(cell_data)
return cellsfrom google.cloud.documentai.types import Document
def extract_form_fields(document: Document) -> dict:
"""
Extract form fields (key-value pairs) from processed document.
Args:
document: Processed Document object
Returns:
dict: Form fields organized as key-value pairs
"""
form_fields = {}
for page in document.pages:
for form_field in page.form_fields:
# Extract field name (key)
field_name = ""
if form_field.field_name and form_field.field_name.text_anchor:
field_name = extract_text_from_anchor(
document.text,
form_field.field_name.text_anchor
).strip()
# Extract field value
field_value = ""
if form_field.field_value and form_field.field_value.text_anchor:
field_value = extract_text_from_anchor(
document.text,
form_field.field_value.text_anchor
).strip()
# Store form field with confidence
if field_name:
form_fields[field_name] = {
"value": field_value,
"name_confidence": form_field.field_name.confidence if form_field.field_name else 0.0,
"value_confidence": form_field.field_value.confidence if form_field.field_value else 0.0
}
return form_fieldsimport asyncio
from google.cloud.documentai import DocumentProcessorServiceAsyncClient
from google.cloud.documentai.types import ProcessRequest, RawDocument
async def process_document_async(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str
) -> "Document":
"""
Process document asynchronously.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Document processor ID
file_path: Path to document file
mime_type: MIME type of document
Returns:
Document: Processed document
"""
client = DocumentProcessorServiceAsyncClient()
# Build processor name
name = client.processor_path(project_id, location, processor_id)
# Read document
with open(file_path, "rb") as document_file:
document_content = document_file.read()
# Create request
raw_document = RawDocument(content=document_content, mime_type=mime_type)
request = ProcessRequest(name=name, raw_document=raw_document)
# Process asynchronously
result = await client.process_document(request=request)
await client.close()
return result.document
# Example usage
async def main():
document = await process_document_async(
project_id="my-project",
location="us",
processor_id="abc123",
file_path="document.pdf",
mime_type="application/pdf"
)
print(f"Processed document: {len(document.text)} characters")
# Run async function
asyncio.run(main())# Supported MIME types for document processing
SUPPORTED_MIME_TYPES = {
# PDF Documents
"application/pdf": "PDF documents",
# Image formats
"image/jpeg": "JPEG images",
"image/jpg": "JPG images",
"image/png": "PNG images",
"image/bmp": "BMP images",
"image/tiff": "TIFF images",
"image/tif": "TIF images",
"image/gif": "GIF images (first frame only)",
"image/webp": "WebP images",
# Office documents (with OCR)
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word documents",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint files",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel files"
}
def validate_mime_type(mime_type: str) -> bool:
"""
Check if MIME type is supported.
Args:
mime_type: MIME type to validate
Returns:
bool: True if supported, False otherwise
"""
return mime_type in SUPPORTED_MIME_TYPES# Document processing limits
PROCESSING_LIMITS = {
"max_file_size_bytes": 20 * 1024 * 1024, # 20 MB
"max_pages_per_document": 2000,
"max_image_dimensions": {
"width": 10000,
"height": 10000
},
"timeout_seconds": 300 # 5 minutes
}
def validate_document_size(file_path: str) -> tuple[bool, str]:
"""
Validate document meets size requirements.
Args:
file_path: Path to document file
Returns:
tuple[bool, str]: (is_valid, error_message)
"""
import os
file_size = os.path.getsize(file_path)
if file_size > PROCESSING_LIMITS["max_file_size_bytes"]:
return False, f"File size ({file_size} bytes) exceeds limit ({PROCESSING_LIMITS['max_file_size_bytes']} bytes)"
return True, ""from google.cloud.documentai import DocumentProcessorServiceClient
from google.api_core.exceptions import (
NotFound,
InvalidArgument,
ResourceExhausted,
DeadlineExceeded
)
from google.cloud.exceptions import GoogleCloudError
def robust_process_document(
client: DocumentProcessorServiceClient,
request: ProcessRequest,
max_retries: int = 3
) -> "ProcessResponse":
"""
Process document with error handling and retries.
Args:
client: DocumentProcessorServiceClient instance
request: Process request
max_retries: Maximum number of retry attempts
Returns:
ProcessResponse: Processing result
Raises:
Exception: If processing fails after all retries
"""
import time
for attempt in range(max_retries + 1):
try:
return client.process_document(request=request)
except NotFound as e:
# Processor not found - don't retry
raise Exception(f"Processor not found: {e}")
except InvalidArgument as e:
# Invalid request - don't retry
raise Exception(f"Invalid request: {e}")
except ResourceExhausted as e:
# Rate limit exceeded - wait and retry
if attempt < max_retries:
wait_time = 2 ** attempt # Exponential backoff
print(f"Rate limit exceeded, waiting {wait_time}s (attempt {attempt + 1})")
time.sleep(wait_time)
continue
raise Exception(f"Rate limit exceeded after {max_retries} retries: {e}")
except DeadlineExceeded as e:
# Timeout - retry with longer timeout
if attempt < max_retries:
print(f"Request timeout, retrying (attempt {attempt + 1})")
continue
raise Exception(f"Request timeout after {max_retries} retries: {e}")
except GoogleCloudError as e:
# Other Google Cloud errors
if attempt < max_retries:
wait_time = 2 ** attempt
print(f"Google Cloud error, retrying in {wait_time}s: {e}")
time.sleep(wait_time)
continue
raise Exception(f"Google Cloud error after {max_retries} retries: {e}")
except Exception as e:
# Unexpected errors - don't retry
raise Exception(f"Unexpected error: {e}")
raise Exception("Maximum retries exceeded")from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ReviewDocumentRequest, Document
def submit_document_for_review(
project_id: str,
location: str,
processor_id: str,
document: Document,
enable_schema_validation: bool = True
) -> "Operation":
"""
Submit a processed document for human review.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
document: Processed document to review
enable_schema_validation: Enable schema validation during review
Returns:
Operation: Long-running operation for review process
"""
client = DocumentProcessorServiceClient()
# Build human review config path
human_review_config = f"projects/{project_id}/locations/{location}/processors/{processor_id}/humanReviewConfig"
# Create review request
request = ReviewDocumentRequest(
human_review_config=human_review_config,
inline_document=document,
enable_schema_validation=enable_schema_validation
)
# Submit for review
operation = client.review_document(request=request)
print(f"Document submitted for human review")
print(f"Operation: {operation.operation.name}")
return operation
def check_review_status(operation: "Operation") -> dict:
"""
Check the status of a human review operation.
Args:
operation: Review operation object
Returns:
dict: Review status information
"""
if operation.done():
if operation.exception():
return {
"status": "failed",
"error": str(operation.exception())
}
else:
result = operation.result()
return {
"status": "completed",
"gcs_destination": result.gcs_destination,
"rejection_reason": result.rejection_reason
}
else:
return {"status": "in_progress"}from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, RawDocument
def complete_document_processing_example():
"""Complete example of document processing with analysis."""
# Initialize client
client = DocumentProcessorServiceClient()
# Configuration
project_id = "my-project"
location = "us"
processor_id = "abc123def456"
file_path = "sample_invoice.pdf"
# Process document
document = process_document_from_file(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
mime_type="application/pdf"
)
# Analyze results
print("=== DOCUMENT ANALYSIS ===")
# 1. Basic text analysis
text_analysis = analyze_document_text(document)
print(f"Total text length: {text_analysis['text_length']} characters")
print(f"Number of pages: {len(text_analysis['pages'])}")
# 2. Extract entities
entities = extract_entities(document)
print(f"\nFound {len(entities)} entity types:")
for entity_type, entity_list in entities.items():
print(f" {entity_type}: {len(entity_list)} instances")
for entity in entity_list[:3]: # Show first 3
print(f" - {entity['text']} (confidence: {entity['confidence']:.2f})")
# 3. Extract tables
tables = extract_tables(document)
print(f"\nFound {len(tables)} tables:")
for table in tables:
print(f" Table on page {table['page']}: {len(table['rows'])} rows")
# 4. Extract form fields
form_fields = extract_form_fields(document)
print(f"\nFound {len(form_fields)} form fields:")
for field_name, field_info in form_fields.items():
print(f" {field_name}: {field_info['value']}")
if __name__ == "__main__":
complete_document_processing_example()This comprehensive guide covers all aspects of document processing with Google Cloud Document AI, from basic operations to advanced analysis and error handling.
Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-documentai