Google Cloud Document AI client library for extracting structured information from documents using machine learning
—
This guide covers comprehensive processor lifecycle management including creation, configuration, deployment, training, and monitoring of document processors.
Document processors follow this lifecycle:
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorsRequest
def list_processors(project_id: str, location: str) -> list["Processor"]:
"""
List all processors in a project location.
Args:
project_id: Google Cloud project ID
location: Processor location (e.g., 'us', 'eu')
Returns:
list[Processor]: List of processor instances
"""
client = DocumentProcessorServiceClient()
# Build parent path
parent = client.common_location_path(project_id, location)
# Create request
request = ListProcessorsRequest(parent=parent)
# List processors
response = client.list_processors(request=request)
processors = []
for processor in response.processors:
processors.append(processor)
return processors
def display_processor_info(processors: list["Processor"]) -> None:
"""
Display processor information in a readable format.
Args:
processors: List of processor objects
"""
print(f"Found {len(processors)} processors:")
print("-" * 80)
for processor in processors:
print(f"Name: {processor.display_name}")
print(f"ID: {processor.name.split('/')[-1]}")
print(f"Type: {processor.type_}")
print(f"State: {processor.state}")
print(f"Default Version: {processor.default_processor_version}")
print(f"Created: {processor.create_time}")
print("-" * 80)from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import FetchProcessorTypesRequest
def fetch_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
"""
Fetch available processor types for a location.
Args:
project_id: Google Cloud project ID
location: Processor location
Returns:
list[ProcessorType]: Available processor types
"""
client = DocumentProcessorServiceClient()
# Build parent path
parent = client.common_location_path(project_id, location)
# Create request
request = FetchProcessorTypesRequest(parent=parent)
# Fetch processor types
response = client.fetch_processor_types(request=request)
processor_types = []
for processor_type in response.processor_types:
processor_types.append(processor_type)
return processor_typesfrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorTypesRequest
def list_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
"""
List available processor types for creation.
Args:
project_id: Google Cloud project ID
location: Processor location
Returns:
list[ProcessorType]: Available processor types
"""
client = DocumentProcessorServiceClient()
# Build parent path
parent = client.common_location_path(project_id, location)
# Create request
request = ListProcessorTypesRequest(parent=parent)
# List processor types
response = client.list_processor_types(request=request)
processor_types = []
for processor_type in response.processor_types:
processor_types.append(processor_type)
return processor_types
def display_processor_types(processor_types: list["ProcessorType"]) -> None:
"""
Display available processor types.
Args:
processor_types: List of ProcessorType objects
"""
print(f"Available processor types ({len(processor_types)}):")
print("-" * 60)
# Group by category for better display
categories = {}
for proc_type in processor_types:
category = proc_type.category
if category not in categories:
categories[category] = []
categories[category].append(proc_type)
for category, types in categories.items():
print(f"\n{category}:")
for proc_type in types:
print(f" - {proc_type.display_name}")
print(f" Type: {proc_type.type_}")
if proc_type.allow_creation:
print(" ✓ Available for creation")
print()from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import GetProcessorRequest
def get_processor(project_id: str, location: str, processor_id: str) -> "Processor":
"""
Get details of a specific processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
Returns:
Processor: Processor details
"""
client = DocumentProcessorServiceClient()
# Build processor name
name = client.processor_path(project_id, location, processor_id)
# Create request
request = GetProcessorRequest(name=name)
# Get processor
processor = client.get_processor(request=request)
return processorfrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import CreateProcessorRequest, Processor
def create_processor(
project_id: str,
location: str,
display_name: str,
processor_type: str
) -> "Processor":
"""
Create a new document processor.
Args:
project_id: Google Cloud project ID
location: Processor location
display_name: Human-readable name for the processor
processor_type: Type of processor to create (e.g., 'OCR_PROCESSOR')
Returns:
Processor: Created processor instance
"""
client = DocumentProcessorServiceClient()
# Build parent path
parent = client.common_location_path(project_id, location)
# Create processor object
processor = Processor(
display_name=display_name,
type_=processor_type
)
# Create request
request = CreateProcessorRequest(
parent=parent,
processor=processor
)
# Create processor
created_processor = client.create_processor(request=request)
print(f"Created processor: {created_processor.display_name}")
print(f"Processor ID: {created_processor.name.split('/')[-1]}")
return created_processor
def create_common_processors(project_id: str, location: str) -> dict[str, "Processor"]:
"""
Create commonly used processors.
Args:
project_id: Google Cloud project ID
location: Processor location
Returns:
dict[str, Processor]: Created processors by type
"""
processors = {}
# Common processor types
common_types = [
("OCR_PROCESSOR", "General OCR Processor"),
("FORM_PARSER_PROCESSOR", "Form Parser"),
("INVOICE_PROCESSOR", "Invoice Processor"),
("RECEIPT_PROCESSOR", "Receipt Processor")
]
for processor_type, display_name in common_types:
try:
processor = create_processor(
project_id=project_id,
location=location,
display_name=display_name,
processor_type=processor_type
)
processors[processor_type] = processor
except Exception as e:
print(f"Failed to create {processor_type}: {e}")
return processorsfrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
EnableProcessorRequest,
DisableProcessorRequest
)
def enable_processor(project_id: str, location: str, processor_id: str) -> "EnableProcessorResponse":
"""
Enable a disabled processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID to enable
Returns:
EnableProcessorResponse: Operation response
"""
client = DocumentProcessorServiceClient()
# Build processor name
name = client.processor_path(project_id, location, processor_id)
# Create request
request = EnableProcessorRequest(name=name)
# Enable processor (this is a long-running operation)
operation = client.enable_processor(request=request)
print(f"Enabling processor {processor_id}...")
# Wait for operation to complete
response = operation.result()
print(f"Processor {processor_id} enabled successfully")
return response
def disable_processor(project_id: str, location: str, processor_id: str) -> "DisableProcessorResponse":
"""
Disable an active processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID to disable
Returns:
DisableProcessorResponse: Operation response
"""
client = DocumentProcessorServiceClient()
# Build processor name
name = client.processor_path(project_id, location, processor_id)
# Create request
request = DisableProcessorRequest(name=name)
# Disable processor (this is a long-running operation)
operation = client.disable_processor(request=request)
print(f"Disabling processor {processor_id}...")
# Wait for operation to complete
response = operation.result()
print(f"Processor {processor_id} disabled successfully")
return responsefrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import DeleteProcessorRequest
def delete_processor(project_id: str, location: str, processor_id: str) -> None:
"""
Delete a processor permanently.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID to delete
Note:
This operation is irreversible. Ensure the processor is disabled first.
"""
client = DocumentProcessorServiceClient()
# Build processor name
name = client.processor_path(project_id, location, processor_id)
# First, ensure processor is disabled
try:
processor = get_processor(project_id, location, processor_id)
if processor.state == "ENABLED":
print("Processor is enabled. Disabling first...")
disable_processor(project_id, location, processor_id)
except Exception as e:
print(f"Warning: Could not check processor state: {e}")
# Create delete request
request = DeleteProcessorRequest(name=name)
# Delete processor (this is a long-running operation)
operation = client.delete_processor(request=request)
print(f"Deleting processor {processor_id}...")
# Wait for operation to complete
operation.result()
print(f"Processor {processor_id} deleted successfully")from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorVersionsRequest
def list_processor_versions(
project_id: str,
location: str,
processor_id: str
) -> list["ProcessorVersion"]:
"""
List all versions of a processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
Returns:
list[ProcessorVersion]: List of processor versions
"""
client = DocumentProcessorServiceClient()
# Build processor path as parent
parent = client.processor_path(project_id, location, processor_id)
# Create request
request = ListProcessorVersionsRequest(parent=parent)
# List versions
response = client.list_processor_versions(request=request)
versions = []
for version in response.processor_versions:
versions.append(version)
return versions
def display_processor_versions(versions: list["ProcessorVersion"]) -> None:
"""
Display processor version information.
Args:
versions: List of ProcessorVersion objects
"""
print(f"Found {len(versions)} processor versions:")
print("-" * 70)
for version in versions:
version_id = version.name.split('/')[-1]
print(f"Version ID: {version_id}")
print(f"Display Name: {version.display_name}")
print(f"State: {version.state}")
print(f"Created: {version.create_time}")
if version.model_type:
print(f"Model Type: {version.model_type}")
if version.latest_evaluation:
print(f"Latest Evaluation: {version.latest_evaluation}")
print("-" * 70)from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import DeployProcessorVersionRequest
def deploy_processor_version(
project_id: str,
location: str,
processor_id: str,
version_id: str
) -> "DeployProcessorVersionResponse":
"""
Deploy a processor version for serving.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID to deploy
Returns:
DeployProcessorVersionResponse: Deployment response
"""
client = DocumentProcessorServiceClient()
# Build processor version name
name = client.processor_version_path(
project_id, location, processor_id, version_id
)
# Create request
request = DeployProcessorVersionRequest(name=name)
# Deploy version (this is a long-running operation)
operation = client.deploy_processor_version(request=request)
print(f"Deploying processor version {version_id}...")
# Wait for operation to complete
response = operation.result()
print(f"Processor version {version_id} deployed successfully")
return response
def undeploy_processor_version(
project_id: str,
location: str,
processor_id: str,
version_id: str
) -> "UndeployProcessorVersionResponse":
"""
Undeploy a processor version from serving.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID to undeploy
Returns:
UndeployProcessorVersionResponse: Undeploy response
"""
client = DocumentProcessorServiceClient()
# Build processor version name
name = client.processor_version_path(
project_id, location, processor_id, version_id
)
# Create request
request = UndeployProcessorVersionRequest(name=name)
# Undeploy version (this is a long-running operation)
operation = client.undeploy_processor_version(request=request)
print(f"Undeploying processor version {version_id}...")
# Wait for operation to complete
response = operation.result()
print(f"Processor version {version_id} undeployed successfully")
return responsefrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import SetDefaultProcessorVersionRequest
def set_default_processor_version(
project_id: str,
location: str,
processor_id: str,
version_id: str
) -> "SetDefaultProcessorVersionResponse":
"""
Set the default version for a processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID to set as default
Returns:
SetDefaultProcessorVersionResponse: Response with updated processor
"""
client = DocumentProcessorServiceClient()
# Build processor path
processor_name = client.processor_path(project_id, location, processor_id)
# Build version path
version_name = client.processor_version_path(
project_id, location, processor_id, version_id
)
# Create request
request = SetDefaultProcessorVersionRequest(
processor=processor_name,
default_processor_version=version_name
)
# Set default version (this is a long-running operation)
operation = client.set_default_processor_version(request=request)
print(f"Setting default version to {version_id}...")
# Wait for operation to complete
response = operation.result()
print(f"Default version set to {version_id} successfully")
return responsefrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
TrainProcessorVersionRequest,
DocumentSchema
)
def train_processor_version(
project_id: str,
location: str,
processor_id: str,
version_display_name: str,
training_dataset: str,
test_dataset: str = None,
document_schema: "DocumentSchema" = None
) -> "TrainProcessorVersionResponse":
"""
Train a new version of a custom processor.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID to train
version_display_name: Display name for new version
training_dataset: Path to training dataset
test_dataset: Optional path to test dataset
document_schema: Optional document schema for training
Returns:
TrainProcessorVersionResponse: Training response with new version
"""
client = DocumentProcessorServiceClient()
# Build processor path as parent
parent = client.processor_path(project_id, location, processor_id)
# Create processor version configuration
processor_version = {
"display_name": version_display_name
}
# Add document schema if provided
if document_schema:
processor_version["document_schema"] = document_schema
# Create training input configuration
input_data = {
"training_documents": {
"gcs_prefix": {"gcs_uri_prefix": training_dataset}
}
}
# Add test dataset if provided
if test_dataset:
input_data["test_documents"] = {
"gcs_prefix": {"gcs_uri_prefix": test_dataset}
}
# Create request
request = TrainProcessorVersionRequest(
parent=parent,
processor_version=processor_version,
input_data=input_data
)
# Start training (this is a long-running operation)
operation = client.train_processor_version(request=request)
print(f"Starting training for processor version: {version_display_name}")
print("This operation may take several hours to complete...")
# For production, you'd typically not wait for completion here
# Instead, you'd check the operation status periodically
print(f"Training operation name: {operation.operation.name}")
return operation
def check_training_progress(operation_name: str) -> dict:
"""
Check the progress of a training operation.
Args:
operation_name: Name of the training operation
Returns:
dict: Operation status and progress information
"""
from google.api_core import operations_v1
from google.auth import default
# Get credentials and create operations client
credentials, project = default()
operations_client = operations_v1.OperationsClient(credentials=credentials)
# Get operation status
operation = operations_client.get_operation(name=operation_name)
status_info = {
"name": operation.name,
"done": operation.done,
"metadata": None,
"result": None,
"error": None
}
if operation.metadata:
# Parse metadata for progress information
status_info["metadata"] = operation.metadata
if operation.done:
if operation.error:
status_info["error"] = operation.error
else:
status_info["result"] = operation.response
return status_infofrom google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
EvaluateProcessorVersionRequest,
EvaluationReference
)
def evaluate_processor_version(
project_id: str,
location: str,
processor_id: str,
version_id: str,
evaluation_documents: str
) -> "EvaluateProcessorVersionResponse":
"""
Evaluate the performance of a processor version.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID to evaluate
evaluation_documents: GCS path to evaluation documents
Returns:
EvaluateProcessorVersionResponse: Evaluation response
"""
client = DocumentProcessorServiceClient()
# Build processor version name
processor_version = client.processor_version_path(
project_id, location, processor_id, version_id
)
# Create evaluation documents configuration
evaluation_documents_config = {
"gcs_prefix": {"gcs_uri_prefix": evaluation_documents}
}
# Create request
request = EvaluateProcessorVersionRequest(
processor_version=processor_version,
evaluation_documents=evaluation_documents_config
)
# Start evaluation (this is a long-running operation)
operation = client.evaluate_processor_version(request=request)
print(f"Starting evaluation for processor version {version_id}...")
# Wait for evaluation to complete
response = operation.result()
print("Evaluation completed successfully")
return response
def list_evaluations(
project_id: str,
location: str,
processor_id: str,
version_id: str
) -> list["Evaluation"]:
"""
List all evaluations for a processor version.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID
Returns:
list[Evaluation]: List of evaluation results
"""
client = DocumentProcessorServiceClient()
# Build processor version path as parent
parent = client.processor_version_path(
project_id, location, processor_id, version_id
)
# Create request
request = ListEvaluationsRequest(parent=parent)
# List evaluations
response = client.list_evaluations(request=request)
evaluations = []
for evaluation in response.evaluations:
evaluations.append(evaluation)
return evaluations
def get_evaluation_details(
project_id: str,
location: str,
processor_id: str,
version_id: str,
evaluation_id: str
) -> "Evaluation":
"""
Get detailed evaluation results.
Args:
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
version_id: Version ID
evaluation_id: Evaluation ID
Returns:
Evaluation: Detailed evaluation results
"""
client = DocumentProcessorServiceClient()
# Build evaluation name
name = client.evaluation_path(
project_id, location, processor_id, version_id, evaluation_id
)
# Create request
request = GetEvaluationRequest(name=name)
# Get evaluation
evaluation = client.get_evaluation(request=request)
return evaluationdef complete_processor_management_example():
"""
Complete example demonstrating processor lifecycle management.
"""
project_id = "my-project"
location = "us"
client = DocumentProcessorServiceClient()
# 1. List existing processors
print("=== LISTING PROCESSORS ===")
processors = list_processors(project_id, location)
display_processor_info(processors)
# 2. Create a new processor if needed
print("\n=== CREATING PROCESSOR ===")
processor = create_processor(
project_id=project_id,
location=location,
display_name="My Custom Invoice Processor",
processor_type="INVOICE_PROCESSOR"
)
processor_id = processor.name.split('/')[-1]
# 3. Enable the processor
print("\n=== ENABLING PROCESSOR ===")
enable_processor(project_id, location, processor_id)
# 4. List processor versions
print("\n=== LISTING VERSIONS ===")
versions = list_processor_versions(project_id, location, processor_id)
display_processor_versions(versions)
# 5. Get processor details
print("\n=== PROCESSOR DETAILS ===")
processor_details = get_processor(project_id, location, processor_id)
print(f"Processor State: {processor_details.state}")
print(f"Default Version: {processor_details.default_processor_version}")
# 6. Evaluate processor (if evaluation data available)
# evaluation_gcs_path = "gs://my-bucket/evaluation-docs/"
# evaluation = evaluate_processor_version(
# project_id, location, processor_id, version_id, evaluation_gcs_path
# )
print("\nProcessor management example completed!")
if __name__ == "__main__":
complete_processor_management_example()This comprehensive guide covers all aspects of processor management in Google Cloud Document AI, from basic operations to advanced training and evaluation workflows.
Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-documentai