CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-documentai

Google Cloud Document AI client library for extracting structured information from documents using machine learning

Pending
Overview
Eval results
Files

processor-management.mddocs/

Processor Management

This guide covers comprehensive processor lifecycle management including creation, configuration, deployment, training, and monitoring of document processors.

Processor Lifecycle Overview

Document processors follow this lifecycle:

  1. Create - Initialize a new processor instance
  2. Configure - Set up processor parameters and options
  3. Train - Train custom processors (for custom types)
  4. Deploy - Make processor versions available for processing
  5. Monitor - Track performance and usage
  6. Update - Deploy new versions and manage defaults
  7. Cleanup - Disable and delete unused processors

List and Discover Processors

List Available Processors

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorsRequest

def list_processors(project_id: str, location: str) -> list["Processor"]:
    """
    List all processors in a project location.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location (e.g., 'us', 'eu')
        
    Returns:
        list[Processor]: List of processor instances
    """
    client = DocumentProcessorServiceClient()
    
    # Build parent path
    parent = client.common_location_path(project_id, location)
    
    # Create request
    request = ListProcessorsRequest(parent=parent)
    
    # List processors
    response = client.list_processors(request=request)
    
    processors = []
    for processor in response.processors:
        processors.append(processor)
    
    return processors

def display_processor_info(processors: list["Processor"]) -> None:
    """
    Display processor information in a readable format.
    
    Args:
        processors: List of processor objects
    """
    print(f"Found {len(processors)} processors:")
    print("-" * 80)
    
    for processor in processors:
        print(f"Name: {processor.display_name}")
        print(f"ID: {processor.name.split('/')[-1]}")
        print(f"Type: {processor.type_}")
        print(f"State: {processor.state}")
        print(f"Default Version: {processor.default_processor_version}")
        print(f"Created: {processor.create_time}")
        print("-" * 80)

Fetch Processor Types

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import FetchProcessorTypesRequest

def fetch_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
    """
    Fetch available processor types for a location.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        
    Returns:
        list[ProcessorType]: Available processor types
    """
    client = DocumentProcessorServiceClient()
    
    # Build parent path
    parent = client.common_location_path(project_id, location)
    
    # Create request
    request = FetchProcessorTypesRequest(parent=parent)
    
    # Fetch processor types
    response = client.fetch_processor_types(request=request)
    
    processor_types = []
    for processor_type in response.processor_types:
        processor_types.append(processor_type)
    
    return processor_types

List Processor Types

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorTypesRequest

def list_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
    """
    List available processor types for creation.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        
    Returns:
        list[ProcessorType]: Available processor types
    """
    client = DocumentProcessorServiceClient()
    
    # Build parent path
    parent = client.common_location_path(project_id, location)
    
    # Create request
    request = ListProcessorTypesRequest(parent=parent)
    
    # List processor types
    response = client.list_processor_types(request=request)
    
    processor_types = []
    for processor_type in response.processor_types:
        processor_types.append(processor_type)
    
    return processor_types

def display_processor_types(processor_types: list["ProcessorType"]) -> None:
    """
    Display available processor types.
    
    Args:
        processor_types: List of ProcessorType objects
    """
    print(f"Available processor types ({len(processor_types)}):")
    print("-" * 60)
    
    # Group by category for better display
    categories = {}
    for proc_type in processor_types:
        category = proc_type.category
        if category not in categories:
            categories[category] = []
        categories[category].append(proc_type)
    
    for category, types in categories.items():
        print(f"\n{category}:")
        for proc_type in types:
            print(f"  - {proc_type.display_name}")
            print(f"    Type: {proc_type.type_}")
            if proc_type.allow_creation:
                print("    ✓ Available for creation")
            print()

Get Specific Processor

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import GetProcessorRequest

def get_processor(project_id: str, location: str, processor_id: str) -> "Processor":
    """
    Get details of a specific processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        
    Returns:
        Processor: Processor details
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor name
    name = client.processor_path(project_id, location, processor_id)
    
    # Create request
    request = GetProcessorRequest(name=name)
    
    # Get processor
    processor = client.get_processor(request=request)
    
    return processor

Create Processors

Create New Processor

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import CreateProcessorRequest, Processor

def create_processor(
    project_id: str,
    location: str,
    display_name: str,
    processor_type: str
) -> "Processor":
    """
    Create a new document processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location  
        display_name: Human-readable name for the processor
        processor_type: Type of processor to create (e.g., 'OCR_PROCESSOR')
        
    Returns:
        Processor: Created processor instance
    """
    client = DocumentProcessorServiceClient()
    
    # Build parent path
    parent = client.common_location_path(project_id, location)
    
    # Create processor object
    processor = Processor(
        display_name=display_name,
        type_=processor_type
    )
    
    # Create request
    request = CreateProcessorRequest(
        parent=parent,
        processor=processor
    )
    
    # Create processor
    created_processor = client.create_processor(request=request)
    
    print(f"Created processor: {created_processor.display_name}")
    print(f"Processor ID: {created_processor.name.split('/')[-1]}")
    
    return created_processor

def create_common_processors(project_id: str, location: str) -> dict[str, "Processor"]:
    """
    Create commonly used processors.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        
    Returns:
        dict[str, Processor]: Created processors by type
    """
    processors = {}
    
    # Common processor types
    common_types = [
        ("OCR_PROCESSOR", "General OCR Processor"),
        ("FORM_PARSER_PROCESSOR", "Form Parser"),
        ("INVOICE_PROCESSOR", "Invoice Processor"),
        ("RECEIPT_PROCESSOR", "Receipt Processor")
    ]
    
    for processor_type, display_name in common_types:
        try:
            processor = create_processor(
                project_id=project_id,
                location=location,
                display_name=display_name,
                processor_type=processor_type
            )
            processors[processor_type] = processor
        except Exception as e:
            print(f"Failed to create {processor_type}: {e}")
    
    return processors

Processor State Management

Enable/Disable Processors

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
    EnableProcessorRequest,
    DisableProcessorRequest
)

def enable_processor(project_id: str, location: str, processor_id: str) -> "EnableProcessorResponse":
    """
    Enable a disabled processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID to enable
        
    Returns:
        EnableProcessorResponse: Operation response
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor name
    name = client.processor_path(project_id, location, processor_id)
    
    # Create request
    request = EnableProcessorRequest(name=name)
    
    # Enable processor (this is a long-running operation)
    operation = client.enable_processor(request=request)
    
    print(f"Enabling processor {processor_id}...")
    
    # Wait for operation to complete
    response = operation.result()
    
    print(f"Processor {processor_id} enabled successfully")
    return response

def disable_processor(project_id: str, location: str, processor_id: str) -> "DisableProcessorResponse":
    """
    Disable an active processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID to disable
        
    Returns:
        DisableProcessorResponse: Operation response
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor name
    name = client.processor_path(project_id, location, processor_id)
    
    # Create request
    request = DisableProcessorRequest(name=name)
    
    # Disable processor (this is a long-running operation)
    operation = client.disable_processor(request=request)
    
    print(f"Disabling processor {processor_id}...")
    
    # Wait for operation to complete
    response = operation.result()
    
    print(f"Processor {processor_id} disabled successfully")
    return response

Delete Processors

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import DeleteProcessorRequest

def delete_processor(project_id: str, location: str, processor_id: str) -> None:
    """
    Delete a processor permanently.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID to delete
        
    Note:
        This operation is irreversible. Ensure the processor is disabled first.
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor name
    name = client.processor_path(project_id, location, processor_id)
    
    # First, ensure processor is disabled
    try:
        processor = get_processor(project_id, location, processor_id)
        if processor.state == "ENABLED":
            print("Processor is enabled. Disabling first...")
            disable_processor(project_id, location, processor_id)
    except Exception as e:
        print(f"Warning: Could not check processor state: {e}")
    
    # Create delete request
    request = DeleteProcessorRequest(name=name)
    
    # Delete processor (this is a long-running operation)
    operation = client.delete_processor(request=request)
    
    print(f"Deleting processor {processor_id}...")
    
    # Wait for operation to complete
    operation.result()
    
    print(f"Processor {processor_id} deleted successfully")

Processor Version Management

List Processor Versions

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorVersionsRequest

def list_processor_versions(
    project_id: str, 
    location: str, 
    processor_id: str
) -> list["ProcessorVersion"]:
    """
    List all versions of a processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        
    Returns:
        list[ProcessorVersion]: List of processor versions
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor path as parent
    parent = client.processor_path(project_id, location, processor_id)
    
    # Create request
    request = ListProcessorVersionsRequest(parent=parent)
    
    # List versions
    response = client.list_processor_versions(request=request)
    
    versions = []
    for version in response.processor_versions:
        versions.append(version)
    
    return versions

def display_processor_versions(versions: list["ProcessorVersion"]) -> None:
    """
    Display processor version information.
    
    Args:
        versions: List of ProcessorVersion objects
    """
    print(f"Found {len(versions)} processor versions:")
    print("-" * 70)
    
    for version in versions:
        version_id = version.name.split('/')[-1]
        print(f"Version ID: {version_id}")
        print(f"Display Name: {version.display_name}")
        print(f"State: {version.state}")
        print(f"Created: {version.create_time}")
        
        if version.model_type:
            print(f"Model Type: {version.model_type}")
        
        if version.latest_evaluation:
            print(f"Latest Evaluation: {version.latest_evaluation}")
        
        print("-" * 70)

Deploy Processor Versions

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import DeployProcessorVersionRequest

def deploy_processor_version(
    project_id: str,
    location: str, 
    processor_id: str,
    version_id: str
) -> "DeployProcessorVersionResponse":
    """
    Deploy a processor version for serving.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID to deploy
        
    Returns:
        DeployProcessorVersionResponse: Deployment response
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor version name
    name = client.processor_version_path(
        project_id, location, processor_id, version_id
    )
    
    # Create request
    request = DeployProcessorVersionRequest(name=name)
    
    # Deploy version (this is a long-running operation)
    operation = client.deploy_processor_version(request=request)
    
    print(f"Deploying processor version {version_id}...")
    
    # Wait for operation to complete
    response = operation.result()
    
    print(f"Processor version {version_id} deployed successfully")
    return response

def undeploy_processor_version(
    project_id: str,
    location: str,
    processor_id: str, 
    version_id: str
) -> "UndeployProcessorVersionResponse":
    """
    Undeploy a processor version from serving.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID to undeploy
        
    Returns:
        UndeployProcessorVersionResponse: Undeploy response
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor version name
    name = client.processor_version_path(
        project_id, location, processor_id, version_id
    )
    
    # Create request
    request = UndeployProcessorVersionRequest(name=name)
    
    # Undeploy version (this is a long-running operation)
    operation = client.undeploy_processor_version(request=request)
    
    print(f"Undeploying processor version {version_id}...")
    
    # Wait for operation to complete
    response = operation.result()
    
    print(f"Processor version {version_id} undeployed successfully")
    return response

Set Default Processor Version

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import SetDefaultProcessorVersionRequest

def set_default_processor_version(
    project_id: str,
    location: str,
    processor_id: str,
    version_id: str
) -> "SetDefaultProcessorVersionResponse":
    """
    Set the default version for a processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID to set as default
        
    Returns:
        SetDefaultProcessorVersionResponse: Response with updated processor
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor path
    processor_name = client.processor_path(project_id, location, processor_id)
    
    # Build version path
    version_name = client.processor_version_path(
        project_id, location, processor_id, version_id
    )
    
    # Create request
    request = SetDefaultProcessorVersionRequest(
        processor=processor_name,
        default_processor_version=version_name
    )
    
    # Set default version (this is a long-running operation)
    operation = client.set_default_processor_version(request=request)
    
    print(f"Setting default version to {version_id}...")
    
    # Wait for operation to complete
    response = operation.result()
    
    print(f"Default version set to {version_id} successfully")
    return response

Custom Processor Training

Train Processor Version

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
    TrainProcessorVersionRequest,
    DocumentSchema
)

def train_processor_version(
    project_id: str,
    location: str,
    processor_id: str,
    version_display_name: str,
    training_dataset: str,
    test_dataset: str = None,
    document_schema: "DocumentSchema" = None
) -> "TrainProcessorVersionResponse":
    """
    Train a new version of a custom processor.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID to train
        version_display_name: Display name for new version
        training_dataset: Path to training dataset
        test_dataset: Optional path to test dataset
        document_schema: Optional document schema for training
        
    Returns:
        TrainProcessorVersionResponse: Training response with new version
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor path as parent
    parent = client.processor_path(project_id, location, processor_id)
    
    # Create processor version configuration
    processor_version = {
        "display_name": version_display_name
    }
    
    # Add document schema if provided
    if document_schema:
        processor_version["document_schema"] = document_schema
    
    # Create training input configuration
    input_data = {
        "training_documents": {
            "gcs_prefix": {"gcs_uri_prefix": training_dataset}
        }
    }
    
    # Add test dataset if provided
    if test_dataset:
        input_data["test_documents"] = {
            "gcs_prefix": {"gcs_uri_prefix": test_dataset}
        }
    
    # Create request
    request = TrainProcessorVersionRequest(
        parent=parent,
        processor_version=processor_version,
        input_data=input_data
    )
    
    # Start training (this is a long-running operation)
    operation = client.train_processor_version(request=request)
    
    print(f"Starting training for processor version: {version_display_name}")
    print("This operation may take several hours to complete...")
    
    # For production, you'd typically not wait for completion here
    # Instead, you'd check the operation status periodically
    print(f"Training operation name: {operation.operation.name}")
    
    return operation

def check_training_progress(operation_name: str) -> dict:
    """
    Check the progress of a training operation.
    
    Args:
        operation_name: Name of the training operation
        
    Returns:
        dict: Operation status and progress information
    """
    from google.api_core import operations_v1
    from google.auth import default
    
    # Get credentials and create operations client
    credentials, project = default()
    operations_client = operations_v1.OperationsClient(credentials=credentials)
    
    # Get operation status
    operation = operations_client.get_operation(name=operation_name)
    
    status_info = {
        "name": operation.name,
        "done": operation.done,
        "metadata": None,
        "result": None,
        "error": None
    }
    
    if operation.metadata:
        # Parse metadata for progress information
        status_info["metadata"] = operation.metadata
    
    if operation.done:
        if operation.error:
            status_info["error"] = operation.error
        else:
            status_info["result"] = operation.response
    
    return status_info

Processor Evaluation

Evaluate Processor Performance

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import (
    EvaluateProcessorVersionRequest,
    EvaluationReference
)

def evaluate_processor_version(
    project_id: str,
    location: str,
    processor_id: str,
    version_id: str,
    evaluation_documents: str
) -> "EvaluateProcessorVersionResponse":
    """
    Evaluate the performance of a processor version.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID to evaluate
        evaluation_documents: GCS path to evaluation documents
        
    Returns:
        EvaluateProcessorVersionResponse: Evaluation response
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor version name
    processor_version = client.processor_version_path(
        project_id, location, processor_id, version_id
    )
    
    # Create evaluation documents configuration
    evaluation_documents_config = {
        "gcs_prefix": {"gcs_uri_prefix": evaluation_documents}
    }
    
    # Create request
    request = EvaluateProcessorVersionRequest(
        processor_version=processor_version,
        evaluation_documents=evaluation_documents_config
    )
    
    # Start evaluation (this is a long-running operation)
    operation = client.evaluate_processor_version(request=request)
    
    print(f"Starting evaluation for processor version {version_id}...")
    
    # Wait for evaluation to complete
    response = operation.result()
    
    print("Evaluation completed successfully")
    return response

def list_evaluations(
    project_id: str,
    location: str, 
    processor_id: str,
    version_id: str
) -> list["Evaluation"]:
    """
    List all evaluations for a processor version.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID
        
    Returns:
        list[Evaluation]: List of evaluation results
    """
    client = DocumentProcessorServiceClient()
    
    # Build processor version path as parent
    parent = client.processor_version_path(
        project_id, location, processor_id, version_id
    )
    
    # Create request
    request = ListEvaluationsRequest(parent=parent)
    
    # List evaluations
    response = client.list_evaluations(request=request)
    
    evaluations = []
    for evaluation in response.evaluations:
        evaluations.append(evaluation)
    
    return evaluations

def get_evaluation_details(
    project_id: str,
    location: str,
    processor_id: str,
    version_id: str, 
    evaluation_id: str
) -> "Evaluation":
    """
    Get detailed evaluation results.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        version_id: Version ID
        evaluation_id: Evaluation ID
        
    Returns:
        Evaluation: Detailed evaluation results
    """
    client = DocumentProcessorServiceClient()
    
    # Build evaluation name
    name = client.evaluation_path(
        project_id, location, processor_id, version_id, evaluation_id
    )
    
    # Create request
    request = GetEvaluationRequest(name=name)
    
    # Get evaluation
    evaluation = client.get_evaluation(request=request)
    
    return evaluation

Complete Processor Management Example

def complete_processor_management_example():
    """
    Complete example demonstrating processor lifecycle management.
    """
    project_id = "my-project"
    location = "us"
    
    client = DocumentProcessorServiceClient()
    
    # 1. List existing processors
    print("=== LISTING PROCESSORS ===")
    processors = list_processors(project_id, location)
    display_processor_info(processors)
    
    # 2. Create a new processor if needed
    print("\n=== CREATING PROCESSOR ===")
    processor = create_processor(
        project_id=project_id,
        location=location,
        display_name="My Custom Invoice Processor",
        processor_type="INVOICE_PROCESSOR"
    )
    processor_id = processor.name.split('/')[-1]
    
    # 3. Enable the processor
    print("\n=== ENABLING PROCESSOR ===")
    enable_processor(project_id, location, processor_id)
    
    # 4. List processor versions
    print("\n=== LISTING VERSIONS ===")
    versions = list_processor_versions(project_id, location, processor_id)
    display_processor_versions(versions)
    
    # 5. Get processor details
    print("\n=== PROCESSOR DETAILS ===")
    processor_details = get_processor(project_id, location, processor_id)
    print(f"Processor State: {processor_details.state}")
    print(f"Default Version: {processor_details.default_processor_version}")
    
    # 6. Evaluate processor (if evaluation data available)
    # evaluation_gcs_path = "gs://my-bucket/evaluation-docs/"
    # evaluation = evaluate_processor_version(
    #     project_id, location, processor_id, version_id, evaluation_gcs_path
    # )
    
    print("\nProcessor management example completed!")

if __name__ == "__main__":
    complete_processor_management_example()

This comprehensive guide covers all aspects of processor management in Google Cloud Document AI, from basic operations to advanced training and evaluation workflows.

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-documentai

docs

batch-operations.md

beta-features.md

document-processing.md

document-types.md

index.md

processor-management.md

tile.json