Google Cloud Document AI client library for extracting structured information from documents using machine learning
—
This guide covers beta features available in the v1beta3 API of Google Cloud Document AI, including dataset management, enhanced document processing, and experimental capabilities.
⚠️ Beta Notice: These features are in beta and may change or be removed in future versions. Use with caution in production environments.
The v1beta3 API includes all v1 functionality plus additional experimental features:
# V1 (Stable) - Production ready
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import Document, ProcessRequest
# V1beta3 (Beta) - Includes experimental features
from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient as DocumentProcessorServiceClientBeta
from google.cloud.documentai_v1beta3 import DocumentServiceClient # Beta-only service
from google.cloud.documentai_v1beta3.types import Dataset, DatasetSchema # Beta-only types# Beta document processing (enhanced features)
from google.cloud.documentai_v1beta3 import (
DocumentProcessorServiceClient,
DocumentProcessorServiceAsyncClient
)
# Beta dataset management (exclusive to v1beta3)
from google.cloud.documentai_v1beta3 import (
DocumentServiceClient,
DocumentServiceAsyncClient
)
# Beta-specific types
from google.cloud.documentai_v1beta3.types import (
# Dataset types
Dataset,
DatasetSchema,
DocumentMetadata,
DocumentId,
BatchDatasetDocuments,
DocumentPageRange,
# Enhanced document types
RevisionRef,
# Schema enhancements
EntityTypeMetadata,
FieldExtractionMetadata,
PropertyMetadata,
SummaryOptions,
# Additional beta request/response types
ImportProcessorVersionRequest,
ImportProcessorVersionResponse,
ImportProcessorVersionMetadata,
BatchDeleteDocumentsRequest,
BatchDeleteDocumentsResponse,
BatchDeleteDocumentsMetadata,
UpdateDatasetRequest,
UpdateDatasetSchemaRequest,
GetDatasetSchemaRequest,
GetDocumentRequest,
GetDocumentResponse,
ImportDocumentsRequest,
ImportDocumentsResponse,
ImportDocumentsMetadata,
UpdateDatasetOperationMetadata,
# Enums
DatasetSplitType,
DocumentLabelingState
)The DocumentServiceClient is exclusive to v1beta3 and provides dataset management capabilities for custom processor training.
from google.cloud.documentai_v1beta3 import DocumentServiceClient
from google.cloud.documentai_v1beta3.types import (
Dataset,
DatasetSchema,
UpdateDatasetRequest,
ImportDocumentsRequest,
GetDocumentRequest,
ListDocumentsRequest,
BatchDeleteDocumentsRequest
)
class DocumentServiceClient:
"""
Service for managing datasets and documents for training custom processors.
This service is only available in v1beta3 API.
"""
def update_dataset(
self,
request: UpdateDatasetRequest,
**kwargs
) -> Dataset:
"""
Update dataset configuration.
Args:
request: Update dataset request with dataset and field mask
Returns:
Dataset: Updated dataset object
"""
pass
def import_documents(
self,
request: ImportDocumentsRequest,
**kwargs
) -> "Operation":
"""
Import documents into a dataset for training.
Args:
request: Import documents request
Returns:
Operation: Long-running operation for import process
"""
pass
def get_document(
self,
request: GetDocumentRequest,
**kwargs
) -> "Document":
"""
Get document metadata and content from dataset.
Args:
request: Get document request with document name
Returns:
Document: Document object with metadata
"""
pass
def list_documents(
self,
request: ListDocumentsRequest,
**kwargs
) -> "ListDocumentsResponse":
"""
List documents in a dataset.
Args:
request: List documents request with parent dataset
Returns:
ListDocumentsResponse: Paginated list of documents
"""
pass
def batch_delete_documents(
self,
request: BatchDeleteDocumentsRequest,
**kwargs
) -> "Operation":
"""
Delete multiple documents from dataset.
Args:
request: Batch delete request with document names
Returns:
Operation: Long-running operation for deletion
"""
pass
def get_dataset_schema(
self,
request: "GetDatasetSchemaRequest",
**kwargs
) -> DatasetSchema:
"""
Get dataset schema definition.
Args:
request: Get schema request
Returns:
DatasetSchema: Schema definition for dataset
"""
pass
def update_dataset_schema(
self,
request: "UpdateDatasetSchemaRequest",
**kwargs
) -> DatasetSchema:
"""
Update dataset schema definition.
Args:
request: Update schema request
Returns:
DatasetSchema: Updated schema definition
"""
pass
# Example usage
client = DocumentServiceClient()
# List documents in a dataset
parent = "projects/my-project/locations/us/processors/abc123/dataset"
request = ListDocumentsRequest(parent=parent)
response = client.list_documents(request=request)
for document_metadata in response.document_metadata:
print(f"Document: {document_metadata.document_id.document_id}")
print(f"State: {document_metadata.labeling_state}")from google.cloud.documentai_v1beta3.types import Dataset
class Dataset:
"""
A dataset of documents for training custom processors.
Attributes:
name (str): Dataset resource name
display_name (str): Human-readable name
description (str): Dataset description
document_count (int): Number of documents in dataset
satisfies_pzs (bool): Reserved for future use
satisfies_pzi (bool): Reserved for future use
"""
def __init__(
self,
display_name: str,
description: str = ""
):
"""
Initialize dataset.
Args:
display_name: Human-readable dataset name
description: Optional description
"""
self.display_name = display_name
self.description = description
# Example usage
def create_training_dataset_config(
display_name: str,
description: str
) -> Dataset:
"""
Create dataset configuration for custom processor training.
Args:
display_name: Dataset name
description: Dataset description
Returns:
Dataset: Dataset configuration
"""
return Dataset(
display_name=display_name,
description=description
)from google.cloud.documentai_v1beta3.types import DatasetSchema
class DatasetSchema:
"""
Schema definition for a dataset, specifying entity types and structure.
Attributes:
name (str): Schema resource name
display_name (str): Human-readable schema name
description (str): Schema description
entity_types (Sequence[DocumentSchema.EntityType]): Entity types in schema
"""
def __init__(
self,
display_name: str,
entity_types: list["DocumentSchema.EntityType"],
description: str = ""
):
"""
Initialize dataset schema.
Args:
display_name: Schema name
entity_types: List of entity type definitions
description: Optional description
"""
self.display_name = display_name
self.entity_types = entity_types
self.description = description
def create_custom_schema(
schema_name: str,
entity_definitions: list[dict]
) -> DatasetSchema:
"""
Create custom dataset schema for training.
Args:
schema_name: Name for the schema
entity_definitions: List of entity type definitions
Returns:
DatasetSchema: Custom schema definition
"""
from google.cloud.documentai_v1beta3.types import DocumentSchema
entity_types = []
for entity_def in entity_definitions:
properties = []
for prop_def in entity_def.get("properties", []):
prop = DocumentSchema.EntityType.Property(
name=prop_def["name"],
display_name=prop_def["display_name"],
value_type=prop_def["value_type"],
occurrence_type=prop_def.get("occurrence_type", "OPTIONAL_ONCE")
)
properties.append(prop)
entity_type = DocumentSchema.EntityType(
name=entity_def["name"],
display_name=entity_def["display_name"],
properties=properties
)
entity_types.append(entity_type)
return DatasetSchema(
display_name=schema_name,
entity_types=entity_types,
description=f"Custom schema: {schema_name}"
)
# Example usage
entity_definitions = [
{
"name": "contract_date",
"display_name": "Contract Date",
"properties": [
{
"name": "date_value",
"display_name": "Date Value",
"value_type": "date",
"occurrence_type": "REQUIRED_ONCE"
}
]
},
{
"name": "contract_parties",
"display_name": "Contract Parties",
"properties": [
{
"name": "party_name",
"display_name": "Party Name",
"value_type": "text",
"occurrence_type": "REQUIRED_MULTIPLE"
}
]
}
]
schema = create_custom_schema("Contract Analysis Schema", entity_definitions)from google.cloud.documentai_v1beta3.types import (
DocumentMetadata,
DocumentId,
DocumentLabelingState
)
class DocumentMetadata:
"""
Metadata for documents in a dataset.
Attributes:
document_id (DocumentId): Document identifier
page_count (int): Number of pages in document
dataset_type (DatasetSplitType): Dataset split type (TRAIN, TEST, etc.)
labeling_state (DocumentLabelingState): Document labeling status
display_name (str): Human-readable document name
"""
pass
class DocumentId:
"""
Identifier for a document within a dataset.
Attributes:
gcs_managed_doc_id (str): Cloud Storage managed document ID
unmanaged_doc_id (str): User-managed document ID
revision_ref (RevisionRef): Reference to document revision
"""
pass
class DocumentLabelingState:
"""
Enum describing document labeling status.
Values:
DOCUMENT_LABELING_STATE_UNSPECIFIED: Unspecified state
DOCUMENT_LABELED: Document is labeled
DOCUMENT_UNLABELED: Document is not labeled
DOCUMENT_AUTO_LABELED: Document is auto-labeled
"""
DOCUMENT_LABELING_STATE_UNSPECIFIED = 0
DOCUMENT_LABELED = 1
DOCUMENT_UNLABELED = 2
DOCUMENT_AUTO_LABELED = 3
# Example usage
def list_dataset_documents(
client: DocumentServiceClient,
project_id: str,
location: str,
processor_id: str
) -> list[DocumentMetadata]:
"""
List all documents in a dataset with metadata.
Args:
client: DocumentServiceClient instance
project_id: Google Cloud project ID
location: Processor location
processor_id: Processor ID
Returns:
list[DocumentMetadata]: List of document metadata
"""
from google.cloud.documentai_v1beta3.types import ListDocumentsRequest
# Build dataset parent path
parent = f"projects/{project_id}/locations/{location}/processors/{processor_id}/dataset"
request = ListDocumentsRequest(parent=parent)
response = client.list_documents(request=request)
documents = []
for doc_metadata in response.document_metadata:
documents.append(doc_metadata)
return documents
def filter_labeled_documents(
document_metadata_list: list[DocumentMetadata]
) -> list[DocumentMetadata]:
"""
Filter documents that are labeled and ready for training.
Args:
document_metadata_list: List of document metadata
Returns:
list[DocumentMetadata]: Filtered labeled documents
"""
return [
doc for doc in document_metadata_list
if doc.labeling_state == DocumentLabelingState.DOCUMENT_LABELED
]from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient
from google.cloud.documentai_v1beta3.types import ImportProcessorVersionRequest
def import_processor_version(
project_id: str,
location: str,
processor_id: str,
source_processor_version: str
) -> "Operation":
"""
Import a processor version from another location or project (beta feature).
Args:
project_id: Target project ID
location: Target location
processor_id: Target processor ID
source_processor_version: Source processor version to import
Returns:
Operation: Long-running operation for import
"""
client = DocumentProcessorServiceClient()
# Build parent processor path
parent = client.processor_path(project_id, location, processor_id)
# Create import request
request = ImportProcessorVersionRequest(
parent=parent,
processor_version_source=source_processor_version
)
# Start import operation
operation = client.import_processor_version(request=request)
print(f"Importing processor version...")
print(f"Operation: {operation.operation.name}")
return operation
# Example usage
operation = import_processor_version(
project_id="target-project",
location="us",
processor_id="target-processor-id",
source_processor_version="projects/source-project/locations/eu/processors/source-id/processorVersions/version-id"
)
# Monitor import progress
result = operation.result() # Wait for completion
print(f"Import completed: {result}")from google.cloud.documentai_v1beta3.types import EntityTypeMetadata
class EntityTypeMetadata:
"""
Metadata for entity types in document schema (beta feature).
Attributes:
inactive (bool): Whether entity type is inactive
description (str): Description of the entity type
"""
def __init__(self, description: str = "", inactive: bool = False):
"""
Initialize entity type metadata.
Args:
description: Entity type description
inactive: Whether entity type is inactive
"""
self.description = description
self.inactive = inactivefrom google.cloud.documentai_v1beta3.types import SummaryOptions
class SummaryOptions:
"""
Options for document summarization (beta feature).
Attributes:
length (SummaryOptions.Length): Summary length preference
format_ (SummaryOptions.Format): Summary format preference
"""
class Length:
"""Summary length options."""
BRIEF = "BRIEF"
MODERATE = "MODERATE"
COMPREHENSIVE = "COMPREHENSIVE"
class Format:
"""Summary format options."""
PARAGRAPH = "PARAGRAPH"
BULLETS = "BULLETS"
STRUCTURED = "STRUCTURED"
def __init__(
self,
length: str = "MODERATE",
format_: str = "PARAGRAPH"
):
"""
Initialize summary options.
Args:
length: Summary length preference
format_: Summary format preference
"""
self.length = length
self.format_ = format_from google.cloud.documentai_v1beta3.types import DatasetSplitType
class DatasetSplitType:
"""
Enum for dataset split types used in training (beta feature).
Values:
DATASET_SPLIT_TYPE_UNSPECIFIED: Unspecified split type
DATASET_SPLIT_TRAIN: Training dataset
DATASET_SPLIT_TEST: Test dataset
DATASET_SPLIT_UNASSIGNED: Unassigned documents
"""
DATASET_SPLIT_TYPE_UNSPECIFIED = 0
DATASET_SPLIT_TRAIN = 1
DATASET_SPLIT_TEST = 2
DATASET_SPLIT_UNASSIGNED = 3
def categorize_documents_by_split(
document_metadata_list: list[DocumentMetadata]
) -> dict[str, list[DocumentMetadata]]:
"""
Categorize documents by their dataset split type.
Args:
document_metadata_list: List of document metadata
Returns:
dict: Documents organized by split type
"""
categorized = {
"train": [],
"test": [],
"unassigned": [],
"unspecified": []
}
for doc in document_metadata_list:
if doc.dataset_type == DatasetSplitType.DATASET_SPLIT_TRAIN:
categorized["train"].append(doc)
elif doc.dataset_type == DatasetSplitType.DATASET_SPLIT_TEST:
categorized["test"].append(doc)
elif doc.dataset_type == DatasetSplitType.DATASET_SPLIT_UNASSIGNED:
categorized["unassigned"].append(doc)
else:
categorized["unspecified"].append(doc)
return categorizedfrom google.cloud.documentai_v1beta3 import (
DocumentServiceClient,
DocumentProcessorServiceClient
)
from google.cloud.documentai_v1beta3.types import (
Dataset,
DatasetSchema,
ImportDocumentsRequest,
TrainProcessorVersionRequest,
ListDocumentsRequest
)
def complete_custom_training_workflow(
project_id: str,
location: str,
processor_type: str = "CUSTOM_EXTRACTION_PROCESSOR"
):
"""
Complete workflow for training a custom processor using beta features.
Args:
project_id: Google Cloud project ID
location: Processing location
processor_type: Type of custom processor to train
"""
# Initialize clients
doc_service = DocumentServiceClient()
processor_service = DocumentProcessorServiceClient()
print("=== CUSTOM PROCESSOR TRAINING WORKFLOW ===")
# Step 1: Create processor for training
print("1. Creating custom processor...")
from google.cloud.documentai_v1beta3.types import CreateProcessorRequest, Processor
parent = processor_service.common_location_path(project_id, location)
processor = Processor(
display_name="Custom Contract Processor",
type_=processor_type
)
create_request = CreateProcessorRequest(
parent=parent,
processor=processor
)
created_processor = processor_service.create_processor(request=create_request)
processor_id = created_processor.name.split('/')[-1]
print(f"Created processor: {processor_id}")
# Step 2: Setup dataset schema
print("2. Creating dataset schema...")
entity_definitions = [
{
"name": "contract_date",
"display_name": "Contract Date",
"properties": [
{
"name": "date_value",
"display_name": "Date Value",
"value_type": "date",
"occurrence_type": "REQUIRED_ONCE"
}
]
},
{
"name": "contract_value",
"display_name": "Contract Value",
"properties": [
{
"name": "money_value",
"display_name": "Money Value",
"value_type": "money",
"occurrence_type": "REQUIRED_ONCE"
}
]
},
{
"name": "party_names",
"display_name": "Party Names",
"properties": [
{
"name": "text_value",
"display_name": "Text Value",
"value_type": "text",
"occurrence_type": "REQUIRED_MULTIPLE"
}
]
}
]
schema = create_custom_schema("Contract Schema", entity_definitions)
# Step 3: Import training documents
print("3. Importing training documents...")
dataset_parent = f"projects/{project_id}/locations/{location}/processors/{processor_id}/dataset"
# Configure document import from Cloud Storage
batch_documents_input_config = {
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/"}
}
import_request = ImportDocumentsRequest(
dataset=dataset_parent,
batch_documents_input_config=batch_documents_input_config
)
import_operation = doc_service.import_documents(request=import_request)
print("Importing documents...")
import_result = import_operation.result() # Wait for completion
print("Documents imported successfully")
# Step 4: Check dataset status
print("4. Checking dataset status...")
list_request = ListDocumentsRequest(parent=dataset_parent)
list_response = doc_service.list_documents(request=list_request)
total_docs = len(list_response.document_metadata)
labeled_docs = len(filter_labeled_documents(list_response.document_metadata))
print(f"Total documents: {total_docs}")
print(f"Labeled documents: {labeled_docs}")
# Step 5: Train processor version (if sufficient labeled data)
if labeled_docs >= 10: # Minimum for training
print("5. Starting processor training...")
processor_parent = processor_service.processor_path(
project_id, location, processor_id
)
train_request = TrainProcessorVersionRequest(
parent=processor_parent,
processor_version={
"display_name": "Contract Processor v1.0",
"document_schema": schema
},
input_data={
"training_documents": {
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/labeled/"}
},
"test_documents": {
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/test/"}
}
}
)
train_operation = processor_service.train_processor_version(request=train_request)
print(f"Training started: {train_operation.operation.name}")
print("Training typically takes several hours. Monitor progress using the operation name.")
else:
print(f"Insufficient labeled documents ({labeled_docs}). Need at least 10 for training.")
return {
"processor_id": processor_id,
"dataset_parent": dataset_parent,
"total_documents": total_docs,
"labeled_documents": labeled_docs
}def monitor_beta_operations(
project_id: str,
location: str
) -> dict:
"""
Monitor various beta operations and provide status.
Args:
project_id: Google Cloud project ID
location: Processing location
Returns:
dict: Status of beta operations
"""
from google.api_core import operations_v1
from google.auth import default
credentials, _ = default()
operations_client = operations_v1.OperationsClient(credentials=credentials)
# List all operations for the location
name = f"projects/{project_id}/locations/{location}"
beta_operations = {
"import_documents": [],
"train_processor": [],
"import_processor_version": [],
"other": []
}
for operation in operations_client.list_operations(name=name):
op_info = {
"name": operation.name,
"done": operation.done,
"error": operation.error.message if operation.error else None
}
# Categorize by operation type
if "importDocuments" in operation.name:
beta_operations["import_documents"].append(op_info)
elif "trainProcessorVersion" in operation.name:
beta_operations["train_processor"].append(op_info)
elif "importProcessorVersion" in operation.name:
beta_operations["import_processor_version"].append(op_info)
else:
beta_operations["other"].append(op_info)
return beta_operations
def print_beta_status(project_id: str, location: str):
"""Print status of beta operations."""
status = monitor_beta_operations(project_id, location)
print("=== BETA OPERATIONS STATUS ===")
for op_type, operations in status.items():
print(f"\n{op_type.replace('_', ' ').title()} Operations ({len(operations)}):")
for op in operations:
status_text = "✓ Complete" if op["done"] else "⏳ Running"
error_text = f" (Error: {op['error']})" if op["error"] else ""
print(f" - {op['name'].split('/')[-1]}: {status_text}{error_text}")
# Example usage
if __name__ == "__main__":
# Run custom training workflow
result = complete_custom_training_workflow(
project_id="my-project",
location="us"
)
print(f"\nWorkflow completed:")
print(f"Processor ID: {result['processor_id']}")
print(f"Dataset: {result['dataset_parent']}")
print(f"Documents: {result['labeled_documents']}/{result['total_documents']} labeled")
# Monitor operations
print_beta_status("my-project", "us")# V1 API (stable) - continues to work
from google.cloud.documentai import DocumentProcessorServiceClient as V1Client
from google.cloud.documentai.types import ProcessRequest as V1ProcessRequest
# V1beta3 API (beta) - includes all v1 functionality + beta features
from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient as V1Beta3Client
from google.cloud.documentai_v1beta3.types import ProcessRequest as V1Beta3ProcessRequest
def migrate_to_beta_client():
"""
Example showing migration from v1 to v1beta3 client.
V1beta3 client is backward compatible with v1 API calls.
"""
# V1 approach (still works)
v1_client = V1Client()
# V1beta3 approach (recommended for new features)
v1beta3_client = V1Beta3Client()
# Both clients support the same core operations
processor_name = "projects/my-project/locations/us/processors/abc123"
# Same request works with both clients
from google.cloud.documentai_v1beta3.types import RawDocument
raw_doc = RawDocument(content=b"document content", mime_type="application/pdf")
request = V1Beta3ProcessRequest(name=processor_name, raw_document=raw_doc)
# Both calls work identically
v1_result = v1_client.process_document(request=request)
v1beta3_result = v1beta3_client.process_document(request=request)
# But only v1beta3 client supports beta features
try:
# This only works with v1beta3 client
import_operation = v1beta3_client.import_processor_version(
# import request
)
print("Beta feature available")
except AttributeError:
print("Beta feature not available in v1 client")
# Best practice: Use v1beta3 for new projects to access all features
def recommended_client_usage():
"""Recommended pattern for using v1beta3 client."""
# Use v1beta3 client for all operations
client = V1Beta3Client()
# Standard operations work normally
# Beta operations are available when needed
return clientThis comprehensive guide covers all beta features available in Google Cloud Document AI v1beta3, including dataset management, enhanced processing capabilities, and migration strategies from the stable v1 API.
Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-documentai