0
# Beta Features (v1beta3)
1
2
This guide covers beta features available in the v1beta3 API of Google Cloud Document AI, including dataset management, enhanced document processing, and experimental capabilities.
3
4
**⚠️ Beta Notice**: These features are in beta and may change or be removed in future versions. Use with caution in production environments.
5
6
## API Versions Overview
7
8
### V1beta3 vs V1 API
9
10
The v1beta3 API includes all v1 functionality plus additional experimental features:
11
12
```python { .api }
13
# V1 (Stable) - Production ready
14
from google.cloud.documentai import DocumentProcessorServiceClient
15
from google.cloud.documentai.types import Document, ProcessRequest
16
17
# V1beta3 (Beta) - Includes experimental features
18
from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient as DocumentProcessorServiceClientBeta
19
from google.cloud.documentai_v1beta3 import DocumentServiceClient # Beta-only service
20
from google.cloud.documentai_v1beta3.types import Dataset, DatasetSchema # Beta-only types
21
```
22
23
### Import Patterns
24
25
```python { .api }
26
# Beta document processing (enhanced features)
27
from google.cloud.documentai_v1beta3 import (
28
DocumentProcessorServiceClient,
29
DocumentProcessorServiceAsyncClient
30
)
31
32
# Beta dataset management (exclusive to v1beta3)
33
from google.cloud.documentai_v1beta3 import (
34
DocumentServiceClient,
35
DocumentServiceAsyncClient
36
)
37
38
# Beta-specific types
39
from google.cloud.documentai_v1beta3.types import (
40
# Dataset types
41
Dataset,
42
DatasetSchema,
43
DocumentMetadata,
44
DocumentId,
45
BatchDatasetDocuments,
46
DocumentPageRange,
47
48
# Enhanced document types
49
RevisionRef,
50
51
# Schema enhancements
52
EntityTypeMetadata,
53
FieldExtractionMetadata,
54
PropertyMetadata,
55
SummaryOptions,
56
57
# Additional beta request/response types
58
ImportProcessorVersionRequest,
59
ImportProcessorVersionResponse,
60
ImportProcessorVersionMetadata,
61
BatchDeleteDocumentsRequest,
62
BatchDeleteDocumentsResponse,
63
BatchDeleteDocumentsMetadata,
64
UpdateDatasetRequest,
65
UpdateDatasetSchemaRequest,
66
GetDatasetSchemaRequest,
67
GetDocumentRequest,
68
GetDocumentResponse,
69
ImportDocumentsRequest,
70
ImportDocumentsResponse,
71
ImportDocumentsMetadata,
72
UpdateDatasetOperationMetadata,
73
74
# Enums
75
DatasetSplitType,
76
DocumentLabelingState
77
)
78
```
79
80
## Dataset Management
81
82
### DocumentServiceClient
83
84
The `DocumentServiceClient` is exclusive to v1beta3 and provides dataset management capabilities for custom processor training.
85
86
```python { .api }
87
from google.cloud.documentai_v1beta3 import DocumentServiceClient
88
from google.cloud.documentai_v1beta3.types import (
89
Dataset,
90
DatasetSchema,
91
UpdateDatasetRequest,
92
ImportDocumentsRequest,
93
GetDocumentRequest,
94
ListDocumentsRequest,
95
BatchDeleteDocumentsRequest
96
)
97
98
class DocumentServiceClient:
99
"""
100
Service for managing datasets and documents for training custom processors.
101
102
This service is only available in v1beta3 API.
103
"""
104
105
def update_dataset(
106
self,
107
request: UpdateDatasetRequest,
108
**kwargs
109
) -> Dataset:
110
"""
111
Update dataset configuration.
112
113
Args:
114
request: Update dataset request with dataset and field mask
115
116
Returns:
117
Dataset: Updated dataset object
118
"""
119
pass
120
121
def import_documents(
122
self,
123
request: ImportDocumentsRequest,
124
**kwargs
125
) -> "Operation":
126
"""
127
Import documents into a dataset for training.
128
129
Args:
130
request: Import documents request
131
132
Returns:
133
Operation: Long-running operation for import process
134
"""
135
pass
136
137
def get_document(
138
self,
139
request: GetDocumentRequest,
140
**kwargs
141
) -> "Document":
142
"""
143
Get document metadata and content from dataset.
144
145
Args:
146
request: Get document request with document name
147
148
Returns:
149
Document: Document object with metadata
150
"""
151
pass
152
153
def list_documents(
154
self,
155
request: ListDocumentsRequest,
156
**kwargs
157
) -> "ListDocumentsResponse":
158
"""
159
List documents in a dataset.
160
161
Args:
162
request: List documents request with parent dataset
163
164
Returns:
165
ListDocumentsResponse: Paginated list of documents
166
"""
167
pass
168
169
def batch_delete_documents(
170
self,
171
request: BatchDeleteDocumentsRequest,
172
**kwargs
173
) -> "Operation":
174
"""
175
Delete multiple documents from dataset.
176
177
Args:
178
request: Batch delete request with document names
179
180
Returns:
181
Operation: Long-running operation for deletion
182
"""
183
pass
184
185
def get_dataset_schema(
186
self,
187
request: "GetDatasetSchemaRequest",
188
**kwargs
189
) -> DatasetSchema:
190
"""
191
Get dataset schema definition.
192
193
Args:
194
request: Get schema request
195
196
Returns:
197
DatasetSchema: Schema definition for dataset
198
"""
199
pass
200
201
def update_dataset_schema(
202
self,
203
request: "UpdateDatasetSchemaRequest",
204
**kwargs
205
) -> DatasetSchema:
206
"""
207
Update dataset schema definition.
208
209
Args:
210
request: Update schema request
211
212
Returns:
213
DatasetSchema: Updated schema definition
214
"""
215
pass
216
217
# Example usage
218
client = DocumentServiceClient()
219
220
# List documents in a dataset
221
parent = "projects/my-project/locations/us/processors/abc123/dataset"
222
request = ListDocumentsRequest(parent=parent)
223
response = client.list_documents(request=request)
224
225
for document_metadata in response.document_metadata:
226
print(f"Document: {document_metadata.document_id.document_id}")
227
print(f"State: {document_metadata.labeling_state}")
228
```
229
230
### Dataset Types
231
232
#### Dataset
233
234
```python { .api }
235
from google.cloud.documentai_v1beta3.types import Dataset
236
237
class Dataset:
238
"""
239
A dataset of documents for training custom processors.
240
241
Attributes:
242
name (str): Dataset resource name
243
display_name (str): Human-readable name
244
description (str): Dataset description
245
document_count (int): Number of documents in dataset
246
satisfies_pzs (bool): Reserved for future use
247
satisfies_pzi (bool): Reserved for future use
248
"""
249
250
def __init__(
251
self,
252
display_name: str,
253
description: str = ""
254
):
255
"""
256
Initialize dataset.
257
258
Args:
259
display_name: Human-readable dataset name
260
description: Optional description
261
"""
262
self.display_name = display_name
263
self.description = description
264
265
# Example usage
266
def create_training_dataset_config(
267
display_name: str,
268
description: str
269
) -> Dataset:
270
"""
271
Create dataset configuration for custom processor training.
272
273
Args:
274
display_name: Dataset name
275
description: Dataset description
276
277
Returns:
278
Dataset: Dataset configuration
279
"""
280
return Dataset(
281
display_name=display_name,
282
description=description
283
)
284
```
285
286
#### DatasetSchema
287
288
```python { .api }
289
from google.cloud.documentai_v1beta3.types import DatasetSchema
290
291
class DatasetSchema:
292
"""
293
Schema definition for a dataset, specifying entity types and structure.
294
295
Attributes:
296
name (str): Schema resource name
297
display_name (str): Human-readable schema name
298
description (str): Schema description
299
entity_types (Sequence[DocumentSchema.EntityType]): Entity types in schema
300
"""
301
302
def __init__(
303
self,
304
display_name: str,
305
entity_types: list["DocumentSchema.EntityType"],
306
description: str = ""
307
):
308
"""
309
Initialize dataset schema.
310
311
Args:
312
display_name: Schema name
313
entity_types: List of entity type definitions
314
description: Optional description
315
"""
316
self.display_name = display_name
317
self.entity_types = entity_types
318
self.description = description
319
320
def create_custom_schema(
321
schema_name: str,
322
entity_definitions: list[dict]
323
) -> DatasetSchema:
324
"""
325
Create custom dataset schema for training.
326
327
Args:
328
schema_name: Name for the schema
329
entity_definitions: List of entity type definitions
330
331
Returns:
332
DatasetSchema: Custom schema definition
333
"""
334
from google.cloud.documentai_v1beta3.types import DocumentSchema
335
336
entity_types = []
337
338
for entity_def in entity_definitions:
339
properties = []
340
341
for prop_def in entity_def.get("properties", []):
342
prop = DocumentSchema.EntityType.Property(
343
name=prop_def["name"],
344
display_name=prop_def["display_name"],
345
value_type=prop_def["value_type"],
346
occurrence_type=prop_def.get("occurrence_type", "OPTIONAL_ONCE")
347
)
348
properties.append(prop)
349
350
entity_type = DocumentSchema.EntityType(
351
name=entity_def["name"],
352
display_name=entity_def["display_name"],
353
properties=properties
354
)
355
entity_types.append(entity_type)
356
357
return DatasetSchema(
358
display_name=schema_name,
359
entity_types=entity_types,
360
description=f"Custom schema: {schema_name}"
361
)
362
363
# Example usage
364
entity_definitions = [
365
{
366
"name": "contract_date",
367
"display_name": "Contract Date",
368
"properties": [
369
{
370
"name": "date_value",
371
"display_name": "Date Value",
372
"value_type": "date",
373
"occurrence_type": "REQUIRED_ONCE"
374
}
375
]
376
},
377
{
378
"name": "contract_parties",
379
"display_name": "Contract Parties",
380
"properties": [
381
{
382
"name": "party_name",
383
"display_name": "Party Name",
384
"value_type": "text",
385
"occurrence_type": "REQUIRED_MULTIPLE"
386
}
387
]
388
}
389
]
390
391
schema = create_custom_schema("Contract Analysis Schema", entity_definitions)
392
```
393
394
### Document Management
395
396
#### DocumentMetadata
397
398
```python { .api }
399
from google.cloud.documentai_v1beta3.types import (
400
DocumentMetadata,
401
DocumentId,
402
DocumentLabelingState
403
)
404
405
class DocumentMetadata:
406
"""
407
Metadata for documents in a dataset.
408
409
Attributes:
410
document_id (DocumentId): Document identifier
411
page_count (int): Number of pages in document
412
dataset_type (DatasetSplitType): Dataset split type (TRAIN, TEST, etc.)
413
labeling_state (DocumentLabelingState): Document labeling status
414
display_name (str): Human-readable document name
415
"""
416
pass
417
418
class DocumentId:
419
"""
420
Identifier for a document within a dataset.
421
422
Attributes:
423
gcs_managed_doc_id (str): Cloud Storage managed document ID
424
unmanaged_doc_id (str): User-managed document ID
425
revision_ref (RevisionRef): Reference to document revision
426
"""
427
pass
428
429
class DocumentLabelingState:
430
"""
431
Enum describing document labeling status.
432
433
Values:
434
DOCUMENT_LABELING_STATE_UNSPECIFIED: Unspecified state
435
DOCUMENT_LABELED: Document is labeled
436
DOCUMENT_UNLABELED: Document is not labeled
437
DOCUMENT_AUTO_LABELED: Document is auto-labeled
438
"""
439
DOCUMENT_LABELING_STATE_UNSPECIFIED = 0
440
DOCUMENT_LABELED = 1
441
DOCUMENT_UNLABELED = 2
442
DOCUMENT_AUTO_LABELED = 3
443
444
# Example usage
445
def list_dataset_documents(
446
client: DocumentServiceClient,
447
project_id: str,
448
location: str,
449
processor_id: str
450
) -> list[DocumentMetadata]:
451
"""
452
List all documents in a dataset with metadata.
453
454
Args:
455
client: DocumentServiceClient instance
456
project_id: Google Cloud project ID
457
location: Processor location
458
processor_id: Processor ID
459
460
Returns:
461
list[DocumentMetadata]: List of document metadata
462
"""
463
from google.cloud.documentai_v1beta3.types import ListDocumentsRequest
464
465
# Build dataset parent path
466
parent = f"projects/{project_id}/locations/{location}/processors/{processor_id}/dataset"
467
468
request = ListDocumentsRequest(parent=parent)
469
response = client.list_documents(request=request)
470
471
documents = []
472
for doc_metadata in response.document_metadata:
473
documents.append(doc_metadata)
474
475
return documents
476
477
def filter_labeled_documents(
478
document_metadata_list: list[DocumentMetadata]
479
) -> list[DocumentMetadata]:
480
"""
481
Filter documents that are labeled and ready for training.
482
483
Args:
484
document_metadata_list: List of document metadata
485
486
Returns:
487
list[DocumentMetadata]: Filtered labeled documents
488
"""
489
return [
490
doc for doc in document_metadata_list
491
if doc.labeling_state == DocumentLabelingState.DOCUMENT_LABELED
492
]
493
```
494
495
## Enhanced Document Processing
496
497
### Import Processor Version (Beta)
498
499
```python { .api }
500
from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient
501
from google.cloud.documentai_v1beta3.types import ImportProcessorVersionRequest
502
503
def import_processor_version(
504
project_id: str,
505
location: str,
506
processor_id: str,
507
source_processor_version: str
508
) -> "Operation":
509
"""
510
Import a processor version from another location or project (beta feature).
511
512
Args:
513
project_id: Target project ID
514
location: Target location
515
processor_id: Target processor ID
516
source_processor_version: Source processor version to import
517
518
Returns:
519
Operation: Long-running operation for import
520
"""
521
client = DocumentProcessorServiceClient()
522
523
# Build parent processor path
524
parent = client.processor_path(project_id, location, processor_id)
525
526
# Create import request
527
request = ImportProcessorVersionRequest(
528
parent=parent,
529
processor_version_source=source_processor_version
530
)
531
532
# Start import operation
533
operation = client.import_processor_version(request=request)
534
535
print(f"Importing processor version...")
536
print(f"Operation: {operation.operation.name}")
537
538
return operation
539
540
# Example usage
541
operation = import_processor_version(
542
project_id="target-project",
543
location="us",
544
processor_id="target-processor-id",
545
source_processor_version="projects/source-project/locations/eu/processors/source-id/processorVersions/version-id"
546
)
547
548
# Monitor import progress
549
result = operation.result() # Wait for completion
550
print(f"Import completed: {result}")
551
```
552
553
### Enhanced Schema Types
554
555
#### EntityTypeMetadata
556
557
```python { .api }
558
from google.cloud.documentai_v1beta3.types import EntityTypeMetadata
559
560
class EntityTypeMetadata:
561
"""
562
Metadata for entity types in document schema (beta feature).
563
564
Attributes:
565
inactive (bool): Whether entity type is inactive
566
description (str): Description of the entity type
567
"""
568
569
def __init__(self, description: str = "", inactive: bool = False):
570
"""
571
Initialize entity type metadata.
572
573
Args:
574
description: Entity type description
575
inactive: Whether entity type is inactive
576
"""
577
self.description = description
578
self.inactive = inactive
579
```
580
581
#### SummaryOptions
582
583
```python { .api }
584
from google.cloud.documentai_v1beta3.types import SummaryOptions
585
586
class SummaryOptions:
587
"""
588
Options for document summarization (beta feature).
589
590
Attributes:
591
length (SummaryOptions.Length): Summary length preference
592
format_ (SummaryOptions.Format): Summary format preference
593
"""
594
595
class Length:
596
"""Summary length options."""
597
BRIEF = "BRIEF"
598
MODERATE = "MODERATE"
599
COMPREHENSIVE = "COMPREHENSIVE"
600
601
class Format:
602
"""Summary format options."""
603
PARAGRAPH = "PARAGRAPH"
604
BULLETS = "BULLETS"
605
STRUCTURED = "STRUCTURED"
606
607
def __init__(
608
self,
609
length: str = "MODERATE",
610
format_: str = "PARAGRAPH"
611
):
612
"""
613
Initialize summary options.
614
615
Args:
616
length: Summary length preference
617
format_: Summary format preference
618
"""
619
self.length = length
620
self.format_ = format_
621
```
622
623
## Beta Enums and Constants
624
625
### DatasetSplitType
626
627
```python { .api }
628
from google.cloud.documentai_v1beta3.types import DatasetSplitType
629
630
class DatasetSplitType:
631
"""
632
Enum for dataset split types used in training (beta feature).
633
634
Values:
635
DATASET_SPLIT_TYPE_UNSPECIFIED: Unspecified split type
636
DATASET_SPLIT_TRAIN: Training dataset
637
DATASET_SPLIT_TEST: Test dataset
638
DATASET_SPLIT_UNASSIGNED: Unassigned documents
639
"""
640
DATASET_SPLIT_TYPE_UNSPECIFIED = 0
641
DATASET_SPLIT_TRAIN = 1
642
DATASET_SPLIT_TEST = 2
643
DATASET_SPLIT_UNASSIGNED = 3
644
645
def categorize_documents_by_split(
646
document_metadata_list: list[DocumentMetadata]
647
) -> dict[str, list[DocumentMetadata]]:
648
"""
649
Categorize documents by their dataset split type.
650
651
Args:
652
document_metadata_list: List of document metadata
653
654
Returns:
655
dict: Documents organized by split type
656
"""
657
categorized = {
658
"train": [],
659
"test": [],
660
"unassigned": [],
661
"unspecified": []
662
}
663
664
for doc in document_metadata_list:
665
if doc.dataset_type == DatasetSplitType.DATASET_SPLIT_TRAIN:
666
categorized["train"].append(doc)
667
elif doc.dataset_type == DatasetSplitType.DATASET_SPLIT_TEST:
668
categorized["test"].append(doc)
669
elif doc.dataset_type == DatasetSplitType.DATASET_SPLIT_UNASSIGNED:
670
categorized["unassigned"].append(doc)
671
else:
672
categorized["unspecified"].append(doc)
673
674
return categorized
675
```
676
677
## Complete Beta Feature Example
678
679
### Custom Processor Training Workflow
680
681
```python { .api }
682
from google.cloud.documentai_v1beta3 import (
683
DocumentServiceClient,
684
DocumentProcessorServiceClient
685
)
686
from google.cloud.documentai_v1beta3.types import (
687
Dataset,
688
DatasetSchema,
689
ImportDocumentsRequest,
690
TrainProcessorVersionRequest,
691
ListDocumentsRequest
692
)
693
694
def complete_custom_training_workflow(
695
project_id: str,
696
location: str,
697
processor_type: str = "CUSTOM_EXTRACTION_PROCESSOR"
698
):
699
"""
700
Complete workflow for training a custom processor using beta features.
701
702
Args:
703
project_id: Google Cloud project ID
704
location: Processing location
705
processor_type: Type of custom processor to train
706
"""
707
708
# Initialize clients
709
doc_service = DocumentServiceClient()
710
processor_service = DocumentProcessorServiceClient()
711
712
print("=== CUSTOM PROCESSOR TRAINING WORKFLOW ===")
713
714
# Step 1: Create processor for training
715
print("1. Creating custom processor...")
716
717
from google.cloud.documentai_v1beta3.types import CreateProcessorRequest, Processor
718
719
parent = processor_service.common_location_path(project_id, location)
720
processor = Processor(
721
display_name="Custom Contract Processor",
722
type_=processor_type
723
)
724
725
create_request = CreateProcessorRequest(
726
parent=parent,
727
processor=processor
728
)
729
730
created_processor = processor_service.create_processor(request=create_request)
731
processor_id = created_processor.name.split('/')[-1]
732
733
print(f"Created processor: {processor_id}")
734
735
# Step 2: Setup dataset schema
736
print("2. Creating dataset schema...")
737
738
entity_definitions = [
739
{
740
"name": "contract_date",
741
"display_name": "Contract Date",
742
"properties": [
743
{
744
"name": "date_value",
745
"display_name": "Date Value",
746
"value_type": "date",
747
"occurrence_type": "REQUIRED_ONCE"
748
}
749
]
750
},
751
{
752
"name": "contract_value",
753
"display_name": "Contract Value",
754
"properties": [
755
{
756
"name": "money_value",
757
"display_name": "Money Value",
758
"value_type": "money",
759
"occurrence_type": "REQUIRED_ONCE"
760
}
761
]
762
},
763
{
764
"name": "party_names",
765
"display_name": "Party Names",
766
"properties": [
767
{
768
"name": "text_value",
769
"display_name": "Text Value",
770
"value_type": "text",
771
"occurrence_type": "REQUIRED_MULTIPLE"
772
}
773
]
774
}
775
]
776
777
schema = create_custom_schema("Contract Schema", entity_definitions)
778
779
# Step 3: Import training documents
780
print("3. Importing training documents...")
781
782
dataset_parent = f"projects/{project_id}/locations/{location}/processors/{processor_id}/dataset"
783
784
# Configure document import from Cloud Storage
785
batch_documents_input_config = {
786
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/"}
787
}
788
789
import_request = ImportDocumentsRequest(
790
dataset=dataset_parent,
791
batch_documents_input_config=batch_documents_input_config
792
)
793
794
import_operation = doc_service.import_documents(request=import_request)
795
796
print("Importing documents...")
797
import_result = import_operation.result() # Wait for completion
798
print("Documents imported successfully")
799
800
# Step 4: Check dataset status
801
print("4. Checking dataset status...")
802
803
list_request = ListDocumentsRequest(parent=dataset_parent)
804
list_response = doc_service.list_documents(request=list_request)
805
806
total_docs = len(list_response.document_metadata)
807
labeled_docs = len(filter_labeled_documents(list_response.document_metadata))
808
809
print(f"Total documents: {total_docs}")
810
print(f"Labeled documents: {labeled_docs}")
811
812
# Step 5: Train processor version (if sufficient labeled data)
813
if labeled_docs >= 10: # Minimum for training
814
print("5. Starting processor training...")
815
816
processor_parent = processor_service.processor_path(
817
project_id, location, processor_id
818
)
819
820
train_request = TrainProcessorVersionRequest(
821
parent=processor_parent,
822
processor_version={
823
"display_name": "Contract Processor v1.0",
824
"document_schema": schema
825
},
826
input_data={
827
"training_documents": {
828
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/labeled/"}
829
},
830
"test_documents": {
831
"gcs_prefix": {"gcs_uri_prefix": "gs://my-training-bucket/contracts/test/"}
832
}
833
}
834
)
835
836
train_operation = processor_service.train_processor_version(request=train_request)
837
838
print(f"Training started: {train_operation.operation.name}")
839
print("Training typically takes several hours. Monitor progress using the operation name.")
840
841
else:
842
print(f"Insufficient labeled documents ({labeled_docs}). Need at least 10 for training.")
843
844
return {
845
"processor_id": processor_id,
846
"dataset_parent": dataset_parent,
847
"total_documents": total_docs,
848
"labeled_documents": labeled_docs
849
}
850
```
851
852
### Beta Feature Monitoring
853
854
```python { .api }
855
def monitor_beta_operations(
856
project_id: str,
857
location: str
858
) -> dict:
859
"""
860
Monitor various beta operations and provide status.
861
862
Args:
863
project_id: Google Cloud project ID
864
location: Processing location
865
866
Returns:
867
dict: Status of beta operations
868
"""
869
from google.api_core import operations_v1
870
from google.auth import default
871
872
credentials, _ = default()
873
operations_client = operations_v1.OperationsClient(credentials=credentials)
874
875
# List all operations for the location
876
name = f"projects/{project_id}/locations/{location}"
877
878
beta_operations = {
879
"import_documents": [],
880
"train_processor": [],
881
"import_processor_version": [],
882
"other": []
883
}
884
885
for operation in operations_client.list_operations(name=name):
886
op_info = {
887
"name": operation.name,
888
"done": operation.done,
889
"error": operation.error.message if operation.error else None
890
}
891
892
# Categorize by operation type
893
if "importDocuments" in operation.name:
894
beta_operations["import_documents"].append(op_info)
895
elif "trainProcessorVersion" in operation.name:
896
beta_operations["train_processor"].append(op_info)
897
elif "importProcessorVersion" in operation.name:
898
beta_operations["import_processor_version"].append(op_info)
899
else:
900
beta_operations["other"].append(op_info)
901
902
return beta_operations
903
904
def print_beta_status(project_id: str, location: str):
905
"""Print status of beta operations."""
906
status = monitor_beta_operations(project_id, location)
907
908
print("=== BETA OPERATIONS STATUS ===")
909
910
for op_type, operations in status.items():
911
print(f"\n{op_type.replace('_', ' ').title()} Operations ({len(operations)}):")
912
913
for op in operations:
914
status_text = "✓ Complete" if op["done"] else "⏳ Running"
915
error_text = f" (Error: {op['error']})" if op["error"] else ""
916
print(f" - {op['name'].split('/')[-1]}: {status_text}{error_text}")
917
918
# Example usage
919
if __name__ == "__main__":
920
# Run custom training workflow
921
result = complete_custom_training_workflow(
922
project_id="my-project",
923
location="us"
924
)
925
926
print(f"\nWorkflow completed:")
927
print(f"Processor ID: {result['processor_id']}")
928
print(f"Dataset: {result['dataset_parent']}")
929
print(f"Documents: {result['labeled_documents']}/{result['total_documents']} labeled")
930
931
# Monitor operations
932
print_beta_status("my-project", "us")
933
```
934
935
## Migration from V1 to V1beta3
936
937
### Compatibility Notes
938
939
```python { .api }
940
# V1 API (stable) - continues to work
941
from google.cloud.documentai import DocumentProcessorServiceClient as V1Client
942
from google.cloud.documentai.types import ProcessRequest as V1ProcessRequest
943
944
# V1beta3 API (beta) - includes all v1 functionality + beta features
945
from google.cloud.documentai_v1beta3 import DocumentProcessorServiceClient as V1Beta3Client
946
from google.cloud.documentai_v1beta3.types import ProcessRequest as V1Beta3ProcessRequest
947
948
def migrate_to_beta_client():
949
"""
950
Example showing migration from v1 to v1beta3 client.
951
952
V1beta3 client is backward compatible with v1 API calls.
953
"""
954
955
# V1 approach (still works)
956
v1_client = V1Client()
957
958
# V1beta3 approach (recommended for new features)
959
v1beta3_client = V1Beta3Client()
960
961
# Both clients support the same core operations
962
processor_name = "projects/my-project/locations/us/processors/abc123"
963
964
# Same request works with both clients
965
from google.cloud.documentai_v1beta3.types import RawDocument
966
967
raw_doc = RawDocument(content=b"document content", mime_type="application/pdf")
968
request = V1Beta3ProcessRequest(name=processor_name, raw_document=raw_doc)
969
970
# Both calls work identically
971
v1_result = v1_client.process_document(request=request)
972
v1beta3_result = v1beta3_client.process_document(request=request)
973
974
# But only v1beta3 client supports beta features
975
try:
976
# This only works with v1beta3 client
977
import_operation = v1beta3_client.import_processor_version(
978
# import request
979
)
980
print("Beta feature available")
981
except AttributeError:
982
print("Beta feature not available in v1 client")
983
984
# Best practice: Use v1beta3 for new projects to access all features
985
def recommended_client_usage():
986
"""Recommended pattern for using v1beta3 client."""
987
988
# Use v1beta3 client for all operations
989
client = V1Beta3Client()
990
991
# Standard operations work normally
992
# Beta operations are available when needed
993
994
return client
995
```
996
997
This comprehensive guide covers all beta features available in Google Cloud Document AI v1beta3, including dataset management, enhanced processing capabilities, and migration strategies from the stable v1 API.