0
# Processor Management
1
2
This guide covers comprehensive processor lifecycle management including creation, configuration, deployment, training, and monitoring of document processors.
3
4
## Processor Lifecycle Overview
5
6
Document processors follow this lifecycle:
7
1. **Create** - Initialize a new processor instance
8
2. **Configure** - Set up processor parameters and options
9
3. **Train** - Train custom processors (for custom types)
10
4. **Deploy** - Make processor versions available for processing
11
5. **Monitor** - Track performance and usage
12
6. **Update** - Deploy new versions and manage defaults
13
7. **Cleanup** - Disable and delete unused processors
14
15
## List and Discover Processors
16
17
### List Available Processors
18
19
```python { .api }
20
from google.cloud.documentai import DocumentProcessorServiceClient
21
from google.cloud.documentai.types import ListProcessorsRequest
22
23
def list_processors(project_id: str, location: str) -> list["Processor"]:
24
"""
25
List all processors in a project location.
26
27
Args:
28
project_id: Google Cloud project ID
29
location: Processor location (e.g., 'us', 'eu')
30
31
Returns:
32
list[Processor]: List of processor instances
33
"""
34
client = DocumentProcessorServiceClient()
35
36
# Build parent path
37
parent = client.common_location_path(project_id, location)
38
39
# Create request
40
request = ListProcessorsRequest(parent=parent)
41
42
# List processors
43
response = client.list_processors(request=request)
44
45
processors = []
46
for processor in response.processors:
47
processors.append(processor)
48
49
return processors
50
51
def display_processor_info(processors: list["Processor"]) -> None:
52
"""
53
Display processor information in a readable format.
54
55
Args:
56
processors: List of processor objects
57
"""
58
print(f"Found {len(processors)} processors:")
59
print("-" * 80)
60
61
for processor in processors:
62
print(f"Name: {processor.display_name}")
63
print(f"ID: {processor.name.split('/')[-1]}")
64
print(f"Type: {processor.type_}")
65
print(f"State: {processor.state}")
66
print(f"Default Version: {processor.default_processor_version}")
67
print(f"Created: {processor.create_time}")
68
print("-" * 80)
69
```
70
71
### Fetch Processor Types
72
73
```python { .api }
74
from google.cloud.documentai import DocumentProcessorServiceClient
75
from google.cloud.documentai.types import FetchProcessorTypesRequest
76
77
def fetch_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
78
"""
79
Fetch available processor types for a location.
80
81
Args:
82
project_id: Google Cloud project ID
83
location: Processor location
84
85
Returns:
86
list[ProcessorType]: Available processor types
87
"""
88
client = DocumentProcessorServiceClient()
89
90
# Build parent path
91
parent = client.common_location_path(project_id, location)
92
93
# Create request
94
request = FetchProcessorTypesRequest(parent=parent)
95
96
# Fetch processor types
97
response = client.fetch_processor_types(request=request)
98
99
processor_types = []
100
for processor_type in response.processor_types:
101
processor_types.append(processor_type)
102
103
return processor_types
104
```
105
106
### List Processor Types
107
108
```python { .api }
109
from google.cloud.documentai import DocumentProcessorServiceClient
110
from google.cloud.documentai.types import ListProcessorTypesRequest
111
112
def list_processor_types(project_id: str, location: str) -> list["ProcessorType"]:
113
"""
114
List available processor types for creation.
115
116
Args:
117
project_id: Google Cloud project ID
118
location: Processor location
119
120
Returns:
121
list[ProcessorType]: Available processor types
122
"""
123
client = DocumentProcessorServiceClient()
124
125
# Build parent path
126
parent = client.common_location_path(project_id, location)
127
128
# Create request
129
request = ListProcessorTypesRequest(parent=parent)
130
131
# List processor types
132
response = client.list_processor_types(request=request)
133
134
processor_types = []
135
for processor_type in response.processor_types:
136
processor_types.append(processor_type)
137
138
return processor_types
139
140
def display_processor_types(processor_types: list["ProcessorType"]) -> None:
141
"""
142
Display available processor types.
143
144
Args:
145
processor_types: List of ProcessorType objects
146
"""
147
print(f"Available processor types ({len(processor_types)}):")
148
print("-" * 60)
149
150
# Group by category for better display
151
categories = {}
152
for proc_type in processor_types:
153
category = proc_type.category
154
if category not in categories:
155
categories[category] = []
156
categories[category].append(proc_type)
157
158
for category, types in categories.items():
159
print(f"\n{category}:")
160
for proc_type in types:
161
print(f" - {proc_type.display_name}")
162
print(f" Type: {proc_type.type_}")
163
if proc_type.allow_creation:
164
print(" ✓ Available for creation")
165
print()
166
```
167
168
### Get Specific Processor
169
170
```python { .api }
171
from google.cloud.documentai import DocumentProcessorServiceClient
172
from google.cloud.documentai.types import GetProcessorRequest
173
174
def get_processor(project_id: str, location: str, processor_id: str) -> "Processor":
175
"""
176
Get details of a specific processor.
177
178
Args:
179
project_id: Google Cloud project ID
180
location: Processor location
181
processor_id: Processor ID
182
183
Returns:
184
Processor: Processor details
185
"""
186
client = DocumentProcessorServiceClient()
187
188
# Build processor name
189
name = client.processor_path(project_id, location, processor_id)
190
191
# Create request
192
request = GetProcessorRequest(name=name)
193
194
# Get processor
195
processor = client.get_processor(request=request)
196
197
return processor
198
```
199
200
## Create Processors
201
202
### Create New Processor
203
204
```python { .api }
205
from google.cloud.documentai import DocumentProcessorServiceClient
206
from google.cloud.documentai.types import CreateProcessorRequest, Processor
207
208
def create_processor(
209
project_id: str,
210
location: str,
211
display_name: str,
212
processor_type: str
213
) -> "Processor":
214
"""
215
Create a new document processor.
216
217
Args:
218
project_id: Google Cloud project ID
219
location: Processor location
220
display_name: Human-readable name for the processor
221
processor_type: Type of processor to create (e.g., 'OCR_PROCESSOR')
222
223
Returns:
224
Processor: Created processor instance
225
"""
226
client = DocumentProcessorServiceClient()
227
228
# Build parent path
229
parent = client.common_location_path(project_id, location)
230
231
# Create processor object
232
processor = Processor(
233
display_name=display_name,
234
type_=processor_type
235
)
236
237
# Create request
238
request = CreateProcessorRequest(
239
parent=parent,
240
processor=processor
241
)
242
243
# Create processor
244
created_processor = client.create_processor(request=request)
245
246
print(f"Created processor: {created_processor.display_name}")
247
print(f"Processor ID: {created_processor.name.split('/')[-1]}")
248
249
return created_processor
250
251
def create_common_processors(project_id: str, location: str) -> dict[str, "Processor"]:
252
"""
253
Create commonly used processors.
254
255
Args:
256
project_id: Google Cloud project ID
257
location: Processor location
258
259
Returns:
260
dict[str, Processor]: Created processors by type
261
"""
262
processors = {}
263
264
# Common processor types
265
common_types = [
266
("OCR_PROCESSOR", "General OCR Processor"),
267
("FORM_PARSER_PROCESSOR", "Form Parser"),
268
("INVOICE_PROCESSOR", "Invoice Processor"),
269
("RECEIPT_PROCESSOR", "Receipt Processor")
270
]
271
272
for processor_type, display_name in common_types:
273
try:
274
processor = create_processor(
275
project_id=project_id,
276
location=location,
277
display_name=display_name,
278
processor_type=processor_type
279
)
280
processors[processor_type] = processor
281
except Exception as e:
282
print(f"Failed to create {processor_type}: {e}")
283
284
return processors
285
```
286
287
## Processor State Management
288
289
### Enable/Disable Processors
290
291
```python { .api }
292
from google.cloud.documentai import DocumentProcessorServiceClient
293
from google.cloud.documentai.types import (
294
EnableProcessorRequest,
295
DisableProcessorRequest
296
)
297
298
def enable_processor(project_id: str, location: str, processor_id: str) -> "EnableProcessorResponse":
299
"""
300
Enable a disabled processor.
301
302
Args:
303
project_id: Google Cloud project ID
304
location: Processor location
305
processor_id: Processor ID to enable
306
307
Returns:
308
EnableProcessorResponse: Operation response
309
"""
310
client = DocumentProcessorServiceClient()
311
312
# Build processor name
313
name = client.processor_path(project_id, location, processor_id)
314
315
# Create request
316
request = EnableProcessorRequest(name=name)
317
318
# Enable processor (this is a long-running operation)
319
operation = client.enable_processor(request=request)
320
321
print(f"Enabling processor {processor_id}...")
322
323
# Wait for operation to complete
324
response = operation.result()
325
326
print(f"Processor {processor_id} enabled successfully")
327
return response
328
329
def disable_processor(project_id: str, location: str, processor_id: str) -> "DisableProcessorResponse":
330
"""
331
Disable an active processor.
332
333
Args:
334
project_id: Google Cloud project ID
335
location: Processor location
336
processor_id: Processor ID to disable
337
338
Returns:
339
DisableProcessorResponse: Operation response
340
"""
341
client = DocumentProcessorServiceClient()
342
343
# Build processor name
344
name = client.processor_path(project_id, location, processor_id)
345
346
# Create request
347
request = DisableProcessorRequest(name=name)
348
349
# Disable processor (this is a long-running operation)
350
operation = client.disable_processor(request=request)
351
352
print(f"Disabling processor {processor_id}...")
353
354
# Wait for operation to complete
355
response = operation.result()
356
357
print(f"Processor {processor_id} disabled successfully")
358
return response
359
```
360
361
### Delete Processors
362
363
```python { .api }
364
from google.cloud.documentai import DocumentProcessorServiceClient
365
from google.cloud.documentai.types import DeleteProcessorRequest
366
367
def delete_processor(project_id: str, location: str, processor_id: str) -> None:
368
"""
369
Delete a processor permanently.
370
371
Args:
372
project_id: Google Cloud project ID
373
location: Processor location
374
processor_id: Processor ID to delete
375
376
Note:
377
This operation is irreversible. Ensure the processor is disabled first.
378
"""
379
client = DocumentProcessorServiceClient()
380
381
# Build processor name
382
name = client.processor_path(project_id, location, processor_id)
383
384
# First, ensure processor is disabled
385
try:
386
processor = get_processor(project_id, location, processor_id)
387
if processor.state == "ENABLED":
388
print("Processor is enabled. Disabling first...")
389
disable_processor(project_id, location, processor_id)
390
except Exception as e:
391
print(f"Warning: Could not check processor state: {e}")
392
393
# Create delete request
394
request = DeleteProcessorRequest(name=name)
395
396
# Delete processor (this is a long-running operation)
397
operation = client.delete_processor(request=request)
398
399
print(f"Deleting processor {processor_id}...")
400
401
# Wait for operation to complete
402
operation.result()
403
404
print(f"Processor {processor_id} deleted successfully")
405
```
406
407
## Processor Version Management
408
409
### List Processor Versions
410
411
```python { .api }
412
from google.cloud.documentai import DocumentProcessorServiceClient
413
from google.cloud.documentai.types import ListProcessorVersionsRequest
414
415
def list_processor_versions(
416
project_id: str,
417
location: str,
418
processor_id: str
419
) -> list["ProcessorVersion"]:
420
"""
421
List all versions of a processor.
422
423
Args:
424
project_id: Google Cloud project ID
425
location: Processor location
426
processor_id: Processor ID
427
428
Returns:
429
list[ProcessorVersion]: List of processor versions
430
"""
431
client = DocumentProcessorServiceClient()
432
433
# Build processor path as parent
434
parent = client.processor_path(project_id, location, processor_id)
435
436
# Create request
437
request = ListProcessorVersionsRequest(parent=parent)
438
439
# List versions
440
response = client.list_processor_versions(request=request)
441
442
versions = []
443
for version in response.processor_versions:
444
versions.append(version)
445
446
return versions
447
448
def display_processor_versions(versions: list["ProcessorVersion"]) -> None:
449
"""
450
Display processor version information.
451
452
Args:
453
versions: List of ProcessorVersion objects
454
"""
455
print(f"Found {len(versions)} processor versions:")
456
print("-" * 70)
457
458
for version in versions:
459
version_id = version.name.split('/')[-1]
460
print(f"Version ID: {version_id}")
461
print(f"Display Name: {version.display_name}")
462
print(f"State: {version.state}")
463
print(f"Created: {version.create_time}")
464
465
if version.model_type:
466
print(f"Model Type: {version.model_type}")
467
468
if version.latest_evaluation:
469
print(f"Latest Evaluation: {version.latest_evaluation}")
470
471
print("-" * 70)
472
```
473
474
### Deploy Processor Versions
475
476
```python { .api }
477
from google.cloud.documentai import DocumentProcessorServiceClient
478
from google.cloud.documentai.types import DeployProcessorVersionRequest
479
480
def deploy_processor_version(
481
project_id: str,
482
location: str,
483
processor_id: str,
484
version_id: str
485
) -> "DeployProcessorVersionResponse":
486
"""
487
Deploy a processor version for serving.
488
489
Args:
490
project_id: Google Cloud project ID
491
location: Processor location
492
processor_id: Processor ID
493
version_id: Version ID to deploy
494
495
Returns:
496
DeployProcessorVersionResponse: Deployment response
497
"""
498
client = DocumentProcessorServiceClient()
499
500
# Build processor version name
501
name = client.processor_version_path(
502
project_id, location, processor_id, version_id
503
)
504
505
# Create request
506
request = DeployProcessorVersionRequest(name=name)
507
508
# Deploy version (this is a long-running operation)
509
operation = client.deploy_processor_version(request=request)
510
511
print(f"Deploying processor version {version_id}...")
512
513
# Wait for operation to complete
514
response = operation.result()
515
516
print(f"Processor version {version_id} deployed successfully")
517
return response
518
519
def undeploy_processor_version(
520
project_id: str,
521
location: str,
522
processor_id: str,
523
version_id: str
524
) -> "UndeployProcessorVersionResponse":
525
"""
526
Undeploy a processor version from serving.
527
528
Args:
529
project_id: Google Cloud project ID
530
location: Processor location
531
processor_id: Processor ID
532
version_id: Version ID to undeploy
533
534
Returns:
535
UndeployProcessorVersionResponse: Undeploy response
536
"""
537
client = DocumentProcessorServiceClient()
538
539
# Build processor version name
540
name = client.processor_version_path(
541
project_id, location, processor_id, version_id
542
)
543
544
# Create request
545
request = UndeployProcessorVersionRequest(name=name)
546
547
# Undeploy version (this is a long-running operation)
548
operation = client.undeploy_processor_version(request=request)
549
550
print(f"Undeploying processor version {version_id}...")
551
552
# Wait for operation to complete
553
response = operation.result()
554
555
print(f"Processor version {version_id} undeployed successfully")
556
return response
557
```
558
559
### Set Default Processor Version
560
561
```python { .api }
562
from google.cloud.documentai import DocumentProcessorServiceClient
563
from google.cloud.documentai.types import SetDefaultProcessorVersionRequest
564
565
def set_default_processor_version(
566
project_id: str,
567
location: str,
568
processor_id: str,
569
version_id: str
570
) -> "SetDefaultProcessorVersionResponse":
571
"""
572
Set the default version for a processor.
573
574
Args:
575
project_id: Google Cloud project ID
576
location: Processor location
577
processor_id: Processor ID
578
version_id: Version ID to set as default
579
580
Returns:
581
SetDefaultProcessorVersionResponse: Response with updated processor
582
"""
583
client = DocumentProcessorServiceClient()
584
585
# Build processor path
586
processor_name = client.processor_path(project_id, location, processor_id)
587
588
# Build version path
589
version_name = client.processor_version_path(
590
project_id, location, processor_id, version_id
591
)
592
593
# Create request
594
request = SetDefaultProcessorVersionRequest(
595
processor=processor_name,
596
default_processor_version=version_name
597
)
598
599
# Set default version (this is a long-running operation)
600
operation = client.set_default_processor_version(request=request)
601
602
print(f"Setting default version to {version_id}...")
603
604
# Wait for operation to complete
605
response = operation.result()
606
607
print(f"Default version set to {version_id} successfully")
608
return response
609
```
610
611
## Custom Processor Training
612
613
### Train Processor Version
614
615
```python { .api }
616
from google.cloud.documentai import DocumentProcessorServiceClient
617
from google.cloud.documentai.types import (
618
TrainProcessorVersionRequest,
619
DocumentSchema
620
)
621
622
def train_processor_version(
623
project_id: str,
624
location: str,
625
processor_id: str,
626
version_display_name: str,
627
training_dataset: str,
628
test_dataset: str = None,
629
document_schema: "DocumentSchema" = None
630
) -> "TrainProcessorVersionResponse":
631
"""
632
Train a new version of a custom processor.
633
634
Args:
635
project_id: Google Cloud project ID
636
location: Processor location
637
processor_id: Processor ID to train
638
version_display_name: Display name for new version
639
training_dataset: Path to training dataset
640
test_dataset: Optional path to test dataset
641
document_schema: Optional document schema for training
642
643
Returns:
644
TrainProcessorVersionResponse: Training response with new version
645
"""
646
client = DocumentProcessorServiceClient()
647
648
# Build processor path as parent
649
parent = client.processor_path(project_id, location, processor_id)
650
651
# Create processor version configuration
652
processor_version = {
653
"display_name": version_display_name
654
}
655
656
# Add document schema if provided
657
if document_schema:
658
processor_version["document_schema"] = document_schema
659
660
# Create training input configuration
661
input_data = {
662
"training_documents": {
663
"gcs_prefix": {"gcs_uri_prefix": training_dataset}
664
}
665
}
666
667
# Add test dataset if provided
668
if test_dataset:
669
input_data["test_documents"] = {
670
"gcs_prefix": {"gcs_uri_prefix": test_dataset}
671
}
672
673
# Create request
674
request = TrainProcessorVersionRequest(
675
parent=parent,
676
processor_version=processor_version,
677
input_data=input_data
678
)
679
680
# Start training (this is a long-running operation)
681
operation = client.train_processor_version(request=request)
682
683
print(f"Starting training for processor version: {version_display_name}")
684
print("This operation may take several hours to complete...")
685
686
# For production, you'd typically not wait for completion here
687
# Instead, you'd check the operation status periodically
688
print(f"Training operation name: {operation.operation.name}")
689
690
return operation
691
692
def check_training_progress(operation_name: str) -> dict:
693
"""
694
Check the progress of a training operation.
695
696
Args:
697
operation_name: Name of the training operation
698
699
Returns:
700
dict: Operation status and progress information
701
"""
702
from google.api_core import operations_v1
703
from google.auth import default
704
705
# Get credentials and create operations client
706
credentials, project = default()
707
operations_client = operations_v1.OperationsClient(credentials=credentials)
708
709
# Get operation status
710
operation = operations_client.get_operation(name=operation_name)
711
712
status_info = {
713
"name": operation.name,
714
"done": operation.done,
715
"metadata": None,
716
"result": None,
717
"error": None
718
}
719
720
if operation.metadata:
721
# Parse metadata for progress information
722
status_info["metadata"] = operation.metadata
723
724
if operation.done:
725
if operation.error:
726
status_info["error"] = operation.error
727
else:
728
status_info["result"] = operation.response
729
730
return status_info
731
```
732
733
## Processor Evaluation
734
735
### Evaluate Processor Performance
736
737
```python { .api }
738
from google.cloud.documentai import DocumentProcessorServiceClient
739
from google.cloud.documentai.types import (
740
EvaluateProcessorVersionRequest,
741
EvaluationReference
742
)
743
744
def evaluate_processor_version(
745
project_id: str,
746
location: str,
747
processor_id: str,
748
version_id: str,
749
evaluation_documents: str
750
) -> "EvaluateProcessorVersionResponse":
751
"""
752
Evaluate the performance of a processor version.
753
754
Args:
755
project_id: Google Cloud project ID
756
location: Processor location
757
processor_id: Processor ID
758
version_id: Version ID to evaluate
759
evaluation_documents: GCS path to evaluation documents
760
761
Returns:
762
EvaluateProcessorVersionResponse: Evaluation response
763
"""
764
client = DocumentProcessorServiceClient()
765
766
# Build processor version name
767
processor_version = client.processor_version_path(
768
project_id, location, processor_id, version_id
769
)
770
771
# Create evaluation documents configuration
772
evaluation_documents_config = {
773
"gcs_prefix": {"gcs_uri_prefix": evaluation_documents}
774
}
775
776
# Create request
777
request = EvaluateProcessorVersionRequest(
778
processor_version=processor_version,
779
evaluation_documents=evaluation_documents_config
780
)
781
782
# Start evaluation (this is a long-running operation)
783
operation = client.evaluate_processor_version(request=request)
784
785
print(f"Starting evaluation for processor version {version_id}...")
786
787
# Wait for evaluation to complete
788
response = operation.result()
789
790
print("Evaluation completed successfully")
791
return response
792
793
def list_evaluations(
794
project_id: str,
795
location: str,
796
processor_id: str,
797
version_id: str
798
) -> list["Evaluation"]:
799
"""
800
List all evaluations for a processor version.
801
802
Args:
803
project_id: Google Cloud project ID
804
location: Processor location
805
processor_id: Processor ID
806
version_id: Version ID
807
808
Returns:
809
list[Evaluation]: List of evaluation results
810
"""
811
client = DocumentProcessorServiceClient()
812
813
# Build processor version path as parent
814
parent = client.processor_version_path(
815
project_id, location, processor_id, version_id
816
)
817
818
# Create request
819
request = ListEvaluationsRequest(parent=parent)
820
821
# List evaluations
822
response = client.list_evaluations(request=request)
823
824
evaluations = []
825
for evaluation in response.evaluations:
826
evaluations.append(evaluation)
827
828
return evaluations
829
830
def get_evaluation_details(
831
project_id: str,
832
location: str,
833
processor_id: str,
834
version_id: str,
835
evaluation_id: str
836
) -> "Evaluation":
837
"""
838
Get detailed evaluation results.
839
840
Args:
841
project_id: Google Cloud project ID
842
location: Processor location
843
processor_id: Processor ID
844
version_id: Version ID
845
evaluation_id: Evaluation ID
846
847
Returns:
848
Evaluation: Detailed evaluation results
849
"""
850
client = DocumentProcessorServiceClient()
851
852
# Build evaluation name
853
name = client.evaluation_path(
854
project_id, location, processor_id, version_id, evaluation_id
855
)
856
857
# Create request
858
request = GetEvaluationRequest(name=name)
859
860
# Get evaluation
861
evaluation = client.get_evaluation(request=request)
862
863
return evaluation
864
```
865
866
## Complete Processor Management Example
867
868
```python { .api }
869
def complete_processor_management_example():
870
"""
871
Complete example demonstrating processor lifecycle management.
872
"""
873
project_id = "my-project"
874
location = "us"
875
876
client = DocumentProcessorServiceClient()
877
878
# 1. List existing processors
879
print("=== LISTING PROCESSORS ===")
880
processors = list_processors(project_id, location)
881
display_processor_info(processors)
882
883
# 2. Create a new processor if needed
884
print("\n=== CREATING PROCESSOR ===")
885
processor = create_processor(
886
project_id=project_id,
887
location=location,
888
display_name="My Custom Invoice Processor",
889
processor_type="INVOICE_PROCESSOR"
890
)
891
processor_id = processor.name.split('/')[-1]
892
893
# 3. Enable the processor
894
print("\n=== ENABLING PROCESSOR ===")
895
enable_processor(project_id, location, processor_id)
896
897
# 4. List processor versions
898
print("\n=== LISTING VERSIONS ===")
899
versions = list_processor_versions(project_id, location, processor_id)
900
display_processor_versions(versions)
901
902
# 5. Get processor details
903
print("\n=== PROCESSOR DETAILS ===")
904
processor_details = get_processor(project_id, location, processor_id)
905
print(f"Processor State: {processor_details.state}")
906
print(f"Default Version: {processor_details.default_processor_version}")
907
908
# 6. Evaluate processor (if evaluation data available)
909
# evaluation_gcs_path = "gs://my-bucket/evaluation-docs/"
910
# evaluation = evaluate_processor_version(
911
# project_id, location, processor_id, version_id, evaluation_gcs_path
912
# )
913
914
print("\nProcessor management example completed!")
915
916
if __name__ == "__main__":
917
complete_processor_management_example()
918
```
919
920
This comprehensive guide covers all aspects of processor management in Google Cloud Document AI, from basic operations to advanced training and evaluation workflows.