0
# Document Processing Operations
1
2
This guide covers core document processing operations using Google Cloud Document AI, including synchronous processing, handling different document formats, and extracting structured data.
3
4
## Process Single Document
5
6
### Basic Document Processing
7
8
```python { .api }
9
from google.cloud.documentai import DocumentProcessorServiceClient
10
from google.cloud.documentai.types import ProcessRequest, RawDocument
11
12
def process_document_from_file(
13
project_id: str,
14
location: str,
15
processor_id: str,
16
file_path: str,
17
mime_type: str
18
) -> "Document":
19
"""
20
Process a document file using Document AI.
21
22
Args:
23
project_id: Google Cloud project ID
24
location: Processor location (e.g., 'us', 'eu')
25
processor_id: Document processor ID
26
file_path: Path to the document file
27
mime_type: MIME type of the document
28
29
Returns:
30
Document: Processed document with extracted data
31
"""
32
client = DocumentProcessorServiceClient()
33
34
# Build the processor resource name
35
name = client.processor_path(project_id, location, processor_id)
36
37
# Read document file
38
with open(file_path, "rb") as document_file:
39
document_content = document_file.read()
40
41
# Create raw document
42
raw_document = RawDocument(
43
content=document_content,
44
mime_type=mime_type
45
)
46
47
# Configure process request
48
request = ProcessRequest(
49
name=name,
50
raw_document=raw_document
51
)
52
53
# Process the document
54
result = client.process_document(request=request)
55
56
return result.document
57
```
58
59
### Process Cloud Storage Document
60
61
```python { .api }
62
from google.cloud.documentai import DocumentProcessorServiceClient
63
from google.cloud.documentai.types import ProcessRequest, GcsDocument
64
65
def process_gcs_document(
66
project_id: str,
67
location: str,
68
processor_id: str,
69
gcs_uri: str,
70
mime_type: str
71
) -> "Document":
72
"""
73
Process a document stored in Google Cloud Storage.
74
75
Args:
76
project_id: Google Cloud project ID
77
location: Processor location
78
processor_id: Document processor ID
79
gcs_uri: Cloud Storage URI (gs://bucket/path/file.pdf)
80
mime_type: MIME type of the document
81
82
Returns:
83
Document: Processed document with extracted data
84
"""
85
client = DocumentProcessorServiceClient()
86
87
# Build the processor resource name
88
name = client.processor_path(project_id, location, processor_id)
89
90
# Create GCS document reference
91
gcs_document = GcsDocument(
92
gcs_uri=gcs_uri,
93
mime_type=mime_type
94
)
95
96
# Configure process request
97
request = ProcessRequest(
98
name=name,
99
gcs_document=gcs_document
100
)
101
102
# Process the document
103
result = client.process_document(request=request)
104
105
return result.document
106
```
107
108
## Processing Options
109
110
### OCR Configuration
111
112
```python { .api }
113
from google.cloud.documentai.types import ProcessRequest, OcrConfig, ProcessOptions
114
115
def process_with_ocr_options(
116
client: DocumentProcessorServiceClient,
117
processor_name: str,
118
raw_document: "RawDocument",
119
enable_native_pdf_parsing: bool = True,
120
enable_image_quality_scores: bool = False,
121
enable_symbol: bool = False
122
) -> "Document":
123
"""
124
Process document with specific OCR configuration.
125
126
Args:
127
client: DocumentProcessorServiceClient instance
128
processor_name: Full processor resource name
129
raw_document: Raw document to process
130
enable_native_pdf_parsing: Use native PDF parsing when possible
131
enable_image_quality_scores: Include image quality scores
132
enable_symbol: Enable symbol detection
133
134
Returns:
135
Document: Processed document
136
"""
137
# Configure OCR options
138
ocr_config = OcrConfig(
139
enable_native_pdf_parsing=enable_native_pdf_parsing,
140
enable_image_quality_scores=enable_image_quality_scores,
141
enable_symbol=enable_symbol
142
)
143
144
# Configure process options
145
process_options = ProcessOptions(ocr_config=ocr_config)
146
147
# Create request with options
148
request = ProcessRequest(
149
name=processor_name,
150
raw_document=raw_document,
151
process_options=process_options
152
)
153
154
# Process document
155
result = client.process_document(request=request)
156
return result.document
157
```
158
159
### Field Mask Processing
160
161
```python { .api }
162
from google.cloud.documentai.types import ProcessRequest
163
from google.protobuf.field_mask_pb2 import FieldMask
164
165
def process_with_field_mask(
166
client: DocumentProcessorServiceClient,
167
processor_name: str,
168
raw_document: "RawDocument",
169
fields: list[str]
170
) -> "Document":
171
"""
172
Process document returning only specified fields.
173
174
Args:
175
client: DocumentProcessorServiceClient instance
176
processor_name: Full processor resource name
177
raw_document: Raw document to process
178
fields: List of field paths to return (e.g., ['text', 'pages.blocks'])
179
180
Returns:
181
Document: Processed document with only requested fields
182
"""
183
# Create field mask
184
field_mask = FieldMask(paths=fields)
185
186
# Create request with field mask
187
request = ProcessRequest(
188
name=processor_name,
189
raw_document=raw_document,
190
field_mask=field_mask
191
)
192
193
# Process document
194
result = client.process_document(request=request)
195
return result.document
196
```
197
198
## Document Analysis
199
200
### Extract Text and Layout
201
202
```python { .api }
203
from google.cloud.documentai.types import Document
204
205
def analyze_document_text(document: Document) -> dict:
206
"""
207
Analyze text content and layout from processed document.
208
209
Args:
210
document: Processed Document object
211
212
Returns:
213
dict: Analysis results including text statistics and layout info
214
"""
215
analysis = {
216
"total_text": document.text,
217
"text_length": len(document.text),
218
"pages": [],
219
"text_segments": []
220
}
221
222
# Analyze each page
223
for page_idx, page in enumerate(document.pages):
224
page_info = {
225
"page_number": page_idx + 1,
226
"dimensions": {
227
"width": page.dimension.width,
228
"height": page.dimension.height,
229
"unit": page.dimension.unit
230
},
231
"blocks": len(page.blocks),
232
"paragraphs": len(page.paragraphs),
233
"lines": len(page.lines),
234
"tokens": len(page.tokens)
235
}
236
237
# Extract text segments from page
238
for block in page.blocks:
239
if block.layout and block.layout.text_anchor:
240
text_segment = extract_text_from_anchor(
241
document.text,
242
block.layout.text_anchor
243
)
244
analysis["text_segments"].append({
245
"type": "block",
246
"page": page_idx + 1,
247
"text": text_segment,
248
"confidence": block.layout.confidence
249
})
250
251
analysis["pages"].append(page_info)
252
253
return analysis
254
255
def extract_text_from_anchor(full_text: str, text_anchor: "Document.TextAnchor") -> str:
256
"""
257
Extract text segment using TextAnchor.
258
259
Args:
260
full_text: Full document text
261
text_anchor: TextAnchor specifying text location
262
263
Returns:
264
str: Extracted text segment
265
"""
266
text_segments = []
267
268
for segment in text_anchor.text_segments:
269
start_index = int(segment.start_index) if segment.start_index else 0
270
end_index = int(segment.end_index) if segment.end_index else len(full_text)
271
text_segments.append(full_text[start_index:end_index])
272
273
return "".join(text_segments)
274
```
275
276
### Extract Entities
277
278
```python { .api }
279
from google.cloud.documentai.types import Document
280
281
def extract_entities(document: Document) -> dict:
282
"""
283
Extract and organize entities from processed document.
284
285
Args:
286
document: Processed Document object
287
288
Returns:
289
dict: Organized entities by type with confidence scores
290
"""
291
entities_by_type = {}
292
293
for entity in document.entities:
294
entity_type = entity.type_
295
296
if entity_type not in entities_by_type:
297
entities_by_type[entity_type] = []
298
299
# Extract entity information
300
entity_info = {
301
"text": entity.mention_text,
302
"confidence": entity.confidence,
303
"page_refs": []
304
}
305
306
# Add page references if available
307
if entity.page_anchor:
308
for page_ref in entity.page_anchor.page_refs:
309
entity_info["page_refs"].append({
310
"page": page_ref.page + 1, # Convert to 1-based
311
"layout_type": page_ref.layout_type,
312
"layout_id": page_ref.layout_id
313
})
314
315
# Add text anchor information
316
if entity.text_anchor:
317
entity_info["text_segments"] = []
318
for segment in entity.text_anchor.text_segments:
319
entity_info["text_segments"].append({
320
"start_index": int(segment.start_index or 0),
321
"end_index": int(segment.end_index or 0)
322
})
323
324
# Add properties if available
325
if entity.properties:
326
entity_info["properties"] = []
327
for prop in entity.properties:
328
prop_info = {
329
"type": prop.type_,
330
"text": prop.mention_text,
331
"confidence": prop.confidence
332
}
333
entity_info["properties"].append(prop_info)
334
335
entities_by_type[entity_type].append(entity_info)
336
337
return entities_by_type
338
```
339
340
### Extract Tables
341
342
```python { .api }
343
from google.cloud.documentai.types import Document
344
345
def extract_tables(document: Document) -> list[dict]:
346
"""
347
Extract table data from processed document.
348
349
Args:
350
document: Processed Document object
351
352
Returns:
353
list[dict]: List of tables with structured data
354
"""
355
tables = []
356
357
for page_idx, page in enumerate(document.pages):
358
for table_idx, table in enumerate(page.tables):
359
table_data = {
360
"page": page_idx + 1,
361
"table_index": table_idx,
362
"rows": [],
363
"header_rows": [],
364
"body_rows": []
365
}
366
367
# Process table rows
368
for row in table.header_rows:
369
header_row = extract_table_row(document.text, row)
370
table_data["header_rows"].append(header_row)
371
table_data["rows"].append(header_row)
372
373
for row in table.body_rows:
374
body_row = extract_table_row(document.text, row)
375
table_data["body_rows"].append(body_row)
376
table_data["rows"].append(body_row)
377
378
tables.append(table_data)
379
380
return tables
381
382
def extract_table_row(full_text: str, row: "Document.Page.Table.TableRow") -> list[dict]:
383
"""
384
Extract data from a table row.
385
386
Args:
387
full_text: Full document text
388
row: Table row object
389
390
Returns:
391
list[dict]: List of cell data
392
"""
393
cells = []
394
395
for cell in row.cells:
396
cell_data = {
397
"text": "",
398
"row_span": cell.row_span,
399
"col_span": cell.col_span
400
}
401
402
# Extract cell text
403
if cell.layout and cell.layout.text_anchor:
404
cell_data["text"] = extract_text_from_anchor(
405
full_text,
406
cell.layout.text_anchor
407
).strip()
408
409
cells.append(cell_data)
410
411
return cells
412
```
413
414
### Extract Form Fields
415
416
```python { .api }
417
from google.cloud.documentai.types import Document
418
419
def extract_form_fields(document: Document) -> dict:
420
"""
421
Extract form fields (key-value pairs) from processed document.
422
423
Args:
424
document: Processed Document object
425
426
Returns:
427
dict: Form fields organized as key-value pairs
428
"""
429
form_fields = {}
430
431
for page in document.pages:
432
for form_field in page.form_fields:
433
# Extract field name (key)
434
field_name = ""
435
if form_field.field_name and form_field.field_name.text_anchor:
436
field_name = extract_text_from_anchor(
437
document.text,
438
form_field.field_name.text_anchor
439
).strip()
440
441
# Extract field value
442
field_value = ""
443
if form_field.field_value and form_field.field_value.text_anchor:
444
field_value = extract_text_from_anchor(
445
document.text,
446
form_field.field_value.text_anchor
447
).strip()
448
449
# Store form field with confidence
450
if field_name:
451
form_fields[field_name] = {
452
"value": field_value,
453
"name_confidence": form_field.field_name.confidence if form_field.field_name else 0.0,
454
"value_confidence": form_field.field_value.confidence if form_field.field_value else 0.0
455
}
456
457
return form_fields
458
```
459
460
## Async Document Processing
461
462
### Async Client Usage
463
464
```python { .api }
465
import asyncio
466
from google.cloud.documentai import DocumentProcessorServiceAsyncClient
467
from google.cloud.documentai.types import ProcessRequest, RawDocument
468
469
async def process_document_async(
470
project_id: str,
471
location: str,
472
processor_id: str,
473
file_path: str,
474
mime_type: str
475
) -> "Document":
476
"""
477
Process document asynchronously.
478
479
Args:
480
project_id: Google Cloud project ID
481
location: Processor location
482
processor_id: Document processor ID
483
file_path: Path to document file
484
mime_type: MIME type of document
485
486
Returns:
487
Document: Processed document
488
"""
489
client = DocumentProcessorServiceAsyncClient()
490
491
# Build processor name
492
name = client.processor_path(project_id, location, processor_id)
493
494
# Read document
495
with open(file_path, "rb") as document_file:
496
document_content = document_file.read()
497
498
# Create request
499
raw_document = RawDocument(content=document_content, mime_type=mime_type)
500
request = ProcessRequest(name=name, raw_document=raw_document)
501
502
# Process asynchronously
503
result = await client.process_document(request=request)
504
505
await client.close()
506
return result.document
507
508
# Example usage
509
async def main():
510
document = await process_document_async(
511
project_id="my-project",
512
location="us",
513
processor_id="abc123",
514
file_path="document.pdf",
515
mime_type="application/pdf"
516
)
517
print(f"Processed document: {len(document.text)} characters")
518
519
# Run async function
520
asyncio.run(main())
521
```
522
523
## Supported Document Types
524
525
### MIME Types
526
527
```python { .api }
528
# Supported MIME types for document processing
529
SUPPORTED_MIME_TYPES = {
530
# PDF Documents
531
"application/pdf": "PDF documents",
532
533
# Image formats
534
"image/jpeg": "JPEG images",
535
"image/jpg": "JPG images",
536
"image/png": "PNG images",
537
"image/bmp": "BMP images",
538
"image/tiff": "TIFF images",
539
"image/tif": "TIF images",
540
"image/gif": "GIF images (first frame only)",
541
"image/webp": "WebP images",
542
543
# Office documents (with OCR)
544
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word documents",
545
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint files",
546
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel files"
547
}
548
549
def validate_mime_type(mime_type: str) -> bool:
550
"""
551
Check if MIME type is supported.
552
553
Args:
554
mime_type: MIME type to validate
555
556
Returns:
557
bool: True if supported, False otherwise
558
"""
559
return mime_type in SUPPORTED_MIME_TYPES
560
```
561
562
### Document Size Limits
563
564
```python { .api }
565
# Document processing limits
566
PROCESSING_LIMITS = {
567
"max_file_size_bytes": 20 * 1024 * 1024, # 20 MB
568
"max_pages_per_document": 2000,
569
"max_image_dimensions": {
570
"width": 10000,
571
"height": 10000
572
},
573
"timeout_seconds": 300 # 5 minutes
574
}
575
576
def validate_document_size(file_path: str) -> tuple[bool, str]:
577
"""
578
Validate document meets size requirements.
579
580
Args:
581
file_path: Path to document file
582
583
Returns:
584
tuple[bool, str]: (is_valid, error_message)
585
"""
586
import os
587
588
file_size = os.path.getsize(file_path)
589
590
if file_size > PROCESSING_LIMITS["max_file_size_bytes"]:
591
return False, f"File size ({file_size} bytes) exceeds limit ({PROCESSING_LIMITS['max_file_size_bytes']} bytes)"
592
593
return True, ""
594
```
595
596
## Error Handling
597
598
### Common Processing Errors
599
600
```python { .api }
601
from google.cloud.documentai import DocumentProcessorServiceClient
602
from google.api_core.exceptions import (
603
NotFound,
604
InvalidArgument,
605
ResourceExhausted,
606
DeadlineExceeded
607
)
608
from google.cloud.exceptions import GoogleCloudError
609
610
def robust_process_document(
611
client: DocumentProcessorServiceClient,
612
request: ProcessRequest,
613
max_retries: int = 3
614
) -> "ProcessResponse":
615
"""
616
Process document with error handling and retries.
617
618
Args:
619
client: DocumentProcessorServiceClient instance
620
request: Process request
621
max_retries: Maximum number of retry attempts
622
623
Returns:
624
ProcessResponse: Processing result
625
626
Raises:
627
Exception: If processing fails after all retries
628
"""
629
import time
630
631
for attempt in range(max_retries + 1):
632
try:
633
return client.process_document(request=request)
634
635
except NotFound as e:
636
# Processor not found - don't retry
637
raise Exception(f"Processor not found: {e}")
638
639
except InvalidArgument as e:
640
# Invalid request - don't retry
641
raise Exception(f"Invalid request: {e}")
642
643
except ResourceExhausted as e:
644
# Rate limit exceeded - wait and retry
645
if attempt < max_retries:
646
wait_time = 2 ** attempt # Exponential backoff
647
print(f"Rate limit exceeded, waiting {wait_time}s (attempt {attempt + 1})")
648
time.sleep(wait_time)
649
continue
650
raise Exception(f"Rate limit exceeded after {max_retries} retries: {e}")
651
652
except DeadlineExceeded as e:
653
# Timeout - retry with longer timeout
654
if attempt < max_retries:
655
print(f"Request timeout, retrying (attempt {attempt + 1})")
656
continue
657
raise Exception(f"Request timeout after {max_retries} retries: {e}")
658
659
except GoogleCloudError as e:
660
# Other Google Cloud errors
661
if attempt < max_retries:
662
wait_time = 2 ** attempt
663
print(f"Google Cloud error, retrying in {wait_time}s: {e}")
664
time.sleep(wait_time)
665
continue
666
raise Exception(f"Google Cloud error after {max_retries} retries: {e}")
667
668
except Exception as e:
669
# Unexpected errors - don't retry
670
raise Exception(f"Unexpected error: {e}")
671
672
raise Exception("Maximum retries exceeded")
673
```
674
675
## Human Review Workflow
676
677
### Submit Document for Review
678
679
```python { .api }
680
from google.cloud.documentai import DocumentProcessorServiceClient
681
from google.cloud.documentai.types import ReviewDocumentRequest, Document
682
683
def submit_document_for_review(
684
project_id: str,
685
location: str,
686
processor_id: str,
687
document: Document,
688
enable_schema_validation: bool = True
689
) -> "Operation":
690
"""
691
Submit a processed document for human review.
692
693
Args:
694
project_id: Google Cloud project ID
695
location: Processor location
696
processor_id: Processor ID
697
document: Processed document to review
698
enable_schema_validation: Enable schema validation during review
699
700
Returns:
701
Operation: Long-running operation for review process
702
"""
703
client = DocumentProcessorServiceClient()
704
705
# Build human review config path
706
human_review_config = f"projects/{project_id}/locations/{location}/processors/{processor_id}/humanReviewConfig"
707
708
# Create review request
709
request = ReviewDocumentRequest(
710
human_review_config=human_review_config,
711
inline_document=document,
712
enable_schema_validation=enable_schema_validation
713
)
714
715
# Submit for review
716
operation = client.review_document(request=request)
717
718
print(f"Document submitted for human review")
719
print(f"Operation: {operation.operation.name}")
720
721
return operation
722
723
def check_review_status(operation: "Operation") -> dict:
724
"""
725
Check the status of a human review operation.
726
727
Args:
728
operation: Review operation object
729
730
Returns:
731
dict: Review status information
732
"""
733
if operation.done():
734
if operation.exception():
735
return {
736
"status": "failed",
737
"error": str(operation.exception())
738
}
739
else:
740
result = operation.result()
741
return {
742
"status": "completed",
743
"gcs_destination": result.gcs_destination,
744
"rejection_reason": result.rejection_reason
745
}
746
else:
747
return {"status": "in_progress"}
748
```
749
750
## Complete Processing Example
751
752
```python { .api }
753
from google.cloud.documentai import DocumentProcessorServiceClient
754
from google.cloud.documentai.types import ProcessRequest, RawDocument
755
756
def complete_document_processing_example():
757
"""Complete example of document processing with analysis."""
758
759
# Initialize client
760
client = DocumentProcessorServiceClient()
761
762
# Configuration
763
project_id = "my-project"
764
location = "us"
765
processor_id = "abc123def456"
766
file_path = "sample_invoice.pdf"
767
768
# Process document
769
document = process_document_from_file(
770
project_id=project_id,
771
location=location,
772
processor_id=processor_id,
773
file_path=file_path,
774
mime_type="application/pdf"
775
)
776
777
# Analyze results
778
print("=== DOCUMENT ANALYSIS ===")
779
780
# 1. Basic text analysis
781
text_analysis = analyze_document_text(document)
782
print(f"Total text length: {text_analysis['text_length']} characters")
783
print(f"Number of pages: {len(text_analysis['pages'])}")
784
785
# 2. Extract entities
786
entities = extract_entities(document)
787
print(f"\nFound {len(entities)} entity types:")
788
for entity_type, entity_list in entities.items():
789
print(f" {entity_type}: {len(entity_list)} instances")
790
for entity in entity_list[:3]: # Show first 3
791
print(f" - {entity['text']} (confidence: {entity['confidence']:.2f})")
792
793
# 3. Extract tables
794
tables = extract_tables(document)
795
print(f"\nFound {len(tables)} tables:")
796
for table in tables:
797
print(f" Table on page {table['page']}: {len(table['rows'])} rows")
798
799
# 4. Extract form fields
800
form_fields = extract_form_fields(document)
801
print(f"\nFound {len(form_fields)} form fields:")
802
for field_name, field_info in form_fields.items():
803
print(f" {field_name}: {field_info['value']}")
804
805
if __name__ == "__main__":
806
complete_document_processing_example()
807
```
808
809
This comprehensive guide covers all aspects of document processing with Google Cloud Document AI, from basic operations to advanced analysis and error handling.