Tessl Tile for pypi/google-cloud-documentai@3.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch-operations.md beta-features.md document-processing.md document-types.md index.md processor-management.md

document-processing.mddocs/

0
# Document Processing Operations
1

2
This guide covers core document processing operations using Google Cloud Document AI, including synchronous processing, handling different document formats, and extracting structured data.
3

4
## Process Single Document
5

6
### Basic Document Processing
7

8
```python { .api }
9
from google.cloud.documentai import DocumentProcessorServiceClient
10
from google.cloud.documentai.types import ProcessRequest, RawDocument
11

12
def process_document_from_file(
13
    project_id: str,
14
    location: str, 
15
    processor_id: str,
16
    file_path: str,
17
    mime_type: str
18
) -> "Document":
19
    """
20
    Process a document file using Document AI.
21
    
22
    Args:
23
        project_id: Google Cloud project ID
24
        location: Processor location (e.g., 'us', 'eu')
25
        processor_id: Document processor ID
26
        file_path: Path to the document file
27
        mime_type: MIME type of the document
28
        
29
    Returns:
30
        Document: Processed document with extracted data
31
    """
32
    client = DocumentProcessorServiceClient()
33
    
34
    # Build the processor resource name
35
    name = client.processor_path(project_id, location, processor_id)
36
    
37
    # Read document file
38
    with open(file_path, "rb") as document_file:
39
        document_content = document_file.read()
40
    
41
    # Create raw document
42
    raw_document = RawDocument(
43
        content=document_content,
44
        mime_type=mime_type
45
    )
46
    
47
    # Configure process request
48
    request = ProcessRequest(
49
        name=name,
50
        raw_document=raw_document
51
    )
52
    
53
    # Process the document
54
    result = client.process_document(request=request)
55
    
56
    return result.document
57
```
58

59
### Process Cloud Storage Document
60

61
```python { .api }
62
from google.cloud.documentai import DocumentProcessorServiceClient
63
from google.cloud.documentai.types import ProcessRequest, GcsDocument
64

65
def process_gcs_document(
66
    project_id: str,
67
    location: str,
68
    processor_id: str, 
69
    gcs_uri: str,
70
    mime_type: str
71
) -> "Document":
72
    """
73
    Process a document stored in Google Cloud Storage.
74
    
75
    Args:
76
        project_id: Google Cloud project ID
77
        location: Processor location
78
        processor_id: Document processor ID
79
        gcs_uri: Cloud Storage URI (gs://bucket/path/file.pdf)
80
        mime_type: MIME type of the document
81
        
82
    Returns:
83
        Document: Processed document with extracted data
84
    """
85
    client = DocumentProcessorServiceClient()
86
    
87
    # Build the processor resource name  
88
    name = client.processor_path(project_id, location, processor_id)
89
    
90
    # Create GCS document reference
91
    gcs_document = GcsDocument(
92
        gcs_uri=gcs_uri,
93
        mime_type=mime_type
94
    )
95
    
96
    # Configure process request
97
    request = ProcessRequest(
98
        name=name,
99
        gcs_document=gcs_document
100
    )
101
    
102
    # Process the document
103
    result = client.process_document(request=request)
104
    
105
    return result.document
106
```
107

108
## Processing Options
109

110
### OCR Configuration
111

112
```python { .api }
113
from google.cloud.documentai.types import ProcessRequest, OcrConfig, ProcessOptions
114

115
def process_with_ocr_options(
116
    client: DocumentProcessorServiceClient,
117
    processor_name: str,
118
    raw_document: "RawDocument",
119
    enable_native_pdf_parsing: bool = True,
120
    enable_image_quality_scores: bool = False,
121
    enable_symbol: bool = False
122
) -> "Document":
123
    """
124
    Process document with specific OCR configuration.
125
    
126
    Args:
127
        client: DocumentProcessorServiceClient instance
128
        processor_name: Full processor resource name
129
        raw_document: Raw document to process
130
        enable_native_pdf_parsing: Use native PDF parsing when possible
131
        enable_image_quality_scores: Include image quality scores
132
        enable_symbol: Enable symbol detection
133
        
134
    Returns:
135
        Document: Processed document
136
    """
137
    # Configure OCR options
138
    ocr_config = OcrConfig(
139
        enable_native_pdf_parsing=enable_native_pdf_parsing,
140
        enable_image_quality_scores=enable_image_quality_scores,
141
        enable_symbol=enable_symbol
142
    )
143
    
144
    # Configure process options
145
    process_options = ProcessOptions(ocr_config=ocr_config)
146
    
147
    # Create request with options
148
    request = ProcessRequest(
149
        name=processor_name,
150
        raw_document=raw_document,
151
        process_options=process_options
152
    )
153
    
154
    # Process document
155
    result = client.process_document(request=request)
156
    return result.document
157
```
158

159
### Field Mask Processing
160

161
```python { .api }
162
from google.cloud.documentai.types import ProcessRequest
163
from google.protobuf.field_mask_pb2 import FieldMask
164

165
def process_with_field_mask(
166
    client: DocumentProcessorServiceClient,
167
    processor_name: str,
168
    raw_document: "RawDocument",
169
    fields: list[str]
170
) -> "Document":
171
    """
172
    Process document returning only specified fields.
173
    
174
    Args:
175
        client: DocumentProcessorServiceClient instance
176
        processor_name: Full processor resource name  
177
        raw_document: Raw document to process
178
        fields: List of field paths to return (e.g., ['text', 'pages.blocks'])
179
        
180
    Returns:
181
        Document: Processed document with only requested fields
182
    """
183
    # Create field mask
184
    field_mask = FieldMask(paths=fields)
185
    
186
    # Create request with field mask
187
    request = ProcessRequest(
188
        name=processor_name,
189
        raw_document=raw_document,
190
        field_mask=field_mask
191
    )
192
    
193
    # Process document  
194
    result = client.process_document(request=request)
195
    return result.document
196
```
197

198
## Document Analysis
199

200
### Extract Text and Layout
201

202
```python { .api }
203
from google.cloud.documentai.types import Document
204

205
def analyze_document_text(document: Document) -> dict:
206
    """
207
    Analyze text content and layout from processed document.
208
    
209
    Args:
210
        document: Processed Document object
211
        
212
    Returns:
213
        dict: Analysis results including text statistics and layout info
214
    """
215
    analysis = {
216
        "total_text": document.text,
217
        "text_length": len(document.text),
218
        "pages": [],
219
        "text_segments": []
220
    }
221
    
222
    # Analyze each page
223
    for page_idx, page in enumerate(document.pages):
224
        page_info = {
225
            "page_number": page_idx + 1,
226
            "dimensions": {
227
                "width": page.dimension.width,
228
                "height": page.dimension.height,
229
                "unit": page.dimension.unit
230
            },
231
            "blocks": len(page.blocks),
232
            "paragraphs": len(page.paragraphs), 
233
            "lines": len(page.lines),
234
            "tokens": len(page.tokens)
235
        }
236
        
237
        # Extract text segments from page
238
        for block in page.blocks:
239
            if block.layout and block.layout.text_anchor:
240
                text_segment = extract_text_from_anchor(
241
                    document.text, 
242
                    block.layout.text_anchor
243
                )
244
                analysis["text_segments"].append({
245
                    "type": "block",
246
                    "page": page_idx + 1,
247
                    "text": text_segment,
248
                    "confidence": block.layout.confidence
249
                })
250
        
251
        analysis["pages"].append(page_info)
252
    
253
    return analysis
254

255
def extract_text_from_anchor(full_text: str, text_anchor: "Document.TextAnchor") -> str:
256
    """
257
    Extract text segment using TextAnchor.
258
    
259
    Args:
260
        full_text: Full document text
261
        text_anchor: TextAnchor specifying text location
262
        
263
    Returns:
264
        str: Extracted text segment
265
    """
266
    text_segments = []
267
    
268
    for segment in text_anchor.text_segments:
269
        start_index = int(segment.start_index) if segment.start_index else 0
270
        end_index = int(segment.end_index) if segment.end_index else len(full_text)
271
        text_segments.append(full_text[start_index:end_index])
272
    
273
    return "".join(text_segments)
274
```
275

276
### Extract Entities
277

278
```python { .api }
279
from google.cloud.documentai.types import Document
280

281
def extract_entities(document: Document) -> dict:
282
    """
283
    Extract and organize entities from processed document.
284
    
285
    Args:
286
        document: Processed Document object
287
        
288
    Returns:
289
        dict: Organized entities by type with confidence scores
290
    """
291
    entities_by_type = {}
292
    
293
    for entity in document.entities:
294
        entity_type = entity.type_
295
        
296
        if entity_type not in entities_by_type:
297
            entities_by_type[entity_type] = []
298
        
299
        # Extract entity information
300
        entity_info = {
301
            "text": entity.mention_text,
302
            "confidence": entity.confidence,
303
            "page_refs": []
304
        }
305
        
306
        # Add page references if available
307
        if entity.page_anchor:
308
            for page_ref in entity.page_anchor.page_refs:
309
                entity_info["page_refs"].append({
310
                    "page": page_ref.page + 1,  # Convert to 1-based
311
                    "layout_type": page_ref.layout_type,
312
                    "layout_id": page_ref.layout_id
313
                })
314
        
315
        # Add text anchor information
316
        if entity.text_anchor:
317
            entity_info["text_segments"] = []
318
            for segment in entity.text_anchor.text_segments:
319
                entity_info["text_segments"].append({
320
                    "start_index": int(segment.start_index or 0),
321
                    "end_index": int(segment.end_index or 0)
322
                })
323
        
324
        # Add properties if available
325
        if entity.properties:
326
            entity_info["properties"] = []
327
            for prop in entity.properties:
328
                prop_info = {
329
                    "type": prop.type_,
330
                    "text": prop.mention_text,
331
                    "confidence": prop.confidence
332
                }
333
                entity_info["properties"].append(prop_info)
334
        
335
        entities_by_type[entity_type].append(entity_info)
336
    
337
    return entities_by_type
338
```
339

340
### Extract Tables
341

342
```python { .api }
343
from google.cloud.documentai.types import Document
344

345
def extract_tables(document: Document) -> list[dict]:
346
    """
347
    Extract table data from processed document.
348
    
349
    Args:
350
        document: Processed Document object
351
        
352
    Returns:
353
        list[dict]: List of tables with structured data
354
    """
355
    tables = []
356
    
357
    for page_idx, page in enumerate(document.pages):
358
        for table_idx, table in enumerate(page.tables):
359
            table_data = {
360
                "page": page_idx + 1,
361
                "table_index": table_idx,
362
                "rows": [],
363
                "header_rows": [],
364
                "body_rows": []
365
            }
366
            
367
            # Process table rows
368
            for row in table.header_rows:
369
                header_row = extract_table_row(document.text, row)
370
                table_data["header_rows"].append(header_row)
371
                table_data["rows"].append(header_row)
372
            
373
            for row in table.body_rows:
374
                body_row = extract_table_row(document.text, row)
375
                table_data["body_rows"].append(body_row)
376
                table_data["rows"].append(body_row)
377
            
378
            tables.append(table_data)
379
    
380
    return tables
381

382
def extract_table_row(full_text: str, row: "Document.Page.Table.TableRow") -> list[dict]:
383
    """
384
    Extract data from a table row.
385
    
386
    Args:
387
        full_text: Full document text
388
        row: Table row object
389
        
390
    Returns:
391
        list[dict]: List of cell data
392
    """
393
    cells = []
394
    
395
    for cell in row.cells:
396
        cell_data = {
397
            "text": "",
398
            "row_span": cell.row_span,
399
            "col_span": cell.col_span
400
        }
401
        
402
        # Extract cell text
403
        if cell.layout and cell.layout.text_anchor:
404
            cell_data["text"] = extract_text_from_anchor(
405
                full_text, 
406
                cell.layout.text_anchor
407
            ).strip()
408
        
409
        cells.append(cell_data)
410
    
411
    return cells
412
```
413

414
### Extract Form Fields
415

416
```python { .api }
417
from google.cloud.documentai.types import Document
418

419
def extract_form_fields(document: Document) -> dict:
420
    """
421
    Extract form fields (key-value pairs) from processed document.
422
    
423
    Args:
424
        document: Processed Document object
425
        
426
    Returns:
427
        dict: Form fields organized as key-value pairs
428
    """
429
    form_fields = {}
430
    
431
    for page in document.pages:
432
        for form_field in page.form_fields:
433
            # Extract field name (key)
434
            field_name = ""
435
            if form_field.field_name and form_field.field_name.text_anchor:
436
                field_name = extract_text_from_anchor(
437
                    document.text,
438
                    form_field.field_name.text_anchor
439
                ).strip()
440
            
441
            # Extract field value
442
            field_value = ""
443
            if form_field.field_value and form_field.field_value.text_anchor:
444
                field_value = extract_text_from_anchor(
445
                    document.text,
446
                    form_field.field_value.text_anchor
447
                ).strip()
448
            
449
            # Store form field with confidence
450
            if field_name:
451
                form_fields[field_name] = {
452
                    "value": field_value,
453
                    "name_confidence": form_field.field_name.confidence if form_field.field_name else 0.0,
454
                    "value_confidence": form_field.field_value.confidence if form_field.field_value else 0.0
455
                }
456
    
457
    return form_fields
458
```
459

460
## Async Document Processing
461

462
### Async Client Usage
463

464
```python { .api }
465
import asyncio
466
from google.cloud.documentai import DocumentProcessorServiceAsyncClient
467
from google.cloud.documentai.types import ProcessRequest, RawDocument
468

469
async def process_document_async(
470
    project_id: str,
471
    location: str,
472
    processor_id: str,
473
    file_path: str,
474
    mime_type: str
475
) -> "Document":
476
    """
477
    Process document asynchronously.
478
    
479
    Args:
480
        project_id: Google Cloud project ID
481
        location: Processor location
482
        processor_id: Document processor ID  
483
        file_path: Path to document file
484
        mime_type: MIME type of document
485
        
486
    Returns:
487
        Document: Processed document
488
    """
489
    client = DocumentProcessorServiceAsyncClient()
490
    
491
    # Build processor name
492
    name = client.processor_path(project_id, location, processor_id)
493
    
494
    # Read document
495
    with open(file_path, "rb") as document_file:
496
        document_content = document_file.read()
497
    
498
    # Create request
499
    raw_document = RawDocument(content=document_content, mime_type=mime_type)
500
    request = ProcessRequest(name=name, raw_document=raw_document)
501
    
502
    # Process asynchronously
503
    result = await client.process_document(request=request)
504
    
505
    await client.close()
506
    return result.document
507

508
# Example usage
509
async def main():
510
    document = await process_document_async(
511
        project_id="my-project",
512
        location="us", 
513
        processor_id="abc123",
514
        file_path="document.pdf",
515
        mime_type="application/pdf"
516
    )
517
    print(f"Processed document: {len(document.text)} characters")
518

519
# Run async function
520
asyncio.run(main())
521
```
522

523
## Supported Document Types
524

525
### MIME Types
526

527
```python { .api }
528
# Supported MIME types for document processing
529
SUPPORTED_MIME_TYPES = {
530
    # PDF Documents
531
    "application/pdf": "PDF documents",
532
    
533
    # Image formats
534
    "image/jpeg": "JPEG images", 
535
    "image/jpg": "JPG images",
536
    "image/png": "PNG images",
537
    "image/bmp": "BMP images",
538
    "image/tiff": "TIFF images",
539
    "image/tif": "TIF images",
540
    "image/gif": "GIF images (first frame only)",
541
    "image/webp": "WebP images",
542
    
543
    # Office documents (with OCR)
544
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word documents",
545
    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint files",
546
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel files"
547
}
548

549
def validate_mime_type(mime_type: str) -> bool:
550
    """
551
    Check if MIME type is supported.
552
    
553
    Args:
554
        mime_type: MIME type to validate
555
        
556
    Returns:
557
        bool: True if supported, False otherwise
558
    """
559
    return mime_type in SUPPORTED_MIME_TYPES
560
```
561

562
### Document Size Limits
563

564
```python { .api }
565
# Document processing limits
566
PROCESSING_LIMITS = {
567
    "max_file_size_bytes": 20 * 1024 * 1024,  # 20 MB
568
    "max_pages_per_document": 2000,
569
    "max_image_dimensions": {
570
        "width": 10000,
571
        "height": 10000
572
    },
573
    "timeout_seconds": 300  # 5 minutes
574
}
575

576
def validate_document_size(file_path: str) -> tuple[bool, str]:
577
    """
578
    Validate document meets size requirements.
579
    
580
    Args:
581
        file_path: Path to document file
582
        
583
    Returns:
584
        tuple[bool, str]: (is_valid, error_message)
585
    """
586
    import os
587
    
588
    file_size = os.path.getsize(file_path)
589
    
590
    if file_size > PROCESSING_LIMITS["max_file_size_bytes"]:
591
        return False, f"File size ({file_size} bytes) exceeds limit ({PROCESSING_LIMITS['max_file_size_bytes']} bytes)"
592
    
593
    return True, ""
594
```
595

596
## Error Handling
597

598
### Common Processing Errors
599

600
```python { .api }
601
from google.cloud.documentai import DocumentProcessorServiceClient
602
from google.api_core.exceptions import (
603
    NotFound,
604
    InvalidArgument, 
605
    ResourceExhausted,
606
    DeadlineExceeded
607
)
608
from google.cloud.exceptions import GoogleCloudError
609

610
def robust_process_document(
611
    client: DocumentProcessorServiceClient,
612
    request: ProcessRequest,
613
    max_retries: int = 3
614
) -> "ProcessResponse":
615
    """
616
    Process document with error handling and retries.
617
    
618
    Args:
619
        client: DocumentProcessorServiceClient instance
620
        request: Process request
621
        max_retries: Maximum number of retry attempts
622
        
623
    Returns:
624
        ProcessResponse: Processing result
625
        
626
    Raises:
627
        Exception: If processing fails after all retries
628
    """
629
    import time
630
    
631
    for attempt in range(max_retries + 1):
632
        try:
633
            return client.process_document(request=request)
634
            
635
        except NotFound as e:
636
            # Processor not found - don't retry
637
            raise Exception(f"Processor not found: {e}")
638
            
639
        except InvalidArgument as e:
640
            # Invalid request - don't retry
641
            raise Exception(f"Invalid request: {e}")
642
            
643
        except ResourceExhausted as e:
644
            # Rate limit exceeded - wait and retry
645
            if attempt < max_retries:
646
                wait_time = 2 ** attempt  # Exponential backoff
647
                print(f"Rate limit exceeded, waiting {wait_time}s (attempt {attempt + 1})")
648
                time.sleep(wait_time)
649
                continue
650
            raise Exception(f"Rate limit exceeded after {max_retries} retries: {e}")
651
            
652
        except DeadlineExceeded as e:
653
            # Timeout - retry with longer timeout
654
            if attempt < max_retries:
655
                print(f"Request timeout, retrying (attempt {attempt + 1})")
656
                continue
657
            raise Exception(f"Request timeout after {max_retries} retries: {e}")
658
            
659
        except GoogleCloudError as e:
660
            # Other Google Cloud errors
661
            if attempt < max_retries:
662
                wait_time = 2 ** attempt
663
                print(f"Google Cloud error, retrying in {wait_time}s: {e}")
664
                time.sleep(wait_time)
665
                continue
666
            raise Exception(f"Google Cloud error after {max_retries} retries: {e}")
667
            
668
        except Exception as e:
669
            # Unexpected errors - don't retry
670
            raise Exception(f"Unexpected error: {e}")
671
    
672
    raise Exception("Maximum retries exceeded")
673
```
674

675
## Human Review Workflow
676

677
### Submit Document for Review
678

679
```python { .api }
680
from google.cloud.documentai import DocumentProcessorServiceClient
681
from google.cloud.documentai.types import ReviewDocumentRequest, Document
682

683
def submit_document_for_review(
684
    project_id: str,
685
    location: str,
686
    processor_id: str,
687
    document: Document,
688
    enable_schema_validation: bool = True
689
) -> "Operation":
690
    """
691
    Submit a processed document for human review.
692
    
693
    Args:
694
        project_id: Google Cloud project ID
695
        location: Processor location
696
        processor_id: Processor ID
697
        document: Processed document to review
698
        enable_schema_validation: Enable schema validation during review
699
        
700
    Returns:
701
        Operation: Long-running operation for review process
702
    """
703
    client = DocumentProcessorServiceClient()
704
    
705
    # Build human review config path
706
    human_review_config = f"projects/{project_id}/locations/{location}/processors/{processor_id}/humanReviewConfig"
707
    
708
    # Create review request
709
    request = ReviewDocumentRequest(
710
        human_review_config=human_review_config,
711
        inline_document=document,
712
        enable_schema_validation=enable_schema_validation
713
    )
714
    
715
    # Submit for review
716
    operation = client.review_document(request=request)
717
    
718
    print(f"Document submitted for human review")
719
    print(f"Operation: {operation.operation.name}")
720
    
721
    return operation
722

723
def check_review_status(operation: "Operation") -> dict:
724
    """
725
    Check the status of a human review operation.
726
    
727
    Args:
728
        operation: Review operation object
729
        
730
    Returns:
731
        dict: Review status information
732
    """
733
    if operation.done():
734
        if operation.exception():
735
            return {
736
                "status": "failed",
737
                "error": str(operation.exception())
738
            }
739
        else:
740
            result = operation.result()
741
            return {
742
                "status": "completed", 
743
                "gcs_destination": result.gcs_destination,
744
                "rejection_reason": result.rejection_reason
745
            }
746
    else:
747
        return {"status": "in_progress"}
748
```
749

750
## Complete Processing Example
751

752
```python { .api }
753
from google.cloud.documentai import DocumentProcessorServiceClient
754
from google.cloud.documentai.types import ProcessRequest, RawDocument
755

756
def complete_document_processing_example():
757
    """Complete example of document processing with analysis."""
758
    
759
    # Initialize client
760
    client = DocumentProcessorServiceClient()
761
    
762
    # Configuration
763
    project_id = "my-project"
764
    location = "us"
765
    processor_id = "abc123def456"
766
    file_path = "sample_invoice.pdf"
767
    
768
    # Process document
769
    document = process_document_from_file(
770
        project_id=project_id,
771
        location=location,
772
        processor_id=processor_id,
773
        file_path=file_path,
774
        mime_type="application/pdf"
775
    )
776
    
777
    # Analyze results
778
    print("=== DOCUMENT ANALYSIS ===")
779
    
780
    # 1. Basic text analysis
781
    text_analysis = analyze_document_text(document)
782
    print(f"Total text length: {text_analysis['text_length']} characters")
783
    print(f"Number of pages: {len(text_analysis['pages'])}")
784
    
785
    # 2. Extract entities
786
    entities = extract_entities(document)
787
    print(f"\nFound {len(entities)} entity types:")
788
    for entity_type, entity_list in entities.items():
789
        print(f"  {entity_type}: {len(entity_list)} instances")
790
        for entity in entity_list[:3]:  # Show first 3
791
            print(f"    - {entity['text']} (confidence: {entity['confidence']:.2f})")
792
    
793
    # 3. Extract tables
794
    tables = extract_tables(document)
795
    print(f"\nFound {len(tables)} tables:")
796
    for table in tables:
797
        print(f"  Table on page {table['page']}: {len(table['rows'])} rows")
798
    
799
    # 4. Extract form fields  
800
    form_fields = extract_form_fields(document)
801
    print(f"\nFound {len(form_fields)} form fields:")
802
    for field_name, field_info in form_fields.items():
803
        print(f"  {field_name}: {field_info['value']}")
804

805
if __name__ == "__main__":
806
    complete_document_processing_example()
807
```
808

809
This comprehensive guide covers all aspects of document processing with Google Cloud Document AI, from basic operations to advanced analysis and error handling.

Version

Tile

Files

document-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

document-processing.mddocs/