CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-documentai

Google Cloud Document AI client library for extracting structured information from documents using machine learning

Pending
Overview
Eval results
Files

document-processing.mddocs/

Document Processing Operations

This guide covers core document processing operations using Google Cloud Document AI, including synchronous processing, handling different document formats, and extracting structured data.

Process Single Document

Basic Document Processing

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, RawDocument

def process_document_from_file(
    project_id: str,
    location: str, 
    processor_id: str,
    file_path: str,
    mime_type: str
) -> "Document":
    """
    Process a document file using Document AI.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location (e.g., 'us', 'eu')
        processor_id: Document processor ID
        file_path: Path to the document file
        mime_type: MIME type of the document
        
    Returns:
        Document: Processed document with extracted data
    """
    client = DocumentProcessorServiceClient()
    
    # Build the processor resource name
    name = client.processor_path(project_id, location, processor_id)
    
    # Read document file
    with open(file_path, "rb") as document_file:
        document_content = document_file.read()
    
    # Create raw document
    raw_document = RawDocument(
        content=document_content,
        mime_type=mime_type
    )
    
    # Configure process request
    request = ProcessRequest(
        name=name,
        raw_document=raw_document
    )
    
    # Process the document
    result = client.process_document(request=request)
    
    return result.document

Process Cloud Storage Document

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, GcsDocument

def process_gcs_document(
    project_id: str,
    location: str,
    processor_id: str, 
    gcs_uri: str,
    mime_type: str
) -> "Document":
    """
    Process a document stored in Google Cloud Storage.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Document processor ID
        gcs_uri: Cloud Storage URI (gs://bucket/path/file.pdf)
        mime_type: MIME type of the document
        
    Returns:
        Document: Processed document with extracted data
    """
    client = DocumentProcessorServiceClient()
    
    # Build the processor resource name  
    name = client.processor_path(project_id, location, processor_id)
    
    # Create GCS document reference
    gcs_document = GcsDocument(
        gcs_uri=gcs_uri,
        mime_type=mime_type
    )
    
    # Configure process request
    request = ProcessRequest(
        name=name,
        gcs_document=gcs_document
    )
    
    # Process the document
    result = client.process_document(request=request)
    
    return result.document

Processing Options

OCR Configuration

from google.cloud.documentai.types import ProcessRequest, OcrConfig, ProcessOptions

def process_with_ocr_options(
    client: DocumentProcessorServiceClient,
    processor_name: str,
    raw_document: "RawDocument",
    enable_native_pdf_parsing: bool = True,
    enable_image_quality_scores: bool = False,
    enable_symbol: bool = False
) -> "Document":
    """
    Process document with specific OCR configuration.
    
    Args:
        client: DocumentProcessorServiceClient instance
        processor_name: Full processor resource name
        raw_document: Raw document to process
        enable_native_pdf_parsing: Use native PDF parsing when possible
        enable_image_quality_scores: Include image quality scores
        enable_symbol: Enable symbol detection
        
    Returns:
        Document: Processed document
    """
    # Configure OCR options
    ocr_config = OcrConfig(
        enable_native_pdf_parsing=enable_native_pdf_parsing,
        enable_image_quality_scores=enable_image_quality_scores,
        enable_symbol=enable_symbol
    )
    
    # Configure process options
    process_options = ProcessOptions(ocr_config=ocr_config)
    
    # Create request with options
    request = ProcessRequest(
        name=processor_name,
        raw_document=raw_document,
        process_options=process_options
    )
    
    # Process document
    result = client.process_document(request=request)
    return result.document

Field Mask Processing

from google.cloud.documentai.types import ProcessRequest
from google.protobuf.field_mask_pb2 import FieldMask

def process_with_field_mask(
    client: DocumentProcessorServiceClient,
    processor_name: str,
    raw_document: "RawDocument",
    fields: list[str]
) -> "Document":
    """
    Process document returning only specified fields.
    
    Args:
        client: DocumentProcessorServiceClient instance
        processor_name: Full processor resource name  
        raw_document: Raw document to process
        fields: List of field paths to return (e.g., ['text', 'pages.blocks'])
        
    Returns:
        Document: Processed document with only requested fields
    """
    # Create field mask
    field_mask = FieldMask(paths=fields)
    
    # Create request with field mask
    request = ProcessRequest(
        name=processor_name,
        raw_document=raw_document,
        field_mask=field_mask
    )
    
    # Process document  
    result = client.process_document(request=request)
    return result.document

Document Analysis

Extract Text and Layout

from google.cloud.documentai.types import Document

def analyze_document_text(document: Document) -> dict:
    """
    Analyze text content and layout from processed document.
    
    Args:
        document: Processed Document object
        
    Returns:
        dict: Analysis results including text statistics and layout info
    """
    analysis = {
        "total_text": document.text,
        "text_length": len(document.text),
        "pages": [],
        "text_segments": []
    }
    
    # Analyze each page
    for page_idx, page in enumerate(document.pages):
        page_info = {
            "page_number": page_idx + 1,
            "dimensions": {
                "width": page.dimension.width,
                "height": page.dimension.height,
                "unit": page.dimension.unit
            },
            "blocks": len(page.blocks),
            "paragraphs": len(page.paragraphs), 
            "lines": len(page.lines),
            "tokens": len(page.tokens)
        }
        
        # Extract text segments from page
        for block in page.blocks:
            if block.layout and block.layout.text_anchor:
                text_segment = extract_text_from_anchor(
                    document.text, 
                    block.layout.text_anchor
                )
                analysis["text_segments"].append({
                    "type": "block",
                    "page": page_idx + 1,
                    "text": text_segment,
                    "confidence": block.layout.confidence
                })
        
        analysis["pages"].append(page_info)
    
    return analysis

def extract_text_from_anchor(full_text: str, text_anchor: "Document.TextAnchor") -> str:
    """
    Extract text segment using TextAnchor.
    
    Args:
        full_text: Full document text
        text_anchor: TextAnchor specifying text location
        
    Returns:
        str: Extracted text segment
    """
    text_segments = []
    
    for segment in text_anchor.text_segments:
        start_index = int(segment.start_index) if segment.start_index else 0
        end_index = int(segment.end_index) if segment.end_index else len(full_text)
        text_segments.append(full_text[start_index:end_index])
    
    return "".join(text_segments)

Extract Entities

from google.cloud.documentai.types import Document

def extract_entities(document: Document) -> dict:
    """
    Extract and organize entities from processed document.
    
    Args:
        document: Processed Document object
        
    Returns:
        dict: Organized entities by type with confidence scores
    """
    entities_by_type = {}
    
    for entity in document.entities:
        entity_type = entity.type_
        
        if entity_type not in entities_by_type:
            entities_by_type[entity_type] = []
        
        # Extract entity information
        entity_info = {
            "text": entity.mention_text,
            "confidence": entity.confidence,
            "page_refs": []
        }
        
        # Add page references if available
        if entity.page_anchor:
            for page_ref in entity.page_anchor.page_refs:
                entity_info["page_refs"].append({
                    "page": page_ref.page + 1,  # Convert to 1-based
                    "layout_type": page_ref.layout_type,
                    "layout_id": page_ref.layout_id
                })
        
        # Add text anchor information
        if entity.text_anchor:
            entity_info["text_segments"] = []
            for segment in entity.text_anchor.text_segments:
                entity_info["text_segments"].append({
                    "start_index": int(segment.start_index or 0),
                    "end_index": int(segment.end_index or 0)
                })
        
        # Add properties if available
        if entity.properties:
            entity_info["properties"] = []
            for prop in entity.properties:
                prop_info = {
                    "type": prop.type_,
                    "text": prop.mention_text,
                    "confidence": prop.confidence
                }
                entity_info["properties"].append(prop_info)
        
        entities_by_type[entity_type].append(entity_info)
    
    return entities_by_type

Extract Tables

from google.cloud.documentai.types import Document

def extract_tables(document: Document) -> list[dict]:
    """
    Extract table data from processed document.
    
    Args:
        document: Processed Document object
        
    Returns:
        list[dict]: List of tables with structured data
    """
    tables = []
    
    for page_idx, page in enumerate(document.pages):
        for table_idx, table in enumerate(page.tables):
            table_data = {
                "page": page_idx + 1,
                "table_index": table_idx,
                "rows": [],
                "header_rows": [],
                "body_rows": []
            }
            
            # Process table rows
            for row in table.header_rows:
                header_row = extract_table_row(document.text, row)
                table_data["header_rows"].append(header_row)
                table_data["rows"].append(header_row)
            
            for row in table.body_rows:
                body_row = extract_table_row(document.text, row)
                table_data["body_rows"].append(body_row)
                table_data["rows"].append(body_row)
            
            tables.append(table_data)
    
    return tables

def extract_table_row(full_text: str, row: "Document.Page.Table.TableRow") -> list[dict]:
    """
    Extract data from a table row.
    
    Args:
        full_text: Full document text
        row: Table row object
        
    Returns:
        list[dict]: List of cell data
    """
    cells = []
    
    for cell in row.cells:
        cell_data = {
            "text": "",
            "row_span": cell.row_span,
            "col_span": cell.col_span
        }
        
        # Extract cell text
        if cell.layout and cell.layout.text_anchor:
            cell_data["text"] = extract_text_from_anchor(
                full_text, 
                cell.layout.text_anchor
            ).strip()
        
        cells.append(cell_data)
    
    return cells

Extract Form Fields

from google.cloud.documentai.types import Document

def extract_form_fields(document: Document) -> dict:
    """
    Extract form fields (key-value pairs) from processed document.
    
    Args:
        document: Processed Document object
        
    Returns:
        dict: Form fields organized as key-value pairs
    """
    form_fields = {}
    
    for page in document.pages:
        for form_field in page.form_fields:
            # Extract field name (key)
            field_name = ""
            if form_field.field_name and form_field.field_name.text_anchor:
                field_name = extract_text_from_anchor(
                    document.text,
                    form_field.field_name.text_anchor
                ).strip()
            
            # Extract field value
            field_value = ""
            if form_field.field_value and form_field.field_value.text_anchor:
                field_value = extract_text_from_anchor(
                    document.text,
                    form_field.field_value.text_anchor
                ).strip()
            
            # Store form field with confidence
            if field_name:
                form_fields[field_name] = {
                    "value": field_value,
                    "name_confidence": form_field.field_name.confidence if form_field.field_name else 0.0,
                    "value_confidence": form_field.field_value.confidence if form_field.field_value else 0.0
                }
    
    return form_fields

Async Document Processing

Async Client Usage

import asyncio
from google.cloud.documentai import DocumentProcessorServiceAsyncClient
from google.cloud.documentai.types import ProcessRequest, RawDocument

async def process_document_async(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str
) -> "Document":
    """
    Process document asynchronously.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Document processor ID  
        file_path: Path to document file
        mime_type: MIME type of document
        
    Returns:
        Document: Processed document
    """
    client = DocumentProcessorServiceAsyncClient()
    
    # Build processor name
    name = client.processor_path(project_id, location, processor_id)
    
    # Read document
    with open(file_path, "rb") as document_file:
        document_content = document_file.read()
    
    # Create request
    raw_document = RawDocument(content=document_content, mime_type=mime_type)
    request = ProcessRequest(name=name, raw_document=raw_document)
    
    # Process asynchronously
    result = await client.process_document(request=request)
    
    await client.close()
    return result.document

# Example usage
async def main():
    document = await process_document_async(
        project_id="my-project",
        location="us", 
        processor_id="abc123",
        file_path="document.pdf",
        mime_type="application/pdf"
    )
    print(f"Processed document: {len(document.text)} characters")

# Run async function
asyncio.run(main())

Supported Document Types

MIME Types

# Supported MIME types for document processing
SUPPORTED_MIME_TYPES = {
    # PDF Documents
    "application/pdf": "PDF documents",
    
    # Image formats
    "image/jpeg": "JPEG images", 
    "image/jpg": "JPG images",
    "image/png": "PNG images",
    "image/bmp": "BMP images",
    "image/tiff": "TIFF images",
    "image/tif": "TIF images",
    "image/gif": "GIF images (first frame only)",
    "image/webp": "WebP images",
    
    # Office documents (with OCR)
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word documents",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint files",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel files"
}

def validate_mime_type(mime_type: str) -> bool:
    """
    Check if MIME type is supported.
    
    Args:
        mime_type: MIME type to validate
        
    Returns:
        bool: True if supported, False otherwise
    """
    return mime_type in SUPPORTED_MIME_TYPES

Document Size Limits

# Document processing limits
PROCESSING_LIMITS = {
    "max_file_size_bytes": 20 * 1024 * 1024,  # 20 MB
    "max_pages_per_document": 2000,
    "max_image_dimensions": {
        "width": 10000,
        "height": 10000
    },
    "timeout_seconds": 300  # 5 minutes
}

def validate_document_size(file_path: str) -> tuple[bool, str]:
    """
    Validate document meets size requirements.
    
    Args:
        file_path: Path to document file
        
    Returns:
        tuple[bool, str]: (is_valid, error_message)
    """
    import os
    
    file_size = os.path.getsize(file_path)
    
    if file_size > PROCESSING_LIMITS["max_file_size_bytes"]:
        return False, f"File size ({file_size} bytes) exceeds limit ({PROCESSING_LIMITS['max_file_size_bytes']} bytes)"
    
    return True, ""

Error Handling

Common Processing Errors

from google.cloud.documentai import DocumentProcessorServiceClient
from google.api_core.exceptions import (
    NotFound,
    InvalidArgument, 
    ResourceExhausted,
    DeadlineExceeded
)
from google.cloud.exceptions import GoogleCloudError

def robust_process_document(
    client: DocumentProcessorServiceClient,
    request: ProcessRequest,
    max_retries: int = 3
) -> "ProcessResponse":
    """
    Process document with error handling and retries.
    
    Args:
        client: DocumentProcessorServiceClient instance
        request: Process request
        max_retries: Maximum number of retry attempts
        
    Returns:
        ProcessResponse: Processing result
        
    Raises:
        Exception: If processing fails after all retries
    """
    import time
    
    for attempt in range(max_retries + 1):
        try:
            return client.process_document(request=request)
            
        except NotFound as e:
            # Processor not found - don't retry
            raise Exception(f"Processor not found: {e}")
            
        except InvalidArgument as e:
            # Invalid request - don't retry
            raise Exception(f"Invalid request: {e}")
            
        except ResourceExhausted as e:
            # Rate limit exceeded - wait and retry
            if attempt < max_retries:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Rate limit exceeded, waiting {wait_time}s (attempt {attempt + 1})")
                time.sleep(wait_time)
                continue
            raise Exception(f"Rate limit exceeded after {max_retries} retries: {e}")
            
        except DeadlineExceeded as e:
            # Timeout - retry with longer timeout
            if attempt < max_retries:
                print(f"Request timeout, retrying (attempt {attempt + 1})")
                continue
            raise Exception(f"Request timeout after {max_retries} retries: {e}")
            
        except GoogleCloudError as e:
            # Other Google Cloud errors
            if attempt < max_retries:
                wait_time = 2 ** attempt
                print(f"Google Cloud error, retrying in {wait_time}s: {e}")
                time.sleep(wait_time)
                continue
            raise Exception(f"Google Cloud error after {max_retries} retries: {e}")
            
        except Exception as e:
            # Unexpected errors - don't retry
            raise Exception(f"Unexpected error: {e}")
    
    raise Exception("Maximum retries exceeded")

Human Review Workflow

Submit Document for Review

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ReviewDocumentRequest, Document

def submit_document_for_review(
    project_id: str,
    location: str,
    processor_id: str,
    document: Document,
    enable_schema_validation: bool = True
) -> "Operation":
    """
    Submit a processed document for human review.
    
    Args:
        project_id: Google Cloud project ID
        location: Processor location
        processor_id: Processor ID
        document: Processed document to review
        enable_schema_validation: Enable schema validation during review
        
    Returns:
        Operation: Long-running operation for review process
    """
    client = DocumentProcessorServiceClient()
    
    # Build human review config path
    human_review_config = f"projects/{project_id}/locations/{location}/processors/{processor_id}/humanReviewConfig"
    
    # Create review request
    request = ReviewDocumentRequest(
        human_review_config=human_review_config,
        inline_document=document,
        enable_schema_validation=enable_schema_validation
    )
    
    # Submit for review
    operation = client.review_document(request=request)
    
    print(f"Document submitted for human review")
    print(f"Operation: {operation.operation.name}")
    
    return operation

def check_review_status(operation: "Operation") -> dict:
    """
    Check the status of a human review operation.
    
    Args:
        operation: Review operation object
        
    Returns:
        dict: Review status information
    """
    if operation.done():
        if operation.exception():
            return {
                "status": "failed",
                "error": str(operation.exception())
            }
        else:
            result = operation.result()
            return {
                "status": "completed", 
                "gcs_destination": result.gcs_destination,
                "rejection_reason": result.rejection_reason
            }
    else:
        return {"status": "in_progress"}

Complete Processing Example

from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest, RawDocument

def complete_document_processing_example():
    """Complete example of document processing with analysis."""
    
    # Initialize client
    client = DocumentProcessorServiceClient()
    
    # Configuration
    project_id = "my-project"
    location = "us"
    processor_id = "abc123def456"
    file_path = "sample_invoice.pdf"
    
    # Process document
    document = process_document_from_file(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        file_path=file_path,
        mime_type="application/pdf"
    )
    
    # Analyze results
    print("=== DOCUMENT ANALYSIS ===")
    
    # 1. Basic text analysis
    text_analysis = analyze_document_text(document)
    print(f"Total text length: {text_analysis['text_length']} characters")
    print(f"Number of pages: {len(text_analysis['pages'])}")
    
    # 2. Extract entities
    entities = extract_entities(document)
    print(f"\nFound {len(entities)} entity types:")
    for entity_type, entity_list in entities.items():
        print(f"  {entity_type}: {len(entity_list)} instances")
        for entity in entity_list[:3]:  # Show first 3
            print(f"    - {entity['text']} (confidence: {entity['confidence']:.2f})")
    
    # 3. Extract tables
    tables = extract_tables(document)
    print(f"\nFound {len(tables)} tables:")
    for table in tables:
        print(f"  Table on page {table['page']}: {len(table['rows'])} rows")
    
    # 4. Extract form fields  
    form_fields = extract_form_fields(document)
    print(f"\nFound {len(form_fields)} form fields:")
    for field_name, field_info in form_fields.items():
        print(f"  {field_name}: {field_info['value']}")

if __name__ == "__main__":
    complete_document_processing_example()

This comprehensive guide covers all aspects of document processing with Google Cloud Document AI, from basic operations to advanced analysis and error handling.

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-documentai

docs

batch-operations.md

beta-features.md

document-processing.md

document-types.md

index.md

processor-management.md

tile.json