Google Cloud Document AI client library for extracting structured information from documents using machine learning
npx @tessl/cli install tessl/pypi-google-cloud-documentai@3.6.0Google Cloud Document AI is a machine learning service that extracts structured data from documents using pre-trained and custom document processors. The service can process various document types including invoices, receipts, forms, contracts, and other business documents.
Package Name: google-cloud-documentai
Version: 3.6.0
Documentation: Google Cloud Document AI Documentation
pip install google-cloud-documentaiThis package requires Google Cloud authentication. Set up authentication using one of these methods:
Application Default Credentials (Recommended):
gcloud auth application-default loginService Account Key:
export GOOGLE_APPLICATION_CREDENTIALS="path/to/service-account-key.json"Environment Variables:
export GOOGLE_CLOUD_PROJECT="your-project-id"# Main module - exports v1 (stable) API
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai import Document, ProcessRequest, ProcessResponse
# Alternative import pattern
from google.cloud import documentai
# For async operations
from google.cloud.documentai import DocumentProcessorServiceAsyncClient
# Core types for document processing
from google.cloud.documentai.types import (
RawDocument,
GcsDocument,
Processor,
ProcessorType,
BoundingPoly,
Vertex
)from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import RawDocument, ProcessRequest
def process_document(project_id: str, location: str, processor_id: str, file_path: str, mime_type: str):
"""
Process a document using Google Cloud Document AI.
Args:
project_id: Google Cloud project ID
location: Processor location (e.g., 'us' or 'eu')
processor_id: ID of the document processor to use
file_path: Path to the document file
mime_type: MIME type of the document (e.g., 'application/pdf')
Returns:
Document: Processed document with extracted data
"""
# Initialize the client
client = DocumentProcessorServiceClient()
# The full resource name of the processor
name = client.processor_path(project_id, location, processor_id)
# Read the document file
with open(file_path, "rb") as document:
document_content = document.read()
# Create raw document
raw_document = RawDocument(content=document_content, mime_type=mime_type)
# Configure the process request
request = ProcessRequest(name=name, raw_document=raw_document)
# Process the document
result = client.process_document(request=request)
# Access processed document
document = result.document
print(f"Document text: {document.text}")
print(f"Number of pages: {len(document.pages)}")
# Extract entities
for entity in document.entities:
print(f"Entity: {entity.type_} = {entity.mention_text}")
return document
# Example usage
document = process_document(
project_id="my-project",
location="us",
processor_id="abc123def456",
file_path="invoice.pdf",
mime_type="application/pdf"
)Google Cloud Document AI follows this processing workflow:
Processors are AI models that extract data from specific document types:
The Document type represents processed documents with:
Processors are deployed in specific regions:
us: United States (Iowa)eu: Europe (Belgium)Core functionality for processing individual and batch documents.
# Process single document
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ProcessRequest
client = DocumentProcessorServiceClient()
request = ProcessRequest(name="projects/my-project/locations/us/processors/abc123")
result = client.process_document(request=request)→ Document Processing Operations
Manage processor lifecycle including creation, deployment, and training.
# List available processors
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import ListProcessorsRequest
client = DocumentProcessorServiceClient()
request = ListProcessorsRequest(parent="projects/my-project/locations/us")
response = client.list_processors(request=request)
for processor in response.processors:
print(f"Processor: {processor.display_name} ({processor.name})")Work with document structures, entities, and type definitions.
# Access document structure
from google.cloud.documentai.types import Document
def analyze_document_structure(document: Document):
"""Analyze the structure of a processed document."""
print(f"Total text length: {len(document.text)}")
# Analyze pages
for i, page in enumerate(document.pages):
print(f"Page {i+1}: {len(page.blocks)} blocks, {len(page.paragraphs)} paragraphs")
# Analyze entities by type
entity_types = {}
for entity in document.entities:
entity_type = entity.type_
if entity_type not in entity_types:
entity_types[entity_type] = []
entity_types[entity_type].append(entity.mention_text)
for entity_type, mentions in entity_types.items():
print(f"{entity_type}: {len(mentions)} instances")Process multiple documents asynchronously for high-volume workflows.
# Batch process documents
from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.documentai.types import BatchProcessRequest, GcsDocuments
client = DocumentProcessorServiceClient()
# Configure batch request
gcs_documents = GcsDocuments(documents=[
{"gcs_uri": "gs://my-bucket/doc1.pdf", "mime_type": "application/pdf"},
{"gcs_uri": "gs://my-bucket/doc2.pdf", "mime_type": "application/pdf"}
])
request = BatchProcessRequest(
name="projects/my-project/locations/us/processors/abc123",
input_documents=gcs_documents,
document_output_config={
"gcs_output_config": {"gcs_uri": "gs://my-bucket/output/"}
}
)
operation = client.batch_process_documents(request=request)Access experimental features including dataset management and enhanced document processing.
# Beta features - DocumentService for dataset management
from google.cloud.documentai_v1beta3 import DocumentServiceClient
from google.cloud.documentai_v1beta3.types import Dataset
client = DocumentServiceClient()
# List documents in a dataset
request = {"parent": "projects/my-project/locations/us/processors/abc123/dataset"}
response = client.list_documents(request=request)The main google.cloud.documentai module exports the stable v1 API:
google.cloud.documentaigoogle.cloud.documentai_v1Extended API with additional features:
google.cloud.documentai_v1beta3from google.cloud.documentai import DocumentProcessorServiceClient
from google.cloud.exceptions import GoogleCloudError
from google.api_core.exceptions import NotFound, InvalidArgument
client = DocumentProcessorServiceClient()
try:
# Process document
result = client.process_document(request=request)
except NotFound as e:
print(f"Processor not found: {e}")
except InvalidArgument as e:
print(f"Invalid request: {e}")
except GoogleCloudError as e:
print(f"Google Cloud error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")Google Cloud Document AI uses hierarchical resource names:
from google.cloud.documentai import DocumentProcessorServiceClient
client = DocumentProcessorServiceClient()
# Build resource names using helper methods
processor_path = client.processor_path("my-project", "us", "processor-id")
# Result: "projects/my-project/locations/us/processors/processor-id"
processor_version_path = client.processor_version_path(
"my-project", "us", "processor-id", "version-id"
)
# Result: "projects/my-project/locations/us/processors/processor-id/processorVersions/version-id"
location_path = client.common_location_path("my-project", "us")
# Result: "projects/my-project/locations/us"