or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch-operations.mdbeta-features.mddocument-processing.mddocument-types.mdindex.mdprocessor-management.md

document-processing.mddocs/

0

# Document Processing Operations

1

2

This guide covers core document processing operations using Google Cloud Document AI, including synchronous processing, handling different document formats, and extracting structured data.

3

4

## Process Single Document

5

6

### Basic Document Processing

7

8

```python { .api }

9

from google.cloud.documentai import DocumentProcessorServiceClient

10

from google.cloud.documentai.types import ProcessRequest, RawDocument

11

12

def process_document_from_file(

13

project_id: str,

14

location: str,

15

processor_id: str,

16

file_path: str,

17

mime_type: str

18

) -> "Document":

19

"""

20

Process a document file using Document AI.

21

22

Args:

23

project_id: Google Cloud project ID

24

location: Processor location (e.g., 'us', 'eu')

25

processor_id: Document processor ID

26

file_path: Path to the document file

27

mime_type: MIME type of the document

28

29

Returns:

30

Document: Processed document with extracted data

31

"""

32

client = DocumentProcessorServiceClient()

33

34

# Build the processor resource name

35

name = client.processor_path(project_id, location, processor_id)

36

37

# Read document file

38

with open(file_path, "rb") as document_file:

39

document_content = document_file.read()

40

41

# Create raw document

42

raw_document = RawDocument(

43

content=document_content,

44

mime_type=mime_type

45

)

46

47

# Configure process request

48

request = ProcessRequest(

49

name=name,

50

raw_document=raw_document

51

)

52

53

# Process the document

54

result = client.process_document(request=request)

55

56

return result.document

57

```

58

59

### Process Cloud Storage Document

60

61

```python { .api }

62

from google.cloud.documentai import DocumentProcessorServiceClient

63

from google.cloud.documentai.types import ProcessRequest, GcsDocument

64

65

def process_gcs_document(

66

project_id: str,

67

location: str,

68

processor_id: str,

69

gcs_uri: str,

70

mime_type: str

71

) -> "Document":

72

"""

73

Process a document stored in Google Cloud Storage.

74

75

Args:

76

project_id: Google Cloud project ID

77

location: Processor location

78

processor_id: Document processor ID

79

gcs_uri: Cloud Storage URI (gs://bucket/path/file.pdf)

80

mime_type: MIME type of the document

81

82

Returns:

83

Document: Processed document with extracted data

84

"""

85

client = DocumentProcessorServiceClient()

86

87

# Build the processor resource name

88

name = client.processor_path(project_id, location, processor_id)

89

90

# Create GCS document reference

91

gcs_document = GcsDocument(

92

gcs_uri=gcs_uri,

93

mime_type=mime_type

94

)

95

96

# Configure process request

97

request = ProcessRequest(

98

name=name,

99

gcs_document=gcs_document

100

)

101

102

# Process the document

103

result = client.process_document(request=request)

104

105

return result.document

106

```

107

108

## Processing Options

109

110

### OCR Configuration

111

112

```python { .api }

113

from google.cloud.documentai.types import ProcessRequest, OcrConfig, ProcessOptions

114

115

def process_with_ocr_options(

116

client: DocumentProcessorServiceClient,

117

processor_name: str,

118

raw_document: "RawDocument",

119

enable_native_pdf_parsing: bool = True,

120

enable_image_quality_scores: bool = False,

121

enable_symbol: bool = False

122

) -> "Document":

123

"""

124

Process document with specific OCR configuration.

125

126

Args:

127

client: DocumentProcessorServiceClient instance

128

processor_name: Full processor resource name

129

raw_document: Raw document to process

130

enable_native_pdf_parsing: Use native PDF parsing when possible

131

enable_image_quality_scores: Include image quality scores

132

enable_symbol: Enable symbol detection

133

134

Returns:

135

Document: Processed document

136

"""

137

# Configure OCR options

138

ocr_config = OcrConfig(

139

enable_native_pdf_parsing=enable_native_pdf_parsing,

140

enable_image_quality_scores=enable_image_quality_scores,

141

enable_symbol=enable_symbol

142

)

143

144

# Configure process options

145

process_options = ProcessOptions(ocr_config=ocr_config)

146

147

# Create request with options

148

request = ProcessRequest(

149

name=processor_name,

150

raw_document=raw_document,

151

process_options=process_options

152

)

153

154

# Process document

155

result = client.process_document(request=request)

156

return result.document

157

```

158

159

### Field Mask Processing

160

161

```python { .api }

162

from google.cloud.documentai.types import ProcessRequest

163

from google.protobuf.field_mask_pb2 import FieldMask

164

165

def process_with_field_mask(

166

client: DocumentProcessorServiceClient,

167

processor_name: str,

168

raw_document: "RawDocument",

169

fields: list[str]

170

) -> "Document":

171

"""

172

Process document returning only specified fields.

173

174

Args:

175

client: DocumentProcessorServiceClient instance

176

processor_name: Full processor resource name

177

raw_document: Raw document to process

178

fields: List of field paths to return (e.g., ['text', 'pages.blocks'])

179

180

Returns:

181

Document: Processed document with only requested fields

182

"""

183

# Create field mask

184

field_mask = FieldMask(paths=fields)

185

186

# Create request with field mask

187

request = ProcessRequest(

188

name=processor_name,

189

raw_document=raw_document,

190

field_mask=field_mask

191

)

192

193

# Process document

194

result = client.process_document(request=request)

195

return result.document

196

```

197

198

## Document Analysis

199

200

### Extract Text and Layout

201

202

```python { .api }

203

from google.cloud.documentai.types import Document

204

205

def analyze_document_text(document: Document) -> dict:

206

"""

207

Analyze text content and layout from processed document.

208

209

Args:

210

document: Processed Document object

211

212

Returns:

213

dict: Analysis results including text statistics and layout info

214

"""

215

analysis = {

216

"total_text": document.text,

217

"text_length": len(document.text),

218

"pages": [],

219

"text_segments": []

220

}

221

222

# Analyze each page

223

for page_idx, page in enumerate(document.pages):

224

page_info = {

225

"page_number": page_idx + 1,

226

"dimensions": {

227

"width": page.dimension.width,

228

"height": page.dimension.height,

229

"unit": page.dimension.unit

230

},

231

"blocks": len(page.blocks),

232

"paragraphs": len(page.paragraphs),

233

"lines": len(page.lines),

234

"tokens": len(page.tokens)

235

}

236

237

# Extract text segments from page

238

for block in page.blocks:

239

if block.layout and block.layout.text_anchor:

240

text_segment = extract_text_from_anchor(

241

document.text,

242

block.layout.text_anchor

243

)

244

analysis["text_segments"].append({

245

"type": "block",

246

"page": page_idx + 1,

247

"text": text_segment,

248

"confidence": block.layout.confidence

249

})

250

251

analysis["pages"].append(page_info)

252

253

return analysis

254

255

def extract_text_from_anchor(full_text: str, text_anchor: "Document.TextAnchor") -> str:

256

"""

257

Extract text segment using TextAnchor.

258

259

Args:

260

full_text: Full document text

261

text_anchor: TextAnchor specifying text location

262

263

Returns:

264

str: Extracted text segment

265

"""

266

text_segments = []

267

268

for segment in text_anchor.text_segments:

269

start_index = int(segment.start_index) if segment.start_index else 0

270

end_index = int(segment.end_index) if segment.end_index else len(full_text)

271

text_segments.append(full_text[start_index:end_index])

272

273

return "".join(text_segments)

274

```

275

276

### Extract Entities

277

278

```python { .api }

279

from google.cloud.documentai.types import Document

280

281

def extract_entities(document: Document) -> dict:

282

"""

283

Extract and organize entities from processed document.

284

285

Args:

286

document: Processed Document object

287

288

Returns:

289

dict: Organized entities by type with confidence scores

290

"""

291

entities_by_type = {}

292

293

for entity in document.entities:

294

entity_type = entity.type_

295

296

if entity_type not in entities_by_type:

297

entities_by_type[entity_type] = []

298

299

# Extract entity information

300

entity_info = {

301

"text": entity.mention_text,

302

"confidence": entity.confidence,

303

"page_refs": []

304

}

305

306

# Add page references if available

307

if entity.page_anchor:

308

for page_ref in entity.page_anchor.page_refs:

309

entity_info["page_refs"].append({

310

"page": page_ref.page + 1, # Convert to 1-based

311

"layout_type": page_ref.layout_type,

312

"layout_id": page_ref.layout_id

313

})

314

315

# Add text anchor information

316

if entity.text_anchor:

317

entity_info["text_segments"] = []

318

for segment in entity.text_anchor.text_segments:

319

entity_info["text_segments"].append({

320

"start_index": int(segment.start_index or 0),

321

"end_index": int(segment.end_index or 0)

322

})

323

324

# Add properties if available

325

if entity.properties:

326

entity_info["properties"] = []

327

for prop in entity.properties:

328

prop_info = {

329

"type": prop.type_,

330

"text": prop.mention_text,

331

"confidence": prop.confidence

332

}

333

entity_info["properties"].append(prop_info)

334

335

entities_by_type[entity_type].append(entity_info)

336

337

return entities_by_type

338

```

339

340

### Extract Tables

341

342

```python { .api }

343

from google.cloud.documentai.types import Document

344

345

def extract_tables(document: Document) -> list[dict]:

346

"""

347

Extract table data from processed document.

348

349

Args:

350

document: Processed Document object

351

352

Returns:

353

list[dict]: List of tables with structured data

354

"""

355

tables = []

356

357

for page_idx, page in enumerate(document.pages):

358

for table_idx, table in enumerate(page.tables):

359

table_data = {

360

"page": page_idx + 1,

361

"table_index": table_idx,

362

"rows": [],

363

"header_rows": [],

364

"body_rows": []

365

}

366

367

# Process table rows

368

for row in table.header_rows:

369

header_row = extract_table_row(document.text, row)

370

table_data["header_rows"].append(header_row)

371

table_data["rows"].append(header_row)

372

373

for row in table.body_rows:

374

body_row = extract_table_row(document.text, row)

375

table_data["body_rows"].append(body_row)

376

table_data["rows"].append(body_row)

377

378

tables.append(table_data)

379

380

return tables

381

382

def extract_table_row(full_text: str, row: "Document.Page.Table.TableRow") -> list[dict]:

383

"""

384

Extract data from a table row.

385

386

Args:

387

full_text: Full document text

388

row: Table row object

389

390

Returns:

391

list[dict]: List of cell data

392

"""

393

cells = []

394

395

for cell in row.cells:

396

cell_data = {

397

"text": "",

398

"row_span": cell.row_span,

399

"col_span": cell.col_span

400

}

401

402

# Extract cell text

403

if cell.layout and cell.layout.text_anchor:

404

cell_data["text"] = extract_text_from_anchor(

405

full_text,

406

cell.layout.text_anchor

407

).strip()

408

409

cells.append(cell_data)

410

411

return cells

412

```

413

414

### Extract Form Fields

415

416

```python { .api }

417

from google.cloud.documentai.types import Document

418

419

def extract_form_fields(document: Document) -> dict:

420

"""

421

Extract form fields (key-value pairs) from processed document.

422

423

Args:

424

document: Processed Document object

425

426

Returns:

427

dict: Form fields organized as key-value pairs

428

"""

429

form_fields = {}

430

431

for page in document.pages:

432

for form_field in page.form_fields:

433

# Extract field name (key)

434

field_name = ""

435

if form_field.field_name and form_field.field_name.text_anchor:

436

field_name = extract_text_from_anchor(

437

document.text,

438

form_field.field_name.text_anchor

439

).strip()

440

441

# Extract field value

442

field_value = ""

443

if form_field.field_value and form_field.field_value.text_anchor:

444

field_value = extract_text_from_anchor(

445

document.text,

446

form_field.field_value.text_anchor

447

).strip()

448

449

# Store form field with confidence

450

if field_name:

451

form_fields[field_name] = {

452

"value": field_value,

453

"name_confidence": form_field.field_name.confidence if form_field.field_name else 0.0,

454

"value_confidence": form_field.field_value.confidence if form_field.field_value else 0.0

455

}

456

457

return form_fields

458

```

459

460

## Async Document Processing

461

462

### Async Client Usage

463

464

```python { .api }

465

import asyncio

466

from google.cloud.documentai import DocumentProcessorServiceAsyncClient

467

from google.cloud.documentai.types import ProcessRequest, RawDocument

468

469

async def process_document_async(

470

project_id: str,

471

location: str,

472

processor_id: str,

473

file_path: str,

474

mime_type: str

475

) -> "Document":

476

"""

477

Process document asynchronously.

478

479

Args:

480

project_id: Google Cloud project ID

481

location: Processor location

482

processor_id: Document processor ID

483

file_path: Path to document file

484

mime_type: MIME type of document

485

486

Returns:

487

Document: Processed document

488

"""

489

client = DocumentProcessorServiceAsyncClient()

490

491

# Build processor name

492

name = client.processor_path(project_id, location, processor_id)

493

494

# Read document

495

with open(file_path, "rb") as document_file:

496

document_content = document_file.read()

497

498

# Create request

499

raw_document = RawDocument(content=document_content, mime_type=mime_type)

500

request = ProcessRequest(name=name, raw_document=raw_document)

501

502

# Process asynchronously

503

result = await client.process_document(request=request)

504

505

await client.close()

506

return result.document

507

508

# Example usage

509

async def main():

510

document = await process_document_async(

511

project_id="my-project",

512

location="us",

513

processor_id="abc123",

514

file_path="document.pdf",

515

mime_type="application/pdf"

516

)

517

print(f"Processed document: {len(document.text)} characters")

518

519

# Run async function

520

asyncio.run(main())

521

```

522

523

## Supported Document Types

524

525

### MIME Types

526

527

```python { .api }

528

# Supported MIME types for document processing

529

SUPPORTED_MIME_TYPES = {

530

# PDF Documents

531

"application/pdf": "PDF documents",

532

533

# Image formats

534

"image/jpeg": "JPEG images",

535

"image/jpg": "JPG images",

536

"image/png": "PNG images",

537

"image/bmp": "BMP images",

538

"image/tiff": "TIFF images",

539

"image/tif": "TIF images",

540

"image/gif": "GIF images (first frame only)",

541

"image/webp": "WebP images",

542

543

# Office documents (with OCR)

544

"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word documents",

545

"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint files",

546

"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel files"

547

}

548

549

def validate_mime_type(mime_type: str) -> bool:

550

"""

551

Check if MIME type is supported.

552

553

Args:

554

mime_type: MIME type to validate

555

556

Returns:

557

bool: True if supported, False otherwise

558

"""

559

return mime_type in SUPPORTED_MIME_TYPES

560

```

561

562

### Document Size Limits

563

564

```python { .api }

565

# Document processing limits

566

PROCESSING_LIMITS = {

567

"max_file_size_bytes": 20 * 1024 * 1024, # 20 MB

568

"max_pages_per_document": 2000,

569

"max_image_dimensions": {

570

"width": 10000,

571

"height": 10000

572

},

573

"timeout_seconds": 300 # 5 minutes

574

}

575

576

def validate_document_size(file_path: str) -> tuple[bool, str]:

577

"""

578

Validate document meets size requirements.

579

580

Args:

581

file_path: Path to document file

582

583

Returns:

584

tuple[bool, str]: (is_valid, error_message)

585

"""

586

import os

587

588

file_size = os.path.getsize(file_path)

589

590

if file_size > PROCESSING_LIMITS["max_file_size_bytes"]:

591

return False, f"File size ({file_size} bytes) exceeds limit ({PROCESSING_LIMITS['max_file_size_bytes']} bytes)"

592

593

return True, ""

594

```

595

596

## Error Handling

597

598

### Common Processing Errors

599

600

```python { .api }

601

from google.cloud.documentai import DocumentProcessorServiceClient

602

from google.api_core.exceptions import (

603

NotFound,

604

InvalidArgument,

605

ResourceExhausted,

606

DeadlineExceeded

607

)

608

from google.cloud.exceptions import GoogleCloudError

609

610

def robust_process_document(

611

client: DocumentProcessorServiceClient,

612

request: ProcessRequest,

613

max_retries: int = 3

614

) -> "ProcessResponse":

615

"""

616

Process document with error handling and retries.

617

618

Args:

619

client: DocumentProcessorServiceClient instance

620

request: Process request

621

max_retries: Maximum number of retry attempts

622

623

Returns:

624

ProcessResponse: Processing result

625

626

Raises:

627

Exception: If processing fails after all retries

628

"""

629

import time

630

631

for attempt in range(max_retries + 1):

632

try:

633

return client.process_document(request=request)

634

635

except NotFound as e:

636

# Processor not found - don't retry

637

raise Exception(f"Processor not found: {e}")

638

639

except InvalidArgument as e:

640

# Invalid request - don't retry

641

raise Exception(f"Invalid request: {e}")

642

643

except ResourceExhausted as e:

644

# Rate limit exceeded - wait and retry

645

if attempt < max_retries:

646

wait_time = 2 ** attempt # Exponential backoff

647

print(f"Rate limit exceeded, waiting {wait_time}s (attempt {attempt + 1})")

648

time.sleep(wait_time)

649

continue

650

raise Exception(f"Rate limit exceeded after {max_retries} retries: {e}")

651

652

except DeadlineExceeded as e:

653

# Timeout - retry with longer timeout

654

if attempt < max_retries:

655

print(f"Request timeout, retrying (attempt {attempt + 1})")

656

continue

657

raise Exception(f"Request timeout after {max_retries} retries: {e}")

658

659

except GoogleCloudError as e:

660

# Other Google Cloud errors

661

if attempt < max_retries:

662

wait_time = 2 ** attempt

663

print(f"Google Cloud error, retrying in {wait_time}s: {e}")

664

time.sleep(wait_time)

665

continue

666

raise Exception(f"Google Cloud error after {max_retries} retries: {e}")

667

668

except Exception as e:

669

# Unexpected errors - don't retry

670

raise Exception(f"Unexpected error: {e}")

671

672

raise Exception("Maximum retries exceeded")

673

```

674

675

## Human Review Workflow

676

677

### Submit Document for Review

678

679

```python { .api }

680

from google.cloud.documentai import DocumentProcessorServiceClient

681

from google.cloud.documentai.types import ReviewDocumentRequest, Document

682

683

def submit_document_for_review(

684

project_id: str,

685

location: str,

686

processor_id: str,

687

document: Document,

688

enable_schema_validation: bool = True

689

) -> "Operation":

690

"""

691

Submit a processed document for human review.

692

693

Args:

694

project_id: Google Cloud project ID

695

location: Processor location

696

processor_id: Processor ID

697

document: Processed document to review

698

enable_schema_validation: Enable schema validation during review

699

700

Returns:

701

Operation: Long-running operation for review process

702

"""

703

client = DocumentProcessorServiceClient()

704

705

# Build human review config path

706

human_review_config = f"projects/{project_id}/locations/{location}/processors/{processor_id}/humanReviewConfig"

707

708

# Create review request

709

request = ReviewDocumentRequest(

710

human_review_config=human_review_config,

711

inline_document=document,

712

enable_schema_validation=enable_schema_validation

713

)

714

715

# Submit for review

716

operation = client.review_document(request=request)

717

718

print(f"Document submitted for human review")

719

print(f"Operation: {operation.operation.name}")

720

721

return operation

722

723

def check_review_status(operation: "Operation") -> dict:

724

"""

725

Check the status of a human review operation.

726

727

Args:

728

operation: Review operation object

729

730

Returns:

731

dict: Review status information

732

"""

733

if operation.done():

734

if operation.exception():

735

return {

736

"status": "failed",

737

"error": str(operation.exception())

738

}

739

else:

740

result = operation.result()

741

return {

742

"status": "completed",

743

"gcs_destination": result.gcs_destination,

744

"rejection_reason": result.rejection_reason

745

}

746

else:

747

return {"status": "in_progress"}

748

```

749

750

## Complete Processing Example

751

752

```python { .api }

753

from google.cloud.documentai import DocumentProcessorServiceClient

754

from google.cloud.documentai.types import ProcessRequest, RawDocument

755

756

def complete_document_processing_example():

757

"""Complete example of document processing with analysis."""

758

759

# Initialize client

760

client = DocumentProcessorServiceClient()

761

762

# Configuration

763

project_id = "my-project"

764

location = "us"

765

processor_id = "abc123def456"

766

file_path = "sample_invoice.pdf"

767

768

# Process document

769

document = process_document_from_file(

770

project_id=project_id,

771

location=location,

772

processor_id=processor_id,

773

file_path=file_path,

774

mime_type="application/pdf"

775

)

776

777

# Analyze results

778

print("=== DOCUMENT ANALYSIS ===")

779

780

# 1. Basic text analysis

781

text_analysis = analyze_document_text(document)

782

print(f"Total text length: {text_analysis['text_length']} characters")

783

print(f"Number of pages: {len(text_analysis['pages'])}")

784

785

# 2. Extract entities

786

entities = extract_entities(document)

787

print(f"\nFound {len(entities)} entity types:")

788

for entity_type, entity_list in entities.items():

789

print(f" {entity_type}: {len(entity_list)} instances")

790

for entity in entity_list[:3]: # Show first 3

791

print(f" - {entity['text']} (confidence: {entity['confidence']:.2f})")

792

793

# 3. Extract tables

794

tables = extract_tables(document)

795

print(f"\nFound {len(tables)} tables:")

796

for table in tables:

797

print(f" Table on page {table['page']}: {len(table['rows'])} rows")

798

799

# 4. Extract form fields

800

form_fields = extract_form_fields(document)

801

print(f"\nFound {len(form_fields)} form fields:")

802

for field_name, field_info in form_fields.items():

803

print(f" {field_name}: {field_info['value']}")

804

805

if __name__ == "__main__":

806

complete_document_processing_example()

807

```

808

809

This comprehensive guide covers all aspects of document processing with Google Cloud Document AI, from basic operations to advanced analysis and error handling.