Microsoft Azure Cognitive Services Computer Vision Client Library for Python providing state-of-the-art algorithms to process images and return information including mature content detection, face detection, color analysis, image categorization, description generation, and thumbnail creation.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Extract text from images using both synchronous OCR for printed text and asynchronous Read API for comprehensive text recognition including handwritten text. The service supports multiple languages and provides detailed text layout information.
Immediate text extraction from images containing printed text with language detection and orientation analysis.
def recognize_printed_text(detect_orientation, url, language=None, custom_headers=None, raw=False, **operation_config):
"""
Perform OCR on printed text in images.
Args:
detect_orientation (bool): Whether to detect and correct text orientation
url (str): Publicly reachable URL of an image
language (str, optional): OCR language code. If not specified, auto-detect is used.
Supported languages include: en, zh-Hans, zh-Hant, cs, da, nl, fi, fr, de,
el, hu, it, ja, ko, nb, pl, pt, ru, es, sv, tr, ar, ro, sr-Cyrl, sr-Latn, sk
custom_headers (dict, optional): Custom HTTP headers
raw (bool, optional): Return raw response. Default: False
Returns:
OcrResult: OCR results with text regions, lines, and words
Raises:
ComputerVisionOcrErrorException: OCR operation error
"""
def recognize_printed_text_in_stream(detect_orientation, image, language=None, custom_headers=None, raw=False, **operation_config):
"""
Perform OCR on printed text from binary stream.
Args:
detect_orientation (bool): Whether to detect text orientation
image (Generator): Binary image data stream
language (str, optional): OCR language code
Returns:
OcrResult: OCR results with text layout information
"""Advanced text recognition supporting both printed and handwritten text with high accuracy. This is a two-step process requiring operation polling.
def read(url, language=None, pages=None, model_version="latest", reading_order=None, custom_headers=None, raw=False, **operation_config):
"""
Start asynchronous text reading operation.
Args:
url (str): Publicly reachable URL of an image or PDF
language (str, optional): BCP-47 language code for text recognition.
Supported languages include extensive list for OCR detection
pages (list[int], optional): Page numbers to process (for multi-page documents)
model_version (str, optional): Model version. Default: "latest"
reading_order (str, optional): Reading order algorithm ('basic' or 'natural')
Returns:
str: Operation location URL for polling status
Note:
This starts an asynchronous operation. Use get_read_result() to retrieve results.
"""
def read_in_stream(image, language=None, pages=None, model_version="latest", reading_order=None, custom_headers=None, raw=False, **operation_config):
"""
Start text reading from binary stream.
Args:
image (Generator): Binary image data stream
language (str, optional): Text language for recognition
pages (list[int], optional): Page numbers to process
model_version (str, optional): Model version. Default: "latest"
reading_order (str, optional): Reading order algorithm ('basic' or 'natural')
Returns:
str: Operation location URL for polling
"""
def get_read_result(operation_id, custom_headers=None, raw=False, **operation_config):
"""
Get result of asynchronous read operation.
Args:
operation_id (str): Operation ID extracted from read operation location URL
Returns:
ReadOperationResult: Text recognition results with status
Note:
Poll this endpoint until status is 'succeeded' or 'failed'.
Status values: notStarted, running, succeeded, failed
"""from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
# Initialize client
credentials = CognitiveServicesCredentials("your-api-key")
client = ComputerVisionClient("https://your-endpoint.cognitiveservices.azure.com/", credentials)
# Perform OCR on printed text
image_url = "https://example.com/document.jpg"
ocr_result = client.recognize_printed_text(detect_orientation=True, url=image_url)
print(f"Language: {ocr_result.language}")
print(f"Text angle: {ocr_result.text_angle}")
print(f"Orientation: {ocr_result.orientation}")
# Extract text by regions, lines, and words
for region in ocr_result.regions:
for line in region.lines:
line_text = " ".join([word.text for word in line.words])
print(f"Line: {line_text}")
# Individual word details
for word in line.words:
print(f" Word: '{word.text}' at {word.bounding_box}")import time
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
# Start read operation
image_url = "https://example.com/handwritten-note.jpg"
read_response = client.read(image_url, raw=True)
# Extract operation ID from location header
operation_location = read_response.headers["Operation-Location"]
operation_id = operation_location.split("/")[-1]
# Poll for completion
while True:
read_result = client.get_read_result(operation_id)
if read_result.status == OperationStatusCodes.succeeded:
break
elif read_result.status == OperationStatusCodes.failed:
print("Text recognition failed")
break
time.sleep(1)
# Process results
for page in read_result.analyze_result.read_results:
print(f"Page {page.page}:")
for line in page.lines:
print(f" Line: '{line.text}'")
print(f" Bounding box: {line.bounding_box}")
# Check for handwriting
if line.appearance and line.appearance.style:
if line.appearance.style.name == "handwriting":
print(f" Style: Handwriting (confidence: {line.appearance.style.confidence})")
# Individual words
for word in line.words:
print(f" Word: '{word.text}' (confidence: {word.confidence})")# OCR from local file
with open("local_document.jpg", "rb") as image_stream:
ocr_result = client.recognize_printed_text_in_stream(
detect_orientation=True,
image=image_stream,
language="en"
)
# Extract all text
all_text = []
for region in ocr_result.regions:
for line in region.lines:
line_text = " ".join([word.text for word in line.words])
all_text.append(line_text)
print("\n".join(all_text))# Process specific pages of a multi-page document
pdf_url = "https://example.com/multi-page-document.pdf"
pages_to_process = [1, 3, 5] # Process pages 1, 3, and 5
read_response = client.read(pdf_url, pages=pages_to_process, raw=True)
operation_id = read_response.headers["Operation-Location"].split("/")[-1]
# Poll and get results (same as above)
# ... polling code ...
# Results will contain only the specified pages
for page in read_result.analyze_result.read_results:
print(f"Processing page {page.page}")
# ... process page content ...class OcrResult:
"""
OCR operation result for printed text.
Attributes:
language (str): Detected or specified language code
text_angle (float): Text angle in degrees (-180 to 180)
orientation (str): Text orientation (Up, Down, Left, Right)
regions (list[OcrRegion]): Text regions in the image
"""class OcrRegion:
"""
OCR text region containing multiple lines.
Attributes:
bounding_box (str): Comma-separated bounding box coordinates (left,top,width,height)
lines (list[OcrLine]): Text lines within the region
"""class OcrLine:
"""
OCR text line containing multiple words.
Attributes:
bounding_box (str): Comma-separated bounding box coordinates
words (list[OcrWord]): Words within the line
"""class OcrWord:
"""
Individual OCR word result.
Attributes:
bounding_box (str): Comma-separated bounding box coordinates
text (str): Recognized word text
"""class ReadOperationResult:
"""
Result of asynchronous read operation.
Attributes:
status (OperationStatusCodes): Operation status (notStarted, running, succeeded, failed)
created_date_time (datetime): Operation creation timestamp
last_updated_date_time (datetime): Last update timestamp
analyze_result (AnalyzeResults): Text analysis results (when succeeded)
"""class AnalyzeResults:
"""
Text analysis results from read operation.
Attributes:
version (str): Schema version
model_version (str): OCR model version used
read_results (list[ReadResult]): Text extraction results per page
"""class ReadResult:
"""
Text reading result for a single page.
Attributes:
page (int): Page number (1-indexed)
language (str): Detected language
angle (float): Text angle in degrees
width (float): Page width
height (float): Page height
unit (TextRecognitionResultDimensionUnit): Dimension unit (pixel, inch)
lines (list[Line]): Extracted text lines
"""class Line:
"""
Text line with layout and style information.
Attributes:
language (str): Line language
bounding_box (list[float]): Bounding box coordinates [x1,y1,x2,y2,x3,y3,x4,y4]
appearance (Appearance): Style information (handwriting detection)
text (str): Combined text of all words in the line
words (list[Word]): Individual words within the line
"""class Word:
"""
Individual word with position and confidence.
Attributes:
bounding_box (list[float]): Word bounding box coordinates
text (str): Recognized word text
confidence (float): Recognition confidence score (0.0 to 1.0)
"""class Appearance:
"""
Text appearance and style information.
Attributes:
style (Style): Text style classification
"""class Style:
"""
Text style classification.
Attributes:
name (TextStyle): Style type (other, handwriting)
confidence (float): Style detection confidence (0.0 to 1.0)
"""class OperationStatusCodes(str, Enum):
"""Asynchronous operation status codes."""
not_started = "notStarted"
running = "running"
failed = "failed"
succeeded = "succeeded"class TextStyle(str, Enum):
"""Text style classification values."""
other = "other"
handwriting = "handwriting"class TextRecognitionResultDimensionUnit(str, Enum):
"""Dimension units for text recognition results."""
pixel = "pixel"
inch = "inch"Install with Tessl CLI
npx tessl i tessl/pypi-azure-cognitiveservices-vision-computervision