CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-azure-cognitiveservices-vision-computervision

Microsoft Azure Cognitive Services Computer Vision Client Library for Python providing state-of-the-art algorithms to process images and return information including mature content detection, face detection, color analysis, image categorization, description generation, and thumbnail creation.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

ocr-text-recognition.mddocs/

OCR and Text Recognition

Extract text from images using both synchronous OCR for printed text and asynchronous Read API for comprehensive text recognition including handwritten text. The service supports multiple languages and provides detailed text layout information.

Capabilities

Synchronous OCR (Printed Text)

Immediate text extraction from images containing printed text with language detection and orientation analysis.

def recognize_printed_text(detect_orientation, url, language=None, custom_headers=None, raw=False, **operation_config):
    """
    Perform OCR on printed text in images.
    
    Args:
        detect_orientation (bool): Whether to detect and correct text orientation
        url (str): Publicly reachable URL of an image
        language (str, optional): OCR language code. If not specified, auto-detect is used.
            Supported languages include: en, zh-Hans, zh-Hant, cs, da, nl, fi, fr, de, 
            el, hu, it, ja, ko, nb, pl, pt, ru, es, sv, tr, ar, ro, sr-Cyrl, sr-Latn, sk
        custom_headers (dict, optional): Custom HTTP headers
        raw (bool, optional): Return raw response. Default: False
        
    Returns:
        OcrResult: OCR results with text regions, lines, and words
        
    Raises:
        ComputerVisionOcrErrorException: OCR operation error
    """

def recognize_printed_text_in_stream(detect_orientation, image, language=None, custom_headers=None, raw=False, **operation_config):
    """
    Perform OCR on printed text from binary stream.
    
    Args:
        detect_orientation (bool): Whether to detect text orientation
        image (Generator): Binary image data stream
        language (str, optional): OCR language code
        
    Returns:
        OcrResult: OCR results with text layout information
    """

Asynchronous Text Reading

Advanced text recognition supporting both printed and handwritten text with high accuracy. This is a two-step process requiring operation polling.

def read(url, language=None, pages=None, model_version="latest", reading_order=None, custom_headers=None, raw=False, **operation_config):
    """
    Start asynchronous text reading operation.
    
    Args:
        url (str): Publicly reachable URL of an image or PDF
        language (str, optional): BCP-47 language code for text recognition.
            Supported languages include extensive list for OCR detection
        pages (list[int], optional): Page numbers to process (for multi-page documents)
        model_version (str, optional): Model version. Default: "latest"
        reading_order (str, optional): Reading order algorithm ('basic' or 'natural')
        
    Returns:
        str: Operation location URL for polling status
        
    Note:
        This starts an asynchronous operation. Use get_read_result() to retrieve results.
    """

def read_in_stream(image, language=None, pages=None, model_version="latest", reading_order=None, custom_headers=None, raw=False, **operation_config):
    """
    Start text reading from binary stream.
    
    Args:
        image (Generator): Binary image data stream
        language (str, optional): Text language for recognition
        pages (list[int], optional): Page numbers to process
        model_version (str, optional): Model version. Default: "latest"
        reading_order (str, optional): Reading order algorithm ('basic' or 'natural')
        
    Returns:
        str: Operation location URL for polling
    """

def get_read_result(operation_id, custom_headers=None, raw=False, **operation_config):
    """
    Get result of asynchronous read operation.
    
    Args:
        operation_id (str): Operation ID extracted from read operation location URL
        
    Returns:
        ReadOperationResult: Text recognition results with status
        
    Note:
        Poll this endpoint until status is 'succeeded' or 'failed'.
        Status values: notStarted, running, succeeded, failed
    """

Usage Examples

Basic OCR (Printed Text)

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

# Initialize client
credentials = CognitiveServicesCredentials("your-api-key")
client = ComputerVisionClient("https://your-endpoint.cognitiveservices.azure.com/", credentials)

# Perform OCR on printed text
image_url = "https://example.com/document.jpg"
ocr_result = client.recognize_printed_text(detect_orientation=True, url=image_url)

print(f"Language: {ocr_result.language}")
print(f"Text angle: {ocr_result.text_angle}")
print(f"Orientation: {ocr_result.orientation}")

# Extract text by regions, lines, and words
for region in ocr_result.regions:
    for line in region.lines:
        line_text = " ".join([word.text for word in line.words])
        print(f"Line: {line_text}")
        
        # Individual word details  
        for word in line.words:
            print(f"  Word: '{word.text}' at {word.bounding_box}")

Advanced Text Reading (Async)

import time
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes

# Start read operation
image_url = "https://example.com/handwritten-note.jpg"
read_response = client.read(image_url, raw=True)

# Extract operation ID from location header
operation_location = read_response.headers["Operation-Location"]
operation_id = operation_location.split("/")[-1]

# Poll for completion
while True:
    read_result = client.get_read_result(operation_id)
    
    if read_result.status == OperationStatusCodes.succeeded:
        break
    elif read_result.status == OperationStatusCodes.failed:
        print("Text recognition failed")
        break
    
    time.sleep(1)

# Process results
for page in read_result.analyze_result.read_results:
    print(f"Page {page.page}:")
    
    for line in page.lines:
        print(f"  Line: '{line.text}'")
        print(f"    Bounding box: {line.bounding_box}")
        
        # Check for handwriting
        if line.appearance and line.appearance.style:
            if line.appearance.style.name == "handwriting":
                print(f"    Style: Handwriting (confidence: {line.appearance.style.confidence})")
        
        # Individual words
        for word in line.words:
            print(f"    Word: '{word.text}' (confidence: {word.confidence})")

Local File OCR

# OCR from local file
with open("local_document.jpg", "rb") as image_stream:
    ocr_result = client.recognize_printed_text_in_stream(
        detect_orientation=True,
        image=image_stream,
        language="en"
    )
    
    # Extract all text
    all_text = []
    for region in ocr_result.regions:
        for line in region.lines:
            line_text = " ".join([word.text for word in line.words])
            all_text.append(line_text)
    
    print("\n".join(all_text))

Multi-page Document Processing

# Process specific pages of a multi-page document
pdf_url = "https://example.com/multi-page-document.pdf"
pages_to_process = [1, 3, 5]  # Process pages 1, 3, and 5

read_response = client.read(pdf_url, pages=pages_to_process, raw=True)
operation_id = read_response.headers["Operation-Location"].split("/")[-1]

# Poll and get results (same as above)
# ... polling code ...

# Results will contain only the specified pages
for page in read_result.analyze_result.read_results:
    print(f"Processing page {page.page}")
    # ... process page content ...

Response Data Types

OcrResult

class OcrResult:
    """
    OCR operation result for printed text.
    
    Attributes:
        language (str): Detected or specified language code
        text_angle (float): Text angle in degrees (-180 to 180)
        orientation (str): Text orientation (Up, Down, Left, Right)
        regions (list[OcrRegion]): Text regions in the image
    """

OcrRegion

class OcrRegion:
    """
    OCR text region containing multiple lines.
    
    Attributes:
        bounding_box (str): Comma-separated bounding box coordinates (left,top,width,height)
        lines (list[OcrLine]): Text lines within the region
    """

OcrLine

class OcrLine:
    """
    OCR text line containing multiple words.
    
    Attributes:
        bounding_box (str): Comma-separated bounding box coordinates
        words (list[OcrWord]): Words within the line
    """

OcrWord

class OcrWord:
    """
    Individual OCR word result.
    
    Attributes:
        bounding_box (str): Comma-separated bounding box coordinates
        text (str): Recognized word text
    """

ReadOperationResult

class ReadOperationResult:
    """
    Result of asynchronous read operation.
    
    Attributes:
        status (OperationStatusCodes): Operation status (notStarted, running, succeeded, failed)
        created_date_time (datetime): Operation creation timestamp
        last_updated_date_time (datetime): Last update timestamp
        analyze_result (AnalyzeResults): Text analysis results (when succeeded)
    """

AnalyzeResults

class AnalyzeResults:
    """
    Text analysis results from read operation.
    
    Attributes:
        version (str): Schema version
        model_version (str): OCR model version used
        read_results (list[ReadResult]): Text extraction results per page
    """

ReadResult

class ReadResult:
    """
    Text reading result for a single page.
    
    Attributes:
        page (int): Page number (1-indexed)
        language (str): Detected language
        angle (float): Text angle in degrees
        width (float): Page width
        height (float): Page height
        unit (TextRecognitionResultDimensionUnit): Dimension unit (pixel, inch)
        lines (list[Line]): Extracted text lines
    """

Line

class Line:
    """
    Text line with layout and style information.
    
    Attributes:
        language (str): Line language
        bounding_box (list[float]): Bounding box coordinates [x1,y1,x2,y2,x3,y3,x4,y4]
        appearance (Appearance): Style information (handwriting detection)
        text (str): Combined text of all words in the line
        words (list[Word]): Individual words within the line
    """

Word

class Word:
    """
    Individual word with position and confidence.
    
    Attributes:
        bounding_box (list[float]): Word bounding box coordinates
        text (str): Recognized word text
        confidence (float): Recognition confidence score (0.0 to 1.0)
    """

Appearance

class Appearance:
    """
    Text appearance and style information.
    
    Attributes:
        style (Style): Text style classification
    """

Style

class Style:
    """
    Text style classification.
    
    Attributes:
        name (TextStyle): Style type (other, handwriting)
        confidence (float): Style detection confidence (0.0 to 1.0)
    """

Enumerations

OperationStatusCodes

class OperationStatusCodes(str, Enum):
    """Asynchronous operation status codes."""
    
    not_started = "notStarted"
    running = "running"
    failed = "failed" 
    succeeded = "succeeded"

TextStyle

class TextStyle(str, Enum):
    """Text style classification values."""
    
    other = "other"
    handwriting = "handwriting"

TextRecognitionResultDimensionUnit

class TextRecognitionResultDimensionUnit(str, Enum):
    """Dimension units for text recognition results."""
    
    pixel = "pixel"
    inch = "inch"

Install with Tessl CLI

npx tessl i tessl/pypi-azure-cognitiveservices-vision-computervision

docs

area-of-interest.md

domain-analysis.md

image-analysis.md

image-description.md

image-tagging.md

index.md

object-detection.md

ocr-text-recognition.md

thumbnail-generation.md

tile.json