Microsoft Azure Cognitive Services Computer Vision Client Library for Python providing state-of-the-art algorithms to process images and return information including mature content detection, face detection, color analysis, image categorization, description generation, and thumbnail creation.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Generate human-readable descriptions of image content in complete English sentences. The service analyzes visual content and creates natural language descriptions that capture the main elements, activities, and context within images.
Create natural language descriptions of image content with confidence scores and multiple description candidates.
def describe_image(url, max_candidates=None, language="en", description_exclude=None, model_version="latest", custom_headers=None, raw=False, **operation_config):
"""
Generate human-readable description of image content.
Args:
url (str): Publicly reachable URL of an image
max_candidates (int, optional): Maximum number of description candidates to return (default: 1)
language (str, optional): Output language for descriptions.
Supported: "en", "es", "ja", "pt", "zh". Default: "en"
description_exclude (list[DescriptionExclude], optional): Domain models to exclude.
Available values: Celebrities, Landmarks
model_version (str, optional): AI model version. Default: "latest"
custom_headers (dict, optional): Custom HTTP headers
raw (bool, optional): Return raw response. Default: False
Returns:
ImageDescription: Generated descriptions with confidence scores and tags
Raises:
ComputerVisionErrorResponseException: API error occurred
"""
def describe_image_in_stream(image, max_candidates=None, language="en", description_exclude=None, model_version="latest", custom_headers=None, raw=False, **operation_config):
"""
Generate description from binary image stream.
Args:
image (Generator): Binary image data stream
max_candidates (int, optional): Maximum description candidates
language (str, optional): Output language
description_exclude (list[DescriptionExclude], optional): Domain models to exclude
Returns:
ImageDescription: Generated descriptions and metadata
"""from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
# Initialize client
credentials = CognitiveServicesCredentials("your-api-key")
client = ComputerVisionClient("https://your-endpoint.cognitiveservices.azure.com/", credentials)
# Generate description for image
image_url = "https://example.com/park-scene.jpg"
description_result = client.describe_image(image_url)
# Get the best description
if description_result.captions:
best_caption = description_result.captions[0]
print(f"Description: {best_caption.text}")
print(f"Confidence: {best_caption.confidence:.3f}")
# Show related tags
print(f"\nRelated tags:")
for tag in description_result.tags:
print(f" - {tag}")# Get multiple description candidates
image_url = "https://example.com/complex-scene.jpg"
description_result = client.describe_image(
image_url,
max_candidates=3 # Get up to 3 different descriptions
)
print("Description candidates:")
for i, caption in enumerate(description_result.captions, 1):
print(f"{i}. {caption.text} (confidence: {caption.confidence:.3f})")
# Choose description with highest confidence
best_caption = max(description_result.captions, key=lambda c: c.confidence)
print(f"\nBest description: {best_caption.text}")# Generate descriptions in different languages
image_url = "https://example.com/street-scene.jpg"
languages = ["en", "es", "ja"]
descriptions = {}
for lang in languages:
try:
result = client.describe_image(image_url, language=lang)
if result.captions:
descriptions[lang] = result.captions[0].text
except Exception as e:
print(f"Failed to get description in {lang}: {e}")
# Display results
for lang, description in descriptions.items():
print(f"{lang}: {description}")# Generate description from local image
with open("vacation_photo.jpg", "rb") as image_stream:
description_result = client.describe_image_in_stream(
image_stream,
max_candidates=2
)
print("Descriptions:")
for caption in description_result.captions:
print(f" {caption.text} (confidence: {caption.confidence:.3f})")
print("\nDetected elements:")
for tag in description_result.tags:
print(f" - {tag}")from azure.cognitiveservices.vision.computervision.models import DescriptionExclude
# Generate description excluding celebrity and landmark information
image_url = "https://example.com/tourist-photo.jpg"
description_result = client.describe_image(
image_url,
description_exclude=[DescriptionExclude.celebrities, DescriptionExclude.landmarks]
)
# This will focus on general scene description rather than identifying specific people or places
for caption in description_result.captions:
print(f"General description: {caption.text}")# Process multiple images for descriptions
image_urls = [
"https://example.com/image1.jpg",
"https://example.com/image2.jpg",
"https://example.com/image3.jpg"
]
descriptions = []
for i, url in enumerate(image_urls):
try:
result = client.describe_image(url)
if result.captions:
descriptions.append({
'url': url,
'description': result.captions[0].text,
'confidence': result.captions[0].confidence,
'tags': result.tags
})
print(f"Processed image {i+1}/{len(image_urls)}")
except Exception as e:
print(f"Error processing {url}: {e}")
# Display results
for desc in descriptions:
print(f"\nImage: {desc['url']}")
print(f"Description: {desc['description']}")
print(f"Confidence: {desc['confidence']:.3f}")
print(f"Tags: {', '.join(desc['tags'][:5])}") # Show first 5 tagsclass ImageDescription:
"""
Image description generation result.
Attributes:
tags (list[str]): Descriptive tags related to image content
captions (list[ImageCaption]): Generated description candidates with confidence scores
description_details (ImageDescriptionDetails): Additional description metadata
request_id (str): Request identifier
metadata (ImageMetadata): Image metadata (dimensions, format)
model_version (str): AI model version used
"""class ImageCaption:
"""
Generated image caption with confidence score.
Attributes:
text (str): Natural language description of the image
confidence (float): Confidence score for the description (0.0 to 1.0)
"""class ImageDescriptionDetails:
"""
Additional details about the description generation process.
Attributes:
tags (list[str]): Extended list of descriptive tags
celebrities (list): Celebrity information (if applicable)
landmarks (list): Landmark information (if applicable)
"""class ImageMetadata:
"""
Image metadata information.
Attributes:
height (int): Image height in pixels
width (int): Image width in pixels
format (str): Image format (e.g., "Jpeg", "Png")
"""The description service supports multiple languages for output:
English typically provides the most detailed and accurate descriptions, while other languages may have varying levels of detail and accuracy.
The service typically includes:
max_candidates > 1) for important applicationsInstall with Tessl CLI
npx tessl i tessl/pypi-azure-cognitiveservices-vision-computervision