Microsoft Azure AI Search Client Library for Python providing comprehensive search, indexing, and AI-powered document processing capabilities.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
The SearchIndexerClient manages automated data ingestion through indexers that connect to various data sources, with optional AI-powered content enrichment through skillsets. This enables knowledge mining, document cracking, and cognitive enrichment of content during the indexing process.
Create a SearchIndexerClient to manage indexers, data sources, and skillsets.
class SearchIndexerClient:
def __init__(
self,
endpoint: str,
credential: Union[AzureKeyCredential, TokenCredential],
**kwargs
) -> None:
"""
Initialize SearchIndexerClient for indexer management.
Parameters:
- endpoint (str): The URL endpoint of an Azure search service
- credential: A credential to authorize requests
- api_version (str, optional): The Search API version to use
- audience (str, optional): AAD audience for authentication
"""
def close(self) -> None:
"""Close the session."""
def __enter__(self) -> "SearchIndexerClient": ...
def __exit__(self, *args) -> None: ...Create, configure, and manage indexers for automated data ingestion.
def create_indexer(self, indexer: SearchIndexer, **kwargs) -> SearchIndexer:
"""
Create a new indexer.
Parameters:
- indexer (SearchIndexer): The indexer definition
Returns:
SearchIndexer: The created indexer
"""
def create_or_update_indexer(
self,
indexer: SearchIndexer,
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
cache_reprocessing_change_detection_disabled: Optional[bool] = None,
cache_reset_requirements_ignored: Optional[bool] = None,
**kwargs
) -> SearchIndexer:
"""
Create a new indexer or update an existing one.
Parameters:
- indexer (SearchIndexer): The indexer definition
- if_match (str): ETag for conditional updates
- if_none_match (str): ETag for conditional creation
- cache_reprocessing_change_detection_disabled (bool): Disable change detection
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
Returns:
SearchIndexer: The created or updated indexer
"""
def get_indexer(self, name: str, **kwargs) -> SearchIndexer:
"""
Retrieve an indexer definition.
Parameters:
- name (str): Name of the indexer
Returns:
SearchIndexer: The indexer definition
"""
def get_indexers(
self,
*,
select: Optional[List[str]] = None,
**kwargs
) -> Sequence[SearchIndexer]:
"""
List all indexers in the search service.
Parameters:
- select (List[str], optional): Fields to include in results
Returns:
Sequence[SearchIndexer]: List of indexers
"""
def get_indexer_names(self, **kwargs) -> Sequence[str]:
"""
List all indexer names.
Returns:
Sequence[str]: List of indexer names
"""
def delete_indexer(
self,
indexer: Union[str, SearchIndexer],
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
**kwargs
) -> None:
"""
Delete an indexer.
Parameters:
- indexer: Indexer name or SearchIndexer object
- if_match (str): ETag for conditional deletion
- if_none_match (str): ETag for conditional deletion
"""Run, reset, and monitor indexer execution.
def run_indexer(self, name: str, **kwargs) -> None:
"""
Run an indexer manually.
Parameters:
- name (str): Name of the indexer to run
"""
def reset_indexer(self, name: str, **kwargs) -> None:
"""
Reset an indexer's execution state.
Parameters:
- name (str): Name of the indexer to reset
"""
def get_indexer_status(self, name: str, **kwargs) -> SearchIndexerStatus:
"""
Get the execution status and history of an indexer.
Parameters:
- name (str): Name of the indexer
Returns:
SearchIndexerStatus: Indexer execution status and history
"""Configure connections to external data sources.
def create_data_source_connection(
self,
data_source: SearchIndexerDataSourceConnection,
**kwargs
) -> SearchIndexerDataSourceConnection:
"""
Create a new data source connection.
Parameters:
- data_source (SearchIndexerDataSourceConnection): Data source definition
Returns:
SearchIndexerDataSourceConnection: The created data source
"""
def create_or_update_data_source_connection(
self,
data_source: SearchIndexerDataSourceConnection,
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
cache_reset_requirements_ignored: Optional[bool] = None,
**kwargs
) -> SearchIndexerDataSourceConnection:
"""
Create or update a data source connection.
Parameters:
- data_source (SearchIndexerDataSourceConnection): Data source definition
- if_match (str): ETag for conditional updates
- if_none_match (str): ETag for conditional creation
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
Returns:
SearchIndexerDataSourceConnection: The created or updated data source
"""
def get_data_source_connection(
self,
name: str,
**kwargs
) -> SearchIndexerDataSourceConnection:
"""
Retrieve a data source connection.
Parameters:
- name (str): Name of the data source
Returns:
SearchIndexerDataSourceConnection: The data source definition
"""
def get_data_source_connections(
self,
*,
select: Optional[List[str]] = None,
**kwargs
) -> Sequence[SearchIndexerDataSourceConnection]:
"""
List all data source connections.
Parameters:
- select (List[str], optional): Fields to include in results
Returns:
Sequence[SearchIndexerDataSourceConnection]: List of data sources
"""
def get_data_source_connection_names(self, **kwargs) -> Sequence[str]:
"""
List all data source connection names.
Returns:
Sequence[str]: List of data source names
"""
def delete_data_source_connection(
self,
data_source: Union[str, SearchIndexerDataSourceConnection],
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
**kwargs
) -> None:
"""
Delete a data source connection.
Parameters:
- data_source: Data source name or object
- if_match (str): ETag for conditional deletion
- if_none_match (str): ETag for conditional deletion
"""Define and manage AI enrichment skillsets for cognitive processing.
def create_skillset(self, skillset: SearchIndexerSkillset, **kwargs) -> SearchIndexerSkillset:
"""
Create a new skillset.
Parameters:
- skillset (SearchIndexerSkillset): The skillset definition
Returns:
SearchIndexerSkillset: The created skillset
"""
def create_or_update_skillset(
self,
skillset: SearchIndexerSkillset,
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
cache_reset_requirements_ignored: Optional[bool] = None,
**kwargs
) -> SearchIndexerSkillset:
"""
Create or update a skillset.
Parameters:
- skillset (SearchIndexerSkillset): The skillset definition
- if_match (str): ETag for conditional updates
- if_none_match (str): ETag for conditional creation
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
Returns:
SearchIndexerSkillset: The created or updated skillset
"""
def get_skillset(self, name: str, **kwargs) -> SearchIndexerSkillset:
"""
Retrieve a skillset definition.
Parameters:
- name (str): Name of the skillset
Returns:
SearchIndexerSkillset: The skillset definition
"""
def get_skillsets(
self,
*,
select: Optional[List[str]] = None,
**kwargs
) -> Sequence[SearchIndexerSkillset]:
"""
List all skillsets.
Parameters:
- select (List[str], optional): Fields to include in results
Returns:
Sequence[SearchIndexerSkillset]: List of skillsets
"""
def get_skillset_names(self, **kwargs) -> Sequence[str]:
"""
List all skillset names.
Returns:
Sequence[str]: List of skillset names
"""
def delete_skillset(
self,
skillset: Union[str, SearchIndexerSkillset],
*,
if_match: Optional[str] = None,
if_none_match: Optional[str] = None,
**kwargs
) -> None:
"""
Delete a skillset.
Parameters:
- skillset: Skillset name or object
- if_match (str): ETag for conditional deletion
- if_none_match (str): ETag for conditional deletion
"""from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
SearchIndexer, SearchIndexerDataSourceConnection, SearchIndexerDataContainer,
BlobIndexerParsingMode, IndexingSchedule
)
from azure.core.credentials import AzureKeyCredential
client = SearchIndexerClient(
endpoint="https://service.search.windows.net",
credential=AzureKeyCredential("admin-key")
)
# Create data source for Blob Storage
data_source = SearchIndexerDataSourceConnection(
name="blob-datasource",
type="azureblob",
connection_string="DefaultEndpointsProtocol=https;AccountName=account;AccountKey=key;EndpointSuffix=core.windows.net",
container=SearchIndexerDataContainer(name="documents")
)
client.create_data_source_connection(data_source)
# Create indexer with scheduling
indexer = SearchIndexer(
name="blob-indexer",
data_source_name="blob-datasource",
target_index_name="documents-index",
schedule=IndexingSchedule(interval="PT2H"), # Run every 2 hours
parameters={
"batchSize": 1000,
"maxFailedItems": 10,
"maxFailedItemsPerBatch": 5,
"configuration": {
"parsingMode": BlobIndexerParsingMode.TEXT,
"excludedFileNameExtensions": ".png,.jpeg,.jpg"
}
}
)
client.create_indexer(indexer)from azure.search.documents.indexes.models import (
SearchIndexerSkillset, EntityRecognitionSkill, KeyPhraseExtractionSkill,
LanguageDetectionSkill, MergeSkill, OcrSkill, ImageAnalysisSkill,
InputFieldMappingEntry, OutputFieldMappingEntry
)
# Create skillset with cognitive skills
skillset = SearchIndexerSkillset(
name="ai-skillset",
description="Extract entities, key phrases, and analyze images",
skills=[
# OCR skill for image text extraction
OcrSkill(
inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
outputs=[OutputFieldMappingEntry(name="text", target_name="myText")]
),
# Language detection
LanguageDetectionSkill(
inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
outputs=[OutputFieldMappingEntry(name="languageCode", target_name="languageCode")]
),
# Key phrase extraction
KeyPhraseExtractionSkill(
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
],
outputs=[OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")]
),
# Entity recognition
EntityRecognitionSkill(
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
],
outputs=[
OutputFieldMappingEntry(name="persons", target_name="persons"),
OutputFieldMappingEntry(name="organizations", target_name="organizations"),
OutputFieldMappingEntry(name="locations", target_name="locations")
]
)
]
)
client.create_skillset(skillset)
# Create indexer that uses the skillset
indexer = SearchIndexer(
name="ai-enriched-indexer",
data_source_name="blob-datasource",
target_index_name="enriched-documents",
skillset_name="ai-skillset",
field_mappings=[
{"sourceFieldName": "metadata_storage_path", "targetFieldName": "id"},
{"sourceFieldName": "metadata_storage_name", "targetFieldName": "filename"}
],
output_field_mappings=[
{"sourceFieldName": "/document/keyPhrases", "targetFieldName": "keyPhrases"},
{"sourceFieldName": "/document/persons", "targetFieldName": "persons"},
{"sourceFieldName": "/document/organizations", "targetFieldName": "organizations"}
]
)
client.create_indexer(indexer)from azure.search.documents.indexes.models import WebApiSkill
# Custom skill that calls external API
custom_skill = WebApiSkill(
name="CustomTextClassifier",
description="Classifies text using custom ML model",
uri="https://your-api.com/classify",
http_method="POST",
http_headers={"Content-Type": "application/json"},
inputs=[
InputFieldMappingEntry(name="text", source="/document/content")
],
outputs=[
OutputFieldMappingEntry(name="category", target_name="category"),
OutputFieldMappingEntry(name="confidence", target_name="confidence")
]
)
skillset = SearchIndexerSkillset(
name="custom-skillset",
skills=[custom_skill]
)
client.create_skillset(skillset)# Run indexer and monitor status
client.run_indexer("my-indexer")
# Get execution status
status = client.get_indexer_status("my-indexer")
print(f"Status: {status.status}")
print(f"Last result: {status.last_result.status}")
# Check execution history
if status.execution_history:
for execution in status.execution_history:
print(f"Start: {execution.start_time}, Status: {execution.status}")
if execution.errors:
for error in execution.errors:
print(f"Error: {error.error_message}")# SQL database data source
sql_data_source = SearchIndexerDataSourceConnection(
name="sql-datasource",
type="azuresql",
connection_string="Server=server.database.windows.net;Database=mydb;User ID=user;Password=pass;",
container=SearchIndexerDataContainer(
name="Products",
query="SELECT ProductId, ProductName, Description, ModifiedDate FROM Products WHERE ModifiedDate > @HighWaterMark ORDER BY ModifiedDate"
),
data_change_detection_policy={
"@odata.type": "#Microsoft.Azure.Search.HighWaterMarkChangeDetectionPolicy",
"highWaterMarkColumnName": "ModifiedDate"
}
)
client.create_data_source_connection(sql_data_source)# Indexer definition
class SearchIndexer:
name: str
description: Optional[str] = None
data_source_name: str
skillset_name: Optional[str] = None
target_index_name: str
schedule: Optional[IndexingSchedule] = None
parameters: Optional[IndexingParameters] = None
field_mappings: Optional[List[FieldMapping]] = None
output_field_mappings: Optional[List[FieldMapping]] = None
is_disabled: Optional[bool] = False
e_tag: Optional[str] = None
encryption_key: Optional[SearchResourceEncryptionKey] = None
# Data source connection
class SearchIndexerDataSourceConnection:
name: str
description: Optional[str] = None
type: str
connection_string: str
container: SearchIndexerDataContainer
data_change_detection_policy: Optional[DataChangeDetectionPolicy] = None
data_deletion_detection_policy: Optional[DataDeletionDetectionPolicy] = None
e_tag: Optional[str] = None
encryption_key: Optional[SearchResourceEncryptionKey] = None
# Skillset definition
class SearchIndexerSkillset:
name: str
description: Optional[str] = None
skills: List[SearchIndexerSkill]
cognitive_services_account: Optional[CognitiveServicesAccount] = None
knowledge_store: Optional[SearchIndexerKnowledgeStore] = None
e_tag: Optional[str] = None
encryption_key: Optional[SearchResourceEncryptionKey] = None
# Indexer status
class SearchIndexerStatus:
status: str
last_result: Optional[IndexerExecutionResult] = None
execution_history: Optional[List[IndexerExecutionResult]] = None
limits: Optional[SearchIndexerLimits] = None
# Execution result
class IndexerExecutionResult:
status: str
start_time: Optional[datetime] = None
end_time: Optional[datetime] = None
error_message: Optional[str] = None
errors: Optional[List[SearchIndexerError]] = None
warnings: Optional[List[SearchIndexerWarning]] = None
item_count: Optional[int] = None
failed_item_count: Optional[int] = NoneInstall with Tessl CLI
npx tessl i tessl/pypi-azure-search-documents