CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-azure-search-documents

Microsoft Azure AI Search Client Library for Python providing comprehensive search, indexing, and AI-powered document processing capabilities.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

indexer-management.mddocs/

Data Ingestion and AI Enrichment

The SearchIndexerClient manages automated data ingestion through indexers that connect to various data sources, with optional AI-powered content enrichment through skillsets. This enables knowledge mining, document cracking, and cognitive enrichment of content during the indexing process.

Capabilities

Client Initialization

Create a SearchIndexerClient to manage indexers, data sources, and skillsets.

class SearchIndexerClient:
    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, TokenCredential], 
        **kwargs
    ) -> None:
        """
        Initialize SearchIndexerClient for indexer management.
        
        Parameters:
        - endpoint (str): The URL endpoint of an Azure search service
        - credential: A credential to authorize requests
        - api_version (str, optional): The Search API version to use
        - audience (str, optional): AAD audience for authentication
        """
    
    def close(self) -> None:
        """Close the session."""
    
    def __enter__(self) -> "SearchIndexerClient": ...
    def __exit__(self, *args) -> None: ...

Indexer Management

Create, configure, and manage indexers for automated data ingestion.

def create_indexer(self, indexer: SearchIndexer, **kwargs) -> SearchIndexer:
    """
    Create a new indexer.
    
    Parameters:
    - indexer (SearchIndexer): The indexer definition
    
    Returns:
    SearchIndexer: The created indexer
    """

def create_or_update_indexer(
    self,
    indexer: SearchIndexer,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reprocessing_change_detection_disabled: Optional[bool] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexer:
    """
    Create a new indexer or update an existing one.
    
    Parameters:
    - indexer (SearchIndexer): The indexer definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reprocessing_change_detection_disabled (bool): Disable change detection
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexer: The created or updated indexer
    """

def get_indexer(self, name: str, **kwargs) -> SearchIndexer:
    """
    Retrieve an indexer definition.
    
    Parameters:
    - name (str): Name of the indexer
    
    Returns:
    SearchIndexer: The indexer definition
    """

def get_indexers(
    self, 
    *, 
    select: Optional[List[str]] = None, 
    **kwargs
) -> Sequence[SearchIndexer]:
    """
    List all indexers in the search service.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexer]: List of indexers
    """

def get_indexer_names(self, **kwargs) -> Sequence[str]:
    """
    List all indexer names.
    
    Returns:
    Sequence[str]: List of indexer names
    """

def delete_indexer(
    self,
    indexer: Union[str, SearchIndexer],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete an indexer.
    
    Parameters:
    - indexer: Indexer name or SearchIndexer object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Indexer Execution Control

Run, reset, and monitor indexer execution.

def run_indexer(self, name: str, **kwargs) -> None:
    """
    Run an indexer manually.
    
    Parameters:
    - name (str): Name of the indexer to run
    """

def reset_indexer(self, name: str, **kwargs) -> None:
    """
    Reset an indexer's execution state.
    
    Parameters:
    - name (str): Name of the indexer to reset
    """

def get_indexer_status(self, name: str, **kwargs) -> SearchIndexerStatus:
    """
    Get the execution status and history of an indexer.
    
    Parameters:
    - name (str): Name of the indexer
    
    Returns:
    SearchIndexerStatus: Indexer execution status and history
    """

Data Source Management

Configure connections to external data sources.

def create_data_source_connection(
    self,
    data_source: SearchIndexerDataSourceConnection,
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Create a new data source connection.
    
    Parameters:
    - data_source (SearchIndexerDataSourceConnection): Data source definition
    
    Returns:
    SearchIndexerDataSourceConnection: The created data source
    """

def create_or_update_data_source_connection(
    self,
    data_source: SearchIndexerDataSourceConnection,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Create or update a data source connection.
    
    Parameters:
    - data_source (SearchIndexerDataSourceConnection): Data source definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexerDataSourceConnection: The created or updated data source
    """

def get_data_source_connection(
    self, 
    name: str, 
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Retrieve a data source connection.
    
    Parameters:
    - name (str): Name of the data source
    
    Returns:
    SearchIndexerDataSourceConnection: The data source definition
    """

def get_data_source_connections(
    self,
    *,
    select: Optional[List[str]] = None,
    **kwargs
) -> Sequence[SearchIndexerDataSourceConnection]:
    """
    List all data source connections.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexerDataSourceConnection]: List of data sources
    """

def get_data_source_connection_names(self, **kwargs) -> Sequence[str]:
    """
    List all data source connection names.
    
    Returns:
    Sequence[str]: List of data source names
    """

def delete_data_source_connection(
    self,
    data_source: Union[str, SearchIndexerDataSourceConnection],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete a data source connection.
    
    Parameters:
    - data_source: Data source name or object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Skillset Management

Define and manage AI enrichment skillsets for cognitive processing.

def create_skillset(self, skillset: SearchIndexerSkillset, **kwargs) -> SearchIndexerSkillset:
    """
    Create a new skillset.
    
    Parameters:
    - skillset (SearchIndexerSkillset): The skillset definition
    
    Returns:
    SearchIndexerSkillset: The created skillset
    """

def create_or_update_skillset(
    self,
    skillset: SearchIndexerSkillset,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexerSkillset:
    """
    Create or update a skillset.
    
    Parameters:
    - skillset (SearchIndexerSkillset): The skillset definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexerSkillset: The created or updated skillset
    """

def get_skillset(self, name: str, **kwargs) -> SearchIndexerSkillset:
    """
    Retrieve a skillset definition.
    
    Parameters:
    - name (str): Name of the skillset
    
    Returns:
    SearchIndexerSkillset: The skillset definition
    """

def get_skillsets(
    self, 
    *, 
    select: Optional[List[str]] = None, 
    **kwargs
) -> Sequence[SearchIndexerSkillset]:
    """
    List all skillsets.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexerSkillset]: List of skillsets
    """

def get_skillset_names(self, **kwargs) -> Sequence[str]:
    """
    List all skillset names.
    
    Returns:
    Sequence[str]: List of skillset names
    """

def delete_skillset(
    self,
    skillset: Union[str, SearchIndexerSkillset],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete a skillset.
    
    Parameters:
    - skillset: Skillset name or object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Usage Examples

Azure Blob Storage Indexer

from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexer, SearchIndexerDataSourceConnection, SearchIndexerDataContainer,
    BlobIndexerParsingMode, IndexingSchedule
)
from azure.core.credentials import AzureKeyCredential

client = SearchIndexerClient(
    endpoint="https://service.search.windows.net",
    credential=AzureKeyCredential("admin-key")
)

# Create data source for Blob Storage
data_source = SearchIndexerDataSourceConnection(
    name="blob-datasource",
    type="azureblob",
    connection_string="DefaultEndpointsProtocol=https;AccountName=account;AccountKey=key;EndpointSuffix=core.windows.net",
    container=SearchIndexerDataContainer(name="documents")
)
client.create_data_source_connection(data_source)

# Create indexer with scheduling
indexer = SearchIndexer(
    name="blob-indexer",
    data_source_name="blob-datasource",
    target_index_name="documents-index",
    schedule=IndexingSchedule(interval="PT2H"),  # Run every 2 hours
    parameters={
        "batchSize": 1000,
        "maxFailedItems": 10,
        "maxFailedItemsPerBatch": 5,
        "configuration": {
            "parsingMode": BlobIndexerParsingMode.TEXT,
            "excludedFileNameExtensions": ".png,.jpeg,.jpg"
        }
    }
)
client.create_indexer(indexer)

AI Enrichment with Skillset

from azure.search.documents.indexes.models import (
    SearchIndexerSkillset, EntityRecognitionSkill, KeyPhraseExtractionSkill,
    LanguageDetectionSkill, MergeSkill, OcrSkill, ImageAnalysisSkill,
    InputFieldMappingEntry, OutputFieldMappingEntry
)

# Create skillset with cognitive skills
skillset = SearchIndexerSkillset(
    name="ai-skillset",
    description="Extract entities, key phrases, and analyze images",
    skills=[
        # OCR skill for image text extraction
        OcrSkill(
            inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
            outputs=[OutputFieldMappingEntry(name="text", target_name="myText")]
        ),
        
        # Language detection
        LanguageDetectionSkill(
            inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
            outputs=[OutputFieldMappingEntry(name="languageCode", target_name="languageCode")]
        ),
        
        # Key phrase extraction
        KeyPhraseExtractionSkill(
            inputs=[
                InputFieldMappingEntry(name="text", source="/document/content"),
                InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
            ],
            outputs=[OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")]
        ),
        
        # Entity recognition
        EntityRecognitionSkill(
            inputs=[
                InputFieldMappingEntry(name="text", source="/document/content"),
                InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
            ],
            outputs=[
                OutputFieldMappingEntry(name="persons", target_name="persons"),
                OutputFieldMappingEntry(name="organizations", target_name="organizations"),
                OutputFieldMappingEntry(name="locations", target_name="locations")
            ]
        )
    ]
)
client.create_skillset(skillset)

# Create indexer that uses the skillset
indexer = SearchIndexer(
    name="ai-enriched-indexer",
    data_source_name="blob-datasource",
    target_index_name="enriched-documents",
    skillset_name="ai-skillset",
    field_mappings=[
        {"sourceFieldName": "metadata_storage_path", "targetFieldName": "id"},
        {"sourceFieldName": "metadata_storage_name", "targetFieldName": "filename"}
    ],
    output_field_mappings=[
        {"sourceFieldName": "/document/keyPhrases", "targetFieldName": "keyPhrases"},
        {"sourceFieldName": "/document/persons", "targetFieldName": "persons"},
        {"sourceFieldName": "/document/organizations", "targetFieldName": "organizations"}
    ]
)
client.create_indexer(indexer)

Custom Web API Skill

from azure.search.documents.indexes.models import WebApiSkill

# Custom skill that calls external API
custom_skill = WebApiSkill(
    name="CustomTextClassifier",
    description="Classifies text using custom ML model",
    uri="https://your-api.com/classify",
    http_method="POST",
    http_headers={"Content-Type": "application/json"},
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="category", target_name="category"),
        OutputFieldMappingEntry(name="confidence", target_name="confidence")
    ]
)

skillset = SearchIndexerSkillset(
    name="custom-skillset",
    skills=[custom_skill]
)
client.create_skillset(skillset)

Monitor Indexer Execution

# Run indexer and monitor status
client.run_indexer("my-indexer")

# Get execution status
status = client.get_indexer_status("my-indexer")
print(f"Status: {status.status}")
print(f"Last result: {status.last_result.status}")

# Check execution history
if status.execution_history:
    for execution in status.execution_history:
        print(f"Start: {execution.start_time}, Status: {execution.status}")
        if execution.errors:
            for error in execution.errors:
                print(f"Error: {error.error_message}")

SQL Database Data Source

# SQL database data source
sql_data_source = SearchIndexerDataSourceConnection(
    name="sql-datasource",
    type="azuresql",
    connection_string="Server=server.database.windows.net;Database=mydb;User ID=user;Password=pass;",
    container=SearchIndexerDataContainer(
        name="Products",
        query="SELECT ProductId, ProductName, Description, ModifiedDate FROM Products WHERE ModifiedDate > @HighWaterMark ORDER BY ModifiedDate"
    ),
    data_change_detection_policy={
        "@odata.type": "#Microsoft.Azure.Search.HighWaterMarkChangeDetectionPolicy",
        "highWaterMarkColumnName": "ModifiedDate"
    }
)
client.create_data_source_connection(sql_data_source)

Common Types

# Indexer definition
class SearchIndexer:
    name: str
    description: Optional[str] = None
    data_source_name: str
    skillset_name: Optional[str] = None
    target_index_name: str
    schedule: Optional[IndexingSchedule] = None
    parameters: Optional[IndexingParameters] = None
    field_mappings: Optional[List[FieldMapping]] = None
    output_field_mappings: Optional[List[FieldMapping]] = None
    is_disabled: Optional[bool] = False
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Data source connection
class SearchIndexerDataSourceConnection:
    name: str
    description: Optional[str] = None
    type: str
    connection_string: str
    container: SearchIndexerDataContainer
    data_change_detection_policy: Optional[DataChangeDetectionPolicy] = None
    data_deletion_detection_policy: Optional[DataDeletionDetectionPolicy] = None
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Skillset definition
class SearchIndexerSkillset:
    name: str
    description: Optional[str] = None
    skills: List[SearchIndexerSkill]
    cognitive_services_account: Optional[CognitiveServicesAccount] = None
    knowledge_store: Optional[SearchIndexerKnowledgeStore] = None
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Indexer status
class SearchIndexerStatus:
    status: str
    last_result: Optional[IndexerExecutionResult] = None
    execution_history: Optional[List[IndexerExecutionResult]] = None
    limits: Optional[SearchIndexerLimits] = None

# Execution result
class IndexerExecutionResult:
    status: str
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None
    error_message: Optional[str] = None
    errors: Optional[List[SearchIndexerError]] = None
    warnings: Optional[List[SearchIndexerWarning]] = None
    item_count: Optional[int] = None
    failed_item_count: Optional[int] = None

Install with Tessl CLI

npx tessl i tessl/pypi-azure-search-documents

docs

async-clients.md

index-management.md

index.md

indexer-management.md

models.md

search-client.md

tile.json