tessl/pypi-azure-search-documents

Microsoft Azure AI Search Client Library for Python providing comprehensive search, indexing, and AI-powered document processing capabilities.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Data Ingestion and AI Enrichment

Name: tessl/pypi-azure-search-documents
Author: tessl

The SearchIndexerClient manages automated data ingestion through indexers that connect to various data sources, with optional AI-powered content enrichment through skillsets. This enables knowledge mining, document cracking, and cognitive enrichment of content during the indexing process.

Capabilities

Client Initialization

Create a SearchIndexerClient to manage indexers, data sources, and skillsets.

class SearchIndexerClient:
    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, TokenCredential], 
        **kwargs
    ) -> None:
        """
        Initialize SearchIndexerClient for indexer management.
        
        Parameters:
        - endpoint (str): The URL endpoint of an Azure search service
        - credential: A credential to authorize requests
        - api_version (str, optional): The Search API version to use
        - audience (str, optional): AAD audience for authentication
        """
    
    def close(self) -> None:
        """Close the session."""
    
    def __enter__(self) -> "SearchIndexerClient": ...
    def __exit__(self, *args) -> None: ...

Indexer Management

Create, configure, and manage indexers for automated data ingestion.

def create_indexer(self, indexer: SearchIndexer, **kwargs) -> SearchIndexer:
    """
    Create a new indexer.
    
    Parameters:
    - indexer (SearchIndexer): The indexer definition
    
    Returns:
    SearchIndexer: The created indexer
    """

def create_or_update_indexer(
    self,
    indexer: SearchIndexer,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reprocessing_change_detection_disabled: Optional[bool] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexer:
    """
    Create a new indexer or update an existing one.
    
    Parameters:
    - indexer (SearchIndexer): The indexer definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reprocessing_change_detection_disabled (bool): Disable change detection
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexer: The created or updated indexer
    """

def get_indexer(self, name: str, **kwargs) -> SearchIndexer:
    """
    Retrieve an indexer definition.
    
    Parameters:
    - name (str): Name of the indexer
    
    Returns:
    SearchIndexer: The indexer definition
    """

def get_indexers(
    self, 
    *, 
    select: Optional[List[str]] = None, 
    **kwargs
) -> Sequence[SearchIndexer]:
    """
    List all indexers in the search service.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexer]: List of indexers
    """

def get_indexer_names(self, **kwargs) -> Sequence[str]:
    """
    List all indexer names.
    
    Returns:
    Sequence[str]: List of indexer names
    """

def delete_indexer(
    self,
    indexer: Union[str, SearchIndexer],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete an indexer.
    
    Parameters:
    - indexer: Indexer name or SearchIndexer object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Indexer Execution Control

Run, reset, and monitor indexer execution.

def run_indexer(self, name: str, **kwargs) -> None:
    """
    Run an indexer manually.
    
    Parameters:
    - name (str): Name of the indexer to run
    """

def reset_indexer(self, name: str, **kwargs) -> None:
    """
    Reset an indexer's execution state.
    
    Parameters:
    - name (str): Name of the indexer to reset
    """

def get_indexer_status(self, name: str, **kwargs) -> SearchIndexerStatus:
    """
    Get the execution status and history of an indexer.
    
    Parameters:
    - name (str): Name of the indexer
    
    Returns:
    SearchIndexerStatus: Indexer execution status and history
    """

Data Source Management

Configure connections to external data sources.

def create_data_source_connection(
    self,
    data_source: SearchIndexerDataSourceConnection,
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Create a new data source connection.
    
    Parameters:
    - data_source (SearchIndexerDataSourceConnection): Data source definition
    
    Returns:
    SearchIndexerDataSourceConnection: The created data source
    """

def create_or_update_data_source_connection(
    self,
    data_source: SearchIndexerDataSourceConnection,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Create or update a data source connection.
    
    Parameters:
    - data_source (SearchIndexerDataSourceConnection): Data source definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexerDataSourceConnection: The created or updated data source
    """

def get_data_source_connection(
    self, 
    name: str, 
    **kwargs
) -> SearchIndexerDataSourceConnection:
    """
    Retrieve a data source connection.
    
    Parameters:
    - name (str): Name of the data source
    
    Returns:
    SearchIndexerDataSourceConnection: The data source definition
    """

def get_data_source_connections(
    self,
    *,
    select: Optional[List[str]] = None,
    **kwargs
) -> Sequence[SearchIndexerDataSourceConnection]:
    """
    List all data source connections.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexerDataSourceConnection]: List of data sources
    """

def get_data_source_connection_names(self, **kwargs) -> Sequence[str]:
    """
    List all data source connection names.
    
    Returns:
    Sequence[str]: List of data source names
    """

def delete_data_source_connection(
    self,
    data_source: Union[str, SearchIndexerDataSourceConnection],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete a data source connection.
    
    Parameters:
    - data_source: Data source name or object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Skillset Management

Define and manage AI enrichment skillsets for cognitive processing.

def create_skillset(self, skillset: SearchIndexerSkillset, **kwargs) -> SearchIndexerSkillset:
    """
    Create a new skillset.
    
    Parameters:
    - skillset (SearchIndexerSkillset): The skillset definition
    
    Returns:
    SearchIndexerSkillset: The created skillset
    """

def create_or_update_skillset(
    self,
    skillset: SearchIndexerSkillset,
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    cache_reset_requirements_ignored: Optional[bool] = None,
    **kwargs
) -> SearchIndexerSkillset:
    """
    Create or update a skillset.
    
    Parameters:
    - skillset (SearchIndexerSkillset): The skillset definition
    - if_match (str): ETag for conditional updates
    - if_none_match (str): ETag for conditional creation
    - cache_reset_requirements_ignored (bool): Ignore cache reset requirements
    
    Returns:
    SearchIndexerSkillset: The created or updated skillset
    """

def get_skillset(self, name: str, **kwargs) -> SearchIndexerSkillset:
    """
    Retrieve a skillset definition.
    
    Parameters:
    - name (str): Name of the skillset
    
    Returns:
    SearchIndexerSkillset: The skillset definition
    """

def get_skillsets(
    self, 
    *, 
    select: Optional[List[str]] = None, 
    **kwargs
) -> Sequence[SearchIndexerSkillset]:
    """
    List all skillsets.
    
    Parameters:
    - select (List[str], optional): Fields to include in results
    
    Returns:
    Sequence[SearchIndexerSkillset]: List of skillsets
    """

def get_skillset_names(self, **kwargs) -> Sequence[str]:
    """
    List all skillset names.
    
    Returns:
    Sequence[str]: List of skillset names
    """

def delete_skillset(
    self,
    skillset: Union[str, SearchIndexerSkillset],
    *,
    if_match: Optional[str] = None,
    if_none_match: Optional[str] = None,
    **kwargs
) -> None:
    """
    Delete a skillset.
    
    Parameters:
    - skillset: Skillset name or object
    - if_match (str): ETag for conditional deletion
    - if_none_match (str): ETag for conditional deletion
    """

Usage Examples

Azure Blob Storage Indexer

from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexer, SearchIndexerDataSourceConnection, SearchIndexerDataContainer,
    BlobIndexerParsingMode, IndexingSchedule
)
from azure.core.credentials import AzureKeyCredential

client = SearchIndexerClient(
    endpoint="https://service.search.windows.net",
    credential=AzureKeyCredential("admin-key")
)

# Create data source for Blob Storage
data_source = SearchIndexerDataSourceConnection(
    name="blob-datasource",
    type="azureblob",
    connection_string="DefaultEndpointsProtocol=https;AccountName=account;AccountKey=key;EndpointSuffix=core.windows.net",
    container=SearchIndexerDataContainer(name="documents")
)
client.create_data_source_connection(data_source)

# Create indexer with scheduling
indexer = SearchIndexer(
    name="blob-indexer",
    data_source_name="blob-datasource",
    target_index_name="documents-index",
    schedule=IndexingSchedule(interval="PT2H"),  # Run every 2 hours
    parameters={
        "batchSize": 1000,
        "maxFailedItems": 10,
        "maxFailedItemsPerBatch": 5,
        "configuration": {
            "parsingMode": BlobIndexerParsingMode.TEXT,
            "excludedFileNameExtensions": ".png,.jpeg,.jpg"
        }
    }
)
client.create_indexer(indexer)

AI Enrichment with Skillset

from azure.search.documents.indexes.models import (
    SearchIndexerSkillset, EntityRecognitionSkill, KeyPhraseExtractionSkill,
    LanguageDetectionSkill, MergeSkill, OcrSkill, ImageAnalysisSkill,
    InputFieldMappingEntry, OutputFieldMappingEntry
)

# Create skillset with cognitive skills
skillset = SearchIndexerSkillset(
    name="ai-skillset",
    description="Extract entities, key phrases, and analyze images",
    skills=[
        # OCR skill for image text extraction
        OcrSkill(
            inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
            outputs=[OutputFieldMappingEntry(name="text", target_name="myText")]
        ),
        
        # Language detection
        LanguageDetectionSkill(
            inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
            outputs=[OutputFieldMappingEntry(name="languageCode", target_name="languageCode")]
        ),
        
        # Key phrase extraction
        KeyPhraseExtractionSkill(
            inputs=[
                InputFieldMappingEntry(name="text", source="/document/content"),
                InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
            ],
            outputs=[OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")]
        ),
        
        # Entity recognition
        EntityRecognitionSkill(
            inputs=[
                InputFieldMappingEntry(name="text", source="/document/content"),
                InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
            ],
            outputs=[
                OutputFieldMappingEntry(name="persons", target_name="persons"),
                OutputFieldMappingEntry(name="organizations", target_name="organizations"),
                OutputFieldMappingEntry(name="locations", target_name="locations")
            ]
        )
    ]
)
client.create_skillset(skillset)

# Create indexer that uses the skillset
indexer = SearchIndexer(
    name="ai-enriched-indexer",
    data_source_name="blob-datasource",
    target_index_name="enriched-documents",
    skillset_name="ai-skillset",
    field_mappings=[
        {"sourceFieldName": "metadata_storage_path", "targetFieldName": "id"},
        {"sourceFieldName": "metadata_storage_name", "targetFieldName": "filename"}
    ],
    output_field_mappings=[
        {"sourceFieldName": "/document/keyPhrases", "targetFieldName": "keyPhrases"},
        {"sourceFieldName": "/document/persons", "targetFieldName": "persons"},
        {"sourceFieldName": "/document/organizations", "targetFieldName": "organizations"}
    ]
)
client.create_indexer(indexer)

Custom Web API Skill

from azure.search.documents.indexes.models import WebApiSkill

# Custom skill that calls external API
custom_skill = WebApiSkill(
    name="CustomTextClassifier",
    description="Classifies text using custom ML model",
    uri="https://your-api.com/classify",
    http_method="POST",
    http_headers={"Content-Type": "application/json"},
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="category", target_name="category"),
        OutputFieldMappingEntry(name="confidence", target_name="confidence")
    ]
)

skillset = SearchIndexerSkillset(
    name="custom-skillset",
    skills=[custom_skill]
)
client.create_skillset(skillset)

Monitor Indexer Execution

# Run indexer and monitor status
client.run_indexer("my-indexer")

# Get execution status
status = client.get_indexer_status("my-indexer")
print(f"Status: {status.status}")
print(f"Last result: {status.last_result.status}")

# Check execution history
if status.execution_history:
    for execution in status.execution_history:
        print(f"Start: {execution.start_time}, Status: {execution.status}")
        if execution.errors:
            for error in execution.errors:
                print(f"Error: {error.error_message}")

SQL Database Data Source

# SQL database data source
sql_data_source = SearchIndexerDataSourceConnection(
    name="sql-datasource",
    type="azuresql",
    connection_string="Server=server.database.windows.net;Database=mydb;User ID=user;Password=pass;",
    container=SearchIndexerDataContainer(
        name="Products",
        query="SELECT ProductId, ProductName, Description, ModifiedDate FROM Products WHERE ModifiedDate > @HighWaterMark ORDER BY ModifiedDate"
    ),
    data_change_detection_policy={
        "@odata.type": "#Microsoft.Azure.Search.HighWaterMarkChangeDetectionPolicy",
        "highWaterMarkColumnName": "ModifiedDate"
    }
)
client.create_data_source_connection(sql_data_source)

Common Types

# Indexer definition
class SearchIndexer:
    name: str
    description: Optional[str] = None
    data_source_name: str
    skillset_name: Optional[str] = None
    target_index_name: str
    schedule: Optional[IndexingSchedule] = None
    parameters: Optional[IndexingParameters] = None
    field_mappings: Optional[List[FieldMapping]] = None
    output_field_mappings: Optional[List[FieldMapping]] = None
    is_disabled: Optional[bool] = False
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Data source connection
class SearchIndexerDataSourceConnection:
    name: str
    description: Optional[str] = None
    type: str
    connection_string: str
    container: SearchIndexerDataContainer
    data_change_detection_policy: Optional[DataChangeDetectionPolicy] = None
    data_deletion_detection_policy: Optional[DataDeletionDetectionPolicy] = None
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Skillset definition
class SearchIndexerSkillset:
    name: str
    description: Optional[str] = None
    skills: List[SearchIndexerSkill]
    cognitive_services_account: Optional[CognitiveServicesAccount] = None
    knowledge_store: Optional[SearchIndexerKnowledgeStore] = None
    e_tag: Optional[str] = None
    encryption_key: Optional[SearchResourceEncryptionKey] = None

# Indexer status
class SearchIndexerStatus:
    status: str
    last_result: Optional[IndexerExecutionResult] = None
    execution_history: Optional[List[IndexerExecutionResult]] = None
    limits: Optional[SearchIndexerLimits] = None

# Execution result
class IndexerExecutionResult:
    status: str
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None
    error_message: Optional[str] = None
    errors: Optional[List[SearchIndexerError]] = None
    warnings: Optional[List[SearchIndexerWarning]] = None
    item_count: Optional[int] = None
    failed_item_count: Optional[int] = None

Install with Tessl CLI

npx tessl i tessl/pypi-azure-search-documents

docs

async-clients.md

index-management.md

index.md

indexer-management.md

models.md

search-client.md

tile.json

tessl/pypi-azure-search-documents

indexer-management.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

Data Ingestion and AI Enrichment

Capabilities

Client Initialization

Indexer Management

Indexer Execution Control

Data Source Management

Skillset Management

Usage Examples

Azure Blob Storage Indexer

AI Enrichment with Skillset

Custom Web API Skill

Monitor Indexer Execution

SQL Database Data Source

Common Types

indexer-management.mddocs/