0
# Data Ingestion and AI Enrichment
1
2
The SearchIndexerClient manages automated data ingestion through indexers that connect to various data sources, with optional AI-powered content enrichment through skillsets. This enables knowledge mining, document cracking, and cognitive enrichment of content during the indexing process.
3
4
## Capabilities
5
6
### Client Initialization
7
8
Create a SearchIndexerClient to manage indexers, data sources, and skillsets.
9
10
```python { .api }
11
class SearchIndexerClient:
12
def __init__(
13
self,
14
endpoint: str,
15
credential: Union[AzureKeyCredential, TokenCredential],
16
**kwargs
17
) -> None:
18
"""
19
Initialize SearchIndexerClient for indexer management.
20
21
Parameters:
22
- endpoint (str): The URL endpoint of an Azure search service
23
- credential: A credential to authorize requests
24
- api_version (str, optional): The Search API version to use
25
- audience (str, optional): AAD audience for authentication
26
"""
27
28
def close(self) -> None:
29
"""Close the session."""
30
31
def __enter__(self) -> "SearchIndexerClient": ...
32
def __exit__(self, *args) -> None: ...
33
```
34
35
### Indexer Management
36
37
Create, configure, and manage indexers for automated data ingestion.
38
39
```python { .api }
40
def create_indexer(self, indexer: SearchIndexer, **kwargs) -> SearchIndexer:
41
"""
42
Create a new indexer.
43
44
Parameters:
45
- indexer (SearchIndexer): The indexer definition
46
47
Returns:
48
SearchIndexer: The created indexer
49
"""
50
51
def create_or_update_indexer(
52
self,
53
indexer: SearchIndexer,
54
*,
55
if_match: Optional[str] = None,
56
if_none_match: Optional[str] = None,
57
cache_reprocessing_change_detection_disabled: Optional[bool] = None,
58
cache_reset_requirements_ignored: Optional[bool] = None,
59
**kwargs
60
) -> SearchIndexer:
61
"""
62
Create a new indexer or update an existing one.
63
64
Parameters:
65
- indexer (SearchIndexer): The indexer definition
66
- if_match (str): ETag for conditional updates
67
- if_none_match (str): ETag for conditional creation
68
- cache_reprocessing_change_detection_disabled (bool): Disable change detection
69
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
70
71
Returns:
72
SearchIndexer: The created or updated indexer
73
"""
74
75
def get_indexer(self, name: str, **kwargs) -> SearchIndexer:
76
"""
77
Retrieve an indexer definition.
78
79
Parameters:
80
- name (str): Name of the indexer
81
82
Returns:
83
SearchIndexer: The indexer definition
84
"""
85
86
def get_indexers(
87
self,
88
*,
89
select: Optional[List[str]] = None,
90
**kwargs
91
) -> Sequence[SearchIndexer]:
92
"""
93
List all indexers in the search service.
94
95
Parameters:
96
- select (List[str], optional): Fields to include in results
97
98
Returns:
99
Sequence[SearchIndexer]: List of indexers
100
"""
101
102
def get_indexer_names(self, **kwargs) -> Sequence[str]:
103
"""
104
List all indexer names.
105
106
Returns:
107
Sequence[str]: List of indexer names
108
"""
109
110
def delete_indexer(
111
self,
112
indexer: Union[str, SearchIndexer],
113
*,
114
if_match: Optional[str] = None,
115
if_none_match: Optional[str] = None,
116
**kwargs
117
) -> None:
118
"""
119
Delete an indexer.
120
121
Parameters:
122
- indexer: Indexer name or SearchIndexer object
123
- if_match (str): ETag for conditional deletion
124
- if_none_match (str): ETag for conditional deletion
125
"""
126
```
127
128
### Indexer Execution Control
129
130
Run, reset, and monitor indexer execution.
131
132
```python { .api }
133
def run_indexer(self, name: str, **kwargs) -> None:
134
"""
135
Run an indexer manually.
136
137
Parameters:
138
- name (str): Name of the indexer to run
139
"""
140
141
def reset_indexer(self, name: str, **kwargs) -> None:
142
"""
143
Reset an indexer's execution state.
144
145
Parameters:
146
- name (str): Name of the indexer to reset
147
"""
148
149
def get_indexer_status(self, name: str, **kwargs) -> SearchIndexerStatus:
150
"""
151
Get the execution status and history of an indexer.
152
153
Parameters:
154
- name (str): Name of the indexer
155
156
Returns:
157
SearchIndexerStatus: Indexer execution status and history
158
"""
159
```
160
161
### Data Source Management
162
163
Configure connections to external data sources.
164
165
```python { .api }
166
def create_data_source_connection(
167
self,
168
data_source: SearchIndexerDataSourceConnection,
169
**kwargs
170
) -> SearchIndexerDataSourceConnection:
171
"""
172
Create a new data source connection.
173
174
Parameters:
175
- data_source (SearchIndexerDataSourceConnection): Data source definition
176
177
Returns:
178
SearchIndexerDataSourceConnection: The created data source
179
"""
180
181
def create_or_update_data_source_connection(
182
self,
183
data_source: SearchIndexerDataSourceConnection,
184
*,
185
if_match: Optional[str] = None,
186
if_none_match: Optional[str] = None,
187
cache_reset_requirements_ignored: Optional[bool] = None,
188
**kwargs
189
) -> SearchIndexerDataSourceConnection:
190
"""
191
Create or update a data source connection.
192
193
Parameters:
194
- data_source (SearchIndexerDataSourceConnection): Data source definition
195
- if_match (str): ETag for conditional updates
196
- if_none_match (str): ETag for conditional creation
197
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
198
199
Returns:
200
SearchIndexerDataSourceConnection: The created or updated data source
201
"""
202
203
def get_data_source_connection(
204
self,
205
name: str,
206
**kwargs
207
) -> SearchIndexerDataSourceConnection:
208
"""
209
Retrieve a data source connection.
210
211
Parameters:
212
- name (str): Name of the data source
213
214
Returns:
215
SearchIndexerDataSourceConnection: The data source definition
216
"""
217
218
def get_data_source_connections(
219
self,
220
*,
221
select: Optional[List[str]] = None,
222
**kwargs
223
) -> Sequence[SearchIndexerDataSourceConnection]:
224
"""
225
List all data source connections.
226
227
Parameters:
228
- select (List[str], optional): Fields to include in results
229
230
Returns:
231
Sequence[SearchIndexerDataSourceConnection]: List of data sources
232
"""
233
234
def get_data_source_connection_names(self, **kwargs) -> Sequence[str]:
235
"""
236
List all data source connection names.
237
238
Returns:
239
Sequence[str]: List of data source names
240
"""
241
242
def delete_data_source_connection(
243
self,
244
data_source: Union[str, SearchIndexerDataSourceConnection],
245
*,
246
if_match: Optional[str] = None,
247
if_none_match: Optional[str] = None,
248
**kwargs
249
) -> None:
250
"""
251
Delete a data source connection.
252
253
Parameters:
254
- data_source: Data source name or object
255
- if_match (str): ETag for conditional deletion
256
- if_none_match (str): ETag for conditional deletion
257
"""
258
```
259
260
### Skillset Management
261
262
Define and manage AI enrichment skillsets for cognitive processing.
263
264
```python { .api }
265
def create_skillset(self, skillset: SearchIndexerSkillset, **kwargs) -> SearchIndexerSkillset:
266
"""
267
Create a new skillset.
268
269
Parameters:
270
- skillset (SearchIndexerSkillset): The skillset definition
271
272
Returns:
273
SearchIndexerSkillset: The created skillset
274
"""
275
276
def create_or_update_skillset(
277
self,
278
skillset: SearchIndexerSkillset,
279
*,
280
if_match: Optional[str] = None,
281
if_none_match: Optional[str] = None,
282
cache_reset_requirements_ignored: Optional[bool] = None,
283
**kwargs
284
) -> SearchIndexerSkillset:
285
"""
286
Create or update a skillset.
287
288
Parameters:
289
- skillset (SearchIndexerSkillset): The skillset definition
290
- if_match (str): ETag for conditional updates
291
- if_none_match (str): ETag for conditional creation
292
- cache_reset_requirements_ignored (bool): Ignore cache reset requirements
293
294
Returns:
295
SearchIndexerSkillset: The created or updated skillset
296
"""
297
298
def get_skillset(self, name: str, **kwargs) -> SearchIndexerSkillset:
299
"""
300
Retrieve a skillset definition.
301
302
Parameters:
303
- name (str): Name of the skillset
304
305
Returns:
306
SearchIndexerSkillset: The skillset definition
307
"""
308
309
def get_skillsets(
310
self,
311
*,
312
select: Optional[List[str]] = None,
313
**kwargs
314
) -> Sequence[SearchIndexerSkillset]:
315
"""
316
List all skillsets.
317
318
Parameters:
319
- select (List[str], optional): Fields to include in results
320
321
Returns:
322
Sequence[SearchIndexerSkillset]: List of skillsets
323
"""
324
325
def get_skillset_names(self, **kwargs) -> Sequence[str]:
326
"""
327
List all skillset names.
328
329
Returns:
330
Sequence[str]: List of skillset names
331
"""
332
333
def delete_skillset(
334
self,
335
skillset: Union[str, SearchIndexerSkillset],
336
*,
337
if_match: Optional[str] = None,
338
if_none_match: Optional[str] = None,
339
**kwargs
340
) -> None:
341
"""
342
Delete a skillset.
343
344
Parameters:
345
- skillset: Skillset name or object
346
- if_match (str): ETag for conditional deletion
347
- if_none_match (str): ETag for conditional deletion
348
"""
349
```
350
351
## Usage Examples
352
353
### Azure Blob Storage Indexer
354
355
```python
356
from azure.search.documents.indexes import SearchIndexerClient
357
from azure.search.documents.indexes.models import (
358
SearchIndexer, SearchIndexerDataSourceConnection, SearchIndexerDataContainer,
359
BlobIndexerParsingMode, IndexingSchedule
360
)
361
from azure.core.credentials import AzureKeyCredential
362
363
client = SearchIndexerClient(
364
endpoint="https://service.search.windows.net",
365
credential=AzureKeyCredential("admin-key")
366
)
367
368
# Create data source for Blob Storage
369
data_source = SearchIndexerDataSourceConnection(
370
name="blob-datasource",
371
type="azureblob",
372
connection_string="DefaultEndpointsProtocol=https;AccountName=account;AccountKey=key;EndpointSuffix=core.windows.net",
373
container=SearchIndexerDataContainer(name="documents")
374
)
375
client.create_data_source_connection(data_source)
376
377
# Create indexer with scheduling
378
indexer = SearchIndexer(
379
name="blob-indexer",
380
data_source_name="blob-datasource",
381
target_index_name="documents-index",
382
schedule=IndexingSchedule(interval="PT2H"), # Run every 2 hours
383
parameters={
384
"batchSize": 1000,
385
"maxFailedItems": 10,
386
"maxFailedItemsPerBatch": 5,
387
"configuration": {
388
"parsingMode": BlobIndexerParsingMode.TEXT,
389
"excludedFileNameExtensions": ".png,.jpeg,.jpg"
390
}
391
}
392
)
393
client.create_indexer(indexer)
394
```
395
396
### AI Enrichment with Skillset
397
398
```python
399
from azure.search.documents.indexes.models import (
400
SearchIndexerSkillset, EntityRecognitionSkill, KeyPhraseExtractionSkill,
401
LanguageDetectionSkill, MergeSkill, OcrSkill, ImageAnalysisSkill,
402
InputFieldMappingEntry, OutputFieldMappingEntry
403
)
404
405
# Create skillset with cognitive skills
406
skillset = SearchIndexerSkillset(
407
name="ai-skillset",
408
description="Extract entities, key phrases, and analyze images",
409
skills=[
410
# OCR skill for image text extraction
411
OcrSkill(
412
inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
413
outputs=[OutputFieldMappingEntry(name="text", target_name="myText")]
414
),
415
416
# Language detection
417
LanguageDetectionSkill(
418
inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
419
outputs=[OutputFieldMappingEntry(name="languageCode", target_name="languageCode")]
420
),
421
422
# Key phrase extraction
423
KeyPhraseExtractionSkill(
424
inputs=[
425
InputFieldMappingEntry(name="text", source="/document/content"),
426
InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
427
],
428
outputs=[OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")]
429
),
430
431
# Entity recognition
432
EntityRecognitionSkill(
433
inputs=[
434
InputFieldMappingEntry(name="text", source="/document/content"),
435
InputFieldMappingEntry(name="languageCode", source="/document/languageCode")
436
],
437
outputs=[
438
OutputFieldMappingEntry(name="persons", target_name="persons"),
439
OutputFieldMappingEntry(name="organizations", target_name="organizations"),
440
OutputFieldMappingEntry(name="locations", target_name="locations")
441
]
442
)
443
]
444
)
445
client.create_skillset(skillset)
446
447
# Create indexer that uses the skillset
448
indexer = SearchIndexer(
449
name="ai-enriched-indexer",
450
data_source_name="blob-datasource",
451
target_index_name="enriched-documents",
452
skillset_name="ai-skillset",
453
field_mappings=[
454
{"sourceFieldName": "metadata_storage_path", "targetFieldName": "id"},
455
{"sourceFieldName": "metadata_storage_name", "targetFieldName": "filename"}
456
],
457
output_field_mappings=[
458
{"sourceFieldName": "/document/keyPhrases", "targetFieldName": "keyPhrases"},
459
{"sourceFieldName": "/document/persons", "targetFieldName": "persons"},
460
{"sourceFieldName": "/document/organizations", "targetFieldName": "organizations"}
461
]
462
)
463
client.create_indexer(indexer)
464
```
465
466
### Custom Web API Skill
467
468
```python
469
from azure.search.documents.indexes.models import WebApiSkill
470
471
# Custom skill that calls external API
472
custom_skill = WebApiSkill(
473
name="CustomTextClassifier",
474
description="Classifies text using custom ML model",
475
uri="https://your-api.com/classify",
476
http_method="POST",
477
http_headers={"Content-Type": "application/json"},
478
inputs=[
479
InputFieldMappingEntry(name="text", source="/document/content")
480
],
481
outputs=[
482
OutputFieldMappingEntry(name="category", target_name="category"),
483
OutputFieldMappingEntry(name="confidence", target_name="confidence")
484
]
485
)
486
487
skillset = SearchIndexerSkillset(
488
name="custom-skillset",
489
skills=[custom_skill]
490
)
491
client.create_skillset(skillset)
492
```
493
494
### Monitor Indexer Execution
495
496
```python
497
# Run indexer and monitor status
498
client.run_indexer("my-indexer")
499
500
# Get execution status
501
status = client.get_indexer_status("my-indexer")
502
print(f"Status: {status.status}")
503
print(f"Last result: {status.last_result.status}")
504
505
# Check execution history
506
if status.execution_history:
507
for execution in status.execution_history:
508
print(f"Start: {execution.start_time}, Status: {execution.status}")
509
if execution.errors:
510
for error in execution.errors:
511
print(f"Error: {error.error_message}")
512
```
513
514
### SQL Database Data Source
515
516
```python
517
# SQL database data source
518
sql_data_source = SearchIndexerDataSourceConnection(
519
name="sql-datasource",
520
type="azuresql",
521
connection_string="Server=server.database.windows.net;Database=mydb;User ID=user;Password=pass;",
522
container=SearchIndexerDataContainer(
523
name="Products",
524
query="SELECT ProductId, ProductName, Description, ModifiedDate FROM Products WHERE ModifiedDate > @HighWaterMark ORDER BY ModifiedDate"
525
),
526
data_change_detection_policy={
527
"@odata.type": "#Microsoft.Azure.Search.HighWaterMarkChangeDetectionPolicy",
528
"highWaterMarkColumnName": "ModifiedDate"
529
}
530
)
531
client.create_data_source_connection(sql_data_source)
532
```
533
534
## Common Types
535
536
```python { .api }
537
# Indexer definition
538
class SearchIndexer:
539
name: str
540
description: Optional[str] = None
541
data_source_name: str
542
skillset_name: Optional[str] = None
543
target_index_name: str
544
schedule: Optional[IndexingSchedule] = None
545
parameters: Optional[IndexingParameters] = None
546
field_mappings: Optional[List[FieldMapping]] = None
547
output_field_mappings: Optional[List[FieldMapping]] = None
548
is_disabled: Optional[bool] = False
549
e_tag: Optional[str] = None
550
encryption_key: Optional[SearchResourceEncryptionKey] = None
551
552
# Data source connection
553
class SearchIndexerDataSourceConnection:
554
name: str
555
description: Optional[str] = None
556
type: str
557
connection_string: str
558
container: SearchIndexerDataContainer
559
data_change_detection_policy: Optional[DataChangeDetectionPolicy] = None
560
data_deletion_detection_policy: Optional[DataDeletionDetectionPolicy] = None
561
e_tag: Optional[str] = None
562
encryption_key: Optional[SearchResourceEncryptionKey] = None
563
564
# Skillset definition
565
class SearchIndexerSkillset:
566
name: str
567
description: Optional[str] = None
568
skills: List[SearchIndexerSkill]
569
cognitive_services_account: Optional[CognitiveServicesAccount] = None
570
knowledge_store: Optional[SearchIndexerKnowledgeStore] = None
571
e_tag: Optional[str] = None
572
encryption_key: Optional[SearchResourceEncryptionKey] = None
573
574
# Indexer status
575
class SearchIndexerStatus:
576
status: str
577
last_result: Optional[IndexerExecutionResult] = None
578
execution_history: Optional[List[IndexerExecutionResult]] = None
579
limits: Optional[SearchIndexerLimits] = None
580
581
# Execution result
582
class IndexerExecutionResult:
583
status: str
584
start_time: Optional[datetime] = None
585
end_time: Optional[datetime] = None
586
error_message: Optional[str] = None
587
errors: Optional[List[SearchIndexerError]] = None
588
warnings: Optional[List[SearchIndexerWarning]] = None
589
item_count: Optional[int] = None
590
failed_item_count: Optional[int] = None
591
```