0
# Inference API
1
2
The Inference API provides machine learning inference capabilities within Elasticsearch, supporting various AI services and models for text embeddings, completions, reranking, and sparse embeddings. It offers a unified interface for integrating with multiple AI providers.
3
4
## Capabilities
5
6
### Inference Operations
7
8
Execute inference tasks using configured inference endpoints for various AI/ML tasks.
9
10
```python { .api }
11
def inference(
12
self,
13
*,
14
inference_id: str,
15
input: Optional[Union[str, List[str]]] = None,
16
query: Optional[str] = None,
17
task_settings: Optional[Any] = None,
18
timeout: Optional[Union[str, int]] = None,
19
**kwargs
20
) -> ObjectApiResponse[Any]:
21
"""
22
Perform general inference on a configured service.
23
24
Parameters:
25
- inference_id: The inference endpoint ID
26
- input: Input text(s) for inference
27
- query: Query text for reranking tasks
28
- task_settings: Task-specific settings
29
- timeout: Request timeout
30
31
Returns:
32
ObjectApiResponse with inference results
33
"""
34
35
def text_embedding(
36
self,
37
*,
38
inference_id: str,
39
input: Optional[Union[str, List[str]]] = None,
40
task_settings: Optional[Any] = None,
41
timeout: Optional[Union[str, int]] = None,
42
**kwargs
43
) -> ObjectApiResponse[Any]:
44
"""
45
Generate text embeddings using the specified inference service.
46
47
Parameters:
48
- inference_id: The embedding model inference ID
49
- input: Text or list of texts to embed
50
- task_settings: Model-specific embedding settings
51
- timeout: Request timeout
52
53
Returns:
54
ObjectApiResponse with embedding vectors
55
"""
56
57
def sparse_embedding(
58
self,
59
*,
60
inference_id: str,
61
input: Optional[Union[str, List[str]]] = None,
62
task_settings: Optional[Any] = None,
63
timeout: Optional[Union[str, int]] = None,
64
**kwargs
65
) -> ObjectApiResponse[Any]:
66
"""
67
Generate sparse embeddings (e.g., SPLADE) using the specified service.
68
69
Parameters:
70
- inference_id: The sparse embedding model inference ID
71
- input: Text or list of texts to embed
72
- task_settings: Model-specific settings
73
- timeout: Request timeout
74
75
Returns:
76
ObjectApiResponse with sparse embedding vectors
77
"""
78
79
def rerank(
80
self,
81
*,
82
inference_id: str,
83
input: Optional[List[str]] = None,
84
query: Optional[str] = None,
85
task_settings: Optional[Any] = None,
86
timeout: Optional[Union[str, int]] = None,
87
**kwargs
88
) -> ObjectApiResponse[Any]:
89
"""
90
Rerank documents using the specified reranking service.
91
92
Parameters:
93
- inference_id: The reranking model inference ID
94
- input: List of documents to rerank
95
- query: Query text for relevance-based reranking
96
- task_settings: Reranking-specific settings
97
- timeout: Request timeout
98
99
Returns:
100
ObjectApiResponse with reranked documents and scores
101
"""
102
103
def completion(
104
self,
105
*,
106
inference_id: str,
107
input: Optional[Union[str, List[str]]] = None,
108
task_settings: Optional[Any] = None,
109
timeout: Optional[Union[str, int]] = None,
110
**kwargs
111
) -> ObjectApiResponse[Any]:
112
"""
113
Generate text completions using the specified language model.
114
115
Parameters:
116
- inference_id: The completion model inference ID
117
- input: Prompt or list of prompts
118
- task_settings: Generation settings (temperature, max_tokens, etc.)
119
- timeout: Request timeout
120
121
Returns:
122
ObjectApiResponse with generated completions
123
"""
124
```
125
126
#### Usage Examples
127
128
```python
129
from elasticsearch import Elasticsearch
130
131
client = Elasticsearch(['http://localhost:9200'])
132
133
# Text embeddings for semantic search
134
embedding_response = client.inference.text_embedding(
135
inference_id="my-embedding-model",
136
input=["Hello world", "Machine learning is fascinating"]
137
)
138
embeddings = embedding_response.body['embeddings']
139
140
# Single text embedding
141
single_embedding = client.inference.text_embedding(
142
inference_id="sentence-transformers",
143
input="This is a sample document for embedding"
144
)
145
146
# Sparse embeddings for keyword-aware search
147
sparse_response = client.inference.sparse_embedding(
148
inference_id="splade-model",
149
input="Natural language processing with transformers"
150
)
151
152
# Document reranking for search relevance
153
rerank_response = client.inference.rerank(
154
inference_id="cross-encoder-model",
155
query="machine learning algorithms",
156
input=[
157
"Introduction to machine learning",
158
"Deep learning with neural networks",
159
"Statistical analysis methods",
160
"Reinforcement learning concepts"
161
]
162
)
163
ranked_docs = rerank_response.body['reranked']
164
165
# Text completion/generation
166
completion_response = client.inference.completion(
167
inference_id="gpt-model",
168
input="Explain quantum computing in simple terms:",
169
task_settings={
170
"max_tokens": 150,
171
"temperature": 0.7
172
}
173
)
174
generated_text = completion_response.body['completion']
175
```
176
177
### Inference Endpoint Management
178
179
Create, update, and manage inference endpoints for various AI services.
180
181
```python { .api }
182
def put(
183
self,
184
*,
185
inference_id: str,
186
task_type: str,
187
inference_config: Dict[str, Any],
188
**kwargs
189
) -> ObjectApiResponse[Any]:
190
"""
191
Create or update a generic inference endpoint.
192
193
Parameters:
194
- inference_id: Unique identifier for the inference endpoint
195
- task_type: Type of task (text_embedding, completion, rerank, sparse_embedding)
196
- inference_config: Service-specific configuration
197
198
Returns:
199
ObjectApiResponse confirming endpoint creation
200
"""
201
202
def get(
203
self,
204
*,
205
inference_id: Optional[str] = None,
206
**kwargs
207
) -> ObjectApiResponse[Any]:
208
"""
209
Get inference endpoint configuration(s).
210
211
Parameters:
212
- inference_id: Specific endpoint ID (omit for all endpoints)
213
214
Returns:
215
ObjectApiResponse with endpoint configuration(s)
216
"""
217
218
def delete(
219
self,
220
*,
221
inference_id: str,
222
**kwargs
223
) -> ObjectApiResponse[Any]:
224
"""
225
Delete an inference endpoint.
226
227
Parameters:
228
- inference_id: The inference endpoint ID to delete
229
230
Returns:
231
ObjectApiResponse confirming deletion
232
"""
233
234
def update(
235
self,
236
*,
237
inference_id: str,
238
inference_config: Optional[Dict[str, Any]] = None,
239
**kwargs
240
) -> ObjectApiResponse[Any]:
241
"""
242
Update an existing inference endpoint configuration.
243
244
Parameters:
245
- inference_id: The inference endpoint ID to update
246
- inference_config: Updated configuration
247
248
Returns:
249
ObjectApiResponse confirming update
250
"""
251
```
252
253
### AI Service Provider Support
254
255
The Inference API provides specialized methods for configuring popular AI service providers.
256
257
```python { .api }
258
def put_openai(
259
self,
260
*,
261
inference_id: str,
262
task_type: str,
263
api_key: Optional[str] = None,
264
model_id: Optional[str] = None,
265
organization_id: Optional[str] = None,
266
url: Optional[str] = None,
267
**kwargs
268
) -> ObjectApiResponse[Any]:
269
"""Configure OpenAI inference endpoint."""
270
271
def put_azureopenai(
272
self,
273
*,
274
inference_id: str,
275
task_type: str,
276
api_key: Optional[str] = None,
277
api_version: Optional[str] = None,
278
deployment_id: Optional[str] = None,
279
resource_name: Optional[str] = None,
280
**kwargs
281
) -> ObjectApiResponse[Any]:
282
"""Configure Azure OpenAI inference endpoint."""
283
284
def put_hugging_face(
285
self,
286
*,
287
inference_id: str,
288
task_type: str,
289
api_key: Optional[str] = None,
290
model_id: Optional[str] = None,
291
url: Optional[str] = None,
292
**kwargs
293
) -> ObjectApiResponse[Any]:
294
"""Configure Hugging Face inference endpoint."""
295
296
def put_cohere(
297
self,
298
*,
299
inference_id: str,
300
task_type: str,
301
api_key: Optional[str] = None,
302
model_id: Optional[str] = None,
303
**kwargs
304
) -> ObjectApiResponse[Any]:
305
"""Configure Cohere inference endpoint."""
306
307
def put_anthropic(
308
self,
309
*,
310
inference_id: str,
311
task_type: str,
312
api_key: Optional[str] = None,
313
model_id: Optional[str] = None,
314
**kwargs
315
) -> ObjectApiResponse[Any]:
316
"""Configure Anthropic inference endpoint."""
317
318
def put_amazonbedrock(
319
self,
320
*,
321
inference_id: str,
322
task_type: str,
323
access_key_id: Optional[str] = None,
324
secret_access_key: Optional[str] = None,
325
region: Optional[str] = None,
326
model_id: Optional[str] = None,
327
**kwargs
328
) -> ObjectApiResponse[Any]:
329
"""Configure Amazon Bedrock inference endpoint."""
330
331
def put_googlevertexai(
332
self,
333
*,
334
inference_id: str,
335
task_type: str,
336
service_account_json: Optional[str] = None,
337
project_id: Optional[str] = None,
338
location: Optional[str] = None,
339
model_id: Optional[str] = None,
340
**kwargs
341
) -> ObjectApiResponse[Any]:
342
"""Configure Google Vertex AI inference endpoint."""
343
344
def put_googleaistudio(
345
self,
346
*,
347
inference_id: str,
348
task_type: str,
349
api_key: Optional[str] = None,
350
model_id: Optional[str] = None,
351
**kwargs
352
) -> ObjectApiResponse[Any]:
353
"""Configure Google AI Studio inference endpoint."""
354
355
def put_elasticsearch(
356
self,
357
*,
358
inference_id: str,
359
task_type: str,
360
model_id: str,
361
num_allocations: Optional[int] = None,
362
num_threads: Optional[int] = None,
363
**kwargs
364
) -> ObjectApiResponse[Any]:
365
"""Configure Elasticsearch built-in model inference endpoint."""
366
367
def put_elser(
368
self,
369
*,
370
inference_id: str,
371
num_allocations: Optional[int] = None,
372
num_threads: Optional[int] = None,
373
**kwargs
374
) -> ObjectApiResponse[Any]:
375
"""Configure Elasticsearch Learned Sparse Encoder (ELSER) endpoint."""
376
```
377
378
#### Service Configuration Examples
379
380
```python
381
# OpenAI embeddings
382
client.inference.put_openai(
383
inference_id="openai-embeddings",
384
task_type="text_embedding",
385
api_key="sk-...",
386
model_id="text-embedding-ada-002"
387
)
388
389
# Azure OpenAI completions
390
client.inference.put_azureopenai(
391
inference_id="azure-gpt4",
392
task_type="completion",
393
api_key="...",
394
api_version="2024-02-01",
395
resource_name="my-resource",
396
deployment_id="gpt-4-deployment"
397
)
398
399
# Hugging Face sentence transformers
400
client.inference.put_hugging_face(
401
inference_id="sentence-transformers",
402
task_type="text_embedding",
403
api_key="hf_...",
404
model_id="sentence-transformers/all-MiniLM-L6-v2"
405
)
406
407
# Cohere reranking
408
client.inference.put_cohere(
409
inference_id="cohere-rerank",
410
task_type="rerank",
411
api_key="...",
412
model_id="rerank-english-v2.0"
413
)
414
415
# Elasticsearch ELSER for sparse embeddings
416
client.inference.put_elser(
417
inference_id="elser-sparse",
418
num_allocations=1,
419
num_threads=2
420
)
421
422
# Amazon Bedrock
423
client.inference.put_amazonbedrock(
424
inference_id="bedrock-titan",
425
task_type="text_embedding",
426
access_key_id="AKIA...",
427
secret_access_key="...",
428
region="us-east-1",
429
model_id="amazon.titan-embed-text-v1"
430
)
431
432
# Google Vertex AI
433
client.inference.put_googlevertexai(
434
inference_id="vertex-palm",
435
task_type="completion",
436
service_account_json='{"type": "service_account", ...}',
437
project_id="my-project",
438
location="us-central1",
439
model_id="text-bison@001"
440
)
441
```
442
443
### Additional Provider Support
444
445
Extended support for more AI service providers:
446
447
```python { .api }
448
def put_mistral(self, *, inference_id: str, task_type: str, api_key: str, model_id: str, **kwargs):
449
"""Configure Mistral AI inference endpoint."""
450
451
def put_voyageai(self, *, inference_id: str, task_type: str, api_key: str, model_id: str, **kwargs):
452
"""Configure VoyageAI inference endpoint."""
453
454
def put_jinaai(self, *, inference_id: str, task_type: str, api_key: str, model_id: str, **kwargs):
455
"""Configure Jina AI inference endpoint."""
456
457
def put_deepseek(self, *, inference_id: str, task_type: str, api_key: str, model_id: str, **kwargs):
458
"""Configure DeepSeek inference endpoint."""
459
460
def put_watsonx(self, *, inference_id: str, task_type: str, api_key: str, project_id: str, model_id: str, **kwargs):
461
"""Configure IBM watsonx inference endpoint."""
462
463
def put_azureaistudio(self, *, inference_id: str, task_type: str, api_key: str, target: str, **kwargs):
464
"""Configure Azure AI Studio inference endpoint."""
465
466
def put_alibabacloud(self, *, inference_id: str, task_type: str, api_key: str, model_id: str, **kwargs):
467
"""Configure Alibaba Cloud inference endpoint."""
468
469
def put_amazonsagemaker(self, *, inference_id: str, task_type: str, access_key_id: str, secret_access_key: str, region: str, endpoint_name: str, **kwargs):
470
"""Configure Amazon SageMaker inference endpoint."""
471
472
def put_custom(self, *, inference_id: str, task_type: str, url: str, **kwargs):
473
"""Configure custom inference endpoint."""
474
```
475
476
## Common Use Cases
477
478
### Semantic Search with Embeddings
479
480
```python
481
# 1. Configure embedding service
482
client.inference.put_openai(
483
inference_id="embeddings",
484
task_type="text_embedding",
485
api_key="sk-...",
486
model_id="text-embedding-ada-002"
487
)
488
489
# 2. Create index with dense vector field
490
client.indices.create(
491
index="documents",
492
mappings={
493
"properties": {
494
"content": {"type": "text"},
495
"embedding": {
496
"type": "dense_vector",
497
"dims": 1536,
498
"index": True,
499
"similarity": "cosine"
500
}
501
}
502
}
503
)
504
505
# 3. Index documents with embeddings
506
doc = "Machine learning transforms data into insights"
507
embedding = client.inference.text_embedding(
508
inference_id="embeddings",
509
input=doc
510
)
511
512
client.index(
513
index="documents",
514
document={
515
"content": doc,
516
"embedding": embedding.body['embeddings'][0]['embedding']
517
}
518
)
519
520
# 4. Search with semantic similarity
521
query_embedding = client.inference.text_embedding(
522
inference_id="embeddings",
523
input="AI and data analysis"
524
)
525
526
results = client.search(
527
index="documents",
528
knn={
529
"field": "embedding",
530
"query_vector": query_embedding.body['embeddings'][0]['embedding'],
531
"k": 10,
532
"num_candidates": 100
533
}
534
)
535
```
536
537
### RAG (Retrieval-Augmented Generation)
538
539
```python
540
# 1. Retrieve relevant documents
541
query = "What is quantum computing?"
542
query_embedding = client.inference.text_embedding(
543
inference_id="embeddings",
544
input=query
545
)
546
547
search_results = client.search(
548
index="knowledge_base",
549
knn={
550
"field": "embedding",
551
"query_vector": query_embedding.body['embeddings'][0]['embedding'],
552
"k": 5
553
}
554
)
555
556
# 2. Rerank results for better relevance
557
documents = [hit['_source']['content'] for hit in search_results.body['hits']['hits']]
558
reranked = client.inference.rerank(
559
inference_id="cohere-rerank",
560
query=query,
561
input=documents
562
)
563
564
# 3. Generate response with context
565
top_docs = [documents[idx] for idx in reranked.body['reranked'][:3]]
566
context = "\n\n".join(top_docs)
567
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
568
569
response = client.inference.completion(
570
inference_id="gpt-4",
571
input=prompt,
572
task_settings={"max_tokens": 200, "temperature": 0.3}
573
)
574
575
answer = response.body['completion']
576
```
577
578
## Types
579
580
```python { .api }
581
from typing import Any, Dict, List, Optional, Union
582
583
# Task types
584
TaskType = Literal["text_embedding", "sparse_embedding", "completion", "rerank"]
585
586
# Service configurations
587
class InferenceConfig:
588
service: str # Service provider name
589
service_settings: Dict[str, Any] # Provider-specific settings
590
task_settings: Dict[str, Any] # Task-specific settings
591
592
# Response types
593
class EmbeddingResponse:
594
embeddings: List[Dict[str, Any]] # Embedding vectors with metadata
595
596
class CompletionResponse:
597
completion: str # Generated text
598
usage: Optional[Dict[str, int]] # Token usage statistics
599
600
class RerankResponse:
601
reranked: List[int] # Reordered document indices
602
scores: List[float] # Relevance scores
603
604
class SparseEmbeddingResponse:
605
embeddings: List[Dict[str, Dict[str, float]]] # Sparse vector representations
606
```