0
# LLMs & Embeddings
1
2
Pluggable interfaces for language models and embedding systems, supporting both synchronous and asynchronous operations with extensive customization options. These interfaces enable integration with various LLM providers and embedding models while maintaining consistent APIs.
3
4
## Capabilities
5
6
### Base LLM Interface
7
8
Foundation interface for all language model implementations, providing standardized completion and chat methods.
9
10
```python { .api }
11
class LLM:
12
"""
13
Base language model interface with completion and chat capabilities.
14
15
Parameters:
16
- model_name: str, name identifier for the model
17
- context_window: int, maximum context window size in tokens
18
- max_new_tokens: Optional[int], maximum new tokens to generate
19
- system_prompt: Optional[str], default system prompt
20
- messages_to_prompt: Optional[Callable], function to convert messages to prompt
21
- completion_to_prompt: Optional[Callable], function to convert completion to prompt
22
- pydantic_program_mode: PydanticProgramMode, mode for Pydantic program execution
23
- output_parser: Optional[BaseOutputParser], parser for model output
24
"""
25
def __init__(
26
self,
27
model_name: str = "unknown",
28
context_window: int = 4096,
29
max_new_tokens: Optional[int] = None,
30
system_prompt: Optional[str] = None,
31
messages_to_prompt: Optional[Callable] = None,
32
completion_to_prompt: Optional[Callable] = None,
33
pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
34
output_parser: Optional[BaseOutputParser] = None,
35
**kwargs
36
): ...
37
38
def complete(
39
self,
40
prompt: str,
41
formatted: bool = False,
42
**kwargs
43
) -> CompletionResponse:
44
"""
45
Complete a text prompt.
46
47
Parameters:
48
- prompt: str, the text prompt to complete
49
- formatted: bool, whether prompt is already formatted
50
51
Returns:
52
- CompletionResponse, completion result with text and metadata
53
"""
54
55
def stream_complete(
56
self,
57
prompt: str,
58
formatted: bool = False,
59
**kwargs
60
) -> CompletionResponseGen:
61
"""
62
Stream completion results for a text prompt.
63
64
Parameters:
65
- prompt: str, the text prompt to complete
66
- formatted: bool, whether prompt is already formatted
67
68
Returns:
69
- CompletionResponseGen, streaming completion generator
70
"""
71
72
def chat(
73
self,
74
messages: Sequence[ChatMessage],
75
**kwargs
76
) -> ChatResponse:
77
"""
78
Generate chat response from message history.
79
80
Parameters:
81
- messages: Sequence[ChatMessage], conversation history
82
83
Returns:
84
- ChatResponse, chat response with message and metadata
85
"""
86
87
def stream_chat(
88
self,
89
messages: Sequence[ChatMessage],
90
**kwargs
91
) -> ChatResponseGen:
92
"""
93
Stream chat response from message history.
94
95
Parameters:
96
- messages: Sequence[ChatMessage], conversation history
97
98
Returns:
99
- ChatResponseGen, streaming chat response generator
100
"""
101
102
async def acomplete(
103
self,
104
prompt: str,
105
formatted: bool = False,
106
**kwargs
107
) -> CompletionResponse:
108
"""Async version of complete method."""
109
110
async def astream_complete(
111
self,
112
prompt: str,
113
formatted: bool = False,
114
**kwargs
115
) -> CompletionResponseAsyncGen:
116
"""Async version of stream_complete method."""
117
118
async def achat(
119
self,
120
messages: Sequence[ChatMessage],
121
**kwargs
122
) -> ChatResponse:
123
"""Async version of chat method."""
124
125
async def astream_chat(
126
self,
127
messages: Sequence[ChatMessage],
128
**kwargs
129
) -> ChatResponseAsyncGen:
130
"""Async version of stream_chat method."""
131
132
@property
133
def metadata(self) -> LLMMetadata:
134
"""Get LLM metadata including context window and token limits."""
135
136
def get_num_tokens(self, text: str) -> int:
137
"""Get token count for text."""
138
139
def get_num_tokens_from_messages(self, messages: Sequence[ChatMessage]) -> int:
140
"""Get token count for message sequence."""
141
```
142
143
### Custom LLM Implementation
144
145
Base class for implementing custom language models with standardized interfaces.
146
147
```python { .api }
148
class CustomLLM(LLM):
149
"""
150
Base class for custom LLM implementations.
151
152
Subclasses must implement:
153
- _complete: Core completion logic
154
- _stream_complete: Core streaming completion logic
155
- _chat: Core chat logic (optional, defaults to completion-based)
156
- _stream_chat: Core streaming chat logic (optional)
157
"""
158
159
def _complete(self, prompt: str, **kwargs) -> CompletionResponse:
160
"""Core completion implementation to be overridden."""
161
162
def _stream_complete(self, prompt: str, **kwargs) -> CompletionResponseGen:
163
"""Core streaming completion implementation to be overridden."""
164
165
def _chat(self, messages: Sequence[ChatMessage], **kwargs) -> ChatResponse:
166
"""Core chat implementation, defaults to completion-based."""
167
168
def _stream_chat(self, messages: Sequence[ChatMessage], **kwargs) -> ChatResponseGen:
169
"""Core streaming chat implementation, defaults to completion-based."""
170
```
171
172
### Mock LLM Implementation
173
174
Testing and development LLM that returns predictable responses without external API calls.
175
176
```python { .api }
177
class MockLLM(CustomLLM):
178
"""
179
Mock LLM for testing and development purposes.
180
181
Parameters:
182
- max_tokens: Optional[int], maximum tokens to return
183
- system_prompt: Optional[str], default system prompt
184
"""
185
def __init__(
186
self,
187
max_tokens: Optional[int] = None,
188
system_prompt: Optional[str] = None,
189
**kwargs
190
): ...
191
```
192
193
### LLM Response Types
194
195
Response structures for various LLM operations with rich metadata and content support.
196
197
```python { .api }
198
class CompletionResponse:
199
"""
200
Response from text completion operations.
201
202
Parameters:
203
- text: str, the completed text
204
- additional_kwargs: Optional[dict], additional response metadata
205
- raw: Optional[dict], raw response from the LLM provider
206
"""
207
def __init__(
208
self,
209
text: str,
210
additional_kwargs: Optional[dict] = None,
211
raw: Optional[dict] = None,
212
**kwargs
213
): ...
214
215
@property
216
def delta(self) -> Optional[str]:
217
"""Get response delta for streaming operations."""
218
219
class ChatResponse:
220
"""
221
Response from chat operations.
222
223
Parameters:
224
- message: ChatMessage, the response message
225
- raw: Optional[dict], raw response from the LLM provider
226
- additional_kwargs: Optional[dict], additional response metadata
227
"""
228
def __init__(
229
self,
230
message: ChatMessage,
231
raw: Optional[dict] = None,
232
additional_kwargs: Optional[dict] = None,
233
**kwargs
234
): ...
235
236
@property
237
def delta(self) -> Optional[str]:
238
"""Get response delta for streaming operations."""
239
240
# Type aliases for streaming responses
241
CompletionResponseGen = Generator[CompletionResponse, None, None]
242
CompletionResponseAsyncGen = AsyncGenerator[CompletionResponse, None]
243
ChatResponseGen = Generator[ChatResponse, None, None]
244
ChatResponseAsyncGen = AsyncGenerator[ChatResponse, None]
245
```
246
247
### Chat Messages & Roles
248
249
Structured message types for chat-based interactions with role-based organization.
250
251
```python { .api }
252
class ChatMessage:
253
"""
254
Individual message in a chat conversation.
255
256
Parameters:
257
- role: MessageRole, role of the message sender
258
- content: Union[str, List[ContentBlock]], message content
259
- additional_kwargs: Optional[dict], additional message metadata
260
- tool_calls: Optional[List[ToolCall]], tool calls in the message
261
- tool_call_id: Optional[str], identifier for tool call responses
262
"""
263
def __init__(
264
self,
265
role: MessageRole,
266
content: Union[str, List[ContentBlock]] = "",
267
additional_kwargs: Optional[dict] = None,
268
tool_calls: Optional[List[ToolCall]] = None,
269
tool_call_id: Optional[str] = None,
270
**kwargs
271
): ...
272
273
@classmethod
274
def from_str(
275
cls,
276
content: str,
277
role: str = MessageRole.USER,
278
**kwargs
279
) -> "ChatMessage":
280
"""Create ChatMessage from string content."""
281
282
class MessageRole(str, Enum):
283
"""Roles for chat message participants."""
284
SYSTEM = "system" # System instructions and context
285
USER = "user" # User input messages
286
ASSISTANT = "assistant" # Assistant/model responses
287
FUNCTION = "function" # Function call results (deprecated)
288
TOOL = "tool" # Tool execution results
289
```
290
291
### Content Block Types
292
293
Rich content support for multi-modal messages including text, images, and documents.
294
295
```python { .api }
296
class TextBlock:
297
"""
298
Text content block for messages.
299
300
Parameters:
301
- text: str, the text content
302
"""
303
def __init__(self, text: str): ...
304
305
class ImageBlock:
306
"""
307
Image content block for messages.
308
309
Parameters:
310
- image: str, base64 encoded image or image URL
311
- image_url: Optional[str], URL to image resource
312
- image_mimetype: Optional[str], MIME type of the image
313
"""
314
def __init__(
315
self,
316
image: str,
317
image_url: Optional[str] = None,
318
image_mimetype: Optional[str] = None
319
): ...
320
321
class AudioBlock:
322
"""
323
Audio content block for messages.
324
325
Parameters:
326
- audio: str, base64 encoded audio data
327
- audio_url: Optional[str], URL to audio resource
328
- audio_mimetype: Optional[str], MIME type of the audio
329
"""
330
def __init__(
331
self,
332
audio: str,
333
audio_url: Optional[str] = None,
334
audio_mimetype: Optional[str] = None
335
): ...
336
337
class DocumentBlock:
338
"""
339
Document content block for messages.
340
341
Parameters:
342
- document: str, base64 encoded document data
343
- document_url: Optional[str], URL to document resource
344
- document_mimetype: Optional[str], MIME type of the document
345
"""
346
def __init__(
347
self,
348
document: str,
349
document_url: Optional[str] = None,
350
document_mimetype: Optional[str] = None
351
): ...
352
```
353
354
### LLM Metadata & Configuration
355
356
Metadata structures for describing LLM capabilities and constraints.
357
358
```python { .api }
359
class LLMMetadata:
360
"""
361
Metadata describing LLM capabilities and limitations.
362
363
Parameters:
364
- context_window: int, maximum context window size in tokens
365
- num_output: int, maximum output tokens per request
366
- is_chat_model: bool, whether model supports chat interface
367
- is_function_calling_model: bool, whether model supports function calling
368
- model_name: str, name identifier for the model
369
- system_role: MessageRole, role used for system messages
370
"""
371
def __init__(
372
self,
373
context_window: int = 4096,
374
num_output: int = 256,
375
is_chat_model: bool = False,
376
is_function_calling_model: bool = False,
377
model_name: str = "unknown",
378
system_role: MessageRole = MessageRole.SYSTEM,
379
**kwargs
380
): ...
381
```
382
383
### Cache Control & Optimization
384
385
Advanced caching mechanisms for optimizing LLM performance and reducing costs.
386
387
```python { .api }
388
class CacheControl:
389
"""
390
Cache control settings for LLM optimization.
391
392
Parameters:
393
- type: str, cache control type (ephemeral, session, etc.)
394
"""
395
def __init__(self, type: str): ...
396
397
class CachePoint:
398
"""
399
Cache point configuration for specific content blocks.
400
401
Parameters:
402
- type: str, cache point type
403
"""
404
def __init__(self, type: str): ...
405
```
406
407
### Base Embedding Interface
408
409
Foundation interface for all embedding model implementations with text and batch processing support.
410
411
```python { .api }
412
class BaseEmbedding:
413
"""
414
Base interface for embedding models.
415
416
Parameters:
417
- model_name: str, name identifier for the embedding model
418
- embed_batch_size: int, batch size for embedding operations
419
- callback_manager: Optional[CallbackManager], callback management system
420
- num_workers: Optional[int], number of worker threads for parallel processing
421
"""
422
def __init__(
423
self,
424
model_name: str = "unknown",
425
embed_batch_size: int = 10,
426
callback_manager: Optional[CallbackManager] = None,
427
num_workers: Optional[int] = None,
428
**kwargs
429
): ...
430
431
def get_text_embedding(self, text: str) -> List[float]:
432
"""
433
Get embedding for single text string.
434
435
Parameters:
436
- text: str, input text to embed
437
438
Returns:
439
- List[float], embedding vector
440
"""
441
442
def get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
443
"""
444
Get embeddings for multiple text strings.
445
446
Parameters:
447
- texts: List[str], list of input texts to embed
448
449
Returns:
450
- List[List[float]], list of embedding vectors
451
"""
452
453
async def aget_text_embedding(self, text: str) -> List[float]:
454
"""Async version of get_text_embedding."""
455
456
async def aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
457
"""Async version of get_text_embeddings."""
458
459
def get_query_embedding(self, query: str) -> List[float]:
460
"""
461
Get embedding for query text (may differ from document embedding).
462
463
Parameters:
464
- query: str, query text to embed
465
466
Returns:
467
- List[float], query embedding vector
468
"""
469
470
async def aget_query_embedding(self, query: str) -> List[float]:
471
"""Async version of get_query_embedding."""
472
473
def similarity(
474
self,
475
embedding1: List[float],
476
embedding2: List[float]
477
) -> float:
478
"""
479
Compute similarity between two embeddings.
480
481
Parameters:
482
- embedding1: List[float], first embedding vector
483
- embedding2: List[float], second embedding vector
484
485
Returns:
486
- float, similarity score
487
"""
488
```
489
490
### Mock Embedding Implementation
491
492
Testing and development embedding model that generates consistent vectors without external API calls.
493
494
```python { .api }
495
class MockEmbedding(BaseEmbedding):
496
"""
497
Mock embedding model for testing and development.
498
499
Parameters:
500
- embed_dim: int, dimensionality of embedding vectors
501
- deterministic: bool, whether to generate deterministic embeddings
502
"""
503
def __init__(
504
self,
505
embed_dim: int = 1536,
506
deterministic: bool = True,
507
**kwargs
508
): ...
509
```
510
511
### Multi-Modal Embedding Support
512
513
Extended embedding interface for handling multiple content modalities.
514
515
```python { .api }
516
class MultiModalEmbedding(BaseEmbedding):
517
"""
518
Multi-modal embedding interface supporting text, images, and other content types.
519
520
Parameters:
521
- model_name: str, name identifier for the multi-modal embedding model
522
- embed_batch_size: int, batch size for embedding operations
523
"""
524
def __init__(
525
self,
526
model_name: str = "unknown",
527
embed_batch_size: int = 10,
528
**kwargs
529
): ...
530
531
def get_image_embedding(self, img_file_path: str) -> List[float]:
532
"""
533
Get embedding for image file.
534
535
Parameters:
536
- img_file_path: str, path to image file
537
538
Returns:
539
- List[float], image embedding vector
540
"""
541
542
async def aget_image_embedding(self, img_file_path: str) -> List[float]:
543
"""Async version of get_image_embedding."""
544
```
545
546
### Embedding Utilities
547
548
Utility functions and classes for embedding model management and operations.
549
550
```python { .api }
551
class Pooling:
552
"""
553
Embedding pooling operations for combining token embeddings.
554
555
Parameters:
556
- pooling_type: str, type of pooling (mean, max, cls)
557
"""
558
def __init__(self, pooling_type: str = "mean"): ...
559
560
def pool(self, embeddings: List[List[float]]) -> List[float]:
561
"""
562
Pool multiple embeddings into single vector.
563
564
Parameters:
565
- embeddings: List[List[float]], embeddings to pool
566
567
Returns:
568
- List[float], pooled embedding vector
569
"""
570
571
def resolve_embed_model(embed_model: Union[str, BaseEmbedding]) -> BaseEmbedding:
572
"""
573
Resolve embedding model from string name or return existing instance.
574
575
Parameters:
576
- embed_model: Union[str, BaseEmbedding], model name or instance
577
578
Returns:
579
- BaseEmbedding, resolved embedding model instance
580
"""
581
```
582
583
### Multi-Modal LLM Interface
584
585
Language models with vision and multi-modal capabilities for processing images alongside text.
586
587
```python { .api }
588
class MultiModalLLM:
589
"""
590
Multi-modal language model interface for vision and text processing.
591
592
Parameters:
593
- model_name: str, name identifier for the model
594
- max_new_tokens: int, maximum new tokens to generate
595
- context_window: int, maximum context window size
596
"""
597
def __init__(
598
self,
599
model_name: str = "unknown",
600
max_new_tokens: int = 300,
601
context_window: int = 4096,
602
**kwargs
603
): ...
604
605
def complete(
606
self,
607
prompt: str,
608
image_documents: Sequence[ImageDocument],
609
**kwargs
610
) -> CompletionResponse:
611
"""
612
Complete prompt with image context.
613
614
Parameters:
615
- prompt: str, text prompt
616
- image_documents: Sequence[ImageDocument], images for context
617
618
Returns:
619
- CompletionResponse, completion with image understanding
620
"""
621
622
def stream_complete(
623
self,
624
prompt: str,
625
image_documents: Sequence[ImageDocument],
626
**kwargs
627
) -> CompletionResponseGen:
628
"""Stream completion with image context."""
629
630
async def acomplete(
631
self,
632
prompt: str,
633
image_documents: Sequence[ImageDocument],
634
**kwargs
635
) -> CompletionResponse:
636
"""Async completion with image context."""
637
638
@property
639
def metadata(self) -> MultiModalLLMMetadata:
640
"""Get multi-modal LLM metadata."""
641
642
class MultiModalLLMMetadata:
643
"""
644
Metadata for multi-modal LLM capabilities.
645
646
Parameters:
647
- num_output: int, maximum output tokens
648
- model_name: str, model identifier
649
"""
650
def __init__(
651
self,
652
num_output: int = 300,
653
model_name: str = "unknown"
654
): ...
655
```
656
657
## Usage Examples
658
659
### Basic LLM Usage
660
661
```python
662
from llama_index.core.llms import MockLLM
663
from llama_index.core.llms.types import ChatMessage, MessageRole
664
665
# Initialize mock LLM
666
llm = MockLLM(max_tokens=256)
667
668
# Text completion
669
response = llm.complete("Explain machine learning in simple terms:")
670
print(response.text)
671
672
# Chat conversation
673
messages = [
674
ChatMessage(role=MessageRole.SYSTEM, content="You are a helpful assistant."),
675
ChatMessage(role=MessageRole.USER, content="What is deep learning?")
676
]
677
678
chat_response = llm.chat(messages)
679
print(chat_response.message.content)
680
```
681
682
### Streaming Responses
683
684
```python
685
# Streaming completion
686
stream = llm.stream_complete("Write a short story about AI:")
687
for response in stream:
688
print(response.delta, end="", flush=True)
689
690
# Streaming chat
691
stream = llm.stream_chat(messages)
692
for response in stream:
693
print(response.delta, end="", flush=True)
694
```
695
696
### Basic Embedding Usage
697
698
```python
699
from llama_index.core.embeddings import MockEmbedding
700
701
# Initialize mock embedding
702
embed_model = MockEmbedding(embed_dim=384)
703
704
# Single text embedding
705
text = "Machine learning is a subset of artificial intelligence."
706
embedding = embed_model.get_text_embedding(text)
707
print(f"Embedding dimension: {len(embedding)}")
708
709
# Batch embeddings
710
texts = [
711
"Natural language processing helps computers understand text.",
712
"Computer vision enables machines to interpret images.",
713
"Reinforcement learning trains agents through rewards."
714
]
715
716
embeddings = embed_model.get_text_embeddings(texts)
717
print(f"Generated {len(embeddings)} embeddings")
718
719
# Query embedding (may differ from document embeddings)
720
query_embedding = embed_model.get_query_embedding("What is AI?")
721
722
# Compute similarity
723
similarity = embed_model.similarity(embedding, query_embedding)
724
print(f"Similarity: {similarity:.3f}")
725
```
726
727
### Custom LLM Implementation
728
729
```python
730
from llama_index.core.llms import CustomLLM
731
from llama_index.core.llms.types import CompletionResponse, LLMMetadata
732
733
class MyCustomLLM(CustomLLM):
734
"""Example custom LLM implementation."""
735
736
def __init__(self, model_path: str, **kwargs):
737
self.model_path = model_path
738
super().__init__(**kwargs)
739
740
@property
741
def metadata(self) -> LLMMetadata:
742
return LLMMetadata(
743
context_window=4096,
744
num_output=512,
745
model_name="my_custom_model"
746
)
747
748
def _complete(self, prompt: str, **kwargs) -> CompletionResponse:
749
# Custom completion logic here
750
generated_text = f"Generated response for: {prompt}"
751
return CompletionResponse(text=generated_text)
752
753
def _stream_complete(self, prompt: str, **kwargs):
754
# Custom streaming logic here
755
response = self._complete(prompt, **kwargs)
756
yield response
757
758
# Use custom LLM
759
custom_llm = MyCustomLLM(model_path="/path/to/model")
760
response = custom_llm.complete("Hello, world!")
761
```
762
763
### Multi-Modal Content
764
765
```python
766
from llama_index.core.llms.types import ChatMessage, ImageBlock, TextBlock
767
768
# Create message with image and text
769
message = ChatMessage(
770
role=MessageRole.USER,
771
content=[
772
TextBlock(text="What do you see in this image?"),
773
ImageBlock(image="base64_encoded_image_data")
774
]
775
)
776
777
# Use in chat (with compatible multi-modal LLM)
778
# response = multimodal_llm.chat([message])
779
```
780
781
## Types & Configuration
782
783
```python { .api }
784
# Response type unions
785
Response = Union[str, ChatResponse, CompletionResponse]
786
RESPONSE_TYPE = Union[Response, StreamingResponse]
787
788
# Content block union
789
ContentBlock = Union[TextBlock, ImageBlock, AudioBlock, DocumentBlock]
790
791
# Pydantic program modes
792
class PydanticProgramMode(str, Enum):
793
DEFAULT = "default"
794
OPENAI = "openai"
795
LLM = "llm"
796
GUIDANCE = "guidance"
797
LM_FORMAT_ENFORCER = "lm-format-enforcer"
798
```