0
# Vector Indexing and Storage
1
2
Core indexing functionality for creating searchable representations of documents using vector embeddings in LlamaIndex.TS.
3
4
## Import
5
6
```typescript
7
import { VectorStoreIndex, Document } from "llamaindex";
8
// Or from specific submodules
9
import { VectorStoreIndex } from "llamaindex/indices";
10
import { SimpleVectorStore } from "llamaindex/vector-store";
11
```
12
13
## Overview
14
15
Vector indexing in LlamaIndex.TS converts documents into searchable vector representations using embedding models. The system supports multi-modal data (text, images, audio), various vector stores, and provides sophisticated similarity search capabilities including hybrid search, MMR (Maximum Marginal Relevance), and advanced filtering for retrieval augmented generation (RAG) applications.
16
17
## VectorStoreIndex
18
19
The primary class for creating and managing vector-based indices.
20
21
```typescript { .api }
22
class VectorStoreIndex extends BaseIndex<IndexDict> {
23
static async init(options: VectorIndexOptions): Promise<VectorStoreIndex>;
24
25
static async fromDocuments(
26
documents: Document[],
27
args?: VectorIndexOptions
28
): Promise<VectorStoreIndex>;
29
30
static async fromVectorStore(
31
vectorStore: BaseVectorStore,
32
args?: Omit<VectorIndexOptions, "storageContext">
33
): Promise<VectorStoreIndex>;
34
35
asQueryEngine(args?: VectorIndexQueryEngineOptions): RetrieverQueryEngine;
36
37
asRetriever(args?: VectorIndexRetrieverOptions): VectorIndexRetriever;
38
39
asChatEngine(args?: VectorIndexChatEngineOptions): ContextChatEngine;
40
41
async insertNodes(nodes: BaseNode[], options?: { logProgress?: boolean }): Promise<void>;
42
async insert(document: Document, args?: InsertArgs): Promise<void>;
43
async deleteRefDoc(refDocId: string, deleteFromDocStore?: boolean): Promise<void>;
44
async deleteNode(nodeId: string): Promise<void>;
45
async update(document: Document): Promise<void>;
46
async refresh(documents: Document[], args?: RefreshArgs): Promise<void>;
47
48
// Multi-modal vector store support
49
vectorStores: VectorStoreByType;
50
indexStore: BaseIndexStore;
51
embedModel?: BaseEmbedding;
52
}
53
54
interface VectorIndexOptions {
55
nodes?: BaseNode[];
56
storageContext?: StorageContext;
57
embedModel?: BaseEmbedding;
58
insertBatchSize?: number;
59
showProgress?: boolean;
60
storeNodesOverride?: boolean;
61
}
62
63
type VectorStoreByType = {
64
[ModalityType.TEXT]?: BaseVectorStore;
65
[ModalityType.IMAGE]?: BaseVectorStore;
66
[ModalityType.AUDIO]?: BaseVectorStore;
67
};
68
```
69
70
## Vector Stores
71
72
### BaseVectorStore Interface
73
74
Base interface that all vector stores implement with support for multi-modal data and advanced querying.
75
76
```typescript { .api }
77
abstract class BaseVectorStore<Client = unknown, T = unknown> {
78
embedModel: BaseEmbedding;
79
abstract storesText: boolean;
80
abstract flatMetadata: boolean;
81
82
abstract client(): Client;
83
abstract add(embeddingResults: BaseNode[]): Promise<string[]>;
84
abstract delete(refDocId: string, deleteOptions?: object): Promise<void>;
85
abstract query(query: VectorStoreQuery<T>, options?: object): Promise<VectorStoreQueryResult>;
86
87
// Optional methods for enhanced functionality
88
async persist?(persistPath?: string, fsMap?: FileSystem): Promise<void>;
89
static async fromParams?(params: any): Promise<BaseVectorStore>;
90
}
91
92
interface VectorStoreConfig {
93
textKey?: string;
94
docIdKey?: string;
95
embeddingKey?: string;
96
metadataKey?: string;
97
chunkSize?: number;
98
addDataToDocStore?: boolean;
99
}
100
```
101
102
### SimpleVectorStore
103
104
In-memory vector store implementation, good for development and small datasets.
105
106
```typescript { .api }
107
class SimpleVectorStore implements BaseVectorStore {
108
constructor(data?: SimpleVectorStoreData);
109
110
add(nodes: BaseNode[]): Promise<string[]>;
111
delete(refDocId: string): Promise<void>;
112
query(query: VectorStoreQuery): Promise<VectorStoreQueryResult>;
113
persist(persistPath?: string): Promise<void>;
114
115
static fromPersistDir(
116
persistDir?: string,
117
filename?: string
118
): Promise<SimpleVectorStore>;
119
120
getData(): SimpleVectorStoreData;
121
setData(data: SimpleVectorStoreData): void;
122
123
stores_text: boolean;
124
is_embedding_query: boolean;
125
flat_metadata: boolean;
126
}
127
```
128
129
## Vector Store Query and Results
130
131
### VectorStoreQuery
132
133
Query object for vector store operations with support for multiple query modes and multi-modal retrieval.
134
135
```typescript { .api }
136
class VectorStoreQuery<T = unknown> {
137
constructor(args: {
138
queryEmbedding?: number[];
139
similarityTopK?: number;
140
docIds?: string[];
141
nodeIds?: string[];
142
queryStr?: string;
143
mode?: VectorStoreQueryMode;
144
alpha?: number;
145
beta?: number;
146
filters?: MetadataFilters;
147
mmrThreshold?: number;
148
sparseTopK?: number;
149
hybridTopK?: number;
150
modalityTopK?: Partial<Record<ModalityType, number>>;
151
outputFields?: string[];
152
embeddingField?: string;
153
});
154
155
queryEmbedding?: number[];
156
similarityTopK: number;
157
docIds?: string[];
158
nodeIds?: string[];
159
queryStr?: string;
160
mode: VectorStoreQueryMode;
161
alpha?: number; // Weight for dense retrieval in hybrid search
162
beta?: number; // Weight for sparse retrieval in hybrid search
163
filters?: MetadataFilters;
164
mmrThreshold?: number; // MMR diversity threshold
165
sparseTopK?: number; // Top-K for sparse retrieval
166
hybridTopK?: number; // Top-K for hybrid retrieval
167
modalityTopK?: Partial<Record<ModalityType, number>>; // Per-modality top-K
168
outputFields?: string[];
169
embeddingField?: string;
170
}
171
172
enum VectorStoreQueryMode {
173
DEFAULT = "default",
174
SPARSE = "sparse",
175
HYBRID = "hybrid",
176
SEMANTIC_HYBRID = "semantic_hybrid",
177
MMR = "mmr", // Maximum Marginal Relevance
178
SVM = "svm",
179
LOGISTIC_REGRESSION = "logistic_regression",
180
LINEAR_REGRESSION = "linear_regression",
181
}
182
183
enum ModalityType {
184
TEXT = "text",
185
IMAGE = "image",
186
AUDIO = "audio",
187
}
188
```
189
190
### VectorStoreQueryResult
191
192
Result object from vector store queries with enhanced result metadata.
193
194
```typescript { .api }
195
class VectorStoreQueryResult {
196
constructor(args: {
197
nodes?: BaseNode[];
198
similarities?: number[];
199
ids?: string[];
200
metadata?: Record<string, any>[];
201
});
202
203
nodes?: BaseNode[];
204
similarities?: number[];
205
ids?: string[];
206
metadata?: Record<string, any>[];
207
}
208
```
209
210
## Metadata Filtering System
211
212
LlamaIndex.TS supports sophisticated metadata filtering with multiple operators and conditional logic.
213
214
### MetadataFilters
215
216
```typescript { .api }
217
interface MetadataFilters {
218
filters: MetadataFilter[];
219
condition?: FilterCondition;
220
}
221
222
interface MetadataFilter {
223
key: string;
224
value: FilterValue;
225
operator: FilterOperator;
226
}
227
228
type FilterValue = string | number | boolean | string[] | number[];
229
230
enum FilterOperator {
231
EQ = "==",
232
NE = "!=",
233
GT = ">",
234
GTE = ">=",
235
LT = "<",
236
LTE = "<=",
237
IN = "in",
238
NIN = "nin",
239
TEXT_MATCH = "text_match",
240
CONTAINS = "contains",
241
IS_EMPTY = "is_empty",
242
ANY = "any",
243
ALL = "all",
244
}
245
246
enum FilterCondition {
247
AND = "and",
248
OR = "or",
249
}
250
```
251
252
### Advanced Filtering Examples
253
254
```typescript { .api }
255
// Complex conditional filtering
256
const complexFilters: MetadataFilters = {
257
filters: [
258
{ key: "category", value: ["tech", "finance"], operator: FilterOperator.IN },
259
{ key: "year", value: 2024, operator: FilterOperator.GTE },
260
{ key: "status", value: "published", operator: FilterOperator.EQ },
261
],
262
condition: FilterCondition.AND,
263
};
264
265
// Text matching and containment
266
const textFilters: MetadataFilters = {
267
filters: [
268
{ key: "title", value: "AI", operator: FilterOperator.TEXT_MATCH },
269
{ key: "tags", value: "machine-learning", operator: FilterOperator.CONTAINS },
270
],
271
condition: FilterCondition.OR,
272
};
273
```
274
275
## Basic Usage
276
277
### Creating a Vector Index
278
279
```typescript
280
import { VectorStoreIndex, Document } from "llamaindex";
281
282
// Create documents
283
const documents = [
284
new Document({ text: "LlamaIndex is a data framework for LLM applications." }),
285
new Document({ text: "It helps connect custom data sources to large language models." }),
286
new Document({ text: "Vector databases enable semantic search over document collections." }),
287
];
288
289
// Create vector index (uses default embedding model from Settings)
290
const index = await VectorStoreIndex.fromDocuments(documents);
291
292
console.log("Index created successfully");
293
```
294
295
### Querying the Index
296
297
```typescript
298
// Create query engine
299
const queryEngine = index.asQueryEngine();
300
301
// Query the index
302
const response = await queryEngine.query("What is LlamaIndex?");
303
console.log("Response:", response.toString());
304
305
// Access source nodes
306
console.log("Sources:", response.sourceNodes?.map(node => node.id_));
307
```
308
309
### Using as Retriever
310
311
```typescript
312
// Create retriever for more control
313
const retriever = index.asRetriever({
314
similarityTopK: 5, // Return top 5 most similar nodes
315
});
316
317
// Retrieve relevant nodes
318
const retrievedNodes = await retriever.retrieve("semantic search");
319
320
retrievedNodes.forEach((node, i) => {
321
console.log(`Node ${i}:`, node.text);
322
console.log(`Score:`, node.score);
323
});
324
```
325
326
## Advanced Usage
327
328
### Custom Vector Store
329
330
```typescript
331
import { SimpleVectorStore, StorageContext } from "llamaindex";
332
333
// Create custom storage context with specific vector store
334
const vectorStore = new SimpleVectorStore();
335
const storageContext = StorageContext.fromDefaults({ vectorStore });
336
337
// Create index with custom storage
338
const index = await VectorStoreIndex.fromDocuments(documents, {
339
storageContext,
340
});
341
```
342
343
### Persistent Vector Store
344
345
```typescript
346
import { SimpleVectorStore } from "llamaindex/vector-store";
347
348
// Save vector store to disk
349
await vectorStore.persist("./vector_store");
350
351
// Load vector store from disk
352
const loadedVectorStore = await SimpleVectorStore.fromPersistDir("./vector_store");
353
354
// Create index from loaded vector store
355
const loadedIndex = VectorStoreIndex.fromVectorStore(loadedVectorStore);
356
```
357
358
### Custom Embedding Model
359
360
```typescript
361
import { VectorStoreIndex, OpenAIEmbedding } from "llamaindex";
362
363
// Create index with specific embedding model
364
const customEmbedding = new OpenAIEmbedding({
365
model: "text-embedding-3-large",
366
dimensions: 1536,
367
});
368
369
const index = await VectorStoreIndex.fromDocuments(documents, {
370
embed_model: customEmbedding,
371
});
372
```
373
374
### Filtering with Metadata
375
376
```typescript
377
// Add documents with metadata
378
const documentsWithMeta = [
379
new Document({
380
text: "Financial report for Q1 2024",
381
metadata: { category: "finance", year: 2024, quarter: 1 }
382
}),
383
new Document({
384
text: "Technical documentation for API v2",
385
metadata: { category: "tech", version: "v2" }
386
}),
387
new Document({
388
text: "Marketing strategy for 2024",
389
metadata: { category: "marketing", year: 2024 }
390
}),
391
];
392
393
const index = await VectorStoreIndex.fromDocuments(documentsWithMeta);
394
395
// Query with metadata filters
396
const queryEngine = index.asQueryEngine({
397
preFilters: {
398
filters: [
399
{ key: "category", value: "finance", operator: "==" },
400
{ key: "year", value: 2024, operator: "==" }
401
]
402
}
403
});
404
405
const response = await queryEngine.query("What was the revenue?");
406
```
407
408
## Index Management
409
410
### Adding Documents
411
412
```typescript
413
// Insert single document
414
const newDoc = new Document({ text: "New information to add to the index." });
415
await index.insert(newDoc);
416
417
// Insert multiple documents
418
const moreDocs = [
419
new Document({ text: "Document 1" }),
420
new Document({ text: "Document 2" }),
421
];
422
423
for (const doc of moreDocs) {
424
await index.insert(doc);
425
}
426
```
427
428
### Updating Documents
429
430
```typescript
431
// Update existing document (requires same ID)
432
const updatedDoc = new Document({
433
text: "Updated content",
434
id_: "existing-doc-id"
435
});
436
437
await index.update(updatedDoc);
438
```
439
440
### Deleting Documents
441
442
```typescript
443
// Delete by document ID
444
await index.deleteDocument("doc-id-to-delete");
445
446
// Delete specific nodes
447
await index.deleteNodes(["node-id-1", "node-id-2"]);
448
```
449
450
### Refreshing Index
451
452
```typescript
453
// Refresh index with updated documents
454
const refreshedDocs = [
455
new Document({ text: "Updated content 1", id_: "doc-1" }),
456
new Document({ text: "Updated content 2", id_: "doc-2" }),
457
];
458
459
await index.refresh(refreshedDocs);
460
```
461
462
## Storage Context
463
464
### Custom Storage Configuration
465
466
```typescript
467
import { StorageContext, SimpleDocumentStore, SimpleIndexStore, SimpleVectorStore } from "llamaindex";
468
469
// Configure all storage components
470
const storageContext = StorageContext.fromDefaults({
471
docStore: new SimpleDocumentStore(),
472
indexStore: new SimpleIndexStore(),
473
vectorStore: new SimpleVectorStore(),
474
});
475
476
const index = await VectorStoreIndex.fromDocuments(documents, {
477
storageContext,
478
});
479
```
480
481
### Persistent Storage
482
483
```typescript
484
// Create index with persistent storage
485
const persistDir = "./storage";
486
const storageContext = StorageContext.fromDefaults({
487
persistDir,
488
});
489
490
const index = await VectorStoreIndex.fromDocuments(documents, {
491
storageContext,
492
});
493
494
// Persist to disk
495
await storageContext.persist(persistDir);
496
497
// Load from disk
498
const loadedStorageContext = StorageContext.fromDefaults({
499
persistDir,
500
});
501
```
502
503
## Performance Optimization
504
505
### Batch Processing
506
507
```typescript
508
// Process large document collections in batches
509
const batchSize = 100;
510
const largeDocs = /* large array of documents */;
511
512
for (let i = 0; i < largeDocs.length; i += batchSize) {
513
const batch = largeDocs.slice(i, i + batchSize);
514
515
if (i === 0) {
516
// Create index with first batch
517
index = await VectorStoreIndex.fromDocuments(batch);
518
} else {
519
// Add subsequent batches
520
for (const doc of batch) {
521
await index.insert(doc);
522
}
523
}
524
525
console.log(`Processed ${Math.min(i + batchSize, largeDocs.length)} documents`);
526
}
527
```
528
529
### Memory Management
530
531
```typescript
532
import { Settings } from "llamaindex";
533
534
// Configure for memory efficiency
535
Settings.chunkSize = 512; // Smaller chunks
536
Settings.chunkOverlap = 10; // Less overlap
537
538
// Use streaming for large operations
539
const queryEngine = index.asQueryEngine();
540
const stream = await queryEngine.query("query", { stream: true });
541
542
for await (const chunk of stream) {
543
console.log(chunk.response);
544
}
545
```
546
547
## Integration Examples
548
549
### With Custom Retriever
550
551
```typescript
552
import { BaseRetriever } from "llamaindex";
553
554
class CustomRetriever implements BaseRetriever {
555
async retrieve(query: string): Promise<NodeWithScore[]> {
556
// Custom retrieval logic
557
const vectorRetriever = index.asRetriever({ similarityTopK: 3 });
558
const nodes = await vectorRetriever.retrieve(query);
559
560
// Apply custom scoring or filtering
561
return nodes.filter(node => node.score! > 0.7);
562
}
563
}
564
565
const customRetriever = new CustomRetriever();
566
const queryEngine = index.asQueryEngine({ retriever: customRetriever });
567
```
568
569
### With Response Synthesis
570
571
```typescript
572
import { ResponseSynthesizer } from "llamaindex";
573
574
const responseSynthesizer = new ResponseSynthesizer({
575
responseMode: "tree_summarize", // Use hierarchical summarization
576
});
577
578
const queryEngine = index.asQueryEngine({
579
responseSynthesizer,
580
});
581
582
const response = await queryEngine.query("Complex multi-part question");
583
```
584
585
## Best Practices
586
587
### Index Configuration
588
589
```typescript
590
// Configure for your use case
591
const productionIndex = await VectorStoreIndex.fromDocuments(documents, {
592
storageContext: StorageContext.fromDefaults({
593
persistDir: "./production_storage", // Persistent storage for production
594
}),
595
showProgress: true, // Show progress for large datasets
596
});
597
598
// Configure retrieval parameters
599
const queryEngine = productionIndex.asQueryEngine({
600
retriever: productionIndex.asRetriever({
601
similarityTopK: 5, // Adjust based on your needs
602
}),
603
});
604
```
605
606
### Error Handling
607
608
```typescript
609
try {
610
const index = await VectorStoreIndex.fromDocuments(documents);
611
const response = await index.asQueryEngine().query("test query");
612
console.log(response.toString());
613
} catch (error) {
614
console.error("Indexing failed:", error);
615
// Handle specific error types
616
if (error.message.includes("embedding")) {
617
console.error("Check your embedding model configuration");
618
}
619
}
620
```
621
622
### Monitoring and Debugging
623
624
```typescript
625
import { Settings } from "llamaindex";
626
627
// Enable debug mode for detailed logging
628
Settings.debug = true;
629
630
// Monitor index size and performance
631
console.log("Vector store data:", vectorStore.getData());
632
console.log("Number of nodes:", vectorStore.getData().embeddingDict.size);
633
```