tessl/pypi-feast

Python SDK for Feast - an open source feature store for machine learning that manages features for both training and serving environments.

—

Pending

Overview

Eval results

Files

Vector Store

Name: tessl/pypi-feast
Author: tessl

The FeastVectorStore class provides vector store functionality for RAG (Retrieval-Augmented Generation) applications and semantic search using Feast's feature store infrastructure. It enables efficient vector similarity search and document retrieval for AI applications.

Capabilities

Vector Store Initialization

Initialize a vector store instance with a Feast repository and RAG-enabled feature view.

class FeastVectorStore:
    def __init__(self, repo_path: str, rag_view: FeatureView, features: List[str]):
        """
        Initialize the Feast vector store.
        
        Parameters:
        - repo_path: Path to the Feast repository
        - rag_view: Feature view configured for RAG operations
        - features: List of feature names to retrieve in queries
        """

Vector Similarity Search

Query the vector store using vector embeddings or text queries for semantic similarity search.

def query(
    self,
    query_vector: Optional[np.ndarray] = None,
    query_string: Optional[str] = None, 
    top_k: int = 10
) -> OnlineResponse:
    """
    Query the Feast vector store for similar documents.
    
    Parameters:
    - query_vector: Vector embedding for similarity search
    - query_string: Text query for semantic search
    - top_k: Number of most similar results to return
    
    Returns:
    OnlineResponse containing the retrieved documents and features
    
    Note: Either query_vector or query_string must be provided
    """

Vector Store Properties

Access the underlying Feast store and configuration.

@property
def store(self) -> FeatureStore:
    """Access the underlying FeatureStore instance."""

Usage Examples

Basic Vector Search Setup

import numpy as np
from feast import FeatureStore, FeatureView, Field, FileSource, ValueType, FeastVectorStore
from datetime import timedelta

# Define a RAG-enabled feature view with vector fields
documents_source = FileSource(
    path="data/document_embeddings.parquet",
    timestamp_field="created_timestamp"
)

# Create feature view for document embeddings
document_embeddings_fv = FeatureView(
    name="document_embeddings",
    entities=["document_id"],
    ttl=timedelta(days=365),
    schema=[
        Field(name="title", dtype=ValueType.STRING),
        Field(name="content", dtype=ValueType.STRING),
        Field(name="embedding", dtype=ValueType.FLOAT_LIST),  # Vector field
        Field(name="category", dtype=ValueType.STRING)
    ],
    source=documents_source
)

# Initialize vector store
vector_store = FeastVectorStore(
    repo_path="./feast_repo",
    rag_view=document_embeddings_fv,
    features=[
        "document_embeddings:title",
        "document_embeddings:content",
        "document_embeddings:embedding",
        "document_embeddings:category"
    ]
)

Vector Similarity Search

# Create query vector (e.g., from text embedding model)
query_embedding = np.array([0.1, 0.2, 0.3, 0.4, 0.5])  # Example 5-dimensional vector

# Perform vector similarity search
results = vector_store.query(
    query_vector=query_embedding,
    top_k=5
)

# Access results
result_dict = results.to_dict()
print("Top 5 similar documents:")
for i in range(len(result_dict["document_id"])):
    print(f"Document: {result_dict['title'][i]}")
    print(f"Category: {result_dict['category'][i]}")
    print(f"Content: {result_dict['content'][i][:100]}...")
    print("---")

Text-Based Semantic Search

# Perform text-based semantic search (if supported by the vector store backend)
results = vector_store.query(
    query_string="machine learning algorithms",
    top_k=10
)

# Convert to DataFrame for analysis
df = results.to_df()
print(df[["title", "category", "content"]])

RAG Pipeline Integration

def rag_query(question: str, vector_store: FeastVectorStore, embedding_model, llm_model):
    """
    Complete RAG pipeline using FeastVectorStore.
    
    Args:
        question: User question
        vector_store: Configured FeastVectorStore instance
        embedding_model: Model to create embeddings
        llm_model: Language model for generation
    """
    # Generate embedding for the question
    question_embedding = embedding_model.encode(question)
    
    # Retrieve relevant documents
    context_results = vector_store.query(
        query_vector=question_embedding,
        top_k=5
    )
    
    # Format context from retrieved documents
    context_dict = context_results.to_dict()
    context_text = "\n\n".join([
        f"Title: {title}\nContent: {content}"
        for title, content in zip(context_dict["title"], context_dict["content"])
    ])
    
    # Generate answer using retrieved context
    prompt = f"""
    Context:
    {context_text}
    
    Question: {question}
    
    Answer based on the provided context:
    """
    
    answer = llm_model.generate(prompt)
    return answer, context_results

# Usage
question = "What are the benefits of feature stores?"
answer, sources = rag_query(question, vector_store, embedding_model, llm_model)
print(f"Answer: {answer}")
print(f"Sources: {len(sources.to_dict()['document_id'])} documents")

Advanced Vector Store Configuration

from feast import Entity

# Define document entity
document_entity = Entity(
    name="document_id",
    value_type=ValueType.STRING,
    description="Unique document identifier"
)

# Create vector store with comprehensive configuration
vector_store = FeastVectorStore(
    repo_path="./feast_repo",
    rag_view=document_embeddings_fv,
    features=[
        "document_embeddings:title",
        "document_embeddings:content", 
        "document_embeddings:embedding",
        "document_embeddings:category",
        "document_embeddings:author",
        "document_embeddings:published_date",
        "document_embeddings:tags"
    ]
)

# Batch vector search for multiple queries
query_vectors = [
    np.random.rand(384),  # Example embedding dimension
    np.random.rand(384),
    np.random.rand(384)
]

batch_results = []
for i, query_vec in enumerate(query_vectors):
    result = vector_store.query(
        query_vector=query_vec,
        top_k=3
    )
    batch_results.append(result)
    print(f"Query {i+1}: Found {len(result.to_dict()['document_id'])} results")