Python bindings for llama.cpp enabling efficient local language model inference without external API dependencies
—
Vector embeddings functionality for semantic similarity and RAG applications. PyLLaMACpp supports generating embeddings for individual prompts or extracting embeddings from current model context, enabling vector-based semantic search and retrieval-augmented generation workflows.
Extract embeddings from the current model context. This method returns the embeddings vector from the last processed input in the model's context.
def get_embeddings(self) -> List[float]:
"""
Get embeddings from the current model context.
Returns the last embeddings vector from the context.
The model must be initialized with embedding=True.
Returns:
List[float]: 1-dimensional embeddings vector with shape [n_embd]
Raises:
AssertionError: If model was not initialized with embedding=True
"""Example usage:
from pyllamacpp.model import Model
# Initialize model in embedding mode
model = Model(
model_path="/path/to/model.ggml",
embedding=True,
n_ctx=512
)
# Process some text to set context
list(model.generate("The quick brown fox", n_predict=1))
# Extract embeddings from current context
embeddings = model.get_embeddings()
print(f"Embeddings shape: {len(embeddings)}")
print(f"First few values: {embeddings[:5]}")Generate embeddings for specific text prompts. This method processes the given prompt and returns its embeddings vector, resetting the context afterward.
def get_prompt_embeddings(
self,
prompt: str,
n_threads: int = 4,
n_batch: int = 512
) -> List[float]:
"""
Generate embeddings for a specific prompt.
This method resets the model context, processes the prompt,
extracts embeddings, and resets the context again.
Parameters:
- prompt: str, text to generate embeddings for
- n_threads: int, number of CPU threads to use (default: 4)
- n_batch: int, batch size for processing (default: 512, must be >=32 for BLAS)
Returns:
List[float]: Embeddings vector for the prompt
Raises:
AssertionError: If model was not initialized with embedding=True
"""Example usage:
from pyllamacpp.model import Model
# Initialize model in embedding mode
model = Model(
model_path="/path/to/model.ggml",
embedding=True
)
# Generate embeddings for different texts
texts = [
"Machine learning is a subset of artificial intelligence",
"Deep learning uses neural networks with multiple layers",
"Natural language processing handles human language",
"Computer vision analyzes and interprets visual information"
]
embeddings_list = []
for text in texts:
embedding = model.get_prompt_embeddings(
prompt=text,
n_threads=8,
n_batch=512
)
embeddings_list.append(embedding)
print(f"Generated embeddings for: {text[:30]}...")
print(f"Generated {len(embeddings_list)} embedding vectors")Compute semantic similarity between embeddings using cosine similarity or other distance metrics.
import numpy as np
from scipy.spatial.distance import cosine
def cosine_similarity(embedding1, embedding2):
"""Compute cosine similarity between two embeddings."""
return 1 - cosine(embedding1, embedding2)
def euclidean_similarity(embedding1, embedding2):
"""Compute euclidean similarity between two embeddings."""
return 1 / (1 + np.linalg.norm(np.array(embedding1) - np.array(embedding2)))
# Example usage
model = Model(model_path="/path/to/model.ggml", embedding=True)
# Generate embeddings for comparison
text1 = "Artificial intelligence and machine learning"
text2 = "AI and ML technologies"
text3 = "Weather forecast for tomorrow"
embed1 = model.get_prompt_embeddings(text1)
embed2 = model.get_prompt_embeddings(text2)
embed3 = model.get_prompt_embeddings(text3)
# Compare similarities
sim_1_2 = cosine_similarity(embed1, embed2)
sim_1_3 = cosine_similarity(embed1, embed3)
print(f"Similarity between text1 and text2: {sim_1_2:.3f}")
print(f"Similarity between text1 and text3: {sim_1_3:.3f}")Build a simple document retrieval system using embeddings for semantic search.
import numpy as np
from typing import List, Tuple
class DocumentRetriever:
def __init__(self, model_path: str):
self.model = Model(model_path=model_path, embedding=True)
self.documents = []
self.embeddings = []
def add_document(self, text: str):
"""Add a document to the retrieval system."""
embedding = self.model.get_prompt_embeddings(text)
self.documents.append(text)
self.embeddings.append(embedding)
def add_documents(self, texts: List[str]):
"""Add multiple documents to the retrieval system."""
for text in texts:
self.add_document(text)
def search(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
"""Search for most similar documents to the query."""
query_embedding = self.model.get_prompt_embeddings(query)
similarities = []
for i, doc_embedding in enumerate(self.embeddings):
similarity = cosine_similarity(query_embedding, doc_embedding)
similarities.append((self.documents[i], similarity))
# Sort by similarity (descending)
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
# Example usage
retriever = DocumentRetriever("/path/to/model.ggml")
# Add documents
documents = [
"Python is a high-level programming language known for its simplicity",
"Machine learning algorithms can learn patterns from data",
"Neural networks are inspired by biological neural systems",
"Deep learning is a subset of machine learning using deep neural networks",
"Natural language processing enables computers to understand human language"
]
retriever.add_documents(documents)
# Search for similar documents
query = "What is deep neural network learning?"
results = retriever.search(query, top_k=3)
print(f"Query: {query}")
print("Most similar documents:")
for doc, similarity in results:
print(f" Similarity: {similarity:.3f} - {doc}")Combine embeddings with text generation for retrieval-augmented generation workflows.
class RAGSystem:
def __init__(self, model_path: str):
# Separate models for embeddings and generation
self.embedding_model = Model(model_path=model_path, embedding=True)
self.generation_model = Model(model_path=model_path, embedding=False)
self.retriever = DocumentRetriever(model_path)
def add_knowledge_base(self, documents: List[str]):
"""Add documents to the knowledge base."""
self.retriever.add_documents(documents)
def generate_with_context(self, query: str, top_k: int = 3) -> str:
"""Generate response using retrieved context."""
# Retrieve relevant documents
relevant_docs = self.retriever.search(query, top_k=top_k)
# Build context from retrieved documents
context = "\n\n".join([doc for doc, _ in relevant_docs])
# Create prompt with context
prompt = f"""Context:
{context}
Question: {query}
Answer based on the context:"""
# Generate response
response = self.generation_model.cpp_generate(
prompt=prompt,
n_predict=200,
temp=0.3
)
return response
# Example usage
rag = RAGSystem("/path/to/model.ggml")
# Add knowledge base
knowledge_base = [
"Python supports multiple programming paradigms including procedural, object-oriented, and functional programming.",
"Machine learning models require training data to learn patterns and make predictions on new data.",
"Transformers are a type of neural network architecture that uses attention mechanisms.",
"Large language models are trained on vast amounts of text data to understand and generate human-like text.",
]
rag.add_knowledge_base(knowledge_base)
# Ask questions with context
question = "How do machine learning models work?"
answer = rag.generate_with_context(question)
print(f"Question: {question}")
print(f"Answer: {answer}")Process multiple texts efficiently for large-scale embedding generation.
from concurrent.futures import ThreadPoolExecutor
import time
def generate_embeddings_batch(model_path: str, texts: List[str], n_threads: int = 4) -> List[List[float]]:
"""Generate embeddings for multiple texts efficiently."""
model = Model(model_path=model_path, embedding=True)
embeddings = []
start_time = time.time()
for i, text in enumerate(texts):
embedding = model.get_prompt_embeddings(text, n_threads=n_threads)
embeddings.append(embedding)
if (i + 1) % 10 == 0:
elapsed = time.time() - start_time
print(f"Processed {i + 1}/{len(texts)} texts in {elapsed:.2f}s")
return embeddings
# Example usage
texts = [
"First document about AI",
"Second document about ML",
"Third document about NLP",
# ... more texts
]
embeddings = generate_embeddings_batch("/path/to/model.ggml", texts, n_threads=8)
print(f"Generated {len(embeddings)} embeddings")embedding=True when creating models for embedding tasksget_prompt_embeddings for clean embeddings, get_embeddings for context-aware embeddingsn_threads for faster processing on multi-core systemsInstall with Tessl CLI
npx tessl i tessl/pypi-pyllamacpp