PostgreSQL pgvector extension support for Python with vector operations and similarity search across multiple database libraries
—
Peewee ORM field types for vector operations in Peewee-based applications with full vector type support and query integration.
Peewee model fields for storing different vector types in PostgreSQL with pgvector extension.
class VectorField(Field):
"""
Peewee field for storing Vector (float32) data.
Args:
dimensions (int, optional): Fixed number of dimensions
**kwargs: Standard Peewee field parameters
"""
class HalfVectorField(Field):
"""
Peewee field for storing HalfVector (float16) data.
Args:
dimensions (int, optional): Fixed number of dimensions
**kwargs: Standard Peewee field parameters
"""
class SparseVectorField(Field):
"""
Peewee field for storing SparseVector data.
Args:
dimensions (int, optional): Fixed number of dimensions
**kwargs: Standard Peewee field parameters
"""
class FixedBitField(Field):
"""
Peewee field for storing Bit vector data.
Args:
**kwargs: Standard Peewee field parameters
"""Usage Examples:
from peewee import Model, TextField, PostgresqlDatabase, IntegerField
from pgvector.peewee import VectorField, HalfVectorField, SparseVectorField, FixedBitField
# Database connection
db = PostgresqlDatabase(
'your_database',
user='user',
password='password',
host='localhost'
)
class Document(Model):
content = TextField()
embedding = VectorField(dimensions=1536) # OpenAI embeddings
title_embedding = HalfVectorField(dimensions=768) # Memory efficient
sparse_features = SparseVectorField(dimensions=10000) # High-dimensional sparse
binary_hash = FixedBitField() # Binary features
class Meta:
database = db
table_name = 'documents'
# Create tables
db.create_tables([Document])
# Insert data
from pgvector import Vector, HalfVector, SparseVector, Bit
doc = Document.create(
content="Sample document",
embedding=Vector([0.1, 0.2, 0.3] * 512), # 1536 dimensions
title_embedding=HalfVector([0.5, 0.6, 0.7] * 256), # 768 dimensions
sparse_features=SparseVector({0: 1.0, 500: 2.5}, 10000),
binary_hash=Bit("1010110")
)Using PostgreSQL distance operators in Peewee queries for similarity search.
Usage Examples:
from peewee import fn, SQL
from pgvector import Vector, Bit
query_vector = Vector([0.1, 0.2, 0.3] * 512) # 1536 dimensions
# L2 (Euclidean) distance using <-> operator
l2_results = (Document
.select(
Document.content,
SQL('embedding <-> %s', query_vector.to_text()).alias('distance')
)
.order_by(SQL('embedding <-> %s', query_vector.to_text()))
.limit(10))
for doc in l2_results:
print(f"Content: {doc.content}, Distance: {doc.distance}")
# Cosine distance using <=> operator
cosine_results = (Document
.select(
Document.content,
SQL('embedding <=> %s', query_vector.to_text()).alias('cosine_distance')
)
.order_by(SQL('embedding <=> %s', query_vector.to_text()))
.limit(10))
# Inner product distance using <#> operator
inner_product_results = (Document
.select(
Document.content,
SQL('embedding <#> %s', query_vector.to_text()).alias('inner_product')
)
.order_by(SQL('embedding <#> %s', query_vector.to_text()))
.limit(10))
# Filter by distance threshold
close_documents = (Document
.select()
.where(SQL('embedding <-> %s < 0.5', query_vector.to_text())))
# Hamming distance for bit vectors
query_bits = Bit("1010110" + "0" * 57) # Pad to required length
hamming_results = (Document
.select(
Document.content,
SQL('binary_hash <~> %s', query_bits.to_text()).alias('hamming_distance')
)
.order_by(SQL('binary_hash <~> %s', query_bits.to_text()))
.limit(10))
# Jaccard distance for bit vectors
jaccard_results = (Document
.select(
Document.content,
SQL('binary_hash <%> %s', query_bits.to_text()).alias('jaccard_distance')
)
.order_by(SQL('binary_hash <%> %s', query_bits.to_text()))
.limit(10))Creating vector indexes for improved query performance in Peewee.
Usage Examples:
from peewee import SQL
# Create HNSW index using raw SQL
def create_hnsw_index():
db.execute_sql("""
CREATE INDEX IF NOT EXISTS documents_embedding_hnsw_idx
ON documents
USING hnsw (embedding vector_l2_ops)
WITH (m = 16, ef_construction = 64)
""")
# Create IVFFlat index using raw SQL
def create_ivfflat_index():
db.execute_sql("""
CREATE INDEX IF NOT EXISTS documents_embedding_ivfflat_idx
ON documents
USING ivfflat (embedding vector_l2_ops)
WITH (lists = 100)
""")
# Create indexes after table creation
db.create_tables([Document])
create_hnsw_index()
create_ivfflat_index()
# Index for sparse vectors
def create_sparse_index():
db.execute_sql("""
CREATE INDEX IF NOT EXISTS documents_sparse_features_idx
ON documents
USING ivfflat (sparse_features sparsevec_l2_ops)
WITH (lists = 50)
""")
create_sparse_index()Complex similarity search patterns using Peewee with pgvector.
Usage Examples:
from peewee import Case, fn, Value
from datetime import datetime, timedelta
class Article(Model):
title = TextField()
content = TextField()
category = TextField()
embedding = VectorField(dimensions=384)
published_at = DateTimeField()
class Meta:
database = db
# Hybrid search: combine semantic similarity with metadata filtering
def hybrid_search(query_embedding, category=None, days_ago=7, limit=10):
base_query = Article.select(
Article.title,
Article.content,
Article.category,
SQL('embedding <=> %s', query_embedding.to_text()).alias('similarity')
)
if category:
base_query = base_query.where(Article.category == category)
if days_ago:
cutoff_date = datetime.now() - timedelta(days=days_ago)
base_query = base_query.where(Article.published_at >= cutoff_date)
return (base_query
.order_by(SQL('embedding <=> %s', query_embedding.to_text()))
.limit(limit))
# Multi-vector search with weighted combination
def multi_vector_search(title_embedding, content_embedding, title_weight=0.3, content_weight=0.7):
return (Document
.select(
Document.content,
SQL(
'(%s * (embedding <=> %s) + %s * (title_embedding <=> %s))',
title_weight, title_embedding.to_text(),
content_weight, content_embedding.to_text()
).alias('weighted_similarity')
)
.order_by(SQL(
'(%s * (embedding <=> %s) + %s * (title_embedding <=> %s))',
title_weight, title_embedding.to_text(),
content_weight, content_embedding.to_text()
))
.limit(10))
# Similarity clustering
def find_similar_clusters(reference_embedding, threshold=0.3):
"""Find documents that are similar to each other and to reference."""
return (Document
.select(
Document.id,
Document.content,
SQL('embedding <=> %s', reference_embedding.to_text()).alias('ref_similarity')
)
.where(SQL('embedding <=> %s < %s', reference_embedding.to_text(), threshold)))
# Vector aggregation
def get_category_centroids():
"""Calculate average embeddings by category."""
# Note: Peewee doesn't have built-in vector avg, use raw SQL
results = db.execute_sql("""
SELECT category, AVG(embedding) as centroid_embedding
FROM documents
GROUP BY category
""")
return [(row[0], Vector.from_text(row[1])) for row in results]from peewee import Model, TextField, DateTimeField, PostgresqlDatabase
from pgvector.peewee import VectorField
from pgvector import Vector
from datetime import datetime
# Database setup
db = PostgresqlDatabase(
'semantic_search_db',
user='user',
password='password',
host='localhost'
)
class NewsArticle(Model):
title = TextField()
content = TextField()
category = TextField()
embedding = VectorField(dimensions=384) # sentence-transformers
published_at = DateTimeField(default=datetime.now)
class Meta:
database = db
table_name = 'news_articles'
# Create table and indexes
db.create_tables([NewsArticle])
# Create vector index
db.execute_sql("""
CREATE INDEX IF NOT EXISTS news_embedding_hnsw_idx
ON news_articles
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64)
""")
# Semantic search service
class SemanticSearchService:
@staticmethod
def search_articles(query_embedding, category=None, limit=10):
query = (NewsArticle
.select(
NewsArticle.title,
NewsArticle.content,
NewsArticle.category,
NewsArticle.published_at,
SQL('embedding <=> %s', query_embedding.to_text()).alias('similarity')
))
if category:
query = query.where(NewsArticle.category == category)
return (query
.order_by(SQL('embedding <=> %s', query_embedding.to_text()))
.limit(limit))
@staticmethod
def find_related_articles(article_id, limit=5):
article = NewsArticle.get_by_id(article_id)
return (NewsArticle
.select(
NewsArticle.title,
SQL('embedding <=> %s', article.embedding.to_text()).alias('similarity')
)
.where(NewsArticle.id != article_id)
.order_by(SQL('embedding <=> %s', article.embedding.to_text()))
.limit(limit))
# Usage
service = SemanticSearchService()
# Search for articles
query_vector = Vector([0.1] * 384) # Your query embedding
results = service.search_articles(query_vector, category='technology', limit=5)
for article in results:
print(f"Title: {article.title}")
print(f"Similarity: {article.similarity}")
print("---")
# Find related articles
related = service.find_related_articles(article_id=1, limit=3)
for related_article in related:
print(f"Related: {related_article.title} (similarity: {related_article.similarity})")Install with Tessl CLI
npx tessl i tessl/pypi-pgvector