PostgreSQL pgvector extension support for Python with vector operations and similarity search across multiple database libraries
—
Native SQLAlchemy types and functions providing full vector support for SQLAlchemy applications with type safety and query builder integration.
SQLAlchemy column types for storing different vector types in PostgreSQL with pgvector extension.
class VECTOR(UserDefinedType):
"""
SQLAlchemy type for Vector (float32) columns.
Args:
dim (int, optional): Fixed number of dimensions
"""
def __init__(self, dim=None):
"""Initialize VECTOR type with optional dimension constraint."""
def get_col_spec(self, **kw) -> str:
"""Get column specification for DDL generation."""
def bind_processor(self, dialect):
"""Return a conversion function for processing bind values."""
def literal_processor(self, dialect):
"""Return a conversion function for processing literal values."""
def result_processor(self, dialect, coltype):
"""Return a conversion function for processing result values."""
class comparator_factory(UserDefinedType.Comparator):
"""Comparator for vector distance operations."""
def l2_distance(self, other):
"""L2 (Euclidean) distance operation."""
def max_inner_product(self, other):
"""Maximum inner product operation."""
def cosine_distance(self, other):
"""Cosine distance operation."""
def l1_distance(self, other):
"""L1 (Manhattan) distance operation."""
class HALFVEC(UserDefinedType):
"""
SQLAlchemy type for HalfVector (float16) columns.
Args:
dim (int, optional): Fixed number of dimensions
"""
def __init__(self, dim=None):
"""Initialize HALFVEC type with optional dimension constraint."""
def get_col_spec(self, **kw) -> str:
"""Get column specification for DDL generation."""
def bind_processor(self, dialect):
"""Return a conversion function for processing bind values."""
def literal_processor(self, dialect):
"""Return a conversion function for processing literal values."""
def result_processor(self, dialect, coltype):
"""Return a conversion function for processing result values."""
class comparator_factory(UserDefinedType.Comparator):
"""Comparator for half vector distance operations."""
def l2_distance(self, other):
"""L2 (Euclidean) distance operation."""
def max_inner_product(self, other):
"""Maximum inner product operation."""
def cosine_distance(self, other):
"""Cosine distance operation."""
def l1_distance(self, other):
"""L1 (Manhattan) distance operation."""
class SPARSEVEC(UserDefinedType):
"""
SQLAlchemy type for SparseVector columns.
Args:
dim (int, optional): Fixed number of dimensions
"""
def __init__(self, dim=None):
"""Initialize SPARSEVEC type with optional dimension constraint."""
def get_col_spec(self, **kw) -> str:
"""Get column specification for DDL generation."""
def bind_processor(self, dialect):
"""Return a conversion function for processing bind values."""
def literal_processor(self, dialect):
"""Return a conversion function for processing literal values."""
def result_processor(self, dialect, coltype):
"""Return a conversion function for processing result values."""
class comparator_factory(UserDefinedType.Comparator):
"""Comparator for sparse vector distance operations."""
def l2_distance(self, other):
"""L2 (Euclidean) distance operation."""
def max_inner_product(self, other):
"""Maximum inner product operation."""
def cosine_distance(self, other):
"""Cosine distance operation."""
def l1_distance(self, other):
"""L1 (Manhattan) distance operation."""
class BIT(UserDefinedType):
"""
SQLAlchemy type for Bit vector columns.
Args:
length (int, optional): Fixed bit length
"""
def __init__(self, length=None):
"""Initialize BIT type with optional length constraint."""
def get_col_spec(self, **kw) -> str:
"""Get column specification for DDL generation."""
class comparator_factory(UserDefinedType.Comparator):
"""Comparator for bit vector distance operations."""
def hamming_distance(self, other):
"""Hamming distance operation."""
def jaccard_distance(self, other):
"""Jaccard distance operation."""
# Type alias for compatibility
Vector = VECTORUsage Examples:
from sqlalchemy import Column, Integer, Text, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from pgvector.sqlalchemy import VECTOR, HALFVEC, SPARSEVEC, BIT, Vector
from pgvector import Vector as PgVector, HalfVector, SparseVector, Bit
Base = declarative_base()
class Document(Base):
__tablename__ = 'documents'
id = Column(Integer, primary_key=True)
content = Column(Text)
embedding = Column(VECTOR(1536)) # OpenAI embeddings
title_embedding = Column(HALFVEC(768)) # Memory efficient
sparse_features = Column(SPARSEVEC(10000)) # High-dimensional sparse
binary_hash = Column(BIT()) # Binary features
# Create engine and session
engine = create_engine('postgresql://user:pass@localhost/db')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
# Create document with vectors
doc = Document(
content="Sample document",
embedding=PgVector([0.1, 0.2, 0.3] * 512),
title_embedding=HalfVector([0.5, 0.6, 0.7] * 256),
sparse_features=SparseVector({0: 1.0, 500: 2.5}, 10000),
binary_hash=Bit("1010110")
)
session.add(doc)
session.commit()Use the built-in distance methods for similarity queries:
from sqlalchemy import select
# L2 distance (Euclidean)
query_vector = PgVector([0.2, 0.3, 0.4] * 512)
nearest = session.scalars(
select(Document)
.order_by(Document.embedding.l2_distance(query_vector))
.limit(5)
).all()
# Cosine distance
similar = session.scalars(
select(Document)
.order_by(Document.embedding.cosine_distance(query_vector))
.limit(10)
).all()
# Maximum inner product (similarity)
products = session.scalars(
select(Document)
.order_by(Document.embedding.max_inner_product(query_vector))
.limit(10)
).all()
# Get distance values
distances = session.scalars(
select(Document.embedding.l2_distance(query_vector))
).all()
# Filter by distance threshold
nearby = session.scalars(
select(Document)
.filter(Document.embedding.l2_distance(query_vector) < 0.5)
).all()Built-in aggregate functions for vector operations:
def avg(*args):
"""
Calculate average vector across multiple rows.
Args:
*args: Vector columns or expressions
Returns:
Average vector
"""
def sum(*args):
"""
Calculate sum of vectors across multiple rows.
Args:
*args: Vector columns or expressions
Returns:
Sum vector
"""Usage Examples:
from pgvector.sqlalchemy import avg, sum
# Calculate average embedding
avg_embedding = session.scalars(
select(avg(Document.embedding))
).first()
# Calculate sum of embeddings
sum_embedding = session.scalars(
select(sum(Document.embedding))
).first()
# Group by category and average
from sqlalchemy import func
category_averages = session.execute(
select(Document.category, avg(Document.embedding))
.group_by(Document.category)
).all()Create approximate indexes for faster similarity search:
from sqlalchemy import Index
# HNSW index for L2 distance
hnsw_index = Index(
'embedding_hnsw_idx',
Document.embedding,
postgresql_using='hnsw',
postgresql_with={'m': 16, 'ef_construction': 64},
postgresql_ops={'embedding': 'vector_l2_ops'}
)
# IVFFlat index for L2 distance
ivfflat_index = Index(
'embedding_ivfflat_idx',
Document.embedding,
postgresql_using='ivfflat',
postgresql_with={'lists': 100},
postgresql_ops={'embedding': 'vector_l2_ops'}
)
# Create indexes
hnsw_index.create(engine)
ivfflat_index.create(engine)
# Use different operators for different distance types:
# - vector_l2_ops: L2 distance (default)
# - vector_ip_ops: Inner product
# - vector_cosine_ops: Cosine distanceIndex vectors at half-precision for memory efficiency:
from sqlalchemy.sql import func
# Create half-precision index
half_index = Index(
'embedding_half_idx',
func.cast(Document.embedding, HALFVEC(1536)).label('embedding'),
postgresql_using='hnsw',
postgresql_with={'m': 16, 'ef_construction': 64},
postgresql_ops={'embedding': 'halfvec_l2_ops'}
)
# Query with half-precision casting
query_vector = PgVector([0.1, 0.2, 0.3] * 512)
half_results = session.scalars(
select(Document)
.order_by(
func.cast(Document.embedding, HALFVEC(1536))
.l2_distance(query_vector)
)
.limit(5)
).all()Store arrays of vectors with proper type registration:
from sqlalchemy import ARRAY, event
from pgvector.psycopg import register_vector
class MultiVectorDocument(Base):
__tablename__ = 'multi_vector_docs'
id = Column(Integer, primary_key=True)
embeddings = Column(ARRAY(VECTOR(768))) # Array of vectors
# Register vector types with the database connection
@event.listens_for(engine, "connect")
def connect(dbapi_connection, connection_record):
register_vector(dbapi_connection)
# For async connections
from pgvector.psycopg import register_vector_async
@event.listens_for(engine.sync_engine, "connect")
def connect_async(dbapi_connection, connection_record):
dbapi_connection.run_async(register_vector_async)Install with Tessl CLI
npx tessl i tessl/pypi-pgvector