0
# SQLAlchemy Integration
1
2
Native SQLAlchemy types and functions providing full vector support for SQLAlchemy applications with type safety and query builder integration.
3
4
## Capabilities
5
6
### Vector Types
7
8
SQLAlchemy column types for storing different vector types in PostgreSQL with pgvector extension.
9
10
```python { .api }
11
class VECTOR(UserDefinedType):
12
"""
13
SQLAlchemy type for Vector (float32) columns.
14
15
Args:
16
dim (int, optional): Fixed number of dimensions
17
"""
18
19
def __init__(self, dim=None):
20
"""Initialize VECTOR type with optional dimension constraint."""
21
22
def get_col_spec(self, **kw) -> str:
23
"""Get column specification for DDL generation."""
24
25
def bind_processor(self, dialect):
26
"""Return a conversion function for processing bind values."""
27
28
def literal_processor(self, dialect):
29
"""Return a conversion function for processing literal values."""
30
31
def result_processor(self, dialect, coltype):
32
"""Return a conversion function for processing result values."""
33
34
class comparator_factory(UserDefinedType.Comparator):
35
"""Comparator for vector distance operations."""
36
37
def l2_distance(self, other):
38
"""L2 (Euclidean) distance operation."""
39
40
def max_inner_product(self, other):
41
"""Maximum inner product operation."""
42
43
def cosine_distance(self, other):
44
"""Cosine distance operation."""
45
46
def l1_distance(self, other):
47
"""L1 (Manhattan) distance operation."""
48
49
class HALFVEC(UserDefinedType):
50
"""
51
SQLAlchemy type for HalfVector (float16) columns.
52
53
Args:
54
dim (int, optional): Fixed number of dimensions
55
"""
56
57
def __init__(self, dim=None):
58
"""Initialize HALFVEC type with optional dimension constraint."""
59
60
def get_col_spec(self, **kw) -> str:
61
"""Get column specification for DDL generation."""
62
63
def bind_processor(self, dialect):
64
"""Return a conversion function for processing bind values."""
65
66
def literal_processor(self, dialect):
67
"""Return a conversion function for processing literal values."""
68
69
def result_processor(self, dialect, coltype):
70
"""Return a conversion function for processing result values."""
71
72
class comparator_factory(UserDefinedType.Comparator):
73
"""Comparator for half vector distance operations."""
74
75
def l2_distance(self, other):
76
"""L2 (Euclidean) distance operation."""
77
78
def max_inner_product(self, other):
79
"""Maximum inner product operation."""
80
81
def cosine_distance(self, other):
82
"""Cosine distance operation."""
83
84
def l1_distance(self, other):
85
"""L1 (Manhattan) distance operation."""
86
87
class SPARSEVEC(UserDefinedType):
88
"""
89
SQLAlchemy type for SparseVector columns.
90
91
Args:
92
dim (int, optional): Fixed number of dimensions
93
"""
94
95
def __init__(self, dim=None):
96
"""Initialize SPARSEVEC type with optional dimension constraint."""
97
98
def get_col_spec(self, **kw) -> str:
99
"""Get column specification for DDL generation."""
100
101
def bind_processor(self, dialect):
102
"""Return a conversion function for processing bind values."""
103
104
def literal_processor(self, dialect):
105
"""Return a conversion function for processing literal values."""
106
107
def result_processor(self, dialect, coltype):
108
"""Return a conversion function for processing result values."""
109
110
class comparator_factory(UserDefinedType.Comparator):
111
"""Comparator for sparse vector distance operations."""
112
113
def l2_distance(self, other):
114
"""L2 (Euclidean) distance operation."""
115
116
def max_inner_product(self, other):
117
"""Maximum inner product operation."""
118
119
def cosine_distance(self, other):
120
"""Cosine distance operation."""
121
122
def l1_distance(self, other):
123
"""L1 (Manhattan) distance operation."""
124
125
class BIT(UserDefinedType):
126
"""
127
SQLAlchemy type for Bit vector columns.
128
129
Args:
130
length (int, optional): Fixed bit length
131
"""
132
133
def __init__(self, length=None):
134
"""Initialize BIT type with optional length constraint."""
135
136
def get_col_spec(self, **kw) -> str:
137
"""Get column specification for DDL generation."""
138
139
class comparator_factory(UserDefinedType.Comparator):
140
"""Comparator for bit vector distance operations."""
141
142
def hamming_distance(self, other):
143
"""Hamming distance operation."""
144
145
def jaccard_distance(self, other):
146
"""Jaccard distance operation."""
147
148
# Type alias for compatibility
149
Vector = VECTOR
150
```
151
152
**Usage Examples:**
153
154
```python
155
from sqlalchemy import Column, Integer, Text, create_engine
156
from sqlalchemy.ext.declarative import declarative_base
157
from sqlalchemy.orm import sessionmaker
158
from pgvector.sqlalchemy import VECTOR, HALFVEC, SPARSEVEC, BIT, Vector
159
from pgvector import Vector as PgVector, HalfVector, SparseVector, Bit
160
161
Base = declarative_base()
162
163
class Document(Base):
164
__tablename__ = 'documents'
165
166
id = Column(Integer, primary_key=True)
167
content = Column(Text)
168
embedding = Column(VECTOR(1536)) # OpenAI embeddings
169
title_embedding = Column(HALFVEC(768)) # Memory efficient
170
sparse_features = Column(SPARSEVEC(10000)) # High-dimensional sparse
171
binary_hash = Column(BIT()) # Binary features
172
173
# Create engine and session
174
engine = create_engine('postgresql://user:pass@localhost/db')
175
Base.metadata.create_all(engine)
176
Session = sessionmaker(bind=engine)
177
session = Session()
178
179
# Create document with vectors
180
doc = Document(
181
content="Sample document",
182
embedding=PgVector([0.1, 0.2, 0.3] * 512),
183
title_embedding=HalfVector([0.5, 0.6, 0.7] * 256),
184
sparse_features=SparseVector({0: 1.0, 500: 2.5}, 10000),
185
binary_hash=Bit("1010110")
186
)
187
session.add(doc)
188
session.commit()
189
```
190
191
### Distance Operations
192
193
Use the built-in distance methods for similarity queries:
194
195
```python
196
from sqlalchemy import select
197
198
# L2 distance (Euclidean)
199
query_vector = PgVector([0.2, 0.3, 0.4] * 512)
200
nearest = session.scalars(
201
select(Document)
202
.order_by(Document.embedding.l2_distance(query_vector))
203
.limit(5)
204
).all()
205
206
# Cosine distance
207
similar = session.scalars(
208
select(Document)
209
.order_by(Document.embedding.cosine_distance(query_vector))
210
.limit(10)
211
).all()
212
213
# Maximum inner product (similarity)
214
products = session.scalars(
215
select(Document)
216
.order_by(Document.embedding.max_inner_product(query_vector))
217
.limit(10)
218
).all()
219
220
# Get distance values
221
distances = session.scalars(
222
select(Document.embedding.l2_distance(query_vector))
223
).all()
224
225
# Filter by distance threshold
226
nearby = session.scalars(
227
select(Document)
228
.filter(Document.embedding.l2_distance(query_vector) < 0.5)
229
).all()
230
```
231
232
### Aggregate Functions
233
234
Built-in aggregate functions for vector operations:
235
236
```python { .api }
237
def avg(*args):
238
"""
239
Calculate average vector across multiple rows.
240
241
Args:
242
*args: Vector columns or expressions
243
244
Returns:
245
Average vector
246
"""
247
248
def sum(*args):
249
"""
250
Calculate sum of vectors across multiple rows.
251
252
Args:
253
*args: Vector columns or expressions
254
255
Returns:
256
Sum vector
257
"""
258
```
259
260
**Usage Examples:**
261
262
```python
263
from pgvector.sqlalchemy import avg, sum
264
265
# Calculate average embedding
266
avg_embedding = session.scalars(
267
select(avg(Document.embedding))
268
).first()
269
270
# Calculate sum of embeddings
271
sum_embedding = session.scalars(
272
select(sum(Document.embedding))
273
).first()
274
275
# Group by category and average
276
from sqlalchemy import func
277
category_averages = session.execute(
278
select(Document.category, avg(Document.embedding))
279
.group_by(Document.category)
280
).all()
281
```
282
283
### Index Creation
284
285
Create approximate indexes for faster similarity search:
286
287
```python
288
from sqlalchemy import Index
289
290
# HNSW index for L2 distance
291
hnsw_index = Index(
292
'embedding_hnsw_idx',
293
Document.embedding,
294
postgresql_using='hnsw',
295
postgresql_with={'m': 16, 'ef_construction': 64},
296
postgresql_ops={'embedding': 'vector_l2_ops'}
297
)
298
299
# IVFFlat index for L2 distance
300
ivfflat_index = Index(
301
'embedding_ivfflat_idx',
302
Document.embedding,
303
postgresql_using='ivfflat',
304
postgresql_with={'lists': 100},
305
postgresql_ops={'embedding': 'vector_l2_ops'}
306
)
307
308
# Create indexes
309
hnsw_index.create(engine)
310
ivfflat_index.create(engine)
311
312
# Use different operators for different distance types:
313
# - vector_l2_ops: L2 distance (default)
314
# - vector_ip_ops: Inner product
315
# - vector_cosine_ops: Cosine distance
316
```
317
318
### Half-Precision Indexing
319
320
Index vectors at half-precision for memory efficiency:
321
322
```python
323
from sqlalchemy.sql import func
324
325
# Create half-precision index
326
half_index = Index(
327
'embedding_half_idx',
328
func.cast(Document.embedding, HALFVEC(1536)).label('embedding'),
329
postgresql_using='hnsw',
330
postgresql_with={'m': 16, 'ef_construction': 64},
331
postgresql_ops={'embedding': 'halfvec_l2_ops'}
332
)
333
334
# Query with half-precision casting
335
query_vector = PgVector([0.1, 0.2, 0.3] * 512)
336
half_results = session.scalars(
337
select(Document)
338
.order_by(
339
func.cast(Document.embedding, HALFVEC(1536))
340
.l2_distance(query_vector)
341
)
342
.limit(5)
343
).all()
344
```
345
346
### Array Support
347
348
Store arrays of vectors with proper type registration:
349
350
```python
351
from sqlalchemy import ARRAY, event
352
from pgvector.psycopg import register_vector
353
354
class MultiVectorDocument(Base):
355
__tablename__ = 'multi_vector_docs'
356
357
id = Column(Integer, primary_key=True)
358
embeddings = Column(ARRAY(VECTOR(768))) # Array of vectors
359
360
# Register vector types with the database connection
361
@event.listens_for(engine, "connect")
362
def connect(dbapi_connection, connection_record):
363
register_vector(dbapi_connection)
364
365
# For async connections
366
from pgvector.psycopg import register_vector_async
367
368
@event.listens_for(engine.sync_engine, "connect")
369
def connect_async(dbapi_connection, connection_record):
370
dbapi_connection.run_async(register_vector_async)
371
```