Fuses semantic vector search with lexical full-text search using Reciprocal Rank Fusion in a PL/pgSQL RPC.
100
100%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Passed
No known issues
pgvector extension MUST be enabled in the target database.vector column and an HNSW index MUST already exist.tsvector column with a GIN index.pgvector-hnsw-index-selection skill MUST be installed and verified. See skills/pgvector-hnsw-index-selection.md for setup instructions.SELECT extname FROM pg_extension WHERE extname = 'vector'; — HALT if empty.vector column and a tsvector column.Replace your_table, embedding_col, and tsvector_col with the actual table and column names from the validated schema, then execute the following:
CREATE OR REPLACE FUNCTION hybrid_search(
query_embedding vector,
query_text text,
match_count int DEFAULT 20,
k int DEFAULT 60 -- RRF constant; 60 is the standard default
)
RETURNS TABLE(
id bigint,
combined_score float,
semantic_score float,
fulltext_score float
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY
WITH
-- Semantic CTE: rank rows by cosine distance using pgvector
semantic AS (
SELECT
t.id,
ROW_NUMBER() OVER (ORDER BY t.embedding_col <=> query_embedding) AS rank
FROM your_table t
ORDER BY t.embedding_col <=> query_embedding
LIMIT match_count
),
-- Full-text CTE: rank rows by tsvector match quality
fulltext AS (
SELECT
t.id,
ROW_NUMBER() OVER (
ORDER BY ts_rank_cd(t.tsvector_col, plainto_tsquery(query_text)) DESC
) AS rank
FROM your_table t
WHERE t.tsvector_col @@ plainto_tsquery(query_text)
ORDER BY ts_rank_cd(t.tsvector_col, plainto_tsquery(query_text)) DESC
LIMIT match_count
)
-- RRF fusion: FULL OUTER JOIN so rows from either CTE are retained;
-- COALESCE(..., 0.0) gives a score of 0 for the missing component
SELECT
COALESCE(s.id, f.id) AS id,
(COALESCE(1.0 / (k + s.rank), 0.0) +
COALESCE(1.0 / (k + f.rank), 0.0)) AS combined_score,
COALESCE(1.0 / (k + s.rank), 0.0) AS semantic_score,
COALESCE(1.0 / (k + f.rank), 0.0) AS fulltext_score
FROM semantic s
FULL OUTER JOIN fulltext f ON s.id = f.id
ORDER BY combined_score DESC;
END;
$$;
GRANT EXECUTE ON FUNCTION hybrid_search(vector, text, int, int) TO authenticated;SELECT * FROM hybrid_search(...) with a test embedding and test text.combined_score.combined_score than a row appearing in only one.EXPLAIN (ANALYZE, FORMAT JSON) on the RPC call — HALT if a sequential scan appears on the vector or tsvector column.60 is the standard RRF default and works well in most cases. Lower values amplify rank differences; higher values flatten them.COALESCE(..., 0.0) ensures rows that appear in only one CTE receive a score of 0 for the missing component — they are still returned, just ranked lower than dual-match rows.combined_score expression.