A Python library providing efficient algorithms for set similarity search operations with Jaccard, Cosine, and Containment similarity functions.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
The SearchIndex class provides an efficient data structure for similarity search queries against large collections of sets. It combines prefix filter and position filter techniques to enable fast similarity queries without scanning the entire collection.
A data structure that supports set similarity search queries with pre-built indexing for optimal query performance.
class SearchIndex:
def __init__(self, sets, similarity_func_name="jaccard", similarity_threshold=0.5):
"""
Build a search index for set similarity queries.
The algorithm is a combination of the prefix filter and position filter
techniques.
Parameters:
- sets (list): a list of sets, each entry is an iterable representing a
set. Note, it is the caller's responsibility to ensure the elements
in each set are unique, and duplicate elements will cause incorrect
result.
- similarity_func_name (str): the name of the similarity function used;
this function currently supports "jaccard", "cosine", "containment",
and "containment_min".
- similarity_threshold (float): the threshold used, must be a float
between 0 and 1.0.
Raises:
ValueError: If sets is not a non-empty list, similarity function is not
supported, or threshold is out of range.
"""
def query(self, s):
"""
Query the search index for sets similar to the query set.
Parameters:
- s (Iterable): the query set with unique elements.
Returns:
list: a list of tuples (index, similarity) where the index
is the index of the matching sets in the original list of sets.
"""from SetSimilaritySearch import SearchIndex
# Build index on a collection of sets
document_sets = [
["python", "programming", "language"],
["java", "programming", "language", "object"],
["python", "data", "science", "analysis"],
["java", "enterprise", "development"],
["programming", "tutorial", "beginner"]
]
# Create search index with Jaccard similarity
index = SearchIndex(
document_sets,
similarity_func_name="jaccard",
similarity_threshold=0.2
)
# Query for sets similar to a target
query_set = ["python", "programming"]
results = index.query(query_set)
print(f"Sets similar to {query_set}:")
for idx, similarity in results:
print(f" Set {idx}: {document_sets[idx]} (similarity: {similarity:.3f})")from SetSimilaritySearch import SearchIndex
# Product feature sets
products = [
["wireless", "bluetooth", "headphones", "music"],
["bluetooth", "speaker", "portable", "music"],
["headphones", "noise", "canceling", "premium"],
["wireless", "earbuds", "compact", "sport"],
["speaker", "home", "smart", "voice"]
]
# Compare different similarity functions
query = ["wireless", "bluetooth", "music"]
# Jaccard similarity
jaccard_index = SearchIndex(products, "jaccard", 0.1)
jaccard_results = jaccard_index.query(query)
print("Jaccard results:")
for idx, sim in jaccard_results:
print(f" Product {idx}: {sim:.3f}")
# Cosine similarity
cosine_index = SearchIndex(products, "cosine", 0.1)
cosine_results = cosine_index.query(query)
print("\nCosine results:")
for idx, sim in cosine_results:
print(f" Product {idx}: {sim:.3f}")
# Containment similarity (query containment in indexed sets)
containment_index = SearchIndex(products, "containment", 0.3)
containment_results = containment_index.query(query)
print("\nContainment results:")
for idx, sim in containment_results:
print(f" Product {idx}: {sim:.3f}")from SetSimilaritySearch import SearchIndex
import time
# Large collection simulation
num_docs = 10000
vocabulary = [f"term_{i}" for i in range(1000)]
# Generate document sets
import random
documents = []
for i in range(num_docs):
doc_size = random.randint(10, 50)
doc_terms = random.sample(vocabulary, doc_size)
documents.append(doc_terms)
# Build index (one-time cost)
print("Building search index...")
start_time = time.time()
index = SearchIndex(documents, "jaccard", 0.3)
build_time = time.time() - start_time
print(f"Index built in {build_time:.2f} seconds")
# Perform multiple queries (fast)
query_times = []
for _ in range(100):
query_terms = random.sample(vocabulary, 15)
start_time = time.time()
results = index.query(query_terms)
query_time = time.time() - start_time
query_times.append(query_time)
avg_query_time = sum(query_times) / len(query_times)
print(f"Average query time: {avg_query_time*1000:.2f} ms")
print(f"Average results per query: {sum(len(index.query(random.sample(vocabulary, 15))) for _ in range(10))/10:.1f}")from SetSimilaritySearch import SearchIndex
# Example: Finding similar user interests
user_interests = [
["photography", "travel", "nature", "hiking"],
["cooking", "food", "travel", "culture"],
["programming", "technology", "ai", "machine_learning"],
["photography", "art", "design", "creativity"],
["hiking", "outdoor", "fitness", "nature"],
["travel", "adventure", "photography", "culture"],
["technology", "programming", "startups", "innovation"]
]
# Build index for user matching
user_index = SearchIndex(
user_interests,
similarity_func_name="jaccard",
similarity_threshold=0.25
)
# Find users with similar interests
target_user_interests = ["travel", "photography", "adventure"]
similar_users = user_index.query(target_user_interests)
print(f"Users with interests similar to {target_user_interests}:")
for user_idx, similarity in similar_users:
print(f" User {user_idx}: {user_interests[user_idx]} (similarity: {similarity:.3f})")The SearchIndex supports both symmetric and asymmetric similarity functions:
The SearchIndex raises ValueError in these cases:
# Empty or invalid input during construction
SearchIndex([], "jaccard", 0.5) # ValueError: sets must be non-empty list
# Unsupported similarity function
SearchIndex(sets, "invalid", 0.5) # ValueError: similarity function not supported
# Invalid threshold
SearchIndex(sets, "jaccard", -0.1) # ValueError: threshold must be in [0, 1]
SearchIndex(sets, "jaccard", 1.5) # ValueError: threshold must be in [0, 1]
# No errors on query - handles empty queries gracefully
index.query([]) # Returns empty list
index.query(["nonexistent_token"]) # Returns empty listO(vocabulary_size + sum(prefix_sizes))Install with Tessl CLI
npx tessl i tessl/pypi-set-similarity-search