A library for summarizing data in streams for which it is infeasible to store all events
—
Bloom filters and related data structures for probabilistic set membership testing. These provide memory-efficient ways to test whether an element is in a set, with configurable false positive rates but no false negatives.
Base class for membership filters providing common functionality.
/**
* Abstract base class for membership filters
*/
public abstract class Filter {
protected int hashCount;
/**
* Get number of hash functions used
* @return hash function count
*/
public int getHashCount();
/**
* Get hash buckets for string key
* @param key string to hash
* @return array of hash bucket indices
*/
protected int[] getHashBuckets(String key);
/**
* Get hash buckets for byte array key
* @param key bytes to hash
* @return array of hash bucket indices
*/
protected int[] getHashBuckets(byte[] key);
/**
* Static method to get hash buckets
* @param key string to hash
* @param hashCount number of hash functions
* @param max maximum bucket index
* @return array of hash bucket indices
*/
public static int[] getHashBuckets(String key, int hashCount, int max);
}Bloom filter implementation for probabilistic set membership testing with configurable false positive probability.
/**
* Bloom filter for set membership testing
*/
public class BloomFilter extends Filter {
/**
* Create Bloom filter with specified capacity and buckets per element
* @param numElements expected number of elements
* @param bucketsPerElement buckets per element (affects false positive rate)
*/
public BloomFilter(int numElements, int bucketsPerElement);
/**
* Create Bloom filter with specified capacity and false positive probability
* @param numElements expected number of elements
* @param maxFalsePosProbability maximum false positive probability (0.0 to 1.0)
*/
public BloomFilter(int numElements, double maxFalsePosProbability);
/**
* Clear all elements from the filter
*/
public void clear();
/**
* Get number of buckets in the filter
* @return bucket count
*/
public int buckets();
/**
* Test if string key might be in the set
* @param key string to test
* @return false if definitely not in set, true if might be in set
*/
public boolean isPresent(String key);
/**
* Test if byte array key might be in the set
* @param key bytes to test
* @return false if definitely not in set, true if might be in set
*/
public boolean isPresent(byte[] key);
/**
* Add string key to the filter
* @param key string to add
*/
public void add(String key);
/**
* Add byte array key to the filter
* @param key bytes to add
*/
public void add(byte[] key);
/**
* Merge another Bloom filter into this one
* @param other Bloom filter to merge (must be compatible)
*/
public void addAll(BloomFilter other);
/**
* Merge multiple filters into a new filter
* @param filters filters to merge
* @return new merged filter
*/
public Filter merge(Filter... filters);
/**
* Create filter that always returns true for isPresent
* @return always-matching Bloom filter
*/
public static BloomFilter alwaysMatchingBloomFilter();
/**
* Get serializer for Bloom filters
* @return compact serializer instance
*/
public static ICompactSerializer<BloomFilter> serializer();
}Usage Examples:
import com.clearspring.analytics.stream.membership.BloomFilter;
// Create filter expecting 1000 elements with 1% false positive rate
BloomFilter filter = new BloomFilter(1000, 0.01);
// Add elements to the set
filter.add("user123");
filter.add("user456");
filter.add("user789");
// Test membership
boolean mightContain123 = filter.isPresent("user123"); // true
boolean mightContain999 = filter.isPresent("user999"); // false (or true if false positive)
// Add more elements
filter.add("session_abc");
filter.add("session_def");
// Test with byte arrays
byte[] keyBytes = "some_key".getBytes();
filter.add(keyBytes);
boolean containsKey = filter.isPresent(keyBytes); // trueUtility class for calculating optimal Bloom filter parameters.
/**
* Utility class for Bloom filter parameter calculations
*/
public class BloomCalculations {
/**
* Compute optimal number of hash functions
* @param bucketsPerElement buckets per element ratio
* @return optimal number of hash functions
*/
public static int computeBestK(int bucketsPerElement);
/**
* Compute optimal buckets and hash functions for target false positive rate
* @param maxFalsePosProbability desired false positive probability
* @return specification with optimal parameters
*/
public static BloomSpecification computeBucketsAndK(double maxFalsePosProbability);
/**
* Bloom filter specification with calculated parameters
*/
public static class BloomSpecification {
public final int K; // Number of hash functions
public final int bucketsPerElement; // Buckets per element
public BloomSpecification(int k, int bucketsPerElement);
}
}Interface for compact serialization of data structures.
/**
* Interface for compact serialization
*/
public interface ICompactSerializer<T> {
/**
* Serialize object to data output
* @param t object to serialize
* @param out output stream
* @throws IOException if serialization fails
*/
void serialize(T t, DataOutput out) throws IOException;
/**
* Deserialize object from data input
* @param in input stream
* @return deserialized object
* @throws IOException if deserialization fails
*/
T deserialize(DataInput in) throws IOException;
/**
* Get serialized size of object
* @param t object to measure
* @return size in bytes
*/
long serializedSize(T t);
}Utility classes for buffered I/O operations.
/**
* Buffer for data input operations
*/
public class DataInputBuffer implements DataInput {
/**
* Create empty input buffer
*/
public DataInputBuffer();
/**
* Create input buffer with initial data
* @param bytes initial data
*/
public DataInputBuffer(byte[] bytes);
/**
* Reset buffer with new input data
* @param input new data array
*/
public void reset(byte[] input);
/**
* Reset buffer with range of input data
* @param input data array
* @param start start index
* @param length number of bytes
*/
public void reset(byte[] input, int start, int length);
/**
* Get underlying data array
* @return data bytes
*/
public byte[] getData();
/**
* Get current position in buffer
* @return current position
*/
public int getPosition();
/**
* Get length of valid data
* @return data length
*/
public int getLength();
}
/**
* Buffer for data output operations
*/
public class DataOutputBuffer implements DataOutput {
/**
* Create output buffer with default size
*/
public DataOutputBuffer();
/**
* Create output buffer with specified initial size
* @param size initial buffer size
*/
public DataOutputBuffer(int size);
/**
* Get data written to buffer
* @return byte array of written data
*/
public byte[] getData();
/**
* Get length of data written
* @return number of bytes written
*/
public int getLength();
/**
* Reset buffer to empty state
*/
public void reset();
/**
* Write buffer contents to output stream
* @param out output stream
* @throws IOException if write fails
*/
public void writeTo(OutputStream out) throws IOException;
}Utility for serializing BitSet objects.
/**
* Serialization utilities for BitSet
*/
public class BitSetSerializer {
/**
* Serialize BitSet to data output
* @param bs BitSet to serialize
* @param out output stream
* @throws IOException if serialization fails
*/
public static void serialize(OpenBitSet bs, DataOutput out) throws IOException;
/**
* Deserialize BitSet from data input
* @param in input stream
* @return deserialized BitSet
* @throws IOException if deserialization fails
*/
public static OpenBitSet deserialize(DataInput in) throws IOException;
}// Create filter for expected 10,000 elements with 0.1% false positive rate
BloomFilter filter = new BloomFilter(10000, 0.001);
// Build the set
Set<String> actualSet = new HashSet<>();
for (String item : itemsToAdd) {
filter.add(item);
actualSet.add(item);
}
// Test membership (filter provides fast pre-filtering)
String candidate = "test_item";
if (filter.isPresent(candidate)) {
// Might be in set, check actual set for confirmation
boolean definitiveAnswer = actualSet.contains(candidate);
} else {
// Definitely not in set, no need to check further
System.out.println(candidate + " is not in the set");
}// Create compatible filters on different nodes
BloomFilter filter1 = new BloomFilter(5000, 0.01);
BloomFilter filter2 = new BloomFilter(5000, 0.01);
// Each node adds its elements
filter1.add("node1_item1");
filter1.add("node1_item2");
filter2.add("node2_item1");
filter2.add("node2_item2");
// Merge filters to represent union of sets
filter1.addAll(filter2);
// Now filter1 represents union of both sets
boolean mightContain = filter1.isPresent("node2_item1"); // true// Use Bloom filter to avoid expensive cache misses
BloomFilter cacheFilter = new BloomFilter(100000, 0.01);
// When adding to cache, also add to filter
void addToCache(String key, Object value) {
cache.put(key, value);
cacheFilter.add(key);
}
// Check filter before expensive cache lookup
Object getValue(String key) {
if (!cacheFilter.isPresent(key)) {
// Definitely not in cache, skip lookup
return computeExpensiveValue(key);
}
// Might be in cache, check cache
Object cached = cache.get(key);
if (cached != null) {
return cached;
}
// False positive, compute value
return computeExpensiveValue(key);
}Elements vs Memory Trade-off:
False Positive Rate Impact:
Hash Function Count:
Bloom filter memory usage: (numElements × bucketsPerElement) bits
Examples:
Install with Tessl CLI
npx tessl i tessl/maven-com-clearspring-analytics--stream