A library for summarizing data in streams for which it is infeasible to store all events
—
Data structures for estimating the frequency of items in a data stream with configurable error bounds and confidence levels. These algorithms provide memory-efficient approximations for tracking item counts when exact counting would require too much memory.
Common interface implemented by all frequency estimation algorithms.
/**
* Interface for frequency estimation algorithms
*/
public interface IFrequency {
/**
* Add count for a long item
* @param item the item to count
* @param count number to add to the item's count
*/
void add(long item, long count);
/**
* Add count for a string item
* @param item the item to count
* @param count number to add to the item's count
*/
void add(String item, long count);
/**
* Estimate count for a long item
* @param item the item to query
* @return estimated count (may overestimate, never underestimates)
*/
long estimateCount(long item);
/**
* Estimate count for a string item
* @param item the item to query
* @return estimated count (may overestimate, never underestimates)
*/
long estimateCount(String item);
/**
* Get total size/count processed
* @return total number of items processed
*/
long size();
}Count-Min Sketch data structure for frequency estimation with configurable error bounds and confidence levels.
/**
* Count-Min Sketch frequency estimator
*/
public class CountMinSketch implements IFrequency, Serializable {
public static final long PRIME_MODULUS = (1L << 31) - 1;
/**
* Create Count-Min Sketch with specified dimensions
* @param depth number of hash functions (affects confidence)
* @param width width of each hash table (affects accuracy)
* @param seed random seed for hash functions
*/
public CountMinSketch(int depth, int width, int seed);
/**
* Create Count-Min Sketch with error and confidence parameters
* @param epsOfTotalCount relative error as fraction of total count
* @param confidence confidence level (e.g., 0.99 for 99%)
* @param seed random seed for hash functions
*/
public CountMinSketch(double epsOfTotalCount, double confidence, int seed);
/**
* Get relative error bound
* @return relative error as fraction
*/
public double getRelativeError();
/**
* Get confidence level
* @return confidence level
*/
public double getConfidence();
/**
* Merge multiple Count-Min Sketches
* @param estimators sketches to merge (must have same dimensions)
* @return merged sketch
* @throws CMSMergeException if sketches are incompatible
*/
public static CountMinSketch merge(CountMinSketch... estimators) throws CMSMergeException;
/**
* Serialize sketch to byte array
* @param sketch sketch to serialize
* @return serialized bytes
* @throws IOException if serialization fails
*/
public static byte[] serialize(CountMinSketch sketch) throws IOException;
/**
* Deserialize sketch from byte array
* @param data serialized sketch data
* @return deserialized sketch
* @throws IOException if deserialization fails
*/
public static CountMinSketch deserialize(byte[] data) throws IOException;
/**
* Exception for Count-Min Sketch merge errors
*/
public static class CMSMergeException extends FrequencyMergeException {
public CMSMergeException(String message);
}
}Usage Examples:
import com.clearspring.analytics.stream.frequency.CountMinSketch;
// Create with 1% error, 99% confidence
CountMinSketch cms = new CountMinSketch(0.01, 0.99, 1234);
// Add items
cms.add("apple", 5);
cms.add("banana", 3);
cms.add("cherry", 7);
cms.add("apple", 2); // apple now has count ~7
// Query frequencies
long appleCount = cms.estimateCount("apple"); // returns >= 7
long bananaCount = cms.estimateCount("banana"); // returns >= 3
long unknownCount = cms.estimateCount("unknown"); // returns >= 0
// Total items processed
long totalSize = cms.size(); // returns 17Conservative update variant of Count-Min Sketch that reduces overestimation by using minimum update strategy.
/**
* Conservative Count-Min Sketch with reduced overestimation
*/
public class ConservativeAddSketch extends CountMinSketch {
/**
* Create Conservative Count-Min Sketch with specified dimensions
* @param depth number of hash functions
* @param width width of each hash table
* @param seed random seed for hash functions
*/
public ConservativeAddSketch(int depth, int width, int seed);
/**
* Create Conservative Count-Min Sketch with error and confidence parameters
* @param epsOfTotalCount relative error as fraction of total count
* @param confidence confidence level
* @param seed random seed for hash functions
*/
public ConservativeAddSketch(double epsOfTotalCount, double confidence, int seed);
}Base exception class for frequency estimation merge errors.
/**
* Base exception for frequency estimation merge errors
*/
public abstract class FrequencyMergeException extends Exception {
public FrequencyMergeException();
public FrequencyMergeException(String message);
}// Create sketch with desired accuracy
CountMinSketch cms = new CountMinSketch(0.01, 0.99, 42);
// Process stream data
cms.add("user123", 1);
cms.add("user456", 1);
cms.add("user123", 1); // user123 now has count 2
// Query frequencies
long user123Count = cms.estimateCount("user123"); // >= 2
long user456Count = cms.estimateCount("user456"); // >= 1CountMinSketch cms = new CountMinSketch(0.001, 0.999, 1);
long threshold = 1000; // define heavy hitter threshold
// Process data stream
for (String item : streamData) {
cms.add(item, 1);
// Check if item is a heavy hitter
if (cms.estimateCount(item) >= threshold) {
System.out.println(item + " is a heavy hitter");
}
}// Create compatible sketches on different nodes
CountMinSketch cms1 = new CountMinSketch(0.01, 0.99, 123);
CountMinSketch cms2 = new CountMinSketch(0.01, 0.99, 123);
// Process data on each node
cms1.add("item1", 10);
cms1.add("item2", 5);
cms2.add("item1", 8);
cms2.add("item3", 3);
// Merge sketches
CountMinSketch merged = CountMinSketch.merge(cms1, cms2);
// Query combined frequencies
long item1Count = merged.estimateCount("item1"); // >= 18
long item2Count = merged.estimateCount("item2"); // >= 5
long item3Count = merged.estimateCount("item3"); // >= 3CountMinSketch cms = new CountMinSketch(0.01, 0.99, 456);
// Add data
cms.add("data1", 100);
cms.add("data2", 200);
// Serialize to bytes
byte[] serialized = CountMinSketch.serialize(cms);
// Later, deserialize
CountMinSketch restored = CountMinSketch.deserialize(serialized);
// Verify data is preserved
long data1Count = restored.estimateCount("data1"); // >= 100Depth (d): Controls confidence level
Width (w): Controls accuracy
Conservative vs Standard:
Count-Min Sketch memory usage: depth × width × 8 bytes
Examples:
Install with Tessl CLI
npx tessl i tessl/maven-com-clearspring-analytics--stream