A library for summarizing data in streams for which it is infeasible to store all events
—
Probabilistic algorithms for estimating the number of unique elements in a data stream. These algorithms provide memory-efficient approximations with configurable accuracy trade-offs and support merging for distributed computing scenarios.
Common interface implemented by all cardinality estimation algorithms.
/**
* Interface for cardinality estimation algorithms
*/
public interface ICardinality {
/**
* Add element to the estimator
* @param o stream element
* @return false if cardinality estimate is unaffected by this element
*/
boolean offer(Object o);
/**
* Add pre-hashed element to the estimator
* @param hashedLong pre-computed hash of the element
* @return false if cardinality estimate is unaffected
*/
boolean offerHashed(long hashedLong);
/**
* Add pre-hashed element to the estimator
* @param hashedInt pre-computed hash of the element
* @return false if cardinality estimate is unaffected
*/
boolean offerHashed(int hashedInt);
/**
* Get estimated number of unique elements
* @return estimated cardinality
*/
long cardinality();
/**
* Get size in bytes needed for serialization
* @return byte size
*/
int sizeof();
/**
* Serialize the estimator to byte array
* @return serialized bytes
* @throws IOException if serialization fails
*/
byte[] getBytes() throws IOException;
/**
* Merge estimators to produce combined estimate
* @param estimators compatible estimators to merge
* @return new estimator for combined streams
* @throws CardinalityMergeException if estimators are incompatible
*/
ICardinality merge(ICardinality... estimators) throws CardinalityMergeException;
}HyperLogLog algorithm for cardinality estimation with accuracy = 1.04/sqrt(m) where m = 2^b. Requires 64% less space than LogLog for the same accuracy.
/**
* HyperLogLog cardinality estimator
*/
public class HyperLogLog implements ICardinality, Serializable {
/**
* Create HyperLogLog with specified relative standard deviation
* @param rsd relative standard deviation (e.g., 0.1 for 10%)
*/
public HyperLogLog(double rsd);
/**
* Create HyperLogLog with specified log2m parameter
* @param log2m log base 2 of number of buckets (4-16 recommended)
*/
public HyperLogLog(int log2m);
/**
* Create HyperLogLog with log2m parameter and existing register set
* @param log2m log base 2 of number of buckets
* @param registerSet existing register set to use
*/
public HyperLogLog(int log2m, RegisterSet registerSet);
/**
* Merge another HyperLogLog into this one
* @param other HyperLogLog to merge
*/
public void addAll(HyperLogLog other);
/**
* Builder for HyperLogLog configuration
*/
public static class Builder implements IBuilder<HyperLogLog> {
public Builder(double rsd);
public Builder withb(int b);
public HyperLogLog build();
public int sizeof();
// Static factory methods
public static Builder withLog2m(int log2m);
public static Builder withRsd(double rsd);
public static Builder withAccuracy(double accuracy);
// Build from serialized data
public static HyperLogLog build(byte[] bytes) throws IOException;
public static HyperLogLog build(DataInput serializedByteStream) throws IOException;
}
public static class HyperLogLogMergeException extends CardinalityMergeException {
public HyperLogLogMergeException(String message);
}
}Usage Examples:
import com.clearspring.analytics.stream.cardinality.HyperLogLog;
// Create with 10% relative standard deviation
HyperLogLog hll = new HyperLogLog(0.1);
// Add elements
hll.offer("user123");
hll.offer("user456");
hll.offer("user123"); // duplicate, won't affect cardinality much
// Get estimate
long uniqueUsers = hll.cardinality();
// Merge with another HLL
HyperLogLog hll2 = new HyperLogLog(0.1);
hll2.offer("user789");
HyperLogLog merged = (HyperLogLog) hll.merge(hll2);Enhanced HyperLogLog with improved accuracy for small cardinalities through sparse representation.
/**
* HyperLogLog++ with improved small cardinality accuracy
*/
public class HyperLogLogPlus implements ICardinality, Serializable {
/**
* Create HyperLogLogPlus with precision parameter
* @param p precision parameter (4-25 recommended)
*/
public HyperLogLogPlus(int p);
/**
* Create HyperLogLogPlus with precision and sparse precision
* @param p normal precision parameter
* @param sp sparse precision parameter
*/
public HyperLogLogPlus(int p, int sp);
/**
* Builder for HyperLogLogPlus configuration
*/
public static class Builder implements IBuilder<HyperLogLogPlus> {
public Builder(int p);
public Builder(int p, int sp);
public HyperLogLogPlus build();
public int sizeof();
}
}Original LogLog cardinality estimation algorithm. Less memory efficient than HyperLogLog but simpler implementation.
/**
* LogLog cardinality estimator
*/
public class LogLog implements ICardinality {
/**
* Create LogLog with k parameter
* @param k parameter controlling accuracy vs memory (4-16 recommended)
*/
public LogLog(int k);
/**
* Create LogLog from existing data
* @param M byte array representing internal state
*/
public LogLog(byte[] M);
/**
* Merge multiple LogLog estimators
* @param estimators LogLog instances to merge
* @return merged LogLog estimator
* @throws LogLogMergeException if estimators are incompatible
*/
public static LogLog mergeEstimators(LogLog... estimators) throws LogLogMergeException;
/**
* Helper function for LogLog algorithm
* @param x input value
* @param k parameter
* @return processed value
*/
public static int rho(int x, int k);
/**
* Builder for LogLog configuration
*/
public static class Builder implements IBuilder<LogLog> {
public Builder(int k);
public LogLog build();
public int sizeof();
}
public static class LogLogMergeException extends CardinalityMergeException {
public LogLogMergeException(String message);
}
}Linear counting algorithm for cardinality estimation using bit arrays. More accurate for small cardinalities but uses more memory.
/**
* Linear counting cardinality estimator
*/
public class LinearCounting implements ICardinality {
/**
* Create LinearCounting with bit array size
* @param size size of the bit array
*/
public LinearCounting(int size);
/**
* Create LinearCounting from existing bit map
* @param map byte array representing bit map
*/
public LinearCounting(byte[] map);
/**
* Get utilization ratio of the bit array
* @return ratio of set bits (0.0 to 1.0)
*/
public double getUtilization();
/**
* Get count of unset bits
* @return number of zero bits
*/
public int getCount();
/**
* Check if the bit array is saturated
* @return true if too many bits are set for accurate estimation
*/
public boolean isSaturated();
/**
* Merge multiple LinearCounting estimators
* @param estimators LinearCounting instances to merge
* @return merged LinearCounting estimator
* @throws LinearCountingMergeException if estimators are incompatible
*/
public static LinearCounting mergeEstimators(LinearCounting... estimators)
throws LinearCountingMergeException;
/**
* Advanced builder with error calculation
*/
public static class Builder implements IBuilder<LinearCounting> {
public Builder(int size);
public Builder withSize(int size);
public LinearCounting build();
public int sizeof();
}
public static class LinearCountingMergeException extends CardinalityMergeException {
public LinearCountingMergeException(String message);
}
}Adaptive counting algorithm that automatically switches between different estimation techniques based on the current cardinality.
/**
* Adaptive counting with automatic algorithm switching
*/
public class AdaptiveCounting extends LogLog {
/**
* Create AdaptiveCounting with k parameter
* @param k parameter controlling accuracy vs memory
*/
public AdaptiveCounting(int k);
/**
* Create AdaptiveCounting from existing data
* @param M byte array representing internal state
*/
public AdaptiveCounting(byte[] M);
}Hybrid approach that counts exactly up to a threshold, then switches to probabilistic estimation.
/**
* Hybrid exact counting then estimation
*/
public class CountThenEstimate implements ICardinality {
/**
* Create hybrid estimator
* @param exactCountingThreshold threshold for switching to estimation
* @param backingEstimator probabilistic estimator to use after threshold
*/
public CountThenEstimate(int exactCountingThreshold, ICardinality backingEstimator);
}Efficient bit-packed register storage used internally by HyperLogLog algorithms.
/**
* Bit-packed register storage for HyperLogLog
*/
public class RegisterSet {
public static final int LOG2_BITS_PER_WORD = 6;
public static final int REGISTER_SIZE = 5;
/**
* Create register set with specified count
* @param count number of registers
*/
public RegisterSet(int count);
/**
* Create register set with initial values
* @param count number of registers
* @param initialValues initial register values
*/
public RegisterSet(int count, int[] initialValues);
/**
* Calculate bits needed for count registers
* @param count number of registers
* @return bits required
*/
public static int getBits(int count);
/**
* Calculate size for count registers
* @param count number of registers
* @return size in integers
*/
public static int getSizeForCount(int count);
/**
* Set register value
* @param position register position
* @param value value to set
*/
public void set(int position, int value);
/**
* Get register value
* @param position register position
* @return register value
*/
public int get(int position);
/**
* Update register if new value is greater
* @param position register position
* @param value potential new value
* @return true if register was updated
*/
public boolean updateIfGreater(int position, int value);
/**
* Merge with another register set
* @param that register set to merge with
*/
public void merge(RegisterSet that);
/**
* Get copy of internal bit array
* @return copy of internal array
*/
public int[] bits();
}// For general use, HyperLogLog is recommended
HyperLogLog hll = new HyperLogLog(0.05); // 5% relative standard deviation
// Add elements
hll.offer("element1");
hll.offer("element2");
hll.offer("element1"); // duplicate
// Get estimate
long cardinality = hll.cardinality();// Create compatible estimators
HyperLogLog hll1 = new HyperLogLog(0.1);
HyperLogLog hll2 = new HyperLogLog(0.1);
// Process data on different nodes
hll1.offer("user123");
hll1.offer("user456");
hll2.offer("user789");
hll2.offer("user456"); // duplicate across nodes
// Merge estimators
HyperLogLog combined = (HyperLogLog) hll1.merge(hll2);
long totalUniqueUsers = combined.cardinality();Install with Tessl CLI
npx tessl i tessl/maven-com-clearspring-analytics--stream