or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cardinality.mdfrequency.mdhash.mdindex.mdmembership.mdquantile.mdstream-summary.md

cardinality.mddocs/

0

# Cardinality Estimation

1

2

Probabilistic algorithms for estimating the number of unique elements in a data stream. These algorithms provide memory-efficient approximations with configurable accuracy trade-offs and support merging for distributed computing scenarios.

3

4

## Capabilities

5

6

### ICardinality Interface

7

8

Common interface implemented by all cardinality estimation algorithms.

9

10

```java { .api }

11

/**

12

* Interface for cardinality estimation algorithms

13

*/

14

public interface ICardinality {

15

/**

16

* Add element to the estimator

17

* @param o stream element

18

* @return false if cardinality estimate is unaffected by this element

19

*/

20

boolean offer(Object o);

21

22

/**

23

* Add pre-hashed element to the estimator

24

* @param hashedLong pre-computed hash of the element

25

* @return false if cardinality estimate is unaffected

26

*/

27

boolean offerHashed(long hashedLong);

28

29

/**

30

* Add pre-hashed element to the estimator

31

* @param hashedInt pre-computed hash of the element

32

* @return false if cardinality estimate is unaffected

33

*/

34

boolean offerHashed(int hashedInt);

35

36

/**

37

* Get estimated number of unique elements

38

* @return estimated cardinality

39

*/

40

long cardinality();

41

42

/**

43

* Get size in bytes needed for serialization

44

* @return byte size

45

*/

46

int sizeof();

47

48

/**

49

* Serialize the estimator to byte array

50

* @return serialized bytes

51

* @throws IOException if serialization fails

52

*/

53

byte[] getBytes() throws IOException;

54

55

/**

56

* Merge estimators to produce combined estimate

57

* @param estimators compatible estimators to merge

58

* @return new estimator for combined streams

59

* @throws CardinalityMergeException if estimators are incompatible

60

*/

61

ICardinality merge(ICardinality... estimators) throws CardinalityMergeException;

62

}

63

```

64

65

### HyperLogLog

66

67

HyperLogLog algorithm for cardinality estimation with accuracy = 1.04/sqrt(m) where m = 2^b. Requires 64% less space than LogLog for the same accuracy.

68

69

```java { .api }

70

/**

71

* HyperLogLog cardinality estimator

72

*/

73

public class HyperLogLog implements ICardinality, Serializable {

74

/**

75

* Create HyperLogLog with specified relative standard deviation

76

* @param rsd relative standard deviation (e.g., 0.1 for 10%)

77

*/

78

public HyperLogLog(double rsd);

79

80

/**

81

* Create HyperLogLog with specified log2m parameter

82

* @param log2m log base 2 of number of buckets (4-16 recommended)

83

*/

84

public HyperLogLog(int log2m);

85

86

/**

87

* Create HyperLogLog with log2m parameter and existing register set

88

* @param log2m log base 2 of number of buckets

89

* @param registerSet existing register set to use

90

*/

91

public HyperLogLog(int log2m, RegisterSet registerSet);

92

93

/**

94

* Merge another HyperLogLog into this one

95

* @param other HyperLogLog to merge

96

*/

97

public void addAll(HyperLogLog other);

98

99

/**

100

* Builder for HyperLogLog configuration

101

*/

102

public static class Builder implements IBuilder<HyperLogLog> {

103

public Builder(double rsd);

104

public Builder withb(int b);

105

public HyperLogLog build();

106

public int sizeof();

107

108

// Static factory methods

109

public static Builder withLog2m(int log2m);

110

public static Builder withRsd(double rsd);

111

public static Builder withAccuracy(double accuracy);

112

113

// Build from serialized data

114

public static HyperLogLog build(byte[] bytes) throws IOException;

115

public static HyperLogLog build(DataInput serializedByteStream) throws IOException;

116

}

117

118

public static class HyperLogLogMergeException extends CardinalityMergeException {

119

public HyperLogLogMergeException(String message);

120

}

121

}

122

```

123

124

**Usage Examples:**

125

126

```java

127

import com.clearspring.analytics.stream.cardinality.HyperLogLog;

128

129

// Create with 10% relative standard deviation

130

HyperLogLog hll = new HyperLogLog(0.1);

131

132

// Add elements

133

hll.offer("user123");

134

hll.offer("user456");

135

hll.offer("user123"); // duplicate, won't affect cardinality much

136

137

// Get estimate

138

long uniqueUsers = hll.cardinality();

139

140

// Merge with another HLL

141

HyperLogLog hll2 = new HyperLogLog(0.1);

142

hll2.offer("user789");

143

HyperLogLog merged = (HyperLogLog) hll.merge(hll2);

144

```

145

146

### HyperLogLogPlus

147

148

Enhanced HyperLogLog with improved accuracy for small cardinalities through sparse representation.

149

150

```java { .api }

151

/**

152

* HyperLogLog++ with improved small cardinality accuracy

153

*/

154

public class HyperLogLogPlus implements ICardinality, Serializable {

155

/**

156

* Create HyperLogLogPlus with precision parameter

157

* @param p precision parameter (4-25 recommended)

158

*/

159

public HyperLogLogPlus(int p);

160

161

/**

162

* Create HyperLogLogPlus with precision and sparse precision

163

* @param p normal precision parameter

164

* @param sp sparse precision parameter

165

*/

166

public HyperLogLogPlus(int p, int sp);

167

168

/**

169

* Builder for HyperLogLogPlus configuration

170

*/

171

public static class Builder implements IBuilder<HyperLogLogPlus> {

172

public Builder(int p);

173

public Builder(int p, int sp);

174

public HyperLogLogPlus build();

175

public int sizeof();

176

}

177

}

178

```

179

180

### LogLog

181

182

Original LogLog cardinality estimation algorithm. Less memory efficient than HyperLogLog but simpler implementation.

183

184

```java { .api }

185

/**

186

* LogLog cardinality estimator

187

*/

188

public class LogLog implements ICardinality {

189

/**

190

* Create LogLog with k parameter

191

* @param k parameter controlling accuracy vs memory (4-16 recommended)

192

*/

193

public LogLog(int k);

194

195

/**

196

* Create LogLog from existing data

197

* @param M byte array representing internal state

198

*/

199

public LogLog(byte[] M);

200

201

/**

202

* Merge multiple LogLog estimators

203

* @param estimators LogLog instances to merge

204

* @return merged LogLog estimator

205

* @throws LogLogMergeException if estimators are incompatible

206

*/

207

public static LogLog mergeEstimators(LogLog... estimators) throws LogLogMergeException;

208

209

/**

210

* Helper function for LogLog algorithm

211

* @param x input value

212

* @param k parameter

213

* @return processed value

214

*/

215

public static int rho(int x, int k);

216

217

/**

218

* Builder for LogLog configuration

219

*/

220

public static class Builder implements IBuilder<LogLog> {

221

public Builder(int k);

222

public LogLog build();

223

public int sizeof();

224

}

225

226

public static class LogLogMergeException extends CardinalityMergeException {

227

public LogLogMergeException(String message);

228

}

229

}

230

```

231

232

### LinearCounting

233

234

Linear counting algorithm for cardinality estimation using bit arrays. More accurate for small cardinalities but uses more memory.

235

236

```java { .api }

237

/**

238

* Linear counting cardinality estimator

239

*/

240

public class LinearCounting implements ICardinality {

241

/**

242

* Create LinearCounting with bit array size

243

* @param size size of the bit array

244

*/

245

public LinearCounting(int size);

246

247

/**

248

* Create LinearCounting from existing bit map

249

* @param map byte array representing bit map

250

*/

251

public LinearCounting(byte[] map);

252

253

/**

254

* Get utilization ratio of the bit array

255

* @return ratio of set bits (0.0 to 1.0)

256

*/

257

public double getUtilization();

258

259

/**

260

* Get count of unset bits

261

* @return number of zero bits

262

*/

263

public int getCount();

264

265

/**

266

* Check if the bit array is saturated

267

* @return true if too many bits are set for accurate estimation

268

*/

269

public boolean isSaturated();

270

271

/**

272

* Merge multiple LinearCounting estimators

273

* @param estimators LinearCounting instances to merge

274

* @return merged LinearCounting estimator

275

* @throws LinearCountingMergeException if estimators are incompatible

276

*/

277

public static LinearCounting mergeEstimators(LinearCounting... estimators)

278

throws LinearCountingMergeException;

279

280

/**

281

* Advanced builder with error calculation

282

*/

283

public static class Builder implements IBuilder<LinearCounting> {

284

public Builder(int size);

285

public Builder withSize(int size);

286

public LinearCounting build();

287

public int sizeof();

288

}

289

290

public static class LinearCountingMergeException extends CardinalityMergeException {

291

public LinearCountingMergeException(String message);

292

}

293

}

294

```

295

296

### AdaptiveCounting

297

298

Adaptive counting algorithm that automatically switches between different estimation techniques based on the current cardinality.

299

300

```java { .api }

301

/**

302

* Adaptive counting with automatic algorithm switching

303

*/

304

public class AdaptiveCounting extends LogLog {

305

/**

306

* Create AdaptiveCounting with k parameter

307

* @param k parameter controlling accuracy vs memory

308

*/

309

public AdaptiveCounting(int k);

310

311

/**

312

* Create AdaptiveCounting from existing data

313

* @param M byte array representing internal state

314

*/

315

public AdaptiveCounting(byte[] M);

316

}

317

```

318

319

### CountThenEstimate

320

321

Hybrid approach that counts exactly up to a threshold, then switches to probabilistic estimation.

322

323

```java { .api }

324

/**

325

* Hybrid exact counting then estimation

326

*/

327

public class CountThenEstimate implements ICardinality {

328

/**

329

* Create hybrid estimator

330

* @param exactCountingThreshold threshold for switching to estimation

331

* @param backingEstimator probabilistic estimator to use after threshold

332

*/

333

public CountThenEstimate(int exactCountingThreshold, ICardinality backingEstimator);

334

}

335

```

336

337

### RegisterSet

338

339

Efficient bit-packed register storage used internally by HyperLogLog algorithms.

340

341

```java { .api }

342

/**

343

* Bit-packed register storage for HyperLogLog

344

*/

345

public class RegisterSet {

346

public static final int LOG2_BITS_PER_WORD = 6;

347

public static final int REGISTER_SIZE = 5;

348

349

/**

350

* Create register set with specified count

351

* @param count number of registers

352

*/

353

public RegisterSet(int count);

354

355

/**

356

* Create register set with initial values

357

* @param count number of registers

358

* @param initialValues initial register values

359

*/

360

public RegisterSet(int count, int[] initialValues);

361

362

/**

363

* Calculate bits needed for count registers

364

* @param count number of registers

365

* @return bits required

366

*/

367

public static int getBits(int count);

368

369

/**

370

* Calculate size for count registers

371

* @param count number of registers

372

* @return size in integers

373

*/

374

public static int getSizeForCount(int count);

375

376

/**

377

* Set register value

378

* @param position register position

379

* @param value value to set

380

*/

381

public void set(int position, int value);

382

383

/**

384

* Get register value

385

* @param position register position

386

* @return register value

387

*/

388

public int get(int position);

389

390

/**

391

* Update register if new value is greater

392

* @param position register position

393

* @param value potential new value

394

* @return true if register was updated

395

*/

396

public boolean updateIfGreater(int position, int value);

397

398

/**

399

* Merge with another register set

400

* @param that register set to merge with

401

*/

402

public void merge(RegisterSet that);

403

404

/**

405

* Get copy of internal bit array

406

* @return copy of internal array

407

*/

408

public int[] bits();

409

}

410

```

411

412

## Usage Patterns

413

414

### Basic Cardinality Estimation

415

416

```java

417

// For general use, HyperLogLog is recommended

418

HyperLogLog hll = new HyperLogLog(0.05); // 5% relative standard deviation

419

420

// Add elements

421

hll.offer("element1");

422

hll.offer("element2");

423

hll.offer("element1"); // duplicate

424

425

// Get estimate

426

long cardinality = hll.cardinality();

427

```

428

429

### Distributed Cardinality Estimation

430

431

```java

432

// Create compatible estimators

433

HyperLogLog hll1 = new HyperLogLog(0.1);

434

HyperLogLog hll2 = new HyperLogLog(0.1);

435

436

// Process data on different nodes

437

hll1.offer("user123");

438

hll1.offer("user456");

439

440

hll2.offer("user789");

441

hll2.offer("user456"); // duplicate across nodes

442

443

// Merge estimators

444

HyperLogLog combined = (HyperLogLog) hll1.merge(hll2);

445

long totalUniqueUsers = combined.cardinality();

446

```

447

448

### Algorithm Selection Guidelines

449

450

- **HyperLogLog**: Best general-purpose algorithm, good accuracy and memory efficiency

451

- **HyperLogLogPlus**: Use when small cardinalities need high accuracy

452

- **LinearCounting**: Use for small cardinalities when memory is not a constraint

453

- **LogLog**: Use when simplicity is preferred over efficiency

454

- **AdaptiveCounting**: Use when cardinality range is unknown

455

- **CountThenEstimate**: Use when exact small counts are needed but large counts can be estimated