or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cardinality.mdfrequency.mdhash.mdindex.mdmembership.mdquantile.mdstream-summary.md

membership.mddocs/

0

# Set Membership Testing

1

2

Bloom filters and related data structures for probabilistic set membership testing. These provide memory-efficient ways to test whether an element is in a set, with configurable false positive rates but no false negatives.

3

4

## Capabilities

5

6

### Filter (Abstract Base Class)

7

8

Base class for membership filters providing common functionality.

9

10

```java { .api }

11

/**

12

* Abstract base class for membership filters

13

*/

14

public abstract class Filter {

15

protected int hashCount;

16

17

/**

18

* Get number of hash functions used

19

* @return hash function count

20

*/

21

public int getHashCount();

22

23

/**

24

* Get hash buckets for string key

25

* @param key string to hash

26

* @return array of hash bucket indices

27

*/

28

protected int[] getHashBuckets(String key);

29

30

/**

31

* Get hash buckets for byte array key

32

* @param key bytes to hash

33

* @return array of hash bucket indices

34

*/

35

protected int[] getHashBuckets(byte[] key);

36

37

/**

38

* Static method to get hash buckets

39

* @param key string to hash

40

* @param hashCount number of hash functions

41

* @param max maximum bucket index

42

* @return array of hash bucket indices

43

*/

44

public static int[] getHashBuckets(String key, int hashCount, int max);

45

}

46

```

47

48

### BloomFilter

49

50

Bloom filter implementation for probabilistic set membership testing with configurable false positive probability.

51

52

```java { .api }

53

/**

54

* Bloom filter for set membership testing

55

*/

56

public class BloomFilter extends Filter {

57

/**

58

* Create Bloom filter with specified capacity and buckets per element

59

* @param numElements expected number of elements

60

* @param bucketsPerElement buckets per element (affects false positive rate)

61

*/

62

public BloomFilter(int numElements, int bucketsPerElement);

63

64

/**

65

* Create Bloom filter with specified capacity and false positive probability

66

* @param numElements expected number of elements

67

* @param maxFalsePosProbability maximum false positive probability (0.0 to 1.0)

68

*/

69

public BloomFilter(int numElements, double maxFalsePosProbability);

70

71

/**

72

* Clear all elements from the filter

73

*/

74

public void clear();

75

76

/**

77

* Get number of buckets in the filter

78

* @return bucket count

79

*/

80

public int buckets();

81

82

/**

83

* Test if string key might be in the set

84

* @param key string to test

85

* @return false if definitely not in set, true if might be in set

86

*/

87

public boolean isPresent(String key);

88

89

/**

90

* Test if byte array key might be in the set

91

* @param key bytes to test

92

* @return false if definitely not in set, true if might be in set

93

*/

94

public boolean isPresent(byte[] key);

95

96

/**

97

* Add string key to the filter

98

* @param key string to add

99

*/

100

public void add(String key);

101

102

/**

103

* Add byte array key to the filter

104

* @param key bytes to add

105

*/

106

public void add(byte[] key);

107

108

/**

109

* Merge another Bloom filter into this one

110

* @param other Bloom filter to merge (must be compatible)

111

*/

112

public void addAll(BloomFilter other);

113

114

/**

115

* Merge multiple filters into a new filter

116

* @param filters filters to merge

117

* @return new merged filter

118

*/

119

public Filter merge(Filter... filters);

120

121

/**

122

* Create filter that always returns true for isPresent

123

* @return always-matching Bloom filter

124

*/

125

public static BloomFilter alwaysMatchingBloomFilter();

126

127

/**

128

* Get serializer for Bloom filters

129

* @return compact serializer instance

130

*/

131

public static ICompactSerializer<BloomFilter> serializer();

132

}

133

```

134

135

**Usage Examples:**

136

137

```java

138

import com.clearspring.analytics.stream.membership.BloomFilter;

139

140

// Create filter expecting 1000 elements with 1% false positive rate

141

BloomFilter filter = new BloomFilter(1000, 0.01);

142

143

// Add elements to the set

144

filter.add("user123");

145

filter.add("user456");

146

filter.add("user789");

147

148

// Test membership

149

boolean mightContain123 = filter.isPresent("user123"); // true

150

boolean mightContain999 = filter.isPresent("user999"); // false (or true if false positive)

151

152

// Add more elements

153

filter.add("session_abc");

154

filter.add("session_def");

155

156

// Test with byte arrays

157

byte[] keyBytes = "some_key".getBytes();

158

filter.add(keyBytes);

159

boolean containsKey = filter.isPresent(keyBytes); // true

160

```

161

162

### BloomCalculations

163

164

Utility class for calculating optimal Bloom filter parameters.

165

166

```java { .api }

167

/**

168

* Utility class for Bloom filter parameter calculations

169

*/

170

public class BloomCalculations {

171

/**

172

* Compute optimal number of hash functions

173

* @param bucketsPerElement buckets per element ratio

174

* @return optimal number of hash functions

175

*/

176

public static int computeBestK(int bucketsPerElement);

177

178

/**

179

* Compute optimal buckets and hash functions for target false positive rate

180

* @param maxFalsePosProbability desired false positive probability

181

* @return specification with optimal parameters

182

*/

183

public static BloomSpecification computeBucketsAndK(double maxFalsePosProbability);

184

185

/**

186

* Bloom filter specification with calculated parameters

187

*/

188

public static class BloomSpecification {

189

public final int K; // Number of hash functions

190

public final int bucketsPerElement; // Buckets per element

191

192

public BloomSpecification(int k, int bucketsPerElement);

193

}

194

}

195

```

196

197

### ICompactSerializer

198

199

Interface for compact serialization of data structures.

200

201

```java { .api }

202

/**

203

* Interface for compact serialization

204

*/

205

public interface ICompactSerializer<T> {

206

/**

207

* Serialize object to data output

208

* @param t object to serialize

209

* @param out output stream

210

* @throws IOException if serialization fails

211

*/

212

void serialize(T t, DataOutput out) throws IOException;

213

214

/**

215

* Deserialize object from data input

216

* @param in input stream

217

* @return deserialized object

218

* @throws IOException if deserialization fails

219

*/

220

T deserialize(DataInput in) throws IOException;

221

222

/**

223

* Get serialized size of object

224

* @param t object to measure

225

* @return size in bytes

226

*/

227

long serializedSize(T t);

228

}

229

```

230

231

### Data Buffers

232

233

Utility classes for buffered I/O operations.

234

235

```java { .api }

236

/**

237

* Buffer for data input operations

238

*/

239

public class DataInputBuffer implements DataInput {

240

/**

241

* Create empty input buffer

242

*/

243

public DataInputBuffer();

244

245

/**

246

* Create input buffer with initial data

247

* @param bytes initial data

248

*/

249

public DataInputBuffer(byte[] bytes);

250

251

/**

252

* Reset buffer with new input data

253

* @param input new data array

254

*/

255

public void reset(byte[] input);

256

257

/**

258

* Reset buffer with range of input data

259

* @param input data array

260

* @param start start index

261

* @param length number of bytes

262

*/

263

public void reset(byte[] input, int start, int length);

264

265

/**

266

* Get underlying data array

267

* @return data bytes

268

*/

269

public byte[] getData();

270

271

/**

272

* Get current position in buffer

273

* @return current position

274

*/

275

public int getPosition();

276

277

/**

278

* Get length of valid data

279

* @return data length

280

*/

281

public int getLength();

282

}

283

284

/**

285

* Buffer for data output operations

286

*/

287

public class DataOutputBuffer implements DataOutput {

288

/**

289

* Create output buffer with default size

290

*/

291

public DataOutputBuffer();

292

293

/**

294

* Create output buffer with specified initial size

295

* @param size initial buffer size

296

*/

297

public DataOutputBuffer(int size);

298

299

/**

300

* Get data written to buffer

301

* @return byte array of written data

302

*/

303

public byte[] getData();

304

305

/**

306

* Get length of data written

307

* @return number of bytes written

308

*/

309

public int getLength();

310

311

/**

312

* Reset buffer to empty state

313

*/

314

public void reset();

315

316

/**

317

* Write buffer contents to output stream

318

* @param out output stream

319

* @throws IOException if write fails

320

*/

321

public void writeTo(OutputStream out) throws IOException;

322

}

323

```

324

325

### BitSetSerializer

326

327

Utility for serializing BitSet objects.

328

329

```java { .api }

330

/**

331

* Serialization utilities for BitSet

332

*/

333

public class BitSetSerializer {

334

/**

335

* Serialize BitSet to data output

336

* @param bs BitSet to serialize

337

* @param out output stream

338

* @throws IOException if serialization fails

339

*/

340

public static void serialize(OpenBitSet bs, DataOutput out) throws IOException;

341

342

/**

343

* Deserialize BitSet from data input

344

* @param in input stream

345

* @return deserialized BitSet

346

* @throws IOException if deserialization fails

347

*/

348

public static OpenBitSet deserialize(DataInput in) throws IOException;

349

}

350

```

351

352

## Usage Patterns

353

354

### Basic Set Membership Testing

355

356

```java

357

// Create filter for expected 10,000 elements with 0.1% false positive rate

358

BloomFilter filter = new BloomFilter(10000, 0.001);

359

360

// Build the set

361

Set<String> actualSet = new HashSet<>();

362

for (String item : itemsToAdd) {

363

filter.add(item);

364

actualSet.add(item);

365

}

366

367

// Test membership (filter provides fast pre-filtering)

368

String candidate = "test_item";

369

if (filter.isPresent(candidate)) {

370

// Might be in set, check actual set for confirmation

371

boolean definitiveAnswer = actualSet.contains(candidate);

372

} else {

373

// Definitely not in set, no need to check further

374

System.out.println(candidate + " is not in the set");

375

}

376

```

377

378

### Distributed Set Operations

379

380

```java

381

// Create compatible filters on different nodes

382

BloomFilter filter1 = new BloomFilter(5000, 0.01);

383

BloomFilter filter2 = new BloomFilter(5000, 0.01);

384

385

// Each node adds its elements

386

filter1.add("node1_item1");

387

filter1.add("node1_item2");

388

389

filter2.add("node2_item1");

390

filter2.add("node2_item2");

391

392

// Merge filters to represent union of sets

393

filter1.addAll(filter2);

394

395

// Now filter1 represents union of both sets

396

boolean mightContain = filter1.isPresent("node2_item1"); // true

397

```

398

399

### Cache Filtering

400

401

```java

402

// Use Bloom filter to avoid expensive cache misses

403

BloomFilter cacheFilter = new BloomFilter(100000, 0.01);

404

405

// When adding to cache, also add to filter

406

void addToCache(String key, Object value) {

407

cache.put(key, value);

408

cacheFilter.add(key);

409

}

410

411

// Check filter before expensive cache lookup

412

Object getValue(String key) {

413

if (!cacheFilter.isPresent(key)) {

414

// Definitely not in cache, skip lookup

415

return computeExpensiveValue(key);

416

}

417

418

// Might be in cache, check cache

419

Object cached = cache.get(key);

420

if (cached != null) {

421

return cached;

422

}

423

424

// False positive, compute value

425

return computeExpensiveValue(key);

426

}

427

```

428

429

### Parameter Selection Guidelines

430

431

**Elements vs Memory Trade-off**:

432

- More buckets per element = lower false positive rate but more memory

433

- 8-10 buckets per element gives ~1% false positive rate

434

- 14 buckets per element gives ~0.1% false positive rate

435

436

**False Positive Rate Impact**:

437

- 1% false positive: Good for most applications

438

- 0.1% false positive: Better for applications where false positives are expensive

439

- 0.01% false positive: Use when false positives must be minimized

440

441

**Hash Function Count**:

442

- Optimal K ≈ 0.7 × (buckets per element)

443

- Too few hash functions: Higher false positive rate

444

- Too many hash functions: Slower operations, higher false positive rate

445

446

## Memory Usage

447

448

Bloom filter memory usage: `(numElements × bucketsPerElement) bits`

449

450

Examples:

451

- 10,000 elements, 1% false positive: ~9.6 KB

452

- 100,000 elements, 0.1% false positive: ~180 KB

453

- 1,000,000 elements, 0.01% false positive: ~2.4 MB

454

455

## Performance Characteristics

456

457

- **Add operation**: O(k) where k is number of hash functions

458

- **Membership test**: O(k) where k is number of hash functions

459

- **Space complexity**: O(m) where m is number of bits

460

- **Merge operation**: O(m) bitwise OR of bit arrays