or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bloom-filters.mdfrequency-counters.mdhash-maps.mdindex.md

bloom-filters.mddocs/

0

# Bloom Filters

1

2

Space-efficient probabilistic data structure for membership testing. Provides fast membership queries with configurable error rates, no false negatives, but possible false positives.

3

4

## Capabilities

5

6

### BloomFilter Class

7

8

Bloom filter implementation with configurable parameters and serialization support.

9

10

```python { .api }

11

class BloomFilter:

12

"""

13

Bloom filter that allows for basic membership tests.

14

Only integers are supported as keys.

15

"""

16

def __init__(self, size: int = 1024, hash_funcs: int = 23, seed: int = 0):

17

"""

18

Create a new bloom filter.

19

20

Parameters:

21

- size: Size of bit array (must be > 0)

22

- hash_funcs: Number of hash functions (must be > 0)

23

- seed: Seed for hash functions (default 0)

24

25

Raises:

26

- AssertionError: If size or hash_funcs <= 0

27

"""

28

29

@classmethod

30

def from_error_rate(cls, members: int, error_rate: float = 1e-4):

31

"""

32

Create bloom filter optimized for specific member count and error rate.

33

34

Parameters:

35

- members: Expected number of members to add

36

- error_rate: Desired false positive rate (default 1e-4)

37

38

Returns:

39

- BloomFilter instance with optimal parameters

40

"""

41

42

def add(self, item: int) -> None:

43

"""

44

Add item to bloom filter.

45

46

Parameters:

47

- item: Integer item to add

48

"""

49

50

def __contains__(self, item: int) -> bool:

51

"""

52

Test membership in bloom filter.

53

54

Parameters:

55

- item: Integer item to test

56

57

Returns:

58

- True if item might be in set (possible false positive)

59

- False if item is definitely not in set (no false negatives)

60

"""

61

```

62

63

### Serialization

64

65

Methods for persisting and loading bloom filter state.

66

67

```python { .api }

68

def to_bytes(self) -> bytes:

69

"""

70

Serialize bloom filter to bytes.

71

72

Returns:

73

- Byte representation of bloom filter

74

"""

75

76

def from_bytes(self, byte_string: bytes):

77

"""

78

Deserialize bloom filter from bytes.

79

80

Parameters:

81

- byte_string: Bytes containing serialized bloom filter

82

83

Returns:

84

- Self (for method chaining)

85

86

Raises:

87

- ValueError: If serialization version is unknown or incompatible

88

"""

89

```

90

91

### Utility Functions

92

93

Helper functions for bloom filter optimization.

94

95

```python { .api }

96

def calculate_size_and_hash_count(members: int, error_rate: float) -> tuple[int, int]:

97

"""

98

Calculate optimal size and hash function count for bloom filter.

99

100

Parameters:

101

- members: Expected number of members

102

- error_rate: Desired false positive rate

103

104

Returns:

105

- Tuple of (bit_count, hash_count) for optimal performance

106

"""

107

```

108

109

## Usage Examples

110

111

### Basic Membership Testing

112

113

```python

114

from preshed.bloom import BloomFilter

115

116

# Create bloom filter

117

bloom = BloomFilter(size=1024, hash_funcs=3)

118

119

# Add items

120

bloom.add(12345)

121

bloom.add(54321)

122

bloom.add(98765)

123

124

# Test membership

125

print(12345 in bloom) # True (definitely added)

126

print(54321 in bloom) # True (definitely added)

127

print(99999 in bloom) # False (probably not added, no false negatives)

128

print(11111 in bloom) # False or True (possible false positive)

129

```

130

131

### Optimized for Error Rate

132

133

```python

134

from preshed.bloom import BloomFilter

135

136

# Create bloom filter optimized for 1000 members with 0.01% error rate

137

bloom = BloomFilter.from_error_rate(members=1000, error_rate=0.0001)

138

139

# Add expected number of items

140

for i in range(1000):

141

bloom.add(i)

142

143

# Test membership

144

print(500 in bloom) # True (added)

145

print(1500 in bloom) # False or True (very low probability of false positive)

146

```

147

148

### Manual Parameter Calculation

149

150

```python

151

from preshed.bloom import BloomFilter, calculate_size_and_hash_count

152

153

# Calculate optimal parameters

154

members = 5000

155

error_rate = 0.001

156

size, hash_funcs = calculate_size_and_hash_count(members, error_rate)

157

158

print(f"Optimal size: {size} bits")

159

print(f"Optimal hash functions: {hash_funcs}")

160

161

# Create bloom filter with calculated parameters

162

bloom = BloomFilter(size=size, hash_funcs=hash_funcs)

163

```

164

165

### Serialization and Persistence

166

167

```python

168

from preshed.bloom import BloomFilter

169

170

# Create and populate bloom filter

171

bloom1 = BloomFilter(size=1024, hash_funcs=3)

172

for i in range(100):

173

bloom1.add(i)

174

175

# Serialize to bytes

176

data = bloom1.to_bytes()

177

print(f"Serialized size: {len(data)} bytes")

178

179

# Create new bloom filter and deserialize

180

bloom2 = BloomFilter()

181

bloom2.from_bytes(data)

182

183

# Verify they work identically

184

for i in range(100):

185

assert (i in bloom1) == (i in bloom2)

186

187

print("Serialization successful!")

188

```

189

190

### Large-Scale Membership Testing

191

192

```python

193

from preshed.bloom import BloomFilter

194

import random

195

196

# Create large bloom filter for millions of items

197

bloom = BloomFilter.from_error_rate(members=1_000_000, error_rate=0.0001)

198

199

# Add random items

200

added_items = set()

201

for _ in range(50000):

202

item = random.randint(1, 10_000_000)

203

bloom.add(item)

204

added_items.add(item)

205

206

# Test known items (should all be True)

207

true_positives = 0

208

for item in list(added_items)[:1000]:

209

if item in bloom:

210

true_positives += 1

211

212

print(f"True positives: {true_positives}/1000") # Should be 1000

213

214

# Test random items (some false positives expected)

215

false_positives = 0

216

for _ in range(10000):

217

item = random.randint(10_000_001, 20_000_000) # Definitely not added

218

if item in bloom:

219

false_positives += 1

220

221

print(f"False positives: {false_positives}/10000") # Should be ~1 (0.01%)

222

```

223

224

### Bloom Filter as Set Alternative

225

226

```python

227

from preshed.bloom import BloomFilter

228

229

def is_prime(n):

230

"""Simple prime check for demonstration."""

231

if n < 2:

232

return False

233

for i in range(2, int(n**0.5) + 1):

234

if n % i == 0:

235

return False

236

return True

237

238

# Store known primes in bloom filter for fast lookup

239

prime_bloom = BloomFilter.from_error_rate(members=10000, error_rate=0.0001)

240

241

# Add first 10000 primes

242

count = 0

243

num = 2

244

while count < 10000:

245

if is_prime(num):

246

prime_bloom.add(num)

247

count += 1

248

num += 1

249

250

# Fast membership testing

251

def might_be_prime(n):

252

"""Quick filter for prime candidates."""

253

if n in prime_bloom:

254

return True # Might be prime (check further)

255

else:

256

return False # Definitely not prime (if n < threshold)

257

258

# Test some numbers

259

test_numbers = [17, 97, 997, 9973, 10007, 10009]

260

for num in test_numbers:

261

if might_be_prime(num):

262

print(f"{num}: Possible prime (check with full algorithm)")

263

else:

264

print(f"{num}: Definitely not prime")

265

```

266

267

### Memory-Efficient Deduplication

268

269

```python

270

from preshed.bloom import BloomFilter

271

272

# Use bloom filter for memory-efficient duplicate detection

273

seen_bloom = BloomFilter.from_error_rate(members=100000, error_rate=0.001)

274

definite_duplicates = set()

275

276

def process_item(item_id):

277

"""Process item, tracking potential duplicates."""

278

if item_id in seen_bloom:

279

# Might be duplicate - need further verification

280

if item_id in definite_duplicates:

281

print(f"Definite duplicate: {item_id}")

282

return False

283

else:

284

# First potential duplicate

285

definite_duplicates.add(item_id)

286

print(f"Potential duplicate (false positive?): {item_id}")

287

else:

288

# Definitely not seen before

289

seen_bloom.add(item_id)

290

print(f"New item: {item_id}")

291

292

return True

293

294

# Process some items

295

items = [1, 2, 3, 1, 4, 2, 5, 6, 1]

296

for item in items:

297

process_item(item)

298

```

299

300

## Performance Characteristics

301

302

- **Space efficiency**: Uses much less memory than storing actual items

303

- **Time complexity**: O(k) where k is number of hash functions

304

- **False positives**: Controllable via size and hash function parameters

305

- **False negatives**: Never occur (guaranteed accuracy for "not in set")

306

- **Scalability**: Performance independent of number of items added

307

- **Hash functions**: Uses MurmurHash for good distribution properties