Cython hash table that trusts the keys are pre-hashed
—
Efficient frequency counting for uint64-valued keys with optional Good-Turing smoothing for probability estimation. Designed for statistical applications and natural language processing tasks.
Frequency counter with statistical operations and probability estimation capabilities.
class PreshCounter:
"""
Count occurrences of uint64-valued keys with optional smoothing.
"""
def __init__(self, initial_size: int = 8):
"""
Create a new frequency counter.
Parameters:
- initial_size: Initial capacity (must be power of 2, defaults to 8)
"""
def __getitem__(self, key: int) -> int:
"""
Get count for key.
Parameters:
- key: 64-bit unsigned integer key
Returns:
- Count for key (0 if key not seen)
"""
def __len__(self) -> int:
"""
Get capacity of underlying map.
Returns:
- Capacity of counter
"""
def __iter__(self) -> Iterator[tuple[int, int]]:
"""
Iterate over (key, count) pairs.
Returns:
- Iterator yielding (key, count) tuples
"""Core counting functionality for frequency tracking.
def inc(self, key: int, inc: int) -> int:
"""
Increment counter for key.
Parameters:
- key: 64-bit unsigned integer key
- inc: Amount to increment (can be negative)
Returns:
- New count for key
Raises:
- Exception: On error
"""Probability estimation and smoothing operations.
def prob(self, key: int) -> float:
"""
Get probability for key.
Parameters:
- key: 64-bit unsigned integer key
Returns:
- Probability (count/total) or smoothed probability if smoother is set
"""
def smooth(self) -> None:
"""
Apply Good-Turing smoothing to the frequency distribution.
Creates a GaleSmoother instance for probability estimation.
Raises:
- AssertionError: If data cannot be smoothed (need items with count 1 and 2)
"""@property
def total(self) -> int:
"""
Get total count across all keys.
Returns:
- Sum of all counts
"""
@property
def length(self) -> int:
"""
Get capacity of underlying map.
Returns:
- Capacity of counter
"""
@property
def smoother(self) -> GaleSmoother | None:
"""
Get smoother instance (None until smooth() is called).
Returns:
- GaleSmoother instance or None
"""Good-Turing smoother for probability estimation (created by smooth() method).
class GaleSmoother:
"""
Good-Turing smoother for frequency distributions.
"""
def __call__(self, count: int) -> float:
"""
Get smoothed count for raw count value.
Parameters:
- count: Raw frequency count
Returns:
- Smoothed count value
"""
@property
def cutoff(self) -> int:
"""
Get smoothing cutoff value.
Returns:
- Cutoff value for smoothing
"""
@property
def total(self) -> float:
"""
Get total smoothed count.
Returns:
- Total of all smoothed counts
"""
def count_count(self, r: int) -> int:
"""
Get count of items with specific frequency.
Parameters:
- r: Frequency value to query
Returns:
- Number of items that occur exactly r times
"""from preshed.counter import PreshCounter
# Create counter
counter = PreshCounter()
# Count occurrences
counter.inc(42, 1) # Increment key 42 by 1
counter.inc(42, 4) # Increment key 42 by 4 more
counter.inc(100, 10) # Increment key 100 by 10
print(counter[42]) # 5
print(counter[100]) # 10
print(counter[999]) # 0 (never seen)
print(counter.total) # 15from preshed.counter import PreshCounter
counter = PreshCounter()
# Add some data
counter.inc(1, 10) # 10 occurrences
counter.inc(2, 20) # 20 occurrences
counter.inc(3, 5) # 5 occurrences
# Get raw probabilities
print(counter.prob(1)) # 10/35 ≈ 0.286
print(counter.prob(2)) # 20/35 ≈ 0.571
print(counter.prob(3)) # 5/35 ≈ 0.143
print(counter.prob(999)) # 0.0 (unseen)from preshed.counter import PreshCounter
# Create frequency distribution
counter = PreshCounter()
# Add data (need items with count 1 and 2 for smoothing)
for i in range(10):
counter.inc(100 + i, 1) # 10 items with frequency 1
for i in range(6):
counter.inc(200 + i, 2) # 6 items with frequency 2
for i in range(4):
counter.inc(300 + i, 3) # 4 items with frequency 3
print(f"Total before smoothing: {counter.total}")
# Apply smoothing
counter.smooth()
# Now probabilities are smoothed
print(counter.prob(100)) # Smoothed probability for freq-1 item
print(counter.prob(999)) # Non-zero probability for unseen item
# Access smoother directly
smoother = counter.smoother
print(smoother(1)) # Smoothed count for frequency 1
print(smoother.total) # Total smoothed countfrom preshed.counter import PreshCounter
counter = PreshCounter()
counter.inc(1, 5)
counter.inc(2, 3)
counter.inc(3, 8)
# Iterate over (key, count) pairs
for key, count in counter:
print(f"Key {key}: {count} occurrences")
# Sort by frequency
sorted_items = sorted(counter, key=lambda x: x[1], reverse=True)
for key, count in sorted_items:
print(f"Key {key}: {count} occurrences")from preshed.counter import PreshCounter
# Build frequency distribution
counter = PreshCounter()
# Add data points
data = [1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5]
for item in data:
counter.inc(item, 1)
print(f"Total items: {counter.total}") # 15
print(f"Unique items: {len([k for k, c in counter if c > 0])}")
# Get frequency statistics
frequencies = [count for key, count in counter if count > 0]
print(f"Max frequency: {max(frequencies)}")
print(f"Min frequency: {min(frequencies)}")
# Apply smoothing for better probability estimates
if len(set(frequencies)) >= 2: # Need variety for smoothing
counter.smooth()
print(f"Smoothed probability for unseen item: {counter.prob(999)}")Install with Tessl CLI
npx tessl i tessl/pypi-preshed