CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-preshed

Cython hash table that trusts the keys are pre-hashed

Pending
Overview
Eval results
Files

frequency-counters.mddocs/

Frequency Counters

Efficient frequency counting for uint64-valued keys with optional Good-Turing smoothing for probability estimation. Designed for statistical applications and natural language processing tasks.

Capabilities

PreshCounter Class

Frequency counter with statistical operations and probability estimation capabilities.

class PreshCounter:
    """
    Count occurrences of uint64-valued keys with optional smoothing.
    """
    def __init__(self, initial_size: int = 8):
        """
        Create a new frequency counter.
        
        Parameters:
        - initial_size: Initial capacity (must be power of 2, defaults to 8)
        """
        
    def __getitem__(self, key: int) -> int:
        """
        Get count for key.
        
        Parameters:
        - key: 64-bit unsigned integer key
        
        Returns:
        - Count for key (0 if key not seen)
        """
        
    def __len__(self) -> int:
        """
        Get capacity of underlying map.
        
        Returns:
        - Capacity of counter
        """
        
    def __iter__(self) -> Iterator[tuple[int, int]]:
        """
        Iterate over (key, count) pairs.
        
        Returns:
        - Iterator yielding (key, count) tuples
        """

Counting Operations

Core counting functionality for frequency tracking.

def inc(self, key: int, inc: int) -> int:
    """
    Increment counter for key.
    
    Parameters:
    - key: 64-bit unsigned integer key
    - inc: Amount to increment (can be negative)
    
    Returns:
    - New count for key
    
    Raises:
    - Exception: On error
    """

Statistical Operations

Probability estimation and smoothing operations.

def prob(self, key: int) -> float:
    """
    Get probability for key.
    
    Parameters:
    - key: 64-bit unsigned integer key
    
    Returns:
    - Probability (count/total) or smoothed probability if smoother is set
    """
    
def smooth(self) -> None:
    """
    Apply Good-Turing smoothing to the frequency distribution.
    Creates a GaleSmoother instance for probability estimation.
    
    Raises:
    - AssertionError: If data cannot be smoothed (need items with count 1 and 2)
    """

Properties

@property
def total(self) -> int:
    """
    Get total count across all keys.
    
    Returns:
    - Sum of all counts
    """
    
@property
def length(self) -> int:
    """
    Get capacity of underlying map.
    
    Returns:
    - Capacity of counter
    """
    
@property
def smoother(self) -> GaleSmoother | None:
    """
    Get smoother instance (None until smooth() is called).
    
    Returns:
    - GaleSmoother instance or None
    """

GaleSmoother Class

Good-Turing smoother for probability estimation (created by smooth() method).

class GaleSmoother:
    """
    Good-Turing smoother for frequency distributions.
    """
    def __call__(self, count: int) -> float:
        """
        Get smoothed count for raw count value.
        
        Parameters:
        - count: Raw frequency count
        
        Returns:
        - Smoothed count value
        """
        
    @property
    def cutoff(self) -> int:
        """
        Get smoothing cutoff value.
        
        Returns:
        - Cutoff value for smoothing
        """
        
    @property
    def total(self) -> float:
        """
        Get total smoothed count.
        
        Returns:
        - Total of all smoothed counts
        """
        
    def count_count(self, r: int) -> int:
        """
        Get count of items with specific frequency.
        
        Parameters:
        - r: Frequency value to query
        
        Returns:
        - Number of items that occur exactly r times
        """

Usage Examples

Basic Counting

from preshed.counter import PreshCounter

# Create counter
counter = PreshCounter()

# Count occurrences
counter.inc(42, 1)  # Increment key 42 by 1
counter.inc(42, 4)  # Increment key 42 by 4 more
counter.inc(100, 10)  # Increment key 100 by 10

print(counter[42])   # 5
print(counter[100])  # 10
print(counter[999])  # 0 (never seen)

print(counter.total)  # 15

Probability Estimation

from preshed.counter import PreshCounter

counter = PreshCounter()

# Add some data
counter.inc(1, 10)  # 10 occurrences
counter.inc(2, 20)  # 20 occurrences
counter.inc(3, 5)   # 5 occurrences

# Get raw probabilities
print(counter.prob(1))  # 10/35 ≈ 0.286
print(counter.prob(2))  # 20/35 ≈ 0.571
print(counter.prob(3))  # 5/35 ≈ 0.143
print(counter.prob(999))  # 0.0 (unseen)

Good-Turing Smoothing

from preshed.counter import PreshCounter

# Create frequency distribution
counter = PreshCounter()

# Add data (need items with count 1 and 2 for smoothing)
for i in range(10):
    counter.inc(100 + i, 1)  # 10 items with frequency 1

for i in range(6):
    counter.inc(200 + i, 2)  # 6 items with frequency 2

for i in range(4):
    counter.inc(300 + i, 3)  # 4 items with frequency 3

print(f"Total before smoothing: {counter.total}")

# Apply smoothing
counter.smooth()

# Now probabilities are smoothed
print(counter.prob(100))  # Smoothed probability for freq-1 item
print(counter.prob(999))  # Non-zero probability for unseen item

# Access smoother directly
smoother = counter.smoother
print(smoother(1))  # Smoothed count for frequency 1
print(smoother.total)  # Total smoothed count

Iteration

from preshed.counter import PreshCounter

counter = PreshCounter()
counter.inc(1, 5)
counter.inc(2, 3)
counter.inc(3, 8)

# Iterate over (key, count) pairs
for key, count in counter:
    print(f"Key {key}: {count} occurrences")

# Sort by frequency
sorted_items = sorted(counter, key=lambda x: x[1], reverse=True)
for key, count in sorted_items:
    print(f"Key {key}: {count} occurrences")

Statistical Analysis

from preshed.counter import PreshCounter

# Build frequency distribution
counter = PreshCounter()

# Add data points
data = [1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5]
for item in data:
    counter.inc(item, 1)

print(f"Total items: {counter.total}")  # 15
print(f"Unique items: {len([k for k, c in counter if c > 0])}")

# Get frequency statistics
frequencies = [count for key, count in counter if count > 0]
print(f"Max frequency: {max(frequencies)}")
print(f"Min frequency: {min(frequencies)}")

# Apply smoothing for better probability estimates
if len(set(frequencies)) >= 2:  # Need variety for smoothing
    counter.smooth()
    print(f"Smoothed probability for unseen item: {counter.prob(999)}")

Performance Characteristics

  • Counting: O(1) average case for increment operations
  • Probability: O(1) for probability calculations
  • Smoothing: O(n) where n is number of unique frequencies
  • Memory: Efficient storage using underlying hash map
  • Statistical accuracy: Good-Turing smoothing provides improved probability estimates

Install with Tessl CLI

npx tessl i tessl/pypi-preshed

docs

bloom-filters.md

frequency-counters.md

hash-maps.md

index.md

tile.json