Cython hash table that trusts the keys are pre-hashed
npx @tessl/cli install tessl/pypi-preshed@4.0.0A high-performance Cython-based hash table library for Python that assumes keys come pre-hashed. Inspired by Jeff Preshing's hash table design, preshed provides efficient data structures optimized for use cases where keys are already randomized, such as natural language processing pipelines, caching systems, and other performance-critical applications.
cymem>=2.0.2,<2.1.0, murmurhash>=0.28.0,<1.1.0pip install preshedfrom preshed.maps import PreshMap
from preshed.counter import PreshCounter
from preshed.bloom import BloomFilter, calculate_size_and_hash_countPackage metadata:
from preshed.about import __version__, __author__, __license__
print(__version__) # "4.0.0"
print(__author__) # "Explosion"
print(__license__) # "MIT"
# Or access via main module
import preshed
print(preshed.__version__) # "4.0.0"from preshed.maps import PreshMap
from preshed.counter import PreshCounter
from preshed.bloom import BloomFilter
# Hash map for pre-hashed keys
hash_map = PreshMap()
hash_map[12345] = 67890
print(hash_map[12345]) # 67890
# Counter with frequency tracking
counter = PreshCounter()
counter.inc(42, 5) # Increment key 42 by 5
print(counter[42]) # 5
print(counter.total) # 5
# Bloom filter for membership testing
bloom = BloomFilter(size=1024, hash_funcs=3)
bloom.add(12345)
print(12345 in bloom) # True
print(54321 in bloom) # False (probably)Preshed provides three main high-performance data structures:
All data structures are implemented in Cython for maximum performance and designed to work with pre-hashed keys to minimize hash computation overhead. The library compiles to native C extensions, providing near-C performance while maintaining a Python interface.
High-performance hash table that maps pre-hashed uint64_t keys to uint64_t values using open addressing with linear probing. Provides dictionary-like interface with O(1) average-case performance.
class PreshMap:
def __init__(self, initial_size: int = 8): ...
def __getitem__(self, key: int) -> int | None: ... # Returns None if key not found
def __setitem__(self, key: int, value: int) -> None: ...
def __delitem__(self, key: int) -> None: ... # Raises KeyError if key not found
def __len__(self) -> int: ...
def __contains__(self, key: int) -> bool: ...
def pop(self, key: int, default=None) -> int | None: ...
def items(self) -> Iterator[tuple[int, int]]: ...
def keys(self) -> Iterator[int]: ...
def values(self) -> Iterator[int]: ...
@property
def capacity(self) -> int: ...Efficient frequency counting for uint64-valued keys with optional Good-Turing smoothing for probability estimation. Supports statistical operations and probability calculations.
class PreshCounter:
def __init__(self, initial_size: int = 8): ...
def __getitem__(self, key: int) -> int: ...
def __len__(self) -> int: ...
def __iter__(self) -> Iterator[tuple[int, int]]: ... # Yields (key, count) pairs
def inc(self, key: int, inc: int) -> int: ...
def prob(self, key: int) -> float: ...
def smooth(self) -> None: ...
@property
def total(self) -> int: ...
@property
def length(self) -> int: ...Space-efficient probabilistic data structure for membership testing. Provides configurable error rates and serialization capabilities with no false negatives but possible false positives.
class BloomFilter:
def __init__(self, size: int = 1024, hash_funcs: int = 23, seed: int = 0): ...
@classmethod
def from_error_rate(cls, members: int, error_rate: float = 1e-4): ...
def add(self, item: int) -> None: ...
def __contains__(self, item: int) -> bool: ...
def to_bytes(self) -> bytes: ...
def from_bytes(self, byte_string: bytes): ...
def calculate_size_and_hash_count(members: int, error_rate: float) -> tuple[int, int]: ...# All keys are expected to be 64-bit unsigned integers (pre-hashed)
Key = int # uint64_t
Value = int # uint64_t for maps, int64_t for counters
Count = int # int64_t