Static memory-efficient and fast Trie-like structures for Python based on MARISA-trie C++ library.
—
Primary trie implementations for mapping keys to auto-generated unique IDs. The Trie class handles unicode strings while BinaryTrie works with binary data, both providing efficient key-to-ID mapping with comprehensive lookup and prefix search capabilities.
Maps unicode string keys to auto-generated unique integer IDs with support for prefix operations, key restoration, iteration, and serialization.
class Trie:
def __init__(self, arg=None, num_tries=DEFAULT_NUM_TRIES, binary=False,
cache_size=DEFAULT_CACHE, order=DEFAULT_ORDER, weights=None):
"""
Create a trie from unicode string keys.
Args:
arg (iterable, optional): Iterable of unicode string keys
num_tries (int): Number of tries to use (MIN_NUM_TRIES to MAX_NUM_TRIES)
binary (bool): Use binary tail storage instead of text
cache_size (int): Cache size constant for performance tuning
order (int): Node ordering (LABEL_ORDER or WEIGHT_ORDER)
weights (iterable, optional): Lookup frequency weights for optimization
"""
def key_id(self, key: str) -> int:
"""
Return unique ID for a unicode key.
Args:
key (str): Unicode key to look up
Returns:
int: Unique integer ID for the key
Raises:
KeyError: If key is not present in trie
"""
def restore_key(self, index: int) -> str:
"""
Return unicode key corresponding to given ID.
Args:
index (int): Key ID to look up
Returns:
str: Unicode key for the ID
Raises:
KeyError: If ID is not valid
"""
def get(self, key, default=None):
"""
Return ID for key or default if not found.
Args:
key: Key to look up
default: Value to return if key not found
Returns:
int or default: Key ID or default value
"""
def keys(self, prefix=None) -> list:
"""
Return list of keys with optional prefix filter.
Args:
prefix (str, optional): Prefix to filter keys
Returns:
list: List of unicode keys
"""
def iterkeys(self, prefix=None):
"""
Return iterator over keys with optional prefix filter.
Args:
prefix (str, optional): Prefix to filter keys
Yields:
str: Unicode keys
"""
def prefixes(self, key: str) -> list:
"""
Return list of all prefixes of given key that exist in trie.
Args:
key (str): Key to find prefixes for
Returns:
list: List of prefix strings
"""
def iter_prefixes(self, key: str):
"""
Return iterator over all prefixes of given key.
Args:
key (str): Key to find prefixes for
Yields:
str: Prefix strings
"""
def iter_prefixes_with_ids(self, key: str):
"""
Return iterator of (prefix, id) pairs for all prefixes.
Args:
key (str): Key to find prefixes for
Yields:
tuple: (prefix_string, prefix_id) pairs
"""
def items(self, prefix="") -> list:
"""
Return list of (key, id) pairs with optional prefix.
Args:
prefix (str): Prefix to filter items
Returns:
list: List of (key, id) tuples
"""
def iteritems(self, prefix=""):
"""
Return iterator over (key, id) pairs with optional prefix.
Args:
prefix (str): Prefix to filter items
Yields:
tuple: (key, id) pairs
"""Maps bytes keys to auto-generated unique integer IDs with support for binary data and prefix operations.
class BinaryTrie:
def __init__(self, arg=None, **options):
"""
Create a trie from bytes keys.
Args:
arg (iterable, optional): Iterable of bytes keys
**options: Same options as Trie class
"""
def key_id(self, key: bytes) -> int:
"""
Return unique ID for a bytes key.
Args:
key (bytes): Bytes key to look up
Returns:
int: Unique integer ID for the key
Raises:
KeyError: If key is not present in trie
"""
def restore_key(self, index: int) -> bytes:
"""
Return bytes key corresponding to given ID.
Args:
index (int): Key ID to look up
Returns:
bytes: Bytes key for the ID
Raises:
KeyError: If ID is not valid
"""
def get(self, key: bytes, default=None):
"""
Return ID for bytes key or default if not found.
Args:
key (bytes): Bytes key to look up
default: Value to return if key not found
Returns:
int or default: Key ID or default value
"""
def iter_prefixes(self, key: bytes):
"""
Return iterator over all prefixes of given bytes key.
Args:
key (bytes): Bytes key to find prefixes for
Yields:
bytes: Prefix bytes objects
"""
def prefixes(self, key: bytes) -> list:
"""
Return list of all prefixes of given bytes key.
Args:
key (bytes): Bytes key to find prefixes for
Returns:
list: List of prefix bytes objects
"""
def items(self, prefix=b"") -> list:
"""
Return list of (key, id) pairs with optional bytes prefix.
Args:
prefix (bytes): Bytes prefix to filter items
Returns:
list: List of (bytes_key, id) tuples
"""
def iteritems(self, prefix=b""):
"""
Return iterator over (key, id) pairs with optional bytes prefix.
Args:
prefix (bytes): Bytes prefix to filter items
Yields:
tuple: (bytes_key, id) pairs
"""
def keys(self, prefix=b"") -> list:
"""
Return list of bytes keys with optional prefix.
Args:
prefix (bytes): Bytes prefix to filter keys
Returns:
list: List of bytes keys
"""
def iterkeys(self, prefix=b""):
"""
Return iterator over bytes keys with optional prefix.
Args:
prefix (bytes): Bytes prefix to filter keys
Yields:
bytes: Bytes keys
"""Both Trie and BinaryTrie classes support these common operations:
# Container operations
def __contains__(self, key) -> bool:
"""Check if key exists in trie."""
def __len__(self) -> int:
"""Return number of keys in trie."""
def __iter__(self):
"""Iterate over all keys."""
def __getitem__(self, key) -> int:
"""Get key ID using dictionary syntax."""
# Serialization operations
def save(self, path: str):
"""Save trie to file path."""
def load(self, path: str):
"""Load trie from file path."""
def tobytes(self) -> bytes:
"""Return raw trie content as bytes."""
def frombytes(self, data: bytes):
"""Load trie from raw bytes."""
def mmap(self, path: str):
"""Memory map trie file for efficient access."""
def map(self, buffer):
"""Load trie from buffer protocol object."""import marisa_trie
# Create trie with string keys
words = ['apple', 'application', 'apply', 'banana', 'band', 'bandana']
trie = marisa_trie.Trie(words)
# Key lookup operations
print(trie.key_id('apple')) # Get unique ID
print(trie.restore_key(0)) # Get key from ID
print('app' in trie) # False - exact match only
print(len(trie)) # 6
# Dictionary-style access
apple_id = trie['apple']
print(f"Apple ID: {apple_id}")# Find keys with prefix
app_words = trie.keys(prefix='app')
print(f"Words starting with 'app': {app_words}")
# Output: ['apple', 'application', 'apply']
# Find prefixes of a key
prefixes = trie.prefixes('application')
print(f"Prefixes of 'application': {prefixes}")
# Output: ['app', 'appli', 'application'] (only existing keys)
# Iterate with IDs
for word, word_id in trie.iteritems(prefix='ban'):
print(f"{word}: {word_id}")# Use weights for frequently accessed keys
keys = ['common', 'rare', 'frequent']
weights = [100, 1, 50] # Higher weights for common lookups
optimized_trie = marisa_trie.Trie(
keys,
weights=weights,
cache_size=marisa_trie.LARGE_CACHE,
order=marisa_trie.WEIGHT_ORDER
)# Create trie with binary keys
binary_keys = [b'header\x00\x01', b'data\x02\x03', b'footer\xff']
binary_trie = marisa_trie.BinaryTrie(binary_keys)
# Binary operations
data_id = binary_trie.key_id(b'data\x02\x03')
restored = binary_trie.restore_key(data_id)
print(f"Restored binary key: {restored}")
# Prefix search with binary data
prefixes = binary_trie.prefixes(b'header\x00\x01\x02')
print(f"Binary prefixes: {prefixes}")# Save trie to file
trie.save('dictionary.trie')
# Load trie from file
loaded_trie = marisa_trie.Trie()
loaded_trie.load('dictionary.trie')
# Memory-efficient loading via memory mapping
mapped_trie = marisa_trie.Trie()
mapped_trie.mmap('dictionary.trie')
# Serialize to bytes for network transfer
trie_bytes = trie.tobytes()
new_trie = marisa_trie.Trie()
new_trie.frombytes(trie_bytes)These methods are still available but deprecated and will be removed in future versions:
# Deprecated serialization methods (use save/load instead)
def read(self, f):
"""
Read trie from an open file descriptor.
Args:
f: Open file object with file descriptor
.. deprecated:: 0.7.3
Use load() instead. Will be removed in 0.8.0.
"""
def write(self, f):
"""
Write trie to an open file descriptor.
Args:
f: Open file object with file descriptor
.. deprecated:: 0.7.3
Use save() instead. Will be removed in 0.8.0.
"""
def has_keys_with_prefix(self, prefix="") -> bool:
"""
Check if any keys begin with given prefix.
Args:
prefix (str): Prefix to check
Returns:
bool: True if any keys start with prefix
.. deprecated:: 0.7.3
Use iterkeys() instead. Will be removed in 0.8.0.
"""Methods supporting Python's built-in operations and serialization protocols:
# Comparison operations
def __richcmp__(self, other, int op) -> bool:
"""
Rich comparison supporting == and != operators.
Args:
other: Another Trie instance to compare
op: Comparison operation (2 for ==, 3 for !=)
Returns:
bool: True if tries are equal (for ==) or not equal (for !=)
Raises:
TypeError: For unsupported comparison operations (<, >, <=, >=)
"""
# Pickle serialization support
def __reduce__(self) -> tuple:
"""
Support for pickle serialization.
Returns:
tuple: (class, args, state) for pickle reconstruction
"""
def __setstate__(self, state: bytes):
"""
Support for pickle deserialization (alias for frombytes).
Args:
state (bytes): Serialized trie data from __reduce__
"""The marisa-trie classes follow a hierarchical design:
_Trie (base class)
├── BinaryTrie (binary keys → IDs)
└── _UnicodeKeyedTrie (unicode key handling)
├── Trie (unicode keys → IDs)
└── BytesTrie (unicode keys → bytes payloads)
└── _UnpackTrie (value packing/unpacking)
└── RecordTrie (unicode keys → structured data)Base Class Methods: The _Trie base class provides core functionality including:
__contains__, __len__, __iter__)save, load, tobytes, frombytes, mmap, map)keys, iterkeys)Key-ID Methods: Only Trie and BinaryTrie provide:
key_id() and restore_key() for key-ID mapping__getitem__Unicode-Specific Methods: Only Trie provides:
iter_prefixes_with_ids() for prefix-ID pairsAll trie classes raise specific exceptions for error conditions:
# KeyError for missing keys or invalid IDs
try:
key_id = trie.key_id('nonexistent')
except KeyError as e:
print(f"Key not found: {e}")
try:
key = trie.restore_key(999999)
except KeyError as e:
print(f"Invalid ID: {e}")
# ValueError for invalid configuration
try:
invalid_trie = marisa_trie.Trie(['test'], num_tries=999)
except ValueError as e:
print(f"Configuration error: {e}")
# File I/O exceptions during serialization
try:
trie.load('nonexistent.trie')
except FileNotFoundError as e:
print(f"File error: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-marisa-trie