Static memory-efficient and fast Trie-like structures for Python based on MARISA-trie C++ library.
—
Advanced trie implementations that map unicode keys to lists of custom data payloads. BytesTrie handles arbitrary bytes objects while RecordTrie provides structured data support with automatic serialization using Python's struct module.
Maps unicode string keys to lists of bytes objects, enabling storage of binary data, serialized objects, or any bytes-based payloads associated with string keys.
class BytesTrie:
def __init__(self, arg=None, value_separator=b'\xff', **options):
"""
Create a trie mapping unicode keys to lists of bytes payloads.
Args:
arg (iterable, optional): Iterable of (unicode_key, bytes_payload) tuples
value_separator (bytes): Separator between keys and payloads (default: b'\xff')
**options: Same configuration options as Trie class
"""
def get(self, key, default=None) -> list:
"""
Return list of bytes payloads for key or default if not found.
Args:
key: Unicode key to look up (str or bytes)
default: Value to return if key not found
Returns:
list or default: List of bytes objects or default value
"""
def __getitem__(self, key) -> list:
"""
Return list of bytes payloads for key.
Args:
key: Unicode key to look up
Returns:
list: List of bytes objects
Raises:
KeyError: If key is not present
"""
def get_value(self, key: str) -> list:
"""
Return list of bytes payloads for unicode key.
Args:
key (str): Unicode key to look up
Returns:
list: List of bytes objects
"""
def b_get_value(self, key: bytes) -> list:
"""
Return list of bytes payloads for UTF-8 encoded key.
Args:
key (bytes): UTF-8 encoded key to look up
Returns:
list: List of bytes objects
"""
def prefixes(self, key: str) -> list:
"""
Return list of all prefixes of key that have values.
Args:
key (str): Unicode key to find prefixes for
Returns:
list: List of prefix strings that exist in trie
"""
def items(self, prefix="") -> list:
"""
Return list of (key, payload) pairs with optional prefix.
Args:
prefix (str): Unicode prefix to filter items
Returns:
list: List of (unicode_key, bytes_payload) tuples
"""
def iteritems(self, prefix=""):
"""
Return iterator over (key, payload) pairs with optional prefix.
Args:
prefix (str): Unicode prefix to filter items
Yields:
tuple: (unicode_key, bytes_payload) pairs
"""
def keys(self, prefix="") -> list:
"""
Return list of unicode keys with optional prefix.
Args:
prefix (str): Unicode prefix to filter keys
Returns:
list: List of unicode keys
"""
def iterkeys(self, prefix=""):
"""
Return iterator over unicode keys with optional prefix.
Args:
prefix (str): Unicode prefix to filter keys
Yields:
str: Unicode keys
"""
def _raw_key(self, key: str, payload: bytes) -> bytes:
"""
Combine unicode key with bytes payload using value separator.
Args:
key (str): Unicode key
payload (bytes): Bytes payload to combine with key
Returns:
bytes: Combined key and payload with separator
"""Maps unicode string keys to lists of structured data tuples using Python's struct module for automatic serialization and deserialization.
class RecordTrie:
def __init__(self, fmt: str, arg=None, **options):
"""
Create a trie mapping unicode keys to lists of structured data tuples.
Args:
fmt (str): Struct format string for data serialization
arg (iterable, optional): Iterable of (unicode_key, data_tuple) pairs
**options: Same configuration options as Trie class
"""
def get(self, key, default=None) -> list:
"""
Return list of data tuples for key or default if not found.
Args:
key: Unicode key to look up
default: Value to return if key not found
Returns:
list or default: List of unpacked data tuples or default value
"""
def __getitem__(self, key) -> list:
"""
Return list of data tuples for key.
Args:
key: Unicode key to look up
Returns:
list: List of unpacked data tuples
Raises:
KeyError: If key is not present
"""
def items(self, prefix="") -> list:
"""
Return list of (key, data_tuple) pairs with optional prefix.
Args:
prefix (str): Unicode prefix to filter items
Returns:
list: List of (unicode_key, data_tuple) pairs
"""
def iteritems(self, prefix=""):
"""
Return iterator over (key, data_tuple) pairs with optional prefix.
Args:
prefix (str): Unicode prefix to filter items
Yields:
tuple: (unicode_key, data_tuple) pairs
"""Both BytesTrie and RecordTrie inherit container and serialization operations:
# Container operations
def __contains__(self, key) -> bool:
"""Check if key exists in trie."""
def __len__(self) -> int:
"""Return number of key-value pairs."""
def __iter__(self):
"""Iterate over all keys."""
# Serialization operations inherited from base trie
def save(self, path: str):
"""Save trie to file path."""
def load(self, path: str):
"""Load trie from file path."""
def tobytes(self) -> bytes:
"""Return raw trie content as bytes."""
def frombytes(self, data: bytes):
"""Load trie from raw bytes."""
def mmap(self, path: str):
"""Memory map trie file for efficient access."""import marisa_trie
import json
# Store JSON data as bytes payloads
data = [
('user:john', json.dumps({'id': 1, 'name': 'John'}).encode('utf-8')),
('user:jane', json.dumps({'id': 2, 'name': 'Jane'}).encode('utf-8')),
('user:john', json.dumps({'role': 'admin'}).encode('utf-8')), # Multiple values per key
('config:db', b'host=localhost;port=5432'),
('config:cache', b'redis://localhost:6379')
]
bytes_trie = marisa_trie.BytesTrie(data)
# Retrieve all values for a key (returns list)
user_data = bytes_trie['user:john']
print(f"User john data: {[json.loads(d.decode()) for d in user_data]}")
# Output: [{'id': 1, 'name': 'John'}, {'role': 'admin'}]
# Get single value or default
cache_config = bytes_trie.get('config:cache', [b'default'])[0]
print(f"Cache config: {cache_config.decode()}")
# Find all keys with prefix
user_keys = bytes_trie.keys(prefix='user:')
print(f"User keys: {user_keys}")# Use custom separator to avoid conflicts with data
custom_trie = marisa_trie.BytesTrie(
[('key1', b'data\xff'), ('key2', b'more\xff')],
value_separator=b'\x00' # Use null byte as separator
)
values = custom_trie['key1']
print(f"Values: {values}") # [b'data\xff']import marisa_trie
# Store structured numeric data
# Struct format: '<H?' = little-endian unsigned short + boolean
record_data = [
('product:apple', (100, True)), # (price_cents, in_stock)
('product:apple', (95, True)), # Price history - multiple records per key
('product:banana', (50, False)),
('product:orange', (75, True))
]
record_trie = marisa_trie.RecordTrie('<H?', record_data)
# Retrieve structured data (automatically unpacked)
apple_records = record_trie['product:apple']
print(f"Apple records: {apple_records}")
# Output: [(100, True), (95, True)]
for price, in_stock in apple_records:
print(f"Apple: ${price/100:.2f}, Available: {in_stock}")
# Iterate over all products
for key, (price, in_stock) in record_trie.iteritems():
product = key.split(':')[1]
print(f"{product}: ${price/100:.2f}, Available: {in_stock}")# More complex struct format for mixed data types
# Format: '<10sHf?' = 10-char string + unsigned short + float + boolean
complex_data = [
('server:web1', (b'nginx ', 80, 99.5, True)), # (name, port, uptime%, active)
('server:web2', (b'apache ', 8080, 95.2, True)),
('server:db1', (b'postgres ', 5432, 99.9, True)),
]
server_trie = marisa_trie.RecordTrie('<10sHf?', complex_data)
for server, (name, port, uptime, active) in server_trie.iteritems():
name_str = name.decode().strip()
status = "UP" if active else "DOWN"
print(f"{server}: {name_str}:{port} ({uptime:.1f}% uptime) - {status}")# Find all configuration entries
config_items = bytes_trie.items(prefix='config:')
for key, payload in config_items:
setting = key.split(':')[1]
value = payload.decode()
print(f"{setting}: {value}")
# Find users and their data
for user_key in bytes_trie.keys(prefix='user:'):
user_payloads = bytes_trie[user_key]
user_name = user_key.split(':')[1]
print(f"User {user_name} has {len(user_payloads)} data entries")# Save specialized tries
bytes_trie.save('data_store.trie')
record_trie.save('records.trie')
# Load with proper format specification for RecordTrie
loaded_records = marisa_trie.RecordTrie('<H?')
loaded_records.load('records.trie')
# Verify data integrity
assert loaded_records['product:apple'] == [(100, True), (95, True)]# For large datasets, use appropriate configuration
large_bytes_trie = marisa_trie.BytesTrie(
large_data,
cache_size=marisa_trie.HUGE_CACHE,
order=marisa_trie.WEIGHT_ORDER, # Optimize for frequent lookups
binary=True # Use binary tail storage for better compression
)
# Memory mapping for very large tries
large_bytes_trie.save('large_data.trie')
mapped_trie = marisa_trie.BytesTrie()
mapped_trie.mmap('large_data.trie') # Memory-efficient accessInstall with Tessl CLI
npx tessl i tessl/pypi-marisa-trie