Static memory-efficient and fast Trie-like structures for Python based on MARISA-trie C++ library.
—
Configuration constants for optimizing trie performance, memory usage, and behavior. These constants control cache sizes, node ordering, tail storage methods, and trie count limits to fine-tune performance for specific use cases.
Controls the cache size used during trie construction and queries, affecting both performance and memory usage during operations.
# Cache size constants (in order of size)
DEFAULT_CACHE: int # Default cache size for balanced performance
HUGE_CACHE: int # Largest cache for maximum performance
LARGE_CACHE: int # Large cache for high-performance scenarios
NORMAL_CACHE: int # Normal cache size
SMALL_CACHE: int # Small cache for memory-constrained environments
TINY_CACHE: int # Smallest cache for minimal memory usageDetermines how nodes are arranged within the trie structure, affecting both lookup performance and the order of iteration results.
# Node ordering constants
LABEL_ORDER: int # Arrange nodes in ascending label order (predictable iteration)
WEIGHT_ORDER: int # Arrange nodes in descending weight order (faster matching)
DEFAULT_ORDER: int # Default node ordering strategyControls how the trie stores the tail portions of keys, affecting memory usage and compatibility with different data types.
# Tail storage method constants
TEXT_TAIL: int # Store tails as null-terminated strings (text data)
BINARY_TAIL: int # Store tails as byte sequences with bit vectors (binary data)
DEFAULT_TAIL: int # Default tail storage methodDefines the valid range for the number of tries used in the underlying MARISA-trie structure.
# Trie count constants
MIN_NUM_TRIES: int # Minimum number of tries allowed
MAX_NUM_TRIES: int # Maximum number of tries allowed
DEFAULT_NUM_TRIES: int # Default number of triesimport marisa_trie
# High-performance configuration for frequent lookups
fast_trie = marisa_trie.Trie(
['frequent', 'lookups', 'data'],
cache_size=marisa_trie.HUGE_CACHE, # Maximum cache for speed
order=marisa_trie.WEIGHT_ORDER, # Optimize for lookup performance
num_tries=7 # Higher trie count for speed
)
# Memory-efficient configuration
compact_trie = marisa_trie.Trie(
['memory', 'efficient', 'storage'],
cache_size=marisa_trie.TINY_CACHE, # Minimal cache for memory savings
binary=True, # Use binary tail storage
num_tries=marisa_trie.MIN_NUM_TRIES # Minimum tries for space
)# Optimize for text data with predictable ordering
text_trie = marisa_trie.Trie(
sorted_word_list,
order=marisa_trie.LABEL_ORDER, # Maintain alphabetical order
binary=False # Use text tail storage (default)
)
# Optimize for binary data or data with null bytes
binary_trie = marisa_trie.BinaryTrie(
binary_keys,
binary=True, # Force binary tail storage
cache_size=marisa_trie.LARGE_CACHE
)# Use weights with appropriate ordering for best performance
high_freq_words = ['the', 'and', 'or', 'but']
low_freq_words = ['sesquipedalian', 'antidisestablishmentarianism']
all_words = high_freq_words + low_freq_words
# Assign higher weights to frequently accessed words
weights = [100] * len(high_freq_words) + [1] * len(low_freq_words)
optimized_trie = marisa_trie.Trie(
all_words,
weights=weights,
order=marisa_trie.WEIGHT_ORDER, # Essential for weight optimization
cache_size=marisa_trie.LARGE_CACHE
)
# Verify that high-frequency words get better performance
print(f"'the' lookup speed optimized: {optimized_trie.key_id('the')}")def create_optimized_trie(keys, performance_level='balanced'):
"""Create trie with performance-appropriate settings."""
configs = {
'minimal': {
'cache_size': marisa_trie.TINY_CACHE,
'num_tries': marisa_trie.MIN_NUM_TRIES,
'binary': True
},
'balanced': {
'cache_size': marisa_trie.DEFAULT_CACHE,
'num_tries': marisa_trie.DEFAULT_NUM_TRIES,
'order': marisa_trie.DEFAULT_ORDER
},
'maximum': {
'cache_size': marisa_trie.HUGE_CACHE,
'num_tries': marisa_trie.MAX_NUM_TRIES,
'order': marisa_trie.WEIGHT_ORDER
}
}
config = configs.get(performance_level, configs['balanced'])
return marisa_trie.Trie(keys, **config)
# Usage
words = ['apple', 'banana', 'cherry']
fast_trie = create_optimized_trie(words, 'maximum')
compact_trie = create_optimized_trie(words, 'minimal')import time
import marisa_trie
large_keys = [f"key_{i:06d}" for i in range(10000)]
# Measure performance with different cache sizes
def benchmark_cache_size(keys, cache_size, name):
start = time.time()
trie = marisa_trie.Trie(keys, cache_size=cache_size)
build_time = time.time() - start
start = time.time()
for i in range(1000):
_ = trie.key_id(keys[i % len(keys)])
lookup_time = time.time() - start
print(f"{name}: Build={build_time:.3f}s, Lookup={lookup_time:.3f}s")
# Compare cache configurations
benchmark_cache_size(large_keys, marisa_trie.TINY_CACHE, "Tiny Cache")
benchmark_cache_size(large_keys, marisa_trie.DEFAULT_CACHE, "Default Cache")
benchmark_cache_size(large_keys, marisa_trie.HUGE_CACHE, "Huge Cache")try:
# Invalid num_tries value
invalid_trie = marisa_trie.Trie(
['test'],
num_tries=999 # Exceeds MAX_NUM_TRIES
)
except ValueError as e:
print(f"Configuration error: {e}")
# Check valid ranges
print(f"Valid trie count range: {marisa_trie.MIN_NUM_TRIES} to {marisa_trie.MAX_NUM_TRIES}")
print(f"Default configuration: {marisa_trie.DEFAULT_NUM_TRIES} tries")
# Validate configuration before use
def validate_trie_config(num_tries):
if not (marisa_trie.MIN_NUM_TRIES <= num_tries <= marisa_trie.MAX_NUM_TRIES):
raise ValueError(
f"num_tries must be between {marisa_trie.MIN_NUM_TRIES} "
f"and {marisa_trie.MAX_NUM_TRIES}, got {num_tries}"
)
return True
# Safe configuration
try:
validate_trie_config(5)
safe_trie = marisa_trie.Trie(['safe', 'config'], num_tries=5)
print("Configuration validated and trie created successfully")
except ValueError as e:
print(f"Invalid configuration: {e}")# Configuration for different use cases
def get_trie_config(use_case):
"""Return optimal configuration for specific use cases."""
configs = {
'autocomplete': {
'cache_size': marisa_trie.LARGE_CACHE,
'order': marisa_trie.LABEL_ORDER, # Predictable prefix order
'binary': False
},
'dictionary_lookup': {
'cache_size': marisa_trie.DEFAULT_CACHE,
'order': marisa_trie.WEIGHT_ORDER, # Optimize for frequent words
'binary': False
},
'binary_data': {
'cache_size': marisa_trie.NORMAL_CACHE,
'binary': True, # Handle null bytes properly
'order': marisa_trie.DEFAULT_ORDER
},
'memory_constrained': {
'cache_size': marisa_trie.TINY_CACHE,
'num_tries': marisa_trie.MIN_NUM_TRIES,
'binary': True # Better compression
}
}
return configs.get(use_case, configs['dictionary_lookup'])
# Usage examples
autocomplete_trie = marisa_trie.Trie(
search_suggestions,
**get_trie_config('autocomplete')
)
binary_lookup = marisa_trie.BinaryTrie(
binary_patterns,
**get_trie_config('binary_data')
)Install with Tessl CLI
npx tessl i tessl/pypi-marisa-trie