tessl/pypi-marisa-trie

Static memory-efficient and fast Trie-like structures for Python based on MARISA-trie C++ library.

—

Pending

Overview

Eval results

Files

Configuration and Constants

Name: tessl/pypi-marisa-trie
Author: tessl

Configuration constants for optimizing trie performance, memory usage, and behavior. These constants control cache sizes, node ordering, tail storage methods, and trie count limits to fine-tune performance for specific use cases.

Capabilities

Cache Size Configuration

Controls the cache size used during trie construction and queries, affecting both performance and memory usage during operations.

# Cache size constants (in order of size)
DEFAULT_CACHE: int  # Default cache size for balanced performance
HUGE_CACHE: int     # Largest cache for maximum performance  
LARGE_CACHE: int    # Large cache for high-performance scenarios
NORMAL_CACHE: int   # Normal cache size
SMALL_CACHE: int    # Small cache for memory-constrained environments
TINY_CACHE: int     # Smallest cache for minimal memory usage

Node Ordering Configuration

Determines how nodes are arranged within the trie structure, affecting both lookup performance and the order of iteration results.

# Node ordering constants
LABEL_ORDER: int    # Arrange nodes in ascending label order (predictable iteration)
WEIGHT_ORDER: int   # Arrange nodes in descending weight order (faster matching)  
DEFAULT_ORDER: int  # Default node ordering strategy

Tail Storage Configuration

Controls how the trie stores the tail portions of keys, affecting memory usage and compatibility with different data types.

# Tail storage method constants
TEXT_TAIL: int      # Store tails as null-terminated strings (text data)
BINARY_TAIL: int    # Store tails as byte sequences with bit vectors (binary data)
DEFAULT_TAIL: int   # Default tail storage method

Trie Count Limits

Defines the valid range for the number of tries used in the underlying MARISA-trie structure.

# Trie count constants
MIN_NUM_TRIES: int     # Minimum number of tries allowed
MAX_NUM_TRIES: int     # Maximum number of tries allowed  
DEFAULT_NUM_TRIES: int # Default number of tries

Usage Examples

Performance Optimization

import marisa_trie

# High-performance configuration for frequent lookups
fast_trie = marisa_trie.Trie(
    ['frequent', 'lookups', 'data'],
    cache_size=marisa_trie.HUGE_CACHE,    # Maximum cache for speed
    order=marisa_trie.WEIGHT_ORDER,       # Optimize for lookup performance
    num_tries=7                           # Higher trie count for speed
)

# Memory-efficient configuration  
compact_trie = marisa_trie.Trie(
    ['memory', 'efficient', 'storage'],
    cache_size=marisa_trie.TINY_CACHE,    # Minimal cache for memory savings
    binary=True,                          # Use binary tail storage
    num_tries=marisa_trie.MIN_NUM_TRIES   # Minimum tries for space
)

Data Type Optimization

# Optimize for text data with predictable ordering
text_trie = marisa_trie.Trie(
    sorted_word_list,
    order=marisa_trie.LABEL_ORDER,    # Maintain alphabetical order
    binary=False                      # Use text tail storage (default)
)

# Optimize for binary data or data with null bytes
binary_trie = marisa_trie.BinaryTrie(
    binary_keys,
    binary=True,                      # Force binary tail storage
    cache_size=marisa_trie.LARGE_CACHE
)

Weighted Optimization

# Use weights with appropriate ordering for best performance
high_freq_words = ['the', 'and', 'or', 'but']
low_freq_words = ['sesquipedalian', 'antidisestablishmentarianism']
all_words = high_freq_words + low_freq_words

# Assign higher weights to frequently accessed words
weights = [100] * len(high_freq_words) + [1] * len(low_freq_words)

optimized_trie = marisa_trie.Trie(
    all_words,
    weights=weights,
    order=marisa_trie.WEIGHT_ORDER,   # Essential for weight optimization
    cache_size=marisa_trie.LARGE_CACHE
)

# Verify that high-frequency words get better performance
print(f"'the' lookup speed optimized: {optimized_trie.key_id('the')}")

Custom Configuration Validation

def create_optimized_trie(keys, performance_level='balanced'):
    """Create trie with performance-appropriate settings."""
    
    configs = {
        'minimal': {
            'cache_size': marisa_trie.TINY_CACHE,
            'num_tries': marisa_trie.MIN_NUM_TRIES,
            'binary': True
        },
        'balanced': {
            'cache_size': marisa_trie.DEFAULT_CACHE,
            'num_tries': marisa_trie.DEFAULT_NUM_TRIES,
            'order': marisa_trie.DEFAULT_ORDER
        },
        'maximum': {
            'cache_size': marisa_trie.HUGE_CACHE,
            'num_tries': marisa_trie.MAX_NUM_TRIES,
            'order': marisa_trie.WEIGHT_ORDER
        }
    }
    
    config = configs.get(performance_level, configs['balanced'])
    return marisa_trie.Trie(keys, **config)

# Usage
words = ['apple', 'banana', 'cherry']
fast_trie = create_optimized_trie(words, 'maximum')
compact_trie = create_optimized_trie(words, 'minimal')

Configuration Impact Examples

import time
import marisa_trie

large_keys = [f"key_{i:06d}" for i in range(10000)]

# Measure performance with different cache sizes
def benchmark_cache_size(keys, cache_size, name):
    start = time.time()
    trie = marisa_trie.Trie(keys, cache_size=cache_size)
    build_time = time.time() - start
    
    start = time.time()
    for i in range(1000):
        _ = trie.key_id(keys[i % len(keys)])
    lookup_time = time.time() - start
    
    print(f"{name}: Build={build_time:.3f}s, Lookup={lookup_time:.3f}s")

# Compare cache configurations
benchmark_cache_size(large_keys, marisa_trie.TINY_CACHE, "Tiny Cache")
benchmark_cache_size(large_keys, marisa_trie.DEFAULT_CACHE, "Default Cache")  
benchmark_cache_size(large_keys, marisa_trie.HUGE_CACHE, "Huge Cache")

Error Handling for Configuration

try:
    # Invalid num_tries value
    invalid_trie = marisa_trie.Trie(
        ['test'], 
        num_tries=999  # Exceeds MAX_NUM_TRIES
    )
except ValueError as e:
    print(f"Configuration error: {e}")

# Check valid ranges
print(f"Valid trie count range: {marisa_trie.MIN_NUM_TRIES} to {marisa_trie.MAX_NUM_TRIES}")
print(f"Default configuration: {marisa_trie.DEFAULT_NUM_TRIES} tries")

# Validate configuration before use
def validate_trie_config(num_tries):
    if not (marisa_trie.MIN_NUM_TRIES <= num_tries <= marisa_trie.MAX_NUM_TRIES):
        raise ValueError(
            f"num_tries must be between {marisa_trie.MIN_NUM_TRIES} "
            f"and {marisa_trie.MAX_NUM_TRIES}, got {num_tries}"
        )
    return True

# Safe configuration
try:
    validate_trie_config(5)
    safe_trie = marisa_trie.Trie(['safe', 'config'], num_tries=5)
    print("Configuration validated and trie created successfully")
except ValueError as e:
    print(f"Invalid configuration: {e}")

Advanced Configuration Patterns

# Configuration for different use cases
def get_trie_config(use_case):
    """Return optimal configuration for specific use cases."""
    
    configs = {
        'autocomplete': {
            'cache_size': marisa_trie.LARGE_CACHE,
            'order': marisa_trie.LABEL_ORDER,  # Predictable prefix order
            'binary': False
        },
        'dictionary_lookup': {
            'cache_size': marisa_trie.DEFAULT_CACHE,
            'order': marisa_trie.WEIGHT_ORDER,  # Optimize for frequent words
            'binary': False
        },
        'binary_data': {
            'cache_size': marisa_trie.NORMAL_CACHE,
            'binary': True,  # Handle null bytes properly
            'order': marisa_trie.DEFAULT_ORDER
        },
        'memory_constrained': {
            'cache_size': marisa_trie.TINY_CACHE,
            'num_tries': marisa_trie.MIN_NUM_TRIES,
            'binary': True  # Better compression
        }
    }
    
    return configs.get(use_case, configs['dictionary_lookup'])

# Usage examples
autocomplete_trie = marisa_trie.Trie(
    search_suggestions,
    **get_trie_config('autocomplete')
)

binary_lookup = marisa_trie.BinaryTrie(
    binary_patterns,
    **get_trie_config('binary_data')
)

Install with Tessl CLI