tessl/pypi-marisa-trie

Static memory-efficient and fast Trie-like structures for Python based on MARISA-trie C++ library.

—

Pending

Overview

Eval results

Files

Specialized Payload Tries

Name: tessl/pypi-marisa-trie
Author: tessl

Advanced trie implementations that map unicode keys to lists of custom data payloads. BytesTrie handles arbitrary bytes objects while RecordTrie provides structured data support with automatic serialization using Python's struct module.

Capabilities

Bytes Payload Trie

Maps unicode string keys to lists of bytes objects, enabling storage of binary data, serialized objects, or any bytes-based payloads associated with string keys.

class BytesTrie:
    def __init__(self, arg=None, value_separator=b'\xff', **options):
        """
        Create a trie mapping unicode keys to lists of bytes payloads.
        
        Args:
            arg (iterable, optional): Iterable of (unicode_key, bytes_payload) tuples
            value_separator (bytes): Separator between keys and payloads (default: b'\xff')
            **options: Same configuration options as Trie class
        """
    
    def get(self, key, default=None) -> list:
        """
        Return list of bytes payloads for key or default if not found.
        
        Args:
            key: Unicode key to look up (str or bytes)
            default: Value to return if key not found
            
        Returns:
            list or default: List of bytes objects or default value
        """
    
    def __getitem__(self, key) -> list:
        """
        Return list of bytes payloads for key.
        
        Args:
            key: Unicode key to look up
            
        Returns:
            list: List of bytes objects
            
        Raises:
            KeyError: If key is not present
        """
    
    def get_value(self, key: str) -> list:
        """
        Return list of bytes payloads for unicode key.
        
        Args:
            key (str): Unicode key to look up
            
        Returns:
            list: List of bytes objects
        """
    
    def b_get_value(self, key: bytes) -> list:
        """
        Return list of bytes payloads for UTF-8 encoded key.
        
        Args:
            key (bytes): UTF-8 encoded key to look up
            
        Returns:
            list: List of bytes objects
        """
    
    def prefixes(self, key: str) -> list:
        """
        Return list of all prefixes of key that have values.
        
        Args:
            key (str): Unicode key to find prefixes for
            
        Returns:
            list: List of prefix strings that exist in trie
        """
    
    def items(self, prefix="") -> list:
        """
        Return list of (key, payload) pairs with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter items
            
        Returns:
            list: List of (unicode_key, bytes_payload) tuples
        """
    
    def iteritems(self, prefix=""):
        """
        Return iterator over (key, payload) pairs with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter items
            
        Yields:
            tuple: (unicode_key, bytes_payload) pairs
        """
    
    def keys(self, prefix="") -> list:
        """
        Return list of unicode keys with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter keys
            
        Returns:
            list: List of unicode keys
        """
    
    def iterkeys(self, prefix=""):
        """
        Return iterator over unicode keys with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter keys
            
        Yields:
            str: Unicode keys
        """
    
    def _raw_key(self, key: str, payload: bytes) -> bytes:
        """
        Combine unicode key with bytes payload using value separator.
        
        Args:
            key (str): Unicode key
            payload (bytes): Bytes payload to combine with key
            
        Returns:
            bytes: Combined key and payload with separator
        """

Record Payload Trie

Maps unicode string keys to lists of structured data tuples using Python's struct module for automatic serialization and deserialization.

class RecordTrie:
    def __init__(self, fmt: str, arg=None, **options):
        """
        Create a trie mapping unicode keys to lists of structured data tuples.
        
        Args:
            fmt (str): Struct format string for data serialization
            arg (iterable, optional): Iterable of (unicode_key, data_tuple) pairs
            **options: Same configuration options as Trie class
        """
    
    def get(self, key, default=None) -> list:
        """
        Return list of data tuples for key or default if not found.
        
        Args:
            key: Unicode key to look up
            default: Value to return if key not found
            
        Returns:
            list or default: List of unpacked data tuples or default value
        """
    
    def __getitem__(self, key) -> list:
        """
        Return list of data tuples for key.
        
        Args:
            key: Unicode key to look up
            
        Returns:
            list: List of unpacked data tuples
            
        Raises:
            KeyError: If key is not present
        """
    
    def items(self, prefix="") -> list:
        """
        Return list of (key, data_tuple) pairs with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter items
            
        Returns:
            list: List of (unicode_key, data_tuple) pairs
        """
    
    def iteritems(self, prefix=""):
        """
        Return iterator over (key, data_tuple) pairs with optional prefix.
        
        Args:
            prefix (str): Unicode prefix to filter items
            
        Yields:
            tuple: (unicode_key, data_tuple) pairs
        """

Common Inherited Operations

Both BytesTrie and RecordTrie inherit container and serialization operations:

# Container operations
def __contains__(self, key) -> bool:
    """Check if key exists in trie."""

def __len__(self) -> int:
    """Return number of key-value pairs."""

def __iter__(self):
    """Iterate over all keys."""

# Serialization operations inherited from base trie
def save(self, path: str):
    """Save trie to file path."""

def load(self, path: str):
    """Load trie from file path."""

def tobytes(self) -> bytes:
    """Return raw trie content as bytes."""

def frombytes(self, data: bytes):
    """Load trie from raw bytes."""

def mmap(self, path: str):
    """Memory map trie file for efficient access."""

Usage Examples

BytesTrie for Binary Data Storage

import marisa_trie
import json

# Store JSON data as bytes payloads
data = [
    ('user:john', json.dumps({'id': 1, 'name': 'John'}).encode('utf-8')),
    ('user:jane', json.dumps({'id': 2, 'name': 'Jane'}).encode('utf-8')),
    ('user:john', json.dumps({'role': 'admin'}).encode('utf-8')),  # Multiple values per key
    ('config:db', b'host=localhost;port=5432'),
    ('config:cache', b'redis://localhost:6379')
]

bytes_trie = marisa_trie.BytesTrie(data)

# Retrieve all values for a key (returns list)
user_data = bytes_trie['user:john']
print(f"User john data: {[json.loads(d.decode()) for d in user_data]}")
# Output: [{'id': 1, 'name': 'John'}, {'role': 'admin'}]

# Get single value or default
cache_config = bytes_trie.get('config:cache', [b'default'])[0]
print(f"Cache config: {cache_config.decode()}")

# Find all keys with prefix
user_keys = bytes_trie.keys(prefix='user:')
print(f"User keys: {user_keys}")

BytesTrie with Custom Separators

# Use custom separator to avoid conflicts with data
custom_trie = marisa_trie.BytesTrie(
    [('key1', b'data\xff'), ('key2', b'more\xff')],
    value_separator=b'\x00'  # Use null byte as separator
)

values = custom_trie['key1']
print(f"Values: {values}")  # [b'data\xff']

RecordTrie for Structured Data

import marisa_trie

# Store structured numeric data  
# Struct format: '<H?' = little-endian unsigned short + boolean
record_data = [
    ('product:apple', (100, True)),    # (price_cents, in_stock)
    ('product:apple', (95, True)),     # Price history - multiple records per key
    ('product:banana', (50, False)),
    ('product:orange', (75, True))
]

record_trie = marisa_trie.RecordTrie('<H?', record_data)

# Retrieve structured data (automatically unpacked)
apple_records = record_trie['product:apple']
print(f"Apple records: {apple_records}")
# Output: [(100, True), (95, True)]

for price, in_stock in apple_records:
    print(f"Apple: ${price/100:.2f}, Available: {in_stock}")

# Iterate over all products
for key, (price, in_stock) in record_trie.iteritems():
    product = key.split(':')[1]
    print(f"{product}: ${price/100:.2f}, Available: {in_stock}")

Complex Record Formats

# More complex struct format for mixed data types
# Format: '<10sHf?' = 10-char string + unsigned short + float + boolean  
complex_data = [
    ('server:web1', (b'nginx     ', 80, 99.5, True)),    # (name, port, uptime%, active)
    ('server:web2', (b'apache    ', 8080, 95.2, True)),
    ('server:db1', (b'postgres  ', 5432, 99.9, True)),
]

server_trie = marisa_trie.RecordTrie('<10sHf?', complex_data)

for server, (name, port, uptime, active) in server_trie.iteritems():
    name_str = name.decode().strip()
    status = "UP" if active else "DOWN"
    print(f"{server}: {name_str}:{port} ({uptime:.1f}% uptime) - {status}")

Prefix Search with Payloads

# Find all configuration entries
config_items = bytes_trie.items(prefix='config:')
for key, payload in config_items:
    setting = key.split(':')[1]
    value = payload.decode()
    print(f"{setting}: {value}")

# Find users and their data
for user_key in bytes_trie.keys(prefix='user:'):
    user_payloads = bytes_trie[user_key]
    user_name = user_key.split(':')[1]
    print(f"User {user_name} has {len(user_payloads)} data entries")

Serialization and Persistence

# Save specialized tries
bytes_trie.save('data_store.trie')
record_trie.save('records.trie')

# Load with proper format specification for RecordTrie
loaded_records = marisa_trie.RecordTrie('<H?')
loaded_records.load('records.trie')

# Verify data integrity
assert loaded_records['product:apple'] == [(100, True), (95, True)]

Performance Considerations

# For large datasets, use appropriate configuration
large_bytes_trie = marisa_trie.BytesTrie(
    large_data,
    cache_size=marisa_trie.HUGE_CACHE,
    order=marisa_trie.WEIGHT_ORDER,  # Optimize for frequent lookups
    binary=True  # Use binary tail storage for better compression
)

# Memory mapping for very large tries
large_bytes_trie.save('large_data.trie')
mapped_trie = marisa_trie.BytesTrie()
mapped_trie.mmap('large_data.trie')  # Memory-efficient access

Install with Tessl CLI