CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyahocorasick

Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm

Pending
Overview
Eval results
Files

dictionary-interface.mddocs/

Dictionary Interface

Dict-like operations for accessing stored patterns and values, including existence checking, value retrieval, and iteration over keys, values, and items with optional filtering.

Capabilities

Value Access

Retrieve values associated with keys using dict-like methods.

def get(self, key, default=None):
    """
    Return the value associated with the key string.

    Parameters:
    - key: Key to look up
    - default: Value to return if key not found

    Returns:
    The value associated with key, or default if key not found

    Raises:
    - KeyError: If key not found and no default provided
    """

def __getitem__(self, key):
    """
    Get value for key using bracket notation.

    Parameters:
    - key: Key to look up

    Returns:
    The value associated with key

    Raises:
    - KeyError: If key not found
    """

Usage Examples

import ahocorasick

automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'place')
automaton.add_word('python', {'type': 'language', 'year': 1991})

# Using get() method
greeting = automaton.get('hello')  # 'greeting'
missing = automaton.get('missing', 'not found')  # 'not found'

# Using bracket notation
place = automaton['world']  # 'place'
lang_info = automaton['python']  # {'type': 'language', 'year': 1991}

# KeyError when key doesn't exist
try:
    value = automaton['missing']
except KeyError:
    print("Key not found")

Existence Checking

Check if keys exist in the automaton.

def exists(self, key):
    """
    Return True if the key is present in the trie.

    Parameters:
    - key: Key to check

    Returns:
    bool: True if key exists, False otherwise
    """

def __contains__(self, key):
    """
    Support for 'in' operator.

    Parameters:
    - key: Key to check

    Returns:
    bool: True if key exists, False otherwise
    """

Usage Examples

automaton = ahocorasick.Automaton()
automaton.add_word('cat', 'animal')
automaton.add_word('car', 'vehicle')

# Using exists() method
has_cat = automaton.exists('cat')  # True
has_dog = automaton.exists('dog')  # False

# Using 'in' operator
if 'car' in automaton:
    print("Found car!")

if 'bike' not in automaton:
    print("Bike not found")

Prefix Matching

Check if a key is a prefix of any stored pattern.

def match(self, key):
    """
    Return True if there is a prefix (or key) equal to key present in the trie.

    Parameters:
    - key: Key to check as prefix

    Returns:
    bool: True if key is a prefix of any stored pattern

    Examples:
    If 'example' is in the trie, then match('e'), match('ex'), 
    match('exa'), ..., match('example') all return True.
    """

def longest_prefix(self, string):
    """
    Return the length of the longest prefix of string that exists in the trie.

    Parameters:
    - string: String to check

    Returns:
    int: Length of longest matching prefix
    """

Usage Examples

automaton = ahocorasick.Automaton()
automaton.add_word('example', 'demo')
automaton.add_word('explain', 'clarify')

# Prefix matching
print(automaton.match('e'))       # True - 'e' is prefix of 'example'
print(automaton.match('ex'))      # True - 'ex' is prefix of 'example'
print(automaton.match('exam'))    # True - 'exam' is prefix of 'example'
print(automaton.match('example')) # True - exact match
print(automaton.match('test'))    # False - no pattern starts with 'test'

# Longest prefix
length = automaton.longest_prefix('examples')  # 7 (length of 'example')
length = automaton.longest_prefix('expla')     # 5 (length of 'expla')
length = automaton.longest_prefix('xyz')       # 0 (no matching prefix)

Key Iteration

Iterate over stored keys with optional filtering.

def keys(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
    """
    Return an iterator on keys.

    Parameters:
    - prefix: Optional prefix string to filter keys
    - wildcard: Optional single character for pattern matching
    - how: How to match patterns (MATCH_EXACT_LENGTH, MATCH_AT_LEAST_PREFIX, 
           MATCH_AT_MOST_PREFIX)

    Returns:
    Iterator yielding keys that match the criteria
    """

def __iter__(self):
    """
    Default iteration over all keys.

    Returns:
    Iterator over all keys in the automaton
    """

Usage Examples

automaton = ahocorasick.Automaton()
words = ['cat', 'car', 'card', 'care', 'careful', 'dog', 'door']
for word in words:
    automaton.add_word(word, len(word))

# Iterate over all keys
all_keys = list(automaton.keys())
print("All keys:", all_keys)

# Alternative using __iter__
all_keys_iter = list(automaton)
print("All keys (iter):", all_keys_iter)

# Filter by prefix
car_words = list(automaton.keys(prefix='car'))
print("Keys starting with 'car':", car_words)  # ['car', 'card', 'care', 'careful']

# Wildcard matching
pattern_keys = list(automaton.keys(prefix='ca.', wildcard='.'))
print("Keys matching 'ca.':", pattern_keys)  # ['cat', 'car']

# Different matching modes with wildcards
exact_match = list(automaton.keys(prefix='ca.', wildcard='.', 
                                 how=ahocorasick.MATCH_EXACT_LENGTH))
print("Exact length match:", exact_match)  # ['cat', 'car'] (exactly 3 chars)

at_least_match = list(automaton.keys(prefix='ca.', wildcard='.', 
                                    how=ahocorasick.MATCH_AT_LEAST_PREFIX))
print("At least prefix:", at_least_match)  # ['cat', 'car', 'card', 'care', 'careful']

Value Iteration

Iterate over stored values with same filtering options as keys.

def values(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
    """
    Return an iterator on values associated with keys.

    Parameters:
    - prefix: Optional prefix string to filter keys
    - wildcard: Optional single character for pattern matching  
    - how: How to match patterns

    Returns:
    Iterator yielding values for keys that match the criteria
    """

Usage Example

automaton = ahocorasick.Automaton()
words = {'cat': 'animal', 'car': 'vehicle', 'card': 'object', 'dog': 'animal'}
for word, category in words.items():
    automaton.add_word(word, category)

# All values
all_values = list(automaton.values())
print("All values:", all_values)

# Values for keys starting with 'car'
car_values = list(automaton.values(prefix='car'))
print("Values for 'car' prefix:", car_values)  # ['vehicle', 'object']

Item Iteration

Iterate over key-value pairs with filtering options.

def items(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
    """
    Return an iterator on tuples of (key, value).

    Parameters:
    - prefix: Optional prefix string to filter keys
    - wildcard: Optional single character for pattern matching
    - how: How to match patterns

    Returns:
    Iterator yielding (key, value) tuples for keys that match criteria
    """

Usage Example

automaton = ahocorasick.Automaton()
animals = {'cat': 'feline', 'car': 'vehicle', 'care': 'concern', 'dog': 'canine'}
for word, meaning in animals.items():
    automaton.add_word(word, meaning)

# All items
all_items = list(automaton.items())
print("All items:", all_items)

# Items with prefix
car_items = list(automaton.items(prefix='car'))
print("Items with 'car' prefix:", car_items)  # [('car', 'vehicle'), ('care', 'concern')]

# Items matching wildcard pattern
three_char_items = list(automaton.items(prefix='...', wildcard='.', 
                                       how=ahocorasick.MATCH_EXACT_LENGTH))
print("3-character items:", three_char_items)  # [('cat', 'feline'), ('car', 'vehicle'), ('dog', 'canine')]

Length Operation

Get the number of stored patterns.

def __len__(self):
    """
    Return the number of distinct keys added to the trie.

    Returns:
    int: Number of keys in the automaton
    """

Usage Example

automaton = ahocorasick.Automaton()
print(len(automaton))  # 0

automaton.add_word('hello', 1)
automaton.add_word('world', 2)
print(len(automaton))  # 2

automaton.add_word('hello', 3)  # Updating existing key
print(len(automaton))  # Still 2 (no new key added)

Pattern Matching Modes

When using wildcard patterns, you can control how matches are found:

MATCH_EXACT_LENGTH

Match keys that have exactly the same length as the pattern.

# Pattern: 'c.t' (3 characters)
# Matches: 'cat', 'cut', 'cot'
# Doesn't match: 'cart', 'c', 'cute'

MATCH_AT_LEAST_PREFIX (Default)

Match keys that are at least as long as the pattern.

# Pattern: 'c.t' (3 characters)  
# Matches: 'cat', 'cart', 'cute', 'cattle'
# Doesn't match: 'c', 'ca'

MATCH_AT_MOST_PREFIX

Match keys that are at most as long as the pattern.

# Pattern: 'c.t' (3 characters)
# Matches: 'cat', 'c', 'ca'
# Doesn't match: 'cart', 'cute'

Advanced Usage Patterns

Batch Operations

def batch_check_existence(automaton, keys_to_check):
    """Check existence of multiple keys efficiently."""
    results = {}
    for key in keys_to_check:
        results[key] = key in automaton
    return results

def batch_get_values(automaton, keys_to_get, default=None):
    """Get values for multiple keys with default."""
    results = {}
    for key in keys_to_get:
        results[key] = automaton.get(key, default)
    return results

Pattern Statistics

def analyze_patterns(automaton):
    """Analyze stored patterns."""
    stats = {
        'total_patterns': len(automaton),
        'avg_length': 0,
        'length_distribution': {},
        'prefix_groups': {}
    }
    
    total_length = 0
    for key in automaton.keys():
        length = len(key)
        total_length += length
        
        # Length distribution
        stats['length_distribution'][length] = \
            stats['length_distribution'].get(length, 0) + 1
        
        # Prefix grouping
        if length > 0:
            prefix = key[0]
            if prefix not in stats['prefix_groups']:
                stats['prefix_groups'][prefix] = []
            stats['prefix_groups'][prefix].append(key)
    
    if stats['total_patterns'] > 0:
        stats['avg_length'] = total_length / stats['total_patterns']
    
    return stats

Custom Filtering

def filter_by_value_type(automaton, value_type):
    """Get keys whose values match a specific type."""
    matching_keys = []
    for key, value in automaton.items():
        if isinstance(value, value_type):
            matching_keys.append(key)
    return matching_keys

def filter_by_value_condition(automaton, condition_func):
    """Get keys whose values satisfy a condition."""
    matching_items = []
    for key, value in automaton.items():
        if condition_func(value):
            matching_items.append((key, value))
    return matching_items

Install with Tessl CLI

npx tessl i tessl/pypi-pyahocorasick

docs

automaton-construction.md

dictionary-interface.md

index.md

pattern-search.md

serialization.md

tile.json