Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm
—
Dict-like operations for accessing stored patterns and values, including existence checking, value retrieval, and iteration over keys, values, and items with optional filtering.
Retrieve values associated with keys using dict-like methods.
def get(self, key, default=None):
"""
Return the value associated with the key string.
Parameters:
- key: Key to look up
- default: Value to return if key not found
Returns:
The value associated with key, or default if key not found
Raises:
- KeyError: If key not found and no default provided
"""
def __getitem__(self, key):
"""
Get value for key using bracket notation.
Parameters:
- key: Key to look up
Returns:
The value associated with key
Raises:
- KeyError: If key not found
"""import ahocorasick
automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'place')
automaton.add_word('python', {'type': 'language', 'year': 1991})
# Using get() method
greeting = automaton.get('hello') # 'greeting'
missing = automaton.get('missing', 'not found') # 'not found'
# Using bracket notation
place = automaton['world'] # 'place'
lang_info = automaton['python'] # {'type': 'language', 'year': 1991}
# KeyError when key doesn't exist
try:
value = automaton['missing']
except KeyError:
print("Key not found")Check if keys exist in the automaton.
def exists(self, key):
"""
Return True if the key is present in the trie.
Parameters:
- key: Key to check
Returns:
bool: True if key exists, False otherwise
"""
def __contains__(self, key):
"""
Support for 'in' operator.
Parameters:
- key: Key to check
Returns:
bool: True if key exists, False otherwise
"""automaton = ahocorasick.Automaton()
automaton.add_word('cat', 'animal')
automaton.add_word('car', 'vehicle')
# Using exists() method
has_cat = automaton.exists('cat') # True
has_dog = automaton.exists('dog') # False
# Using 'in' operator
if 'car' in automaton:
print("Found car!")
if 'bike' not in automaton:
print("Bike not found")Check if a key is a prefix of any stored pattern.
def match(self, key):
"""
Return True if there is a prefix (or key) equal to key present in the trie.
Parameters:
- key: Key to check as prefix
Returns:
bool: True if key is a prefix of any stored pattern
Examples:
If 'example' is in the trie, then match('e'), match('ex'),
match('exa'), ..., match('example') all return True.
"""
def longest_prefix(self, string):
"""
Return the length of the longest prefix of string that exists in the trie.
Parameters:
- string: String to check
Returns:
int: Length of longest matching prefix
"""automaton = ahocorasick.Automaton()
automaton.add_word('example', 'demo')
automaton.add_word('explain', 'clarify')
# Prefix matching
print(automaton.match('e')) # True - 'e' is prefix of 'example'
print(automaton.match('ex')) # True - 'ex' is prefix of 'example'
print(automaton.match('exam')) # True - 'exam' is prefix of 'example'
print(automaton.match('example')) # True - exact match
print(automaton.match('test')) # False - no pattern starts with 'test'
# Longest prefix
length = automaton.longest_prefix('examples') # 7 (length of 'example')
length = automaton.longest_prefix('expla') # 5 (length of 'expla')
length = automaton.longest_prefix('xyz') # 0 (no matching prefix)Iterate over stored keys with optional filtering.
def keys(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
"""
Return an iterator on keys.
Parameters:
- prefix: Optional prefix string to filter keys
- wildcard: Optional single character for pattern matching
- how: How to match patterns (MATCH_EXACT_LENGTH, MATCH_AT_LEAST_PREFIX,
MATCH_AT_MOST_PREFIX)
Returns:
Iterator yielding keys that match the criteria
"""
def __iter__(self):
"""
Default iteration over all keys.
Returns:
Iterator over all keys in the automaton
"""automaton = ahocorasick.Automaton()
words = ['cat', 'car', 'card', 'care', 'careful', 'dog', 'door']
for word in words:
automaton.add_word(word, len(word))
# Iterate over all keys
all_keys = list(automaton.keys())
print("All keys:", all_keys)
# Alternative using __iter__
all_keys_iter = list(automaton)
print("All keys (iter):", all_keys_iter)
# Filter by prefix
car_words = list(automaton.keys(prefix='car'))
print("Keys starting with 'car':", car_words) # ['car', 'card', 'care', 'careful']
# Wildcard matching
pattern_keys = list(automaton.keys(prefix='ca.', wildcard='.'))
print("Keys matching 'ca.':", pattern_keys) # ['cat', 'car']
# Different matching modes with wildcards
exact_match = list(automaton.keys(prefix='ca.', wildcard='.',
how=ahocorasick.MATCH_EXACT_LENGTH))
print("Exact length match:", exact_match) # ['cat', 'car'] (exactly 3 chars)
at_least_match = list(automaton.keys(prefix='ca.', wildcard='.',
how=ahocorasick.MATCH_AT_LEAST_PREFIX))
print("At least prefix:", at_least_match) # ['cat', 'car', 'card', 'care', 'careful']Iterate over stored values with same filtering options as keys.
def values(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
"""
Return an iterator on values associated with keys.
Parameters:
- prefix: Optional prefix string to filter keys
- wildcard: Optional single character for pattern matching
- how: How to match patterns
Returns:
Iterator yielding values for keys that match the criteria
"""automaton = ahocorasick.Automaton()
words = {'cat': 'animal', 'car': 'vehicle', 'card': 'object', 'dog': 'animal'}
for word, category in words.items():
automaton.add_word(word, category)
# All values
all_values = list(automaton.values())
print("All values:", all_values)
# Values for keys starting with 'car'
car_values = list(automaton.values(prefix='car'))
print("Values for 'car' prefix:", car_values) # ['vehicle', 'object']Iterate over key-value pairs with filtering options.
def items(self, prefix=None, wildcard=None, how=ahocorasick.MATCH_AT_LEAST_PREFIX):
"""
Return an iterator on tuples of (key, value).
Parameters:
- prefix: Optional prefix string to filter keys
- wildcard: Optional single character for pattern matching
- how: How to match patterns
Returns:
Iterator yielding (key, value) tuples for keys that match criteria
"""automaton = ahocorasick.Automaton()
animals = {'cat': 'feline', 'car': 'vehicle', 'care': 'concern', 'dog': 'canine'}
for word, meaning in animals.items():
automaton.add_word(word, meaning)
# All items
all_items = list(automaton.items())
print("All items:", all_items)
# Items with prefix
car_items = list(automaton.items(prefix='car'))
print("Items with 'car' prefix:", car_items) # [('car', 'vehicle'), ('care', 'concern')]
# Items matching wildcard pattern
three_char_items = list(automaton.items(prefix='...', wildcard='.',
how=ahocorasick.MATCH_EXACT_LENGTH))
print("3-character items:", three_char_items) # [('cat', 'feline'), ('car', 'vehicle'), ('dog', 'canine')]Get the number of stored patterns.
def __len__(self):
"""
Return the number of distinct keys added to the trie.
Returns:
int: Number of keys in the automaton
"""automaton = ahocorasick.Automaton()
print(len(automaton)) # 0
automaton.add_word('hello', 1)
automaton.add_word('world', 2)
print(len(automaton)) # 2
automaton.add_word('hello', 3) # Updating existing key
print(len(automaton)) # Still 2 (no new key added)When using wildcard patterns, you can control how matches are found:
Match keys that have exactly the same length as the pattern.
# Pattern: 'c.t' (3 characters)
# Matches: 'cat', 'cut', 'cot'
# Doesn't match: 'cart', 'c', 'cute'Match keys that are at least as long as the pattern.
# Pattern: 'c.t' (3 characters)
# Matches: 'cat', 'cart', 'cute', 'cattle'
# Doesn't match: 'c', 'ca'Match keys that are at most as long as the pattern.
# Pattern: 'c.t' (3 characters)
# Matches: 'cat', 'c', 'ca'
# Doesn't match: 'cart', 'cute'def batch_check_existence(automaton, keys_to_check):
"""Check existence of multiple keys efficiently."""
results = {}
for key in keys_to_check:
results[key] = key in automaton
return results
def batch_get_values(automaton, keys_to_get, default=None):
"""Get values for multiple keys with default."""
results = {}
for key in keys_to_get:
results[key] = automaton.get(key, default)
return resultsdef analyze_patterns(automaton):
"""Analyze stored patterns."""
stats = {
'total_patterns': len(automaton),
'avg_length': 0,
'length_distribution': {},
'prefix_groups': {}
}
total_length = 0
for key in automaton.keys():
length = len(key)
total_length += length
# Length distribution
stats['length_distribution'][length] = \
stats['length_distribution'].get(length, 0) + 1
# Prefix grouping
if length > 0:
prefix = key[0]
if prefix not in stats['prefix_groups']:
stats['prefix_groups'][prefix] = []
stats['prefix_groups'][prefix].append(key)
if stats['total_patterns'] > 0:
stats['avg_length'] = total_length / stats['total_patterns']
return statsdef filter_by_value_type(automaton, value_type):
"""Get keys whose values match a specific type."""
matching_keys = []
for key, value in automaton.items():
if isinstance(value, value_type):
matching_keys.append(key)
return matching_keys
def filter_by_value_condition(automaton, condition_func):
"""Get keys whose values satisfy a condition."""
matching_items = []
for key, value in automaton.items():
if condition_func(value):
matching_items.append((key, value))
return matching_itemsInstall with Tessl CLI
npx tessl i tessl/pypi-pyahocorasick