Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm
—
Core functionality for creating and managing Aho-Corasick automata, including adding and removing patterns, configuring storage types, and converting tries to search-ready automatons.
Create a new empty Automaton with configurable storage and key types.
class Automaton:
def __init__(self, store=ahocorasick.STORE_ANY, key_type=ahocorasick.KEY_STRING):
"""
Create a new empty Automaton.
Parameters:
- store: Storage type for values (STORE_ANY, STORE_INTS, STORE_LENGTH)
- key_type: Type of keys (KEY_STRING, KEY_SEQUENCE)
"""import ahocorasick
# Default: store any objects with string keys
automaton = ahocorasick.Automaton()
# Store integers only
int_automaton = ahocorasick.Automaton(ahocorasick.STORE_INTS)
# Store key lengths automatically
length_automaton = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
# Use integer sequences as keys
seq_automaton = ahocorasick.Automaton(key_type=ahocorasick.KEY_SEQUENCE)Add key strings to the trie with associated values.
def add_word(self, key, value=None):
"""
Add a key string to the trie and associate with a value.
Parameters:
- key: String key to add (or integer sequence if KEY_SEQUENCE)
- value: Associated value (required for STORE_ANY, optional for STORE_INTS,
not allowed for STORE_LENGTH)
Returns:
bool: True if new word added, False if word already exists
Raises:
- ValueError: If value type doesn't match store type
- TypeError: If key type is incorrect
"""# STORE_ANY (default) - value required
automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', {'type': 'noun', 'meaning': 'earth'})
# STORE_INTS - value optional (defaults to insertion order)
int_automaton = ahocorasick.Automaton(ahocorasick.STORE_INTS)
int_automaton.add_word('cat', 1)
int_automaton.add_word('dog') # defaults to 1 (second word added)
# STORE_LENGTH - value not allowed (automatically uses key length)
length_automaton = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
length_automaton.add_word('cat') # value will be 3
length_automaton.add_word('mouse') # value will be 5
# KEY_SEQUENCE - use integer sequences as keys
seq_automaton = ahocorasick.Automaton(key_type=ahocorasick.KEY_SEQUENCE)
seq_automaton.add_word([1, 2, 3], 'sequence1')
seq_automaton.add_word((4, 5, 6), 'sequence2')Remove words from the trie.
def remove_word(self, key):
"""
Remove given word from the trie.
Parameters:
- key: Word to remove
Returns:
bool: True if word was found and removed, False otherwise
"""
def pop(self, key):
"""
Remove word from trie and return its associated value.
Parameters:
- key: Word to remove
Returns:
The value associated with key
Raises:
- KeyError: If key not found
"""automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'noun')
# Remove word and check if it existed
was_removed = automaton.remove_word('hello') # True
was_removed = automaton.remove_word('missing') # False
# Remove word and get its value
value = automaton.pop('world') # 'noun'
# Pop raises KeyError if key not found
try:
value = automaton.pop('missing')
except KeyError:
print("Key not found")Remove all words from the automaton.
def clear(self):
"""
Remove all keys from the trie.
This method invalidates all iterators.
"""Convert the trie to an Aho-Corasick automaton for efficient searching.
def make_automaton(self):
"""
Finalize and create the Aho-Corasick automaton based on the keys
already added to the trie. After successful creation, the
automaton.kind attribute is set to ahocorasick.AHOCORASICK.
"""automaton = ahocorasick.Automaton()
# Add words to the trie
words = ['he', 'she', 'his', 'hers']
for i, word in enumerate(words):
automaton.add_word(word, i)
print(automaton.kind) # ahocorasick.TRIE
# Convert to automaton for searching
automaton.make_automaton()
print(automaton.kind) # ahocorasick.AHOCORASICK
# Now ready for efficient pattern search
text = "she sells seashells"
matches = list(automaton.iter(text))Access automaton state and configuration information.
@property
def kind(self):
"""
Current state of automaton.
Returns:
int: One of EMPTY, TRIE, or AHOCORASICK
"""
@property
def store(self):
"""
Storage type used by automaton.
Returns:
int: One of STORE_ANY, STORE_INTS, or STORE_LENGTH
"""
def __len__(self):
"""
Number of words in automaton.
Returns:
int: Count of distinct words added
"""Get detailed information about the automaton's structure and memory usage.
def get_stats(self):
"""
Return a dictionary containing Automaton statistics.
Returns:
dict: Statistics with keys:
- nodes_count: Total number of nodes
- words_count: Number of distinct words (same as len(automaton))
- longest_word: Length of the longest word
- links_count: Number of edges
- sizeof_node: Size of single node in bytes
- total_size: Total size of trie in bytes
"""automaton = ahocorasick.Automaton()
for word in ['cat', 'car', 'card', 'care', 'careful']:
automaton.add_word(word, len(word))
automaton.make_automaton()
stats = automaton.get_stats()
print(f"Nodes: {stats['nodes_count']}")
print(f"Words: {stats['words_count']}")
print(f"Longest word: {stats['longest_word']}")
print(f"Memory usage: {stats['total_size']} bytes")Get the approximate memory usage of the automaton instance.
def __sizeof__(self):
"""
Return the approximate size in bytes occupied by the Automaton instance in memory.
Returns:
int: Size in bytes (excludes size of associated objects when using STORE_ANY)
"""Get detailed internal structure information for debugging and inspection.
def dump(self):
"""
Return a three-tuple of lists describing the Automaton as a graph of nodes, edges, failure links.
Returns:
tuple: (nodes, edges, failure_links) where:
- nodes: list of (node_id, end_of_word_marker) pairs
- edges: list of (node_id, label_char, child_node_id) triples
- failure_links: list of (source_node_id, target_node_id) pairs
Returns None if automaton is empty.
"""automaton = ahocorasick.Automaton()
automaton.add_word('cat', 1)
automaton.add_word('car', 2)
automaton.make_automaton()
# Get memory usage
memory_size = automaton.__sizeof__()
print(f"Automaton uses {memory_size} bytes")
# Get internal structure for debugging
structure = automaton.dump()
if structure:
nodes, edges, failure_links = structure
print(f"Nodes: {len(nodes)}")
print(f"Edges: {len(edges)}")
print(f"Failure links: {len(failure_links)}")
# Example: print first few nodes
for node_id, is_end in nodes[:3]:
print(f"Node {node_id}: end_marker={is_end}")add_word()add_word()Install with Tessl CLI
npx tessl i tessl/pypi-pyahocorasick