Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm
—
Efficient multi-pattern search operations using the built automaton, supporting various search modes including standard iteration, longest-match iteration, and callback-based processing.
Iterate over all pattern matches in text, returning overlapping matches.
def iter(self, string, start=0, end=None, ignore_white_space=False):
"""
Perform Aho-Corasick search procedure using the provided input string.
Parameters:
- string: Input text to search
- start: Optional start index for search slice
- end: Optional end index for search slice
- ignore_white_space: If True, ignore whitespace characters in matching
Returns:
Iterator of tuples (end_index, value) where:
- end_index is the end index in input string where a trie key was found
- value is the value associated with found key string
Raises:
- AttributeError: If automaton not built with make_automaton()
"""import ahocorasick
# Create and build automaton
automaton = ahocorasick.Automaton()
words = ['he', 'she', 'his', 'hers']
for i, word in enumerate(words):
automaton.add_word(word, (i, word))
automaton.make_automaton()
# Basic search
text = "she sells seashells by the seashore"
for end_index, (insert_order, original_string) in automaton.iter(text):
start_index = end_index - len(original_string) + 1
print(f"Found '{original_string}' at {start_index}-{end_index}")
# Search with slice bounds
text = "he she his hers"
matches = list(automaton.iter(text, start=3, end=10)) # "she hi"
for end_index, (insert_order, original_string) in matches:
print(f"Match: '{original_string}' ending at {end_index}")
# Ignore whitespace during matching
automaton_no_space = ahocorasick.Automaton()
automaton_no_space.add_word('hello', 'greeting')
automaton_no_space.make_automaton()
text_with_spaces = "h e l l o w o r l d"
matches = list(automaton_no_space.iter(text_with_spaces, ignore_white_space=True))
print(matches) # Will find 'hello' despite spacesIterate over non-overlapping longest matches only.
def iter_long(self, string, start=0, end=None):
"""
Perform modified Aho-Corasick search which matches the longest words from set.
Parameters:
- string: Input text to search
- start: Optional start index for search slice
- end: Optional end index for search slice
Returns:
Iterator of tuples (end_index, value) for non-overlapping longest matches
Raises:
- AttributeError: If automaton not built with make_automaton()
"""automaton = ahocorasick.Automaton()
# Add overlapping patterns of different lengths
patterns = ['he', 'her', 'hers', 'she']
for i, pattern in enumerate(patterns):
automaton.add_word(pattern, (i, pattern))
automaton.make_automaton()
text = "she hers"
# Standard iterator finds all overlapping matches
standard_matches = list(automaton.iter(text))
print("Standard matches:", standard_matches)
# Output: [(2, (3, 'she')), (2, (0, 'he')), (7, (0, 'he')), (7, (1, 'her')), (7, (2, 'hers'))]
# Long iterator finds only longest non-overlapping matches
long_matches = list(automaton.iter_long(text))
print("Longest matches:", long_matches)
# Output: [(2, (3, 'she')), (7, (2, 'hers'))]Search for patterns and invoke a callback function for each match.
def find_all(self, string, callback, start=0, end=None):
"""
Perform Aho-Corasick search procedure and invoke callback for each match.
Parameters:
- string: Input text to search
- callback: Callable that accepts (end_index, value) arguments
- start: Optional start index for search slice
- end: Optional end index for search slice
Raises:
- AttributeError: If automaton not built with make_automaton()
"""automaton = ahocorasick.Automaton()
words = ['cat', 'car', 'card']
for word in words:
automaton.add_word(word, word.upper())
automaton.make_automaton()
# Callback function to process matches
matches_found = []
def collect_matches(end_index, value):
matches_found.append({
'end_position': end_index,
'matched_word': value,
'length': len(value)
})
# Search with callback
text = "the car and cat ran to the card table"
automaton.find_all(text, collect_matches)
for match in matches_found:
start_pos = match['end_position'] - match['length'] + 1
print(f"Found '{match['matched_word']}' at position {start_pos}-{match['end_position']}")Advanced control over search iteration state.
class AutomatonSearchIter:
def set(self, string, reset=False):
"""
Set a new string to search in the iterator.
Parameters:
- string: New string to search
- reset: If False (default), continue search state;
if True, reset automaton state
Usage:
Allows searching large strings in chunks or continuing
search across multiple string segments.
"""automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'place')
automaton.make_automaton()
# Get iterator
search_iter = automaton.iter("hello")
# Process first chunk
for match in search_iter:
print(f"Match in first chunk: {match}")
# Continue with second chunk (maintains state)
search_iter.set(" world", reset=False)
for match in search_iter:
print(f"Match in second chunk: {match}")
# Reset and search new content
search_iter.set("hello again", reset=True)
for match in search_iter:
print(f"Match after reset: {match}")Advanced control over longest-match search iteration state.
class AutomatonSearchIterLong:
def set(self, string, reset=False):
"""
Set a new string to search in the longest-match iterator.
Parameters:
- string: New string to search
- reset: If False (default), continue search state;
if True, reset automaton state
Usage:
Allows searching large strings in chunks for longest matches
or continuing search across multiple string segments.
"""automaton = ahocorasick.Automaton()
automaton.add_word('he', 'short')
automaton.add_word('hello', 'long')
automaton.make_automaton()
# Get longest-match iterator
long_iter = automaton.iter_long("hello")
# Process first chunk
for match in long_iter:
print(f"Longest match in first chunk: {match}")
# Continue with second chunk (maintains state)
long_iter.set(" world", reset=False)
for match in long_iter:
print(f"Longest match in second chunk: {match}")
# Reset and search new content
long_iter.set("hello again", reset=True)
for match in long_iter:
print(f"Longest match after reset: {match}")iter(): Returns all overlapping matchesiter_long(): Returns only longest non-overlapping matchesfind_all(): Processes all overlapping matches via callbackdef extract_matches(automaton, text):
"""Extract all matches with their positions and original text."""
matches = []
for end_index, value in automaton.iter(text):
# Reconstruct original pattern length
if isinstance(value, tuple) and len(value) == 2:
_, pattern = value
start_index = end_index - len(pattern) + 1
matches.append({
'pattern': pattern,
'start': start_index,
'end': end_index,
'text': text[start_index:end_index + 1]
})
return matchesdef count_patterns(automaton, text):
"""Count occurrences of each pattern."""
counts = {}
for end_index, value in automaton.iter(text):
pattern = value if isinstance(value, str) else value[1]
counts[pattern] = counts.get(pattern, 0) + 1
return countsdef find_non_overlapping(automaton, text):
"""Find all non-overlapping matches (greedy left-to-right)."""
matches = []
last_end = -1
for end_index, value in automaton.iter(text):
pattern = value if isinstance(value, str) else value[1]
start_index = end_index - len(pattern) + 1
if start_index > last_end:
matches.append((start_index, end_index, pattern))
last_end = end_index
return matchesInstall with Tessl CLI
npx tessl i tessl/pypi-pyahocorasick