CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyahocorasick

Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm

Pending
Overview
Eval results
Files

pattern-search.mddocs/

Pattern Search

Efficient multi-pattern search operations using the built automaton, supporting various search modes including standard iteration, longest-match iteration, and callback-based processing.

Capabilities

Standard Search Iterator

Iterate over all pattern matches in text, returning overlapping matches.

def iter(self, string, start=0, end=None, ignore_white_space=False):
    """
    Perform Aho-Corasick search procedure using the provided input string.

    Parameters:
    - string: Input text to search
    - start: Optional start index for search slice
    - end: Optional end index for search slice  
    - ignore_white_space: If True, ignore whitespace characters in matching

    Returns:
    Iterator of tuples (end_index, value) where:
    - end_index is the end index in input string where a trie key was found
    - value is the value associated with found key string

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Examples

import ahocorasick

# Create and build automaton
automaton = ahocorasick.Automaton()
words = ['he', 'she', 'his', 'hers']
for i, word in enumerate(words):
    automaton.add_word(word, (i, word))

automaton.make_automaton()

# Basic search
text = "she sells seashells by the seashore"
for end_index, (insert_order, original_string) in automaton.iter(text):
    start_index = end_index - len(original_string) + 1
    print(f"Found '{original_string}' at {start_index}-{end_index}")

# Search with slice bounds
text = "he she his hers"
matches = list(automaton.iter(text, start=3, end=10))  # "she hi"
for end_index, (insert_order, original_string) in matches:
    print(f"Match: '{original_string}' ending at {end_index}")

# Ignore whitespace during matching
automaton_no_space = ahocorasick.Automaton()
automaton_no_space.add_word('hello', 'greeting')
automaton_no_space.make_automaton()

text_with_spaces = "h e l l o   w o r l d"
matches = list(automaton_no_space.iter(text_with_spaces, ignore_white_space=True))
print(matches)  # Will find 'hello' despite spaces

Longest Match Iterator

Iterate over non-overlapping longest matches only.

def iter_long(self, string, start=0, end=None):
    """
    Perform modified Aho-Corasick search which matches the longest words from set.

    Parameters:
    - string: Input text to search
    - start: Optional start index for search slice
    - end: Optional end index for search slice

    Returns:
    Iterator of tuples (end_index, value) for non-overlapping longest matches

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Example

automaton = ahocorasick.Automaton()
# Add overlapping patterns of different lengths
patterns = ['he', 'her', 'hers', 'she']
for i, pattern in enumerate(patterns):
    automaton.add_word(pattern, (i, pattern))

automaton.make_automaton()

text = "she hers"

# Standard iterator finds all overlapping matches
standard_matches = list(automaton.iter(text))
print("Standard matches:", standard_matches)
# Output: [(2, (3, 'she')), (2, (0, 'he')), (7, (0, 'he')), (7, (1, 'her')), (7, (2, 'hers'))]

# Long iterator finds only longest non-overlapping matches
long_matches = list(automaton.iter_long(text))
print("Longest matches:", long_matches)
# Output: [(2, (3, 'she')), (7, (2, 'hers'))]

Callback-based Search

Search for patterns and invoke a callback function for each match.

def find_all(self, string, callback, start=0, end=None):
    """
    Perform Aho-Corasick search procedure and invoke callback for each match.

    Parameters:
    - string: Input text to search
    - callback: Callable that accepts (end_index, value) arguments
    - start: Optional start index for search slice
    - end: Optional end index for search slice

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Example

automaton = ahocorasick.Automaton()
words = ['cat', 'car', 'card']
for word in words:
    automaton.add_word(word, word.upper())

automaton.make_automaton()

# Callback function to process matches
matches_found = []
def collect_matches(end_index, value):
    matches_found.append({
        'end_position': end_index,
        'matched_word': value,
        'length': len(value)
    })

# Search with callback
text = "the car and cat ran to the card table"
automaton.find_all(text, collect_matches)

for match in matches_found:
    start_pos = match['end_position'] - match['length'] + 1
    print(f"Found '{match['matched_word']}' at position {start_pos}-{match['end_position']}")

Search Iterator Manipulation

Advanced control over search iteration state.

class AutomatonSearchIter:
    def set(self, string, reset=False):
        """
        Set a new string to search in the iterator.

        Parameters:
        - string: New string to search
        - reset: If False (default), continue search state; 
                 if True, reset automaton state

        Usage:
        Allows searching large strings in chunks or continuing
        search across multiple string segments.
        """

Usage Example

automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'place')
automaton.make_automaton()

# Get iterator
search_iter = automaton.iter("hello")

# Process first chunk
for match in search_iter:
    print(f"Match in first chunk: {match}")

# Continue with second chunk (maintains state)
search_iter.set(" world", reset=False)
for match in search_iter:
    print(f"Match in second chunk: {match}")

# Reset and search new content
search_iter.set("hello again", reset=True)
for match in search_iter:
    print(f"Match after reset: {match}")

Long Search Iterator Manipulation

Advanced control over longest-match search iteration state.

class AutomatonSearchIterLong:
    def set(self, string, reset=False):
        """
        Set a new string to search in the longest-match iterator.

        Parameters:
        - string: New string to search
        - reset: If False (default), continue search state; 
                 if True, reset automaton state

        Usage:
        Allows searching large strings in chunks for longest matches
        or continuing search across multiple string segments.
        """

Usage Example

automaton = ahocorasick.Automaton()
automaton.add_word('he', 'short')
automaton.add_word('hello', 'long')
automaton.make_automaton()

# Get longest-match iterator
long_iter = automaton.iter_long("hello")

# Process first chunk
for match in long_iter:
    print(f"Longest match in first chunk: {match}")

# Continue with second chunk (maintains state)
long_iter.set(" world", reset=False)
for match in long_iter:
    print(f"Longest match in second chunk: {match}")

# Reset and search new content
long_iter.set("hello again", reset=True)
for match in long_iter:
    print(f"Longest match after reset: {match}")

Search Performance Notes

Memory Efficiency

  • All search methods return iterators, not lists
  • Memory usage is constant regardless of text size or match count
  • Suitable for processing large texts or streaming data

Time Complexity

  • Search time is O(n + m) where n is text length and m is number of matches
  • Performance independent of number of patterns (within reasonable limits)
  • Worst-case and average-case performance are similar

Pattern Overlap Handling

  • iter(): Returns all overlapping matches
  • iter_long(): Returns only longest non-overlapping matches
  • find_all(): Processes all overlapping matches via callback

Common Search Patterns

Extract All Matches with Positions

def extract_matches(automaton, text):
    """Extract all matches with their positions and original text."""
    matches = []
    for end_index, value in automaton.iter(text):
        # Reconstruct original pattern length
        if isinstance(value, tuple) and len(value) == 2:
            _, pattern = value
            start_index = end_index - len(pattern) + 1
            matches.append({
                'pattern': pattern,
                'start': start_index,
                'end': end_index,
                'text': text[start_index:end_index + 1]
            })
    return matches

Count Pattern Occurrences

def count_patterns(automaton, text):
    """Count occurrences of each pattern."""
    counts = {}
    for end_index, value in automaton.iter(text):
        pattern = value if isinstance(value, str) else value[1]
        counts[pattern] = counts.get(pattern, 0) + 1
    return counts

Find Non-overlapping Matches

def find_non_overlapping(automaton, text):
    """Find all non-overlapping matches (greedy left-to-right)."""
    matches = []
    last_end = -1
    
    for end_index, value in automaton.iter(text):
        pattern = value if isinstance(value, str) else value[1]
        start_index = end_index - len(pattern) + 1
        
        if start_index > last_end:
            matches.append((start_index, end_index, pattern))
            last_end = end_index
    
    return matches

Install with Tessl CLI

npx tessl i tessl/pypi-pyahocorasick

docs

automaton-construction.md

dictionary-interface.md

index.md

pattern-search.md

serialization.md

tile.json