tessl/pypi-pyahocorasick

Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm

—

Pending

Overview

Eval results

Files

Pattern Search

Name: tessl/pypi-pyahocorasick
Author: tessl

Efficient multi-pattern search operations using the built automaton, supporting various search modes including standard iteration, longest-match iteration, and callback-based processing.

Capabilities

Standard Search Iterator

Iterate over all pattern matches in text, returning overlapping matches.

def iter(self, string, start=0, end=None, ignore_white_space=False):
    """
    Perform Aho-Corasick search procedure using the provided input string.

    Parameters:
    - string: Input text to search
    - start: Optional start index for search slice
    - end: Optional end index for search slice  
    - ignore_white_space: If True, ignore whitespace characters in matching

    Returns:
    Iterator of tuples (end_index, value) where:
    - end_index is the end index in input string where a trie key was found
    - value is the value associated with found key string

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Examples

import ahocorasick

# Create and build automaton
automaton = ahocorasick.Automaton()
words = ['he', 'she', 'his', 'hers']
for i, word in enumerate(words):
    automaton.add_word(word, (i, word))

automaton.make_automaton()

# Basic search
text = "she sells seashells by the seashore"
for end_index, (insert_order, original_string) in automaton.iter(text):
    start_index = end_index - len(original_string) + 1
    print(f"Found '{original_string}' at {start_index}-{end_index}")

# Search with slice bounds
text = "he she his hers"
matches = list(automaton.iter(text, start=3, end=10))  # "she hi"
for end_index, (insert_order, original_string) in matches:
    print(f"Match: '{original_string}' ending at {end_index}")

# Ignore whitespace during matching
automaton_no_space = ahocorasick.Automaton()
automaton_no_space.add_word('hello', 'greeting')
automaton_no_space.make_automaton()

text_with_spaces = "h e l l o   w o r l d"
matches = list(automaton_no_space.iter(text_with_spaces, ignore_white_space=True))
print(matches)  # Will find 'hello' despite spaces

Longest Match Iterator

Iterate over non-overlapping longest matches only.

def iter_long(self, string, start=0, end=None):
    """
    Perform modified Aho-Corasick search which matches the longest words from set.

    Parameters:
    - string: Input text to search
    - start: Optional start index for search slice
    - end: Optional end index for search slice

    Returns:
    Iterator of tuples (end_index, value) for non-overlapping longest matches

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Example

automaton = ahocorasick.Automaton()
# Add overlapping patterns of different lengths
patterns = ['he', 'her', 'hers', 'she']
for i, pattern in enumerate(patterns):
    automaton.add_word(pattern, (i, pattern))

automaton.make_automaton()

text = "she hers"

# Standard iterator finds all overlapping matches
standard_matches = list(automaton.iter(text))
print("Standard matches:", standard_matches)
# Output: [(2, (3, 'she')), (2, (0, 'he')), (7, (0, 'he')), (7, (1, 'her')), (7, (2, 'hers'))]

# Long iterator finds only longest non-overlapping matches
long_matches = list(automaton.iter_long(text))
print("Longest matches:", long_matches)
# Output: [(2, (3, 'she')), (7, (2, 'hers'))]

Callback-based Search

Search for patterns and invoke a callback function for each match.

def find_all(self, string, callback, start=0, end=None):
    """
    Perform Aho-Corasick search procedure and invoke callback for each match.

    Parameters:
    - string: Input text to search
    - callback: Callable that accepts (end_index, value) arguments
    - start: Optional start index for search slice
    - end: Optional end index for search slice

    Raises:
    - AttributeError: If automaton not built with make_automaton()
    """

Usage Example

automaton = ahocorasick.Automaton()
words = ['cat', 'car', 'card']
for word in words:
    automaton.add_word(word, word.upper())

automaton.make_automaton()

# Callback function to process matches
matches_found = []
def collect_matches(end_index, value):
    matches_found.append({
        'end_position': end_index,
        'matched_word': value,
        'length': len(value)
    })

# Search with callback
text = "the car and cat ran to the card table"
automaton.find_all(text, collect_matches)

for match in matches_found:
    start_pos = match['end_position'] - match['length'] + 1
    print(f"Found '{match['matched_word']}' at position {start_pos}-{match['end_position']}")

Search Iterator Manipulation

Advanced control over search iteration state.

class AutomatonSearchIter:
    def set(self, string, reset=False):
        """
        Set a new string to search in the iterator.

        Parameters:
        - string: New string to search
        - reset: If False (default), continue search state; 
                 if True, reset automaton state

        Usage:
        Allows searching large strings in chunks or continuing
        search across multiple string segments.
        """

Usage Example

automaton = ahocorasick.Automaton()
automaton.add_word('hello', 'greeting')
automaton.add_word('world', 'place')
automaton.make_automaton()

# Get iterator
search_iter = automaton.iter("hello")

# Process first chunk
for match in search_iter:
    print(f"Match in first chunk: {match}")

# Continue with second chunk (maintains state)
search_iter.set(" world", reset=False)
for match in search_iter:
    print(f"Match in second chunk: {match}")

# Reset and search new content
search_iter.set("hello again", reset=True)
for match in search_iter:
    print(f"Match after reset: {match}")

Long Search Iterator Manipulation

Advanced control over longest-match search iteration state.

class AutomatonSearchIterLong:
    def set(self, string, reset=False):
        """
        Set a new string to search in the longest-match iterator.

        Parameters:
        - string: New string to search
        - reset: If False (default), continue search state; 
                 if True, reset automaton state

        Usage:
        Allows searching large strings in chunks for longest matches
        or continuing search across multiple string segments.
        """

Usage Example

automaton = ahocorasick.Automaton()
automaton.add_word('he', 'short')
automaton.add_word('hello', 'long')
automaton.make_automaton()

# Get longest-match iterator
long_iter = automaton.iter_long("hello")

# Process first chunk
for match in long_iter:
    print(f"Longest match in first chunk: {match}")

# Continue with second chunk (maintains state)
long_iter.set(" world", reset=False)
for match in long_iter:
    print(f"Longest match in second chunk: {match}")

# Reset and search new content
long_iter.set("hello again", reset=True)
for match in long_iter:
    print(f"Longest match after reset: {match}")

Search Performance Notes

Memory Efficiency

All search methods return iterators, not lists
Memory usage is constant regardless of text size or match count
Suitable for processing large texts or streaming data

Time Complexity

Search time is O(n + m) where n is text length and m is number of matches
Performance independent of number of patterns (within reasonable limits)
Worst-case and average-case performance are similar

Pattern Overlap Handling

iter(): Returns all overlapping matches
iter_long(): Returns only longest non-overlapping matches
find_all(): Processes all overlapping matches via callback

Common Search Patterns

Extract All Matches with Positions

def extract_matches(automaton, text):
    """Extract all matches with their positions and original text."""
    matches = []
    for end_index, value in automaton.iter(text):
        # Reconstruct original pattern length
        if isinstance(value, tuple) and len(value) == 2:
            _, pattern = value
            start_index = end_index - len(pattern) + 1
            matches.append({
                'pattern': pattern,
                'start': start_index,
                'end': end_index,
                'text': text[start_index:end_index + 1]
            })
    return matches

Count Pattern Occurrences

def count_patterns(automaton, text):
    """Count occurrences of each pattern."""
    counts = {}
    for end_index, value in automaton.iter(text):
        pattern = value if isinstance(value, str) else value[1]
        counts[pattern] = counts.get(pattern, 0) + 1
    return counts

Find Non-overlapping Matches

def find_non_overlapping(automaton, text):
    """Find all non-overlapping matches (greedy left-to-right)."""
    matches = []
    last_end = -1
    
    for end_index, value in automaton.iter(text):
        pattern = value if isinstance(value, str) else value[1]
        start_index = end_index - len(pattern) + 1
        
        if start_index > last_end:
            matches.append((start_index, end_index, pattern))
            last_end = end_index
    
    return matches

Install with Tessl CLI