Fast and memory efficient library for exact or approximate multi-pattern string search using the Aho-Corasick algorithm
—
Save and load automaton instances to/from disk with support for custom serialization functions for arbitrary object storage and efficient built-in serialization for integer storage.
Save an automaton to disk for later reuse.
def save(self, path, serializer=None):
"""
Save content of automaton to a file on disk.
Parameters:
- path: File path to save to
- serializer: Callable for converting Python objects to bytes
(required for STORE_ANY, not used for STORE_INTS/STORE_LENGTH)
Raises:
- ValueError: If serializer required but not provided
- IOError: If file cannot be written
"""import ahocorasick
import pickle
# STORE_ANY - requires serializer
automaton = ahocorasick.Automaton(ahocorasick.STORE_ANY)
automaton.add_word('hello', {'type': 'greeting', 'lang': 'en'})
automaton.add_word('world', {'type': 'noun', 'meaning': 'earth'})
automaton.make_automaton()
# Save with pickle serializer
automaton.save('my_automaton.dat', pickle.dumps)
# STORE_INTS - no serializer needed
int_automaton = ahocorasick.Automaton(ahocorasick.STORE_INTS)
int_automaton.add_word('cat', 1)
int_automaton.add_word('dog', 2)
int_automaton.make_automaton()
# Save without serializer
int_automaton.save('int_automaton.dat')
# STORE_LENGTH - no serializer needed
length_automaton = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
length_automaton.add_word('apple') # value = 5
length_automaton.add_word('orange') # value = 6
length_automaton.make_automaton()
# Save without serializer
length_automaton.save('length_automaton.dat')Load a previously saved automaton from disk.
def ahocorasick.load(path, deserializer=None):
"""
Load automaton previously stored on disk using save method.
Parameters:
- path: File path to load from
- deserializer: Callable for converting bytes back to Python objects
(required for STORE_ANY automatons, not used for others)
Returns:
Automaton: Loaded automaton instance ready for use
Raises:
- ValueError: If deserializer required but not provided
- IOError: If file cannot be read
- PickleError: If deserialization fails
"""import ahocorasick
import pickle
# Load STORE_ANY automaton - requires deserializer
loaded_automaton = ahocorasick.load('my_automaton.dat', pickle.loads)
# Verify it works
print(loaded_automaton.get('hello')) # {'type': 'greeting', 'lang': 'en'}
text = "hello world"
matches = list(loaded_automaton.iter(text))
print(matches)
# Load STORE_INTS automaton - no deserializer needed
int_automaton = ahocorasick.load('int_automaton.dat')
print(int_automaton.get('cat')) # 1
print(int_automaton.get('dog')) # 2
# Load STORE_LENGTH automaton - no deserializer needed
length_automaton = ahocorasick.load('length_automaton.dat')
print(length_automaton.get('apple')) # 5
print(length_automaton.get('orange')) # 6Automatons support Python's standard pickle module for serialization.
def __reduce__(self):
"""
Return pickle-able data for this automaton instance.
Returns:
tuple: Data needed to reconstruct the automaton
Usage:
This method enables standard pickle.dump() and pickle.load() operations.
"""import ahocorasick
import pickle
# Create and populate automaton
automaton = ahocorasick.Automaton()
words = ['the', 'quick', 'brown', 'fox']
for i, word in enumerate(words):
automaton.add_word(word, i)
automaton.make_automaton()
# Pickle to bytes
pickled_data = pickle.dumps(automaton)
# Unpickle from bytes
restored_automaton = pickle.loads(pickled_data)
# Verify functionality
print(restored_automaton.get('quick')) # 1
matches = list(restored_automaton.iter('the quick brown fox'))
print(len(matches)) # 4
# Pickle to file
with open('automaton.pickle', 'wb') as f:
pickle.dump(automaton, f)
# Unpickle from file
with open('automaton.pickle', 'rb') as f:
file_automaton = pickle.load(f)
print(file_automaton.get('fox')) # 3| Feature | save/load | pickle |
|---|---|---|
| Performance | Faster for large automatons | Slower, more overhead |
| File Size | Smaller files | Larger files |
| Portability | pyahocorasick specific | Standard Python |
| Flexibility | Custom serializers | Full object graph |
| Memory Usage | Lower during operation | Higher during operation |
| Storage Type | save/load Serializer | pickle Support | Notes |
|---|---|---|---|
| STORE_INTS | Not required | Yes | Most efficient |
| STORE_LENGTH | Not required | Yes | Very efficient |
| STORE_ANY | Required | Yes | Depends on object complexity |
import ahocorasick
import json
import pickle
class CustomSerializer:
"""Custom serializer for complex objects."""
@staticmethod
def serialize(obj):
"""Convert object to bytes."""
if isinstance(obj, dict):
return json.dumps(obj).encode('utf-8')
else:
return pickle.dumps(obj)
@staticmethod
def deserialize(data):
"""Convert bytes back to object."""
try:
# Try JSON first
return json.loads(data.decode('utf-8'))
except (UnicodeDecodeError, json.JSONDecodeError):
# Fall back to pickle
return pickle.loads(data)
# Usage
automaton = ahocorasick.Automaton()
automaton.add_word('config', {'host': 'localhost', 'port': 8080})
automaton.add_word('data', [1, 2, 3, 4, 5])
automaton.make_automaton()
# Save with custom serializer
automaton.save('custom.dat', CustomSerializer.serialize)
# Load with custom deserializer
loaded = ahocorasick.load('custom.dat', CustomSerializer.deserialize)
print(loaded.get('config')) # {'host': 'localhost', 'port': 8080}def conditional_serializer(obj):
"""Serialize only certain types of objects."""
if isinstance(obj, (str, int, float, bool)):
return pickle.dumps(obj)
elif isinstance(obj, dict) and all(isinstance(k, str) for k in obj.keys()):
return json.dumps(obj).encode('utf-8')
else:
raise ValueError(f"Cannot serialize object of type {type(obj)}")
def conditional_deserializer(data):
"""Deserialize with type detection."""
try:
return json.loads(data.decode('utf-8'))
except:
return pickle.loads(data)import gzip
import pickle
def compressed_save(automaton, path):
"""Save automaton with compression."""
with gzip.open(path, 'wb') as f:
pickle.dump(automaton, f)
def compressed_load(path):
"""Load compressed automaton."""
with gzip.open(path, 'rb') as f:
return pickle.load(f)
# Usage
automaton = ahocorasick.Automaton()
# ... populate automaton ...
compressed_save(automaton, 'compressed_automaton.pkl.gz')
loaded = compressed_load('compressed_automaton.pkl.gz')import ahocorasick
import pickle
class VersionedAutomaton:
"""Wrapper that adds version information."""
VERSION = "1.0"
def __init__(self, automaton):
self.version = self.VERSION
self.automaton = automaton
def save(self, path):
"""Save with version info."""
data = {
'version': self.version,
'automaton_data': pickle.dumps(self.automaton)
}
with open(path, 'wb') as f:
pickle.dump(data, f)
@classmethod
def load(cls, path):
"""Load with version checking."""
with open(path, 'rb') as f:
data = pickle.load(f)
if data['version'] != cls.VERSION:
print(f"Warning: Version mismatch. Expected {cls.VERSION}, got {data['version']}")
automaton = pickle.loads(data['automaton_data'])
return cls(automaton)
# Usage
automaton = ahocorasick.Automaton()
# ... populate automaton ...
versioned = VersionedAutomaton(automaton)
versioned.save('versioned_automaton.dat')
loaded_versioned = VersionedAutomaton.load('versioned_automaton.dat')Common serialization errors and solutions:
import ahocorasick
import os
def safe_save(automaton, path, serializer=None):
"""Save with error handling."""
try:
# Ensure directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)
automaton.save(path, serializer)
return True
except PermissionError:
print(f"Permission denied: {path}")
return False
except IOError as e:
print(f"IO error: {e}")
return False
def safe_load(path, deserializer=None):
"""Load with error handling."""
try:
if not os.path.exists(path):
print(f"File not found: {path}")
return None
return ahocorasick.load(path, deserializer)
except IOError as e:
print(f"IO error: {e}")
return None
except Exception as e:
print(f"Deserialization error: {e}")
return Nonedef validate_serializer(serializer, deserializer, test_obj):
"""Validate that serializer/deserializer pair works."""
try:
serialized = serializer(test_obj)
deserialized = deserializer(serialized)
return deserialized == test_obj
except Exception as e:
print(f"Serializer validation failed: {e}")
return False
# Usage
test_data = {'test': 'data', 'number': 42}
if validate_serializer(pickle.dumps, pickle.loads, test_data):
print("Serializer pair is valid")Install with Tessl CLI
npx tessl i tessl/pypi-pyahocorasick