Fuzzy string matching library using Levenshtein Distance calculations for approximate string comparison and search
—
String preprocessing, validation functions, and utility classes for handling edge cases and optimizing fuzzy string matching operations.
Core string preprocessing function that normalizes strings for consistent fuzzy matching.
def full_process(s: str, force_ascii: bool = False) -> str:
"""
Process string by removing non-alphanumeric characters, trimming, and lowercasing.
Processing steps:
1. Convert to ASCII if force_ascii=True
2. Replace non-letters and non-numbers with whitespace
3. Convert to lowercase
4. Strip leading and trailing whitespace
Parameters:
s: String to process
force_ascii: Force conversion to ASCII, removing non-ASCII characters
Returns:
str: Processed string ready for fuzzy matching
"""Usage Example:
from fuzzywuzzy import utils
# Standard processing
processed = utils.full_process(" Hello, World! 123 ")
print(processed) # "hello world 123"
# With ASCII forcing
processed = utils.full_process("Café naïve résumé", force_ascii=True)
print(processed) # "caf naive resume"
# Remove punctuation and normalize
processed = utils.full_process("user@example.com")
print(processed) # "user example com"Validate that strings are suitable for fuzzy matching operations.
def validate_string(s) -> bool:
"""
Check input has length and that length > 0.
Parameters:
s: Input to validate
Returns:
bool: True if len(s) > 0, False otherwise or if TypeError
"""Usage Example:
from fuzzywuzzy import utils
print(utils.validate_string("hello")) # True
print(utils.validate_string("")) # False
print(utils.validate_string(None)) # False
print(utils.validate_string(123)) # False (TypeError)Ensure both strings are the same type (str or unicode) for consistent comparison.
def make_type_consistent(s1, s2):
"""
If both objects aren't either both string or unicode instances, force them to unicode.
Parameters:
s1: First string
s2: Second string
Returns:
tuple: (s1, s2) with consistent types
"""Functions for handling ASCII conversion and character filtering.
def asciidammit(s) -> str:
"""
Force string to ASCII by removing or converting non-ASCII characters.
Parameters:
s: String to convert
Returns:
str: ASCII-only version of input string
"""
def asciionly(s) -> str:
"""
Remove non-ASCII characters from string.
Parameters:
s: String to filter
Returns:
str: String with non-ASCII characters removed
"""Usage Example:
from fuzzywuzzy import utils
# Force ASCII conversion
ascii_str = utils.asciidammit("Café naïve résumé")
print(ascii_str) # "Caf naive resume"
# Remove non-ASCII only
filtered = utils.asciionly("Hello 世界")
print(filtered) # "Hello "Helper functions for numerical operations in fuzzy matching.
def intr(n) -> int:
"""
Return a correctly rounded integer.
Parameters:
n: Number to round
Returns:
int: Rounded integer value
"""Usage Example:
from fuzzywuzzy import utils
print(utils.intr(97.6)) # 98
print(utils.intr(97.4)) # 97
print(utils.intr(97.5)) # 98Advanced string processing utilities with optimized methods.
class StringProcessor:
"""
String processing utilities class with efficient methods for
text normalization and cleaning operations.
"""
@classmethod
def replace_non_letters_non_numbers_with_whitespace(cls, a_string: str) -> str:
"""
Replace any sequence of non-letters and non-numbers with single whitespace.
Parameters:
a_string: String to process
Returns:
str: String with non-alphanumeric sequences replaced by spaces
"""
@staticmethod
def strip(s: str) -> str:
"""Remove leading and trailing whitespace."""
@staticmethod
def to_lower_case(s: str) -> str:
"""Convert string to lowercase."""
@staticmethod
def to_upper_case(s: str) -> str:
"""Convert string to uppercase."""Usage Example:
from fuzzywuzzy.string_processing import StringProcessor
# Advanced string processing
text = "Hello!!! @#$ World??? 123"
processed = StringProcessor.replace_non_letters_non_numbers_with_whitespace(text)
print(processed) # "Hello World 123"
# Standard operations
lower_text = StringProcessor.to_lower_case("HELLO WORLD")
print(lower_text) # "hello world"
stripped = StringProcessor.strip(" hello world ")
print(stripped) # "hello world"High-performance string matching class available when python-Levenshtein is installed.
class StringMatcher:
"""
A SequenceMatcher-like class built on top of Levenshtein distance calculations.
Provides significant performance improvements when python-Levenshtein is available.
This class provides a SequenceMatcher-compatible interface while using the
highly optimized Levenshtein library for calculations.
"""
def __init__(self, isjunk=None, seq1: str = '', seq2: str = ''):
"""
Initialize StringMatcher with two sequences.
Parameters:
isjunk: Junk function (ignored, not implemented - will show warning)
seq1: First string to compare (default: '')
seq2: Second string to compare (default: '')
"""
def set_seqs(self, seq1: str, seq2: str):
"""
Set both sequences for comparison and reset cache.
Parameters:
seq1: First string to compare
seq2: Second string to compare
"""
def set_seq1(self, seq1: str):
"""
Set first sequence and reset cache.
Parameters:
seq1: First string to compare
"""
def set_seq2(self, seq2: str):
"""
Set second sequence and reset cache.
Parameters:
seq2: Second string to compare
"""
def ratio(self) -> float:
"""
Get similarity ratio between sequences using Levenshtein calculation.
Returns:
float: Similarity ratio between 0.0 and 1.0
"""
def quick_ratio(self) -> float:
"""
Get quick similarity ratio (same as ratio() in this implementation).
Returns:
float: Similarity ratio between 0.0 and 1.0
"""
def real_quick_ratio(self) -> float:
"""
Get a very quick similarity estimate based on string lengths.
Returns:
float: Quick similarity estimate between 0.0 and 1.0
"""
def distance(self) -> int:
"""
Get Levenshtein distance between sequences.
Returns:
int: Edit distance (number of operations to transform seq1 to seq2)
"""
def get_opcodes(self):
"""
Get operation codes for sequence comparison.
Returns:
List of operation codes compatible with difflib.SequenceMatcher
"""
def get_editops(self):
"""
Get edit operations for transforming one sequence to another.
Returns:
List of edit operations (insertions, deletions, substitutions)
"""
def get_matching_blocks(self):
"""
Get matching blocks between sequences.
Returns:
List of matching blocks compatible with difflib.SequenceMatcher
"""Usage Example:
# Only available if python-Levenshtein is installed
try:
from fuzzywuzzy.StringMatcher import StringMatcher
matcher = StringMatcher(seq1="hello world", seq2="hallo world")
ratio = matcher.ratio()
print(f"Similarity: {ratio}") # High-performance ratio calculation
distance = matcher.distance()
print(f"Edit distance: {distance}") # Levenshtein distance
except ImportError:
print("python-Levenshtein not installed, using standard algorithms")Internal constants used by fuzzywuzzy for compatibility and character handling.
PY3: bool # True if running Python 3, False for Python 2
bad_chars: str # String containing ASCII characters 128-256 for filtering
translation_table: dict # Translation table for removing non-ASCII chars (Python 3 only)
unicode: type # str type in Python 3, unicode type in Python 2Usage Example:
from fuzzywuzzy import utils
# Check Python version
if utils.PY3:
print("Running on Python 3")
else:
print("Running on Python 2")
# Access character filtering components
print(f"Bad chars string length: {len(utils.bad_chars)}") # 128 charactersThese decorators are used internally by fuzzywuzzy but can be useful for custom scoring functions. They handle common edge cases in string comparison.
def check_for_equivalence(func):
"""
Decorator that returns 100 if both input strings are identical.
This decorator checks if args[0] == args[1] and returns 100 (perfect match)
if they are equal, otherwise calls the decorated function.
Parameters:
func: Function to decorate that takes two string arguments
Returns:
function: Decorated function that handles string equivalence
"""
def check_for_none(func):
"""
Decorator that returns 0 if either input string is None.
This decorator checks if args[0] or args[1] is None and returns 0
(no match) if either is None, otherwise calls the decorated function.
Parameters:
func: Function to decorate that takes two string arguments
Returns:
function: Decorated function that handles None inputs
"""
def check_empty_string(func):
"""
Decorator that returns 0 if either input string is empty.
This decorator checks if len(args[0]) == 0 or len(args[1]) == 0 and
returns 0 (no match) if either is empty, otherwise calls the decorated function.
Parameters:
func: Function to decorate that takes two string arguments
Returns:
function: Decorated function that handles empty string inputs
"""Usage Example:
from fuzzywuzzy import utils
@utils.check_for_none
@utils.check_for_equivalence
def custom_scorer(s1, s2):
# Your custom scoring logic here
return 50 # Example score
# Decorators handle edge cases automatically
print(custom_scorer("hello", "hello")) # 100 (equivalence)
print(custom_scorer("hello", None)) # 0 (none check)
print(custom_scorer("hello", "world")) # 50 (custom logic)Install with Tessl CLI
npx tessl i tessl/pypi-fuzzywuzzy