Machine Learning Library Extensions providing essential tools for day-to-day data science tasks
—
Text processing utilities for natural language processing tasks including name normalization and tokenization.
Utilities for processing and normalizing person names.
def generalize_names(name):
"""
Generalize person names for consistency.
Parameters:
- name: str, person name to generalize
Returns:
- generalized_name: str, normalized name
"""
def generalize_names_duplcheck(name_list):
"""
Generalize names with duplicate checking and removal.
Parameters:
- name_list: list, list of person names
Returns:
- unique_names: list, deduplicated normalized names
"""Tokenization utilities for text processing including emoticon handling.
def tokenizer_words_and_emoticons(text):
"""
Tokenize text including words and emoticons.
Parameters:
- text: str, input text to tokenize
Returns:
- tokens: list, list of word and emoticon tokens
"""
def tokenizer_emoticons(text):
"""
Extract emoticons from text.
Parameters:
- text: str, input text
Returns:
- emoticons: list, list of emoticon tokens found in text
"""from mlxtend.text import generalize_names, generalize_names_duplcheck
from mlxtend.text import tokenizer_words_and_emoticons, tokenizer_emoticons
# Name processing examples
name = "Dr. John Smith Jr."
normalized = generalize_names(name)
print(f"Original: {name}")
print(f"Normalized: {normalized}")
# Duplicate name handling
names = ["John Smith", "J. Smith", "John Smith", "Jane Doe"]
unique_names = generalize_names_duplcheck(names)
print(f"Original names: {names}")
print(f"Unique normalized: {unique_names}")
# Text tokenization with emoticons
text = "I love machine learning! :) It's so cool :D"
tokens = tokenizer_words_and_emoticons(text)
emoticons = tokenizer_emoticons(text)
print(f"Text: {text}")
print(f"All tokens: {tokens}")
print(f"Emoticons only: {emoticons}")Install with Tessl CLI
npx tessl i tessl/pypi-mlxtend