A little word cloud generator for creating visually appealing word clouds from text data.
Advanced text analysis capabilities for word cloud generation, including intelligent tokenization, stopword filtering, plural normalization, and statistical bigram detection for meaningful phrase extraction.
Extract individual words and statistically significant two-word phrases from text, with automatic filtering and collocation detection.
def unigrams_and_bigrams(words, stopwords, normalize_plurals=True, collocation_threshold=30):
"""
Extract unigrams and statistically significant bigrams from word list.
Processes a list of word tokens to identify meaningful single words and two-word
phrases based on statistical collocation analysis using Dunning likelihood ratios.
Filters out stopwords and optionally normalizes plural forms.
Parameters:
- words (list): List of word tokens from text
- stopwords (set): Set of stopwords to filter out
- normalize_plurals (bool): Whether to merge plural forms with singular (default: True)
- collocation_threshold (int): Minimum collocation score for bigram inclusion (default: 30)
Returns:
- dict: Dictionary mapping words/phrases to their frequencies, with bigrams
included only if they exceed the collocation threshold
"""Normalize word tokens by handling case variations and plural forms for consistent word cloud representation.
def process_tokens(words, normalize_plurals=True):
"""
Normalize word cases and optionally remove plural forms.
Processes word tokens to establish canonical forms: most frequent case
representation is used for each word, and plural forms are optionally
merged with singular forms based on simple heuristics.
Parameters:
- words (iterable): Iterable of word strings to process
- normalize_plurals (bool): Whether to merge plurals with singular forms (default: True)
Returns:
- tuple: (counts_dict, standard_forms_dict) where:
- counts_dict (dict): Word frequencies with canonical case forms
- standard_forms_dict (dict): Mapping from lowercase to canonical case
"""Internal function for calculating statistical significance of word pairs using Dunning likelihood ratios.
def score(count_bigram, count1, count2, n_words):
"""
Calculate Dunning likelihood collocation score for word pairs.
Computes statistical significance of bigram co-occurrence using likelihood
ratio test to identify meaningful phrases versus random word combinations.
Parameters:
- count_bigram (int): Frequency of the bigram
- count1 (int): Frequency of first word
- count2 (int): Frequency of second word
- n_words (int): Total number of words in corpus
Returns:
- float: Collocation score (higher values indicate stronger association)
"""Pre-defined set of common English words to filter from word cloud generation.
STOPWORDS: set[str] # Comprehensive set of English stopwordsThe STOPWORDS constant contains common English words (articles, prepositions, pronouns, etc.) that are typically filtered out of word clouds to focus on meaningful content words.
from wordcloud.tokenization import unigrams_and_bigrams, process_tokens
from wordcloud import STOPWORDS
# Process text tokens
words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
word_counts = unigrams_and_bigrams(words, STOPWORDS)
print(word_counts) # {'quick': 1, 'brown': 1, 'fox': 1, 'jumps': 1, ...}from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS
# Add custom stopwords
custom_stopwords = STOPWORDS.copy()
custom_stopwords.update(['custom', 'specific', 'terms'])
# Process with custom stopwords
words = text.split()
word_counts = unigrams_and_bigrams(words, custom_stopwords)from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS
# Text with potential bigrams
text = "machine learning algorithms are powerful tools for data science"
words = text.split()
# Extract with bigram detection
word_counts = unigrams_and_bigrams(
words,
STOPWORDS,
normalize_plurals=True,
collocation_threshold=10 # Lower threshold for more bigrams
)
# May include bigrams like "machine learning" or "data science"
print(word_counts)from wordcloud.tokenization import process_tokens
# Words with case variations and plurals
words = ["Python", "python", "PYTHON", "pythons", "cats", "cat", "Dogs", "dog"]
# Normalize tokens
counts, standard_forms = process_tokens(words, normalize_plurals=True)
print(counts) # {'Python': 4, 'cat': 2, 'Dogs': 2}
print(standard_forms) # {'python': 'Python', 'cat': 'cat', 'dog': 'Dogs', ...}from wordcloud import WordCloud, STOPWORDS
import re
# Custom text processing with WordCloud
def custom_preprocess(text):
# Custom tokenization
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
# Add domain-specific stopwords
custom_stops = STOPWORDS.copy()
custom_stops.update(['said', 'would', 'could'])
return words, custom_stops
# Use with WordCloud
text = "Your text data here..."
words, stopwords = custom_preprocess(text)
# WordCloud will use its internal processing, but you can also
# use the tokenization functions directly for more control
from wordcloud.tokenization import unigrams_and_bigrams
frequencies = unigrams_and_bigrams(words, stopwords)
wc = WordCloud().generate_from_frequencies(frequencies)from wordcloud.tokenization import process_tokens
# Keep plurals separate
words = ["cat", "cats", "dog", "dogs", "analysis", "analyses"]
counts_separate, _ = process_tokens(words, normalize_plurals=False)
print(counts_separate) # {'cat': 1, 'cats': 1, 'dog': 1, 'dogs': 1, ...}
# Merge plurals (default behavior)
counts_merged, _ = process_tokens(words, normalize_plurals=True)
print(counts_merged) # {'cat': 2, 'dog': 2, 'analysis': 2}from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS
text = "New York City is a great place to visit in New York state"
words = text.split()
# High threshold - fewer bigrams
strict_counts = unigrams_and_bigrams(words, STOPWORDS, collocation_threshold=50)
# Low threshold - more bigrams
loose_counts = unigrams_and_bigrams(words, STOPWORDS, collocation_threshold=5)
# Compare results
print("Strict:", strict_counts)
print("Loose:", loose_counts) # May include "New York" as bigramInstall with Tessl CLI
npx tessl i tessl/pypi-wordcloud