tessl/pypi-wordcloud

A little word cloud generator for creating visually appealing word clouds from text data.

Overview

Eval results

Files

Text Processing and Tokenization

Name: tessl/pypi-wordcloud
Author: tessl

Advanced text analysis capabilities for word cloud generation, including intelligent tokenization, stopword filtering, plural normalization, and statistical bigram detection for meaningful phrase extraction.

Capabilities

Unigram and Bigram Extraction

Extract individual words and statistically significant two-word phrases from text, with automatic filtering and collocation detection.

def unigrams_and_bigrams(words, stopwords, normalize_plurals=True, collocation_threshold=30):
    """
    Extract unigrams and statistically significant bigrams from word list.

    Processes a list of word tokens to identify meaningful single words and two-word
    phrases based on statistical collocation analysis using Dunning likelihood ratios.
    Filters out stopwords and optionally normalizes plural forms.

    Parameters:
    - words (list): List of word tokens from text
    - stopwords (set): Set of stopwords to filter out
    - normalize_plurals (bool): Whether to merge plural forms with singular (default: True)
    - collocation_threshold (int): Minimum collocation score for bigram inclusion (default: 30)

    Returns:
    - dict: Dictionary mapping words/phrases to their frequencies, with bigrams
            included only if they exceed the collocation threshold
    """

Token Processing and Normalization

Normalize word tokens by handling case variations and plural forms for consistent word cloud representation.

def process_tokens(words, normalize_plurals=True):
    """
    Normalize word cases and optionally remove plural forms.

    Processes word tokens to establish canonical forms: most frequent case
    representation is used for each word, and plural forms are optionally
    merged with singular forms based on simple heuristics.

    Parameters:
    - words (iterable): Iterable of word strings to process
    - normalize_plurals (bool): Whether to merge plurals with singular forms (default: True)

    Returns:
    - tuple: (counts_dict, standard_forms_dict) where:
        - counts_dict (dict): Word frequencies with canonical case forms
        - standard_forms_dict (dict): Mapping from lowercase to canonical case
    """

Statistical Collocation Scoring

Internal function for calculating statistical significance of word pairs using Dunning likelihood ratios.

def score(count_bigram, count1, count2, n_words):
    """
    Calculate Dunning likelihood collocation score for word pairs.

    Computes statistical significance of bigram co-occurrence using likelihood
    ratio test to identify meaningful phrases versus random word combinations.

    Parameters:
    - count_bigram (int): Frequency of the bigram
    - count1 (int): Frequency of first word
    - count2 (int): Frequency of second word  
    - n_words (int): Total number of words in corpus

    Returns:
    - float: Collocation score (higher values indicate stronger association)
    """

STOPWORDS Constant

Pre-defined set of common English words to filter from word cloud generation.

STOPWORDS: set[str]  # Comprehensive set of English stopwords

The STOPWORDS constant contains common English words (articles, prepositions, pronouns, etc.) that are typically filtered out of word clouds to focus on meaningful content words.

Usage Examples

Basic Tokenization

from wordcloud.tokenization import unigrams_and_bigrams, process_tokens
from wordcloud import STOPWORDS

# Process text tokens
words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
word_counts = unigrams_and_bigrams(words, STOPWORDS)
print(word_counts)  # {'quick': 1, 'brown': 1, 'fox': 1, 'jumps': 1, ...}

Custom Stopwords

from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS

# Add custom stopwords
custom_stopwords = STOPWORDS.copy()
custom_stopwords.update(['custom', 'specific', 'terms'])

# Process with custom stopwords
words = text.split()
word_counts = unigrams_and_bigrams(words, custom_stopwords)

Bigram Detection

from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS

# Text with potential bigrams
text = "machine learning algorithms are powerful tools for data science"
words = text.split()

# Extract with bigram detection
word_counts = unigrams_and_bigrams(
    words, 
    STOPWORDS, 
    normalize_plurals=True, 
    collocation_threshold=10  # Lower threshold for more bigrams
)

# May include bigrams like "machine learning" or "data science"
print(word_counts)

Token Normalization

from wordcloud.tokenization import process_tokens

# Words with case variations and plurals
words = ["Python", "python", "PYTHON", "pythons", "cats", "cat", "Dogs", "dog"]

# Normalize tokens
counts, standard_forms = process_tokens(words, normalize_plurals=True)

print(counts)          # {'Python': 4, 'cat': 2, 'Dogs': 2}
print(standard_forms)  # {'python': 'Python', 'cat': 'cat', 'dog': 'Dogs', ...}

Integration with WordCloud

from wordcloud import WordCloud, STOPWORDS
import re

# Custom text processing with WordCloud
def custom_preprocess(text):
    # Custom tokenization
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    
    # Add domain-specific stopwords
    custom_stops = STOPWORDS.copy()
    custom_stops.update(['said', 'would', 'could'])
    
    return words, custom_stops

# Use with WordCloud
text = "Your text data here..."
words, stopwords = custom_preprocess(text)

# WordCloud will use its internal processing, but you can also
# use the tokenization functions directly for more control
from wordcloud.tokenization import unigrams_and_bigrams
frequencies = unigrams_and_bigrams(words, stopwords)

wc = WordCloud().generate_from_frequencies(frequencies)

Controlling Plural Normalization

from wordcloud.tokenization import process_tokens

# Keep plurals separate
words = ["cat", "cats", "dog", "dogs", "analysis", "analyses"]
counts_separate, _ = process_tokens(words, normalize_plurals=False)
print(counts_separate)  # {'cat': 1, 'cats': 1, 'dog': 1, 'dogs': 1, ...}

# Merge plurals (default behavior)
counts_merged, _ = process_tokens(words, normalize_plurals=True)
print(counts_merged)    # {'cat': 2, 'dog': 2, 'analysis': 2}

Advanced Collocation Control

from wordcloud.tokenization import unigrams_and_bigrams
from wordcloud import STOPWORDS

text = "New York City is a great place to visit in New York state"
words = text.split()

# High threshold - fewer bigrams
strict_counts = unigrams_and_bigrams(words, STOPWORDS, collocation_threshold=50)

# Low threshold - more bigrams  
loose_counts = unigrams_and_bigrams(words, STOPWORDS, collocation_threshold=5)

# Compare results
print("Strict:", strict_counts)
print("Loose:", loose_counts)  # May include "New York" as bigram

Install with Tessl CLI