0
# Text Analysis
1
2
Specialized visualizers for text analysis and natural language processing, providing tools for exploring text corpora, visualizing document embeddings, and analyzing linguistic patterns. These visualizers support various NLP workflows and text preprocessing pipelines.
3
4
## Capabilities
5
6
### Text Embeddings Visualization
7
8
High-dimensional text embedding visualization using dimensionality reduction techniques like t-SNE and UMAP for exploring document similarity and clustering patterns.
9
10
```python { .api }
11
class TSNEVisualizer(Visualizer):
12
"""
13
t-SNE visualization for text embeddings and high-dimensional data.
14
15
Parameters:
16
- labels: list, text labels for data points
17
- classes: list, class labels for coloring
18
- random_state: int, random state for reproducibility
19
- perplexity: float, t-SNE perplexity parameter
20
- early_exaggeration: float, early exaggeration parameter
21
- learning_rate: float, learning rate parameter
22
- n_iter: int, number of iterations
23
- metric: str, distance metric
24
"""
25
def __init__(self, labels=None, classes=None, random_state=None, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, metric='euclidean', **kwargs): ...
26
def fit(self, X, y=None, **kwargs): ...
27
def show(self, **kwargs): ...
28
29
class UMAPVisualizer(Visualizer):
30
"""
31
UMAP visualization for text embeddings and high-dimensional data.
32
33
Parameters:
34
- labels: list, text labels for data points
35
- classes: list, class labels for coloring
36
- random_state: int, random state for reproducibility
37
- n_neighbors: int, number of neighbors parameter
38
- min_dist: float, minimum distance parameter
39
- metric: str, distance metric
40
"""
41
def __init__(self, labels=None, classes=None, random_state=None, n_neighbors=15, min_dist=0.1, metric='euclidean', **kwargs): ...
42
def fit(self, X, y=None, **kwargs): ...
43
def show(self, **kwargs): ...
44
45
def tsne(X, y=None, labels=None, classes=None, **kwargs):
46
"""
47
Functional API for t-SNE visualization.
48
49
Parameters:
50
- X: feature matrix (document embeddings)
51
- y: target vector (optional)
52
- labels: list, text labels for data points
53
- classes: list, class labels
54
55
Returns:
56
TSNEVisualizer instance
57
"""
58
59
def umap(X, y=None, labels=None, classes=None, **kwargs):
60
"""
61
Functional API for UMAP visualization.
62
63
Parameters:
64
- X: feature matrix (document embeddings)
65
- y: target vector (optional)
66
- labels: list, text labels for data points
67
- classes: list, class labels
68
69
Returns:
70
UMAPVisualizer instance
71
"""
72
```
73
74
**Usage Example:**
75
76
```python
77
from yellowbrick.text import TSNEVisualizer, UMAPVisualizer, tsne, umap
78
from sklearn.feature_extraction.text import TfidfVectorizer
79
from sklearn.datasets import fetch_20newsgroups
80
81
# Load text data
82
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
83
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
84
corpus = newsgroups.data
85
labels = newsgroups.target_names
86
87
# Vectorize text
88
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
89
X = vectorizer.fit_transform(corpus)
90
91
# t-SNE visualization
92
tsne_viz = TSNEVisualizer(labels=labels, classes=newsgroups.target_names)
93
tsne_viz.fit(X.toarray(), newsgroups.target)
94
tsne_viz.show()
95
96
# UMAP visualization
97
umap_viz = UMAPVisualizer(labels=labels, classes=newsgroups.target_names)
98
umap_viz.fit(X.toarray(), newsgroups.target)
99
umap_viz.show()
100
101
# Functional API
102
tsne(X.toarray(), newsgroups.target, classes=newsgroups.target_names)
103
umap(X.toarray(), newsgroups.target, classes=newsgroups.target_names)
104
```
105
106
### Frequency Distribution Analysis
107
108
Word and token frequency distribution visualization for understanding vocabulary characteristics and identifying important terms in text corpora.
109
110
```python { .api }
111
class FreqDistVisualizer(Visualizer):
112
"""
113
Frequency distribution visualizer for text analysis.
114
115
Parameters:
116
- features: list, feature names (words/tokens)
117
- n: int, number of top features to display
118
- orient: str, orientation ('h' for horizontal, 'v' for vertical)
119
"""
120
def __init__(self, features=None, n=50, orient='h', **kwargs): ...
121
def fit(self, corpus, **kwargs): ...
122
def show(self, **kwargs): ...
123
124
def freqdist(corpus, features=None, n=50, **kwargs):
125
"""
126
Functional API for frequency distribution visualization.
127
128
Parameters:
129
- corpus: text corpus or frequency data
130
- features: list, feature names
131
- n: int, number of top features to display
132
133
Returns:
134
FreqDistVisualizer instance
135
"""
136
```
137
138
**Usage Example:**
139
140
```python
141
from yellowbrick.text import FreqDistVisualizer, freqdist
142
from sklearn.feature_extraction.text import CountVectorizer
143
from collections import Counter
144
import re
145
146
# Prepare text data
147
documents = [
148
"The quick brown fox jumps over the lazy dog",
149
"A journey of a thousand miles begins with a single step",
150
"To be or not to be that is the question"
151
]
152
153
# Method 1: Using CountVectorizer
154
vectorizer = CountVectorizer(stop_words='english')
155
X = vectorizer.fit_transform(documents)
156
features = vectorizer.get_feature_names_out()
157
158
# Sum word frequencies across documents
159
word_frequencies = X.sum(axis=0).A1
160
freq_data = dict(zip(features, word_frequencies))
161
162
viz = FreqDistVisualizer(features=features)
163
viz.fit(freq_data)
164
viz.show()
165
166
# Method 2: Using raw text with Counter
167
text = ' '.join(documents).lower()
168
words = re.findall(r'\b\w+\b', text)
169
word_counts = Counter(words)
170
171
freqdist(word_counts, n=20)
172
```
173
174
### Part-of-Speech Analysis
175
176
Part-of-speech tag distribution visualization for analyzing grammatical patterns and linguistic structure in text corpora.
177
178
```python { .api }
179
class PosTagVisualizer(Visualizer):
180
"""
181
Part-of-speech tag visualizer for linguistic analysis.
182
183
Parameters:
184
- tagset: str, POS tagset to use ('universal', 'penn')
185
- colormap: str, matplotlib colormap for bars
186
"""
187
def __init__(self, tagset='universal', colormap='Set2', **kwargs): ...
188
def fit(self, corpus, **kwargs): ...
189
def show(self, **kwargs): ...
190
```
191
192
**Usage Example:**
193
194
```python
195
from yellowbrick.text import PosTagVisualizer
196
import nltk
197
from nltk import pos_tag, word_tokenize
198
199
# Download required NLTK data
200
nltk.download('punkt')
201
nltk.download('averaged_perceptron_tagger')
202
nltk.download('universal_tagset')
203
204
# Prepare text data
205
documents = [
206
"The quick brown fox jumps over the lazy dog",
207
"Natural language processing is fascinating",
208
"Machine learning algorithms can analyze text effectively"
209
]
210
211
# Tokenize and tag
212
tagged_corpus = []
213
for doc in documents:
214
tokens = word_tokenize(doc.lower())
215
tags = pos_tag(tokens, tagset='universal')
216
tagged_corpus.extend(tags)
217
218
# Visualize POS distribution
219
pos_viz = PosTagVisualizer(tagset='universal')
220
pos_viz.fit(tagged_corpus)
221
pos_viz.show()
222
```
223
224
### Word Dispersion Plot
225
226
Word dispersion visualization showing the distribution of specific words throughout a text corpus, useful for analyzing word usage patterns and document structure.
227
228
```python { .api }
229
class DispersionPlot(Visualizer):
230
"""
231
Word dispersion plot for analyzing word distribution in text.
232
233
Parameters:
234
- words: list, target words to analyze
235
- labels: list, labels for documents or text segments
236
- ignore_case: bool, whether to ignore case differences
237
"""
238
def __init__(self, words, labels=None, ignore_case=True, **kwargs): ...
239
def fit(self, corpus, **kwargs): ...
240
def show(self, **kwargs): ...
241
242
def dispersion(corpus, words, labels=None, **kwargs):
243
"""
244
Functional API for word dispersion visualization.
245
246
Parameters:
247
- corpus: text corpus or list of documents
248
- words: list, target words to analyze
249
- labels: list, document labels
250
251
Returns:
252
DispersionPlot instance
253
"""
254
```
255
256
**Usage Example:**
257
258
```python
259
from yellowbrick.text import DispersionPlot, dispersion
260
261
# Sample text corpus
262
corpus = [
263
"The data science field is rapidly evolving with machine learning",
264
"Machine learning algorithms require large datasets for training",
265
"Data analysis and data visualization are key data science skills",
266
"Python and R are popular programming languages for data science",
267
"Deep learning is a subset of machine learning with neural networks"
268
]
269
270
# Target words to analyze
271
target_words = ['data', 'machine', 'learning', 'science']
272
273
# Create dispersion plot
274
dispersion_viz = DispersionPlot(words=target_words)
275
dispersion_viz.fit(corpus)
276
dispersion_viz.show()
277
278
# Functional API
279
dispersion(corpus, target_words)
280
```
281
282
### Word Correlation Analysis
283
284
Word correlation visualization for understanding relationships between words and identifying semantic clusters in text data.
285
286
```python { .api }
287
class WordCorrelationPlot(Visualizer):
288
"""
289
Word correlation plot for analyzing semantic relationships.
290
291
Parameters:
292
- words: list, words to analyze correlations
293
- method: str, correlation method ('pearson', 'spearman')
294
- colormap: str, matplotlib colormap for heatmap
295
"""
296
def __init__(self, words=None, method='pearson', colormap='RdYlBu_r', **kwargs): ...
297
def fit(self, X, **kwargs): ...
298
def show(self, **kwargs): ...
299
300
def word_correlation(X, words=None, method='pearson', **kwargs):
301
"""
302
Functional API for word correlation visualization.
303
304
Parameters:
305
- X: document-term matrix
306
- words: list, words to analyze
307
- method: str, correlation method
308
309
Returns:
310
WordCorrelationPlot instance
311
"""
312
```
313
314
**Usage Example:**
315
316
```python
317
from yellowbrick.text import WordCorrelationPlot, word_correlation
318
from sklearn.feature_extraction.text import TfidfVectorizer
319
import pandas as pd
320
321
# Sample documents
322
documents = [
323
"machine learning algorithms process data efficiently",
324
"data science involves statistical analysis and visualization",
325
"artificial intelligence and machine learning are related fields",
326
"deep learning uses neural networks for pattern recognition",
327
"data analysis requires statistical knowledge and programming skills"
328
]
329
330
# Vectorize documents
331
vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
332
X = vectorizer.fit_transform(documents)
333
feature_names = vectorizer.get_feature_names_out()
334
335
# Select specific words for correlation analysis
336
target_words = ['machine', 'learning', 'data', 'analysis', 'statistical']
337
word_indices = [i for i, word in enumerate(feature_names) if word in target_words]
338
339
# Extract relevant columns
340
X_subset = X.toarray()[:, word_indices]
341
subset_words = [feature_names[i] for i in word_indices]
342
343
# Create correlation plot
344
corr_viz = WordCorrelationPlot(words=subset_words, method='pearson')
345
corr_viz.fit(X_subset)
346
corr_viz.show()
347
348
# Functional API
349
word_correlation(X_subset, words=subset_words, method='spearman')
350
```
351
352
## Usage Patterns
353
354
### Comprehensive Text Analysis Pipeline
355
356
```python
357
from yellowbrick.text import TSNEVisualizer, FreqDistVisualizer, DispersionPlot
358
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
359
from sklearn.datasets import fetch_20newsgroups
360
import matplotlib.pyplot as plt
361
362
# Load text dataset
363
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
364
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
365
corpus = newsgroups.data[:1000] # Use subset for faster processing
366
target = newsgroups.target[:1000]
367
target_names = [newsgroups.target_names[i] for i in range(len(categories))]
368
369
# Step 1: Frequency analysis
370
print("Step 1: Word frequency analysis")
371
count_vectorizer = CountVectorizer(max_features=100, stop_words='english', min_df=2)
372
count_matrix = count_vectorizer.fit_transform(corpus)
373
feature_names = count_vectorizer.get_feature_names_out()
374
375
# Create frequency distribution
376
word_frequencies = count_matrix.sum(axis=0).A1
377
freq_data = dict(zip(feature_names, word_frequencies))
378
freq_viz = FreqDistVisualizer(features=feature_names, n=30)
379
freq_viz.fit(freq_data)
380
freq_viz.show()
381
382
# Step 2: Document embedding visualization
383
print("Step 2: Document embedding visualization")
384
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english', min_df=2, max_df=0.8)
385
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
386
387
# t-SNE visualization
388
tsne_viz = TSNEVisualizer(classes=target_names, random_state=42)
389
tsne_viz.fit(tfidf_matrix.toarray(), target)
390
tsne_viz.show()
391
392
# Step 3: Word dispersion analysis
393
print("Step 3: Word dispersion analysis")
394
# Select most frequent words for dispersion analysis
395
top_words = sorted(freq_data.items(), key=lambda x: x[1], reverse=True)[:8]
396
dispersion_words = [word for word, _ in top_words]
397
398
dispersion_viz = DispersionPlot(words=dispersion_words)
399
dispersion_viz.fit(corpus)
400
dispersion_viz.show()
401
```
402
403
### Comparative Text Analysis
404
405
```python
406
from yellowbrick.text import TSNEVisualizer, UMAPVisualizer
407
from sklearn.feature_extraction.text import TfidfVectorizer
408
import matplotlib.pyplot as plt
409
410
# Compare t-SNE and UMAP embeddings
411
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
412
X = vectorizer.fit_transform(corpus)
413
414
# Create side-by-side comparison
415
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
416
417
# t-SNE visualization
418
tsne_viz = TSNEVisualizer(classes=target_names, ax=axes[0], random_state=42)
419
tsne_viz.fit(X.toarray(), target)
420
tsne_viz.finalize()
421
axes[0].set_title('t-SNE Embedding')
422
423
# UMAP visualization
424
umap_viz = UMAPVisualizer(classes=target_names, ax=axes[1], random_state=42)
425
umap_viz.fit(X.toarray(), target)
426
umap_viz.finalize()
427
axes[1].set_title('UMAP Embedding')
428
429
plt.tight_layout()
430
plt.show()
431
```
432
433
### Topic Modeling Visualization
434
435
```python
436
from yellowbrick.text import TSNEVisualizer
437
from sklearn.decomposition import LatentDirichletAllocation
438
from sklearn.feature_extraction.text import CountVectorizer
439
import numpy as np
440
441
# Prepare data for topic modeling
442
vectorizer = CountVectorizer(max_features=1000, stop_words='english', min_df=2, max_df=0.8)
443
doc_term_matrix = vectorizer.fit_transform(corpus)
444
445
# Fit LDA model
446
n_topics = 4
447
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
448
doc_topic_matrix = lda_model.fit_transform(doc_term_matrix)
449
450
# Assign documents to dominant topics
451
dominant_topics = np.argmax(doc_topic_matrix, axis=1)
452
topic_names = [f'Topic {i}' for i in range(n_topics)]
453
454
# Visualize documents in topic space
455
tsne_viz = TSNEVisualizer(classes=topic_names, random_state=42)
456
tsne_viz.fit(doc_topic_matrix, dominant_topics)
457
tsne_viz.show()
458
459
# Print top words for each topic
460
feature_names = vectorizer.get_feature_names_out()
461
for topic_idx, topic in enumerate(lda_model.components_):
462
top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]]
463
print(f"Topic {topic_idx}: {', '.join(top_words)}")
464
```
465
466
### Multilingual Text Analysis
467
468
```python
469
from yellowbrick.text import FreqDistVisualizer, TSNEVisualizer
470
from sklearn.feature_extraction.text import TfidfVectorizer
471
from collections import Counter
472
import re
473
474
# Sample multilingual text (English and Spanish)
475
multilingual_corpus = [
476
"machine learning is transforming technology",
477
"el aprendizaje automático está transformando la tecnología",
478
"data science involves statistical analysis",
479
"la ciencia de datos involucra análisis estadístico",
480
"artificial intelligence enables automation",
481
"la inteligencia artificial permite la automatización"
482
]
483
484
# Language labels
485
languages = ['English', 'Spanish', 'English', 'Spanish', 'English', 'Spanish']
486
487
# Character-level analysis for language detection
488
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=100)
489
char_features = char_vectorizer.fit_transform(multilingual_corpus)
490
491
# Visualize language clustering
492
tsne_viz = TSNEVisualizer(classes=['English', 'Spanish'], random_state=42)
493
tsne_viz.fit(char_features.toarray(), [0 if lang == 'English' else 1 for lang in languages])
494
tsne_viz.show()
495
496
# Word frequency analysis per language
497
english_docs = [doc for doc, lang in zip(multilingual_corpus, languages) if lang == 'English']
498
spanish_docs = [doc for doc, lang in zip(multilingual_corpus, languages) if lang == 'Spanish']
499
500
for lang, docs in [('English', english_docs), ('Spanish', spanish_docs)]:
501
print(f"\n{lang} word frequencies:")
502
all_words = ' '.join(docs).lower()
503
words = re.findall(r'\b\w+\b', all_words)
504
word_counts = Counter(words)
505
506
freq_viz = FreqDistVisualizer(n=10)
507
freq_viz.fit(word_counts)
508
freq_viz.show()
509
```