or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-utilities.mdfeatures.mdindex.mdmodel-selection.mdregression.mdtext.md

text.mddocs/

0

# Text Analysis

1

2

Specialized visualizers for text analysis and natural language processing, providing tools for exploring text corpora, visualizing document embeddings, and analyzing linguistic patterns. These visualizers support various NLP workflows and text preprocessing pipelines.

3

4

## Capabilities

5

6

### Text Embeddings Visualization

7

8

High-dimensional text embedding visualization using dimensionality reduction techniques like t-SNE and UMAP for exploring document similarity and clustering patterns.

9

10

```python { .api }

11

class TSNEVisualizer(Visualizer):

12

"""

13

t-SNE visualization for text embeddings and high-dimensional data.

14

15

Parameters:

16

- labels: list, text labels for data points

17

- classes: list, class labels for coloring

18

- random_state: int, random state for reproducibility

19

- perplexity: float, t-SNE perplexity parameter

20

- early_exaggeration: float, early exaggeration parameter

21

- learning_rate: float, learning rate parameter

22

- n_iter: int, number of iterations

23

- metric: str, distance metric

24

"""

25

def __init__(self, labels=None, classes=None, random_state=None, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, metric='euclidean', **kwargs): ...

26

def fit(self, X, y=None, **kwargs): ...

27

def show(self, **kwargs): ...

28

29

class UMAPVisualizer(Visualizer):

30

"""

31

UMAP visualization for text embeddings and high-dimensional data.

32

33

Parameters:

34

- labels: list, text labels for data points

35

- classes: list, class labels for coloring

36

- random_state: int, random state for reproducibility

37

- n_neighbors: int, number of neighbors parameter

38

- min_dist: float, minimum distance parameter

39

- metric: str, distance metric

40

"""

41

def __init__(self, labels=None, classes=None, random_state=None, n_neighbors=15, min_dist=0.1, metric='euclidean', **kwargs): ...

42

def fit(self, X, y=None, **kwargs): ...

43

def show(self, **kwargs): ...

44

45

def tsne(X, y=None, labels=None, classes=None, **kwargs):

46

"""

47

Functional API for t-SNE visualization.

48

49

Parameters:

50

- X: feature matrix (document embeddings)

51

- y: target vector (optional)

52

- labels: list, text labels for data points

53

- classes: list, class labels

54

55

Returns:

56

TSNEVisualizer instance

57

"""

58

59

def umap(X, y=None, labels=None, classes=None, **kwargs):

60

"""

61

Functional API for UMAP visualization.

62

63

Parameters:

64

- X: feature matrix (document embeddings)

65

- y: target vector (optional)

66

- labels: list, text labels for data points

67

- classes: list, class labels

68

69

Returns:

70

UMAPVisualizer instance

71

"""

72

```

73

74

**Usage Example:**

75

76

```python

77

from yellowbrick.text import TSNEVisualizer, UMAPVisualizer, tsne, umap

78

from sklearn.feature_extraction.text import TfidfVectorizer

79

from sklearn.datasets import fetch_20newsgroups

80

81

# Load text data

82

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

83

newsgroups = fetch_20newsgroups(subset='train', categories=categories)

84

corpus = newsgroups.data

85

labels = newsgroups.target_names

86

87

# Vectorize text

88

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

89

X = vectorizer.fit_transform(corpus)

90

91

# t-SNE visualization

92

tsne_viz = TSNEVisualizer(labels=labels, classes=newsgroups.target_names)

93

tsne_viz.fit(X.toarray(), newsgroups.target)

94

tsne_viz.show()

95

96

# UMAP visualization

97

umap_viz = UMAPVisualizer(labels=labels, classes=newsgroups.target_names)

98

umap_viz.fit(X.toarray(), newsgroups.target)

99

umap_viz.show()

100

101

# Functional API

102

tsne(X.toarray(), newsgroups.target, classes=newsgroups.target_names)

103

umap(X.toarray(), newsgroups.target, classes=newsgroups.target_names)

104

```

105

106

### Frequency Distribution Analysis

107

108

Word and token frequency distribution visualization for understanding vocabulary characteristics and identifying important terms in text corpora.

109

110

```python { .api }

111

class FreqDistVisualizer(Visualizer):

112

"""

113

Frequency distribution visualizer for text analysis.

114

115

Parameters:

116

- features: list, feature names (words/tokens)

117

- n: int, number of top features to display

118

- orient: str, orientation ('h' for horizontal, 'v' for vertical)

119

"""

120

def __init__(self, features=None, n=50, orient='h', **kwargs): ...

121

def fit(self, corpus, **kwargs): ...

122

def show(self, **kwargs): ...

123

124

def freqdist(corpus, features=None, n=50, **kwargs):

125

"""

126

Functional API for frequency distribution visualization.

127

128

Parameters:

129

- corpus: text corpus or frequency data

130

- features: list, feature names

131

- n: int, number of top features to display

132

133

Returns:

134

FreqDistVisualizer instance

135

"""

136

```

137

138

**Usage Example:**

139

140

```python

141

from yellowbrick.text import FreqDistVisualizer, freqdist

142

from sklearn.feature_extraction.text import CountVectorizer

143

from collections import Counter

144

import re

145

146

# Prepare text data

147

documents = [

148

"The quick brown fox jumps over the lazy dog",

149

"A journey of a thousand miles begins with a single step",

150

"To be or not to be that is the question"

151

]

152

153

# Method 1: Using CountVectorizer

154

vectorizer = CountVectorizer(stop_words='english')

155

X = vectorizer.fit_transform(documents)

156

features = vectorizer.get_feature_names_out()

157

158

# Sum word frequencies across documents

159

word_frequencies = X.sum(axis=0).A1

160

freq_data = dict(zip(features, word_frequencies))

161

162

viz = FreqDistVisualizer(features=features)

163

viz.fit(freq_data)

164

viz.show()

165

166

# Method 2: Using raw text with Counter

167

text = ' '.join(documents).lower()

168

words = re.findall(r'\b\w+\b', text)

169

word_counts = Counter(words)

170

171

freqdist(word_counts, n=20)

172

```

173

174

### Part-of-Speech Analysis

175

176

Part-of-speech tag distribution visualization for analyzing grammatical patterns and linguistic structure in text corpora.

177

178

```python { .api }

179

class PosTagVisualizer(Visualizer):

180

"""

181

Part-of-speech tag visualizer for linguistic analysis.

182

183

Parameters:

184

- tagset: str, POS tagset to use ('universal', 'penn')

185

- colormap: str, matplotlib colormap for bars

186

"""

187

def __init__(self, tagset='universal', colormap='Set2', **kwargs): ...

188

def fit(self, corpus, **kwargs): ...

189

def show(self, **kwargs): ...

190

```

191

192

**Usage Example:**

193

194

```python

195

from yellowbrick.text import PosTagVisualizer

196

import nltk

197

from nltk import pos_tag, word_tokenize

198

199

# Download required NLTK data

200

nltk.download('punkt')

201

nltk.download('averaged_perceptron_tagger')

202

nltk.download('universal_tagset')

203

204

# Prepare text data

205

documents = [

206

"The quick brown fox jumps over the lazy dog",

207

"Natural language processing is fascinating",

208

"Machine learning algorithms can analyze text effectively"

209

]

210

211

# Tokenize and tag

212

tagged_corpus = []

213

for doc in documents:

214

tokens = word_tokenize(doc.lower())

215

tags = pos_tag(tokens, tagset='universal')

216

tagged_corpus.extend(tags)

217

218

# Visualize POS distribution

219

pos_viz = PosTagVisualizer(tagset='universal')

220

pos_viz.fit(tagged_corpus)

221

pos_viz.show()

222

```

223

224

### Word Dispersion Plot

225

226

Word dispersion visualization showing the distribution of specific words throughout a text corpus, useful for analyzing word usage patterns and document structure.

227

228

```python { .api }

229

class DispersionPlot(Visualizer):

230

"""

231

Word dispersion plot for analyzing word distribution in text.

232

233

Parameters:

234

- words: list, target words to analyze

235

- labels: list, labels for documents or text segments

236

- ignore_case: bool, whether to ignore case differences

237

"""

238

def __init__(self, words, labels=None, ignore_case=True, **kwargs): ...

239

def fit(self, corpus, **kwargs): ...

240

def show(self, **kwargs): ...

241

242

def dispersion(corpus, words, labels=None, **kwargs):

243

"""

244

Functional API for word dispersion visualization.

245

246

Parameters:

247

- corpus: text corpus or list of documents

248

- words: list, target words to analyze

249

- labels: list, document labels

250

251

Returns:

252

DispersionPlot instance

253

"""

254

```

255

256

**Usage Example:**

257

258

```python

259

from yellowbrick.text import DispersionPlot, dispersion

260

261

# Sample text corpus

262

corpus = [

263

"The data science field is rapidly evolving with machine learning",

264

"Machine learning algorithms require large datasets for training",

265

"Data analysis and data visualization are key data science skills",

266

"Python and R are popular programming languages for data science",

267

"Deep learning is a subset of machine learning with neural networks"

268

]

269

270

# Target words to analyze

271

target_words = ['data', 'machine', 'learning', 'science']

272

273

# Create dispersion plot

274

dispersion_viz = DispersionPlot(words=target_words)

275

dispersion_viz.fit(corpus)

276

dispersion_viz.show()

277

278

# Functional API

279

dispersion(corpus, target_words)

280

```

281

282

### Word Correlation Analysis

283

284

Word correlation visualization for understanding relationships between words and identifying semantic clusters in text data.

285

286

```python { .api }

287

class WordCorrelationPlot(Visualizer):

288

"""

289

Word correlation plot for analyzing semantic relationships.

290

291

Parameters:

292

- words: list, words to analyze correlations

293

- method: str, correlation method ('pearson', 'spearman')

294

- colormap: str, matplotlib colormap for heatmap

295

"""

296

def __init__(self, words=None, method='pearson', colormap='RdYlBu_r', **kwargs): ...

297

def fit(self, X, **kwargs): ...

298

def show(self, **kwargs): ...

299

300

def word_correlation(X, words=None, method='pearson', **kwargs):

301

"""

302

Functional API for word correlation visualization.

303

304

Parameters:

305

- X: document-term matrix

306

- words: list, words to analyze

307

- method: str, correlation method

308

309

Returns:

310

WordCorrelationPlot instance

311

"""

312

```

313

314

**Usage Example:**

315

316

```python

317

from yellowbrick.text import WordCorrelationPlot, word_correlation

318

from sklearn.feature_extraction.text import TfidfVectorizer

319

import pandas as pd

320

321

# Sample documents

322

documents = [

323

"machine learning algorithms process data efficiently",

324

"data science involves statistical analysis and visualization",

325

"artificial intelligence and machine learning are related fields",

326

"deep learning uses neural networks for pattern recognition",

327

"data analysis requires statistical knowledge and programming skills"

328

]

329

330

# Vectorize documents

331

vectorizer = TfidfVectorizer(max_features=20, stop_words='english')

332

X = vectorizer.fit_transform(documents)

333

feature_names = vectorizer.get_feature_names_out()

334

335

# Select specific words for correlation analysis

336

target_words = ['machine', 'learning', 'data', 'analysis', 'statistical']

337

word_indices = [i for i, word in enumerate(feature_names) if word in target_words]

338

339

# Extract relevant columns

340

X_subset = X.toarray()[:, word_indices]

341

subset_words = [feature_names[i] for i in word_indices]

342

343

# Create correlation plot

344

corr_viz = WordCorrelationPlot(words=subset_words, method='pearson')

345

corr_viz.fit(X_subset)

346

corr_viz.show()

347

348

# Functional API

349

word_correlation(X_subset, words=subset_words, method='spearman')

350

```

351

352

## Usage Patterns

353

354

### Comprehensive Text Analysis Pipeline

355

356

```python

357

from yellowbrick.text import TSNEVisualizer, FreqDistVisualizer, DispersionPlot

358

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

359

from sklearn.datasets import fetch_20newsgroups

360

import matplotlib.pyplot as plt

361

362

# Load text dataset

363

categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

364

newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

365

corpus = newsgroups.data[:1000] # Use subset for faster processing

366

target = newsgroups.target[:1000]

367

target_names = [newsgroups.target_names[i] for i in range(len(categories))]

368

369

# Step 1: Frequency analysis

370

print("Step 1: Word frequency analysis")

371

count_vectorizer = CountVectorizer(max_features=100, stop_words='english', min_df=2)

372

count_matrix = count_vectorizer.fit_transform(corpus)

373

feature_names = count_vectorizer.get_feature_names_out()

374

375

# Create frequency distribution

376

word_frequencies = count_matrix.sum(axis=0).A1

377

freq_data = dict(zip(feature_names, word_frequencies))

378

freq_viz = FreqDistVisualizer(features=feature_names, n=30)

379

freq_viz.fit(freq_data)

380

freq_viz.show()

381

382

# Step 2: Document embedding visualization

383

print("Step 2: Document embedding visualization")

384

tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english', min_df=2, max_df=0.8)

385

tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

386

387

# t-SNE visualization

388

tsne_viz = TSNEVisualizer(classes=target_names, random_state=42)

389

tsne_viz.fit(tfidf_matrix.toarray(), target)

390

tsne_viz.show()

391

392

# Step 3: Word dispersion analysis

393

print("Step 3: Word dispersion analysis")

394

# Select most frequent words for dispersion analysis

395

top_words = sorted(freq_data.items(), key=lambda x: x[1], reverse=True)[:8]

396

dispersion_words = [word for word, _ in top_words]

397

398

dispersion_viz = DispersionPlot(words=dispersion_words)

399

dispersion_viz.fit(corpus)

400

dispersion_viz.show()

401

```

402

403

### Comparative Text Analysis

404

405

```python

406

from yellowbrick.text import TSNEVisualizer, UMAPVisualizer

407

from sklearn.feature_extraction.text import TfidfVectorizer

408

import matplotlib.pyplot as plt

409

410

# Compare t-SNE and UMAP embeddings

411

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

412

X = vectorizer.fit_transform(corpus)

413

414

# Create side-by-side comparison

415

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

416

417

# t-SNE visualization

418

tsne_viz = TSNEVisualizer(classes=target_names, ax=axes[0], random_state=42)

419

tsne_viz.fit(X.toarray(), target)

420

tsne_viz.finalize()

421

axes[0].set_title('t-SNE Embedding')

422

423

# UMAP visualization

424

umap_viz = UMAPVisualizer(classes=target_names, ax=axes[1], random_state=42)

425

umap_viz.fit(X.toarray(), target)

426

umap_viz.finalize()

427

axes[1].set_title('UMAP Embedding')

428

429

plt.tight_layout()

430

plt.show()

431

```

432

433

### Topic Modeling Visualization

434

435

```python

436

from yellowbrick.text import TSNEVisualizer

437

from sklearn.decomposition import LatentDirichletAllocation

438

from sklearn.feature_extraction.text import CountVectorizer

439

import numpy as np

440

441

# Prepare data for topic modeling

442

vectorizer = CountVectorizer(max_features=1000, stop_words='english', min_df=2, max_df=0.8)

443

doc_term_matrix = vectorizer.fit_transform(corpus)

444

445

# Fit LDA model

446

n_topics = 4

447

lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)

448

doc_topic_matrix = lda_model.fit_transform(doc_term_matrix)

449

450

# Assign documents to dominant topics

451

dominant_topics = np.argmax(doc_topic_matrix, axis=1)

452

topic_names = [f'Topic {i}' for i in range(n_topics)]

453

454

# Visualize documents in topic space

455

tsne_viz = TSNEVisualizer(classes=topic_names, random_state=42)

456

tsne_viz.fit(doc_topic_matrix, dominant_topics)

457

tsne_viz.show()

458

459

# Print top words for each topic

460

feature_names = vectorizer.get_feature_names_out()

461

for topic_idx, topic in enumerate(lda_model.components_):

462

top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]]

463

print(f"Topic {topic_idx}: {', '.join(top_words)}")

464

```

465

466

### Multilingual Text Analysis

467

468

```python

469

from yellowbrick.text import FreqDistVisualizer, TSNEVisualizer

470

from sklearn.feature_extraction.text import TfidfVectorizer

471

from collections import Counter

472

import re

473

474

# Sample multilingual text (English and Spanish)

475

multilingual_corpus = [

476

"machine learning is transforming technology",

477

"el aprendizaje automático está transformando la tecnología",

478

"data science involves statistical analysis",

479

"la ciencia de datos involucra análisis estadístico",

480

"artificial intelligence enables automation",

481

"la inteligencia artificial permite la automatización"

482

]

483

484

# Language labels

485

languages = ['English', 'Spanish', 'English', 'Spanish', 'English', 'Spanish']

486

487

# Character-level analysis for language detection

488

char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=100)

489

char_features = char_vectorizer.fit_transform(multilingual_corpus)

490

491

# Visualize language clustering

492

tsne_viz = TSNEVisualizer(classes=['English', 'Spanish'], random_state=42)

493

tsne_viz.fit(char_features.toarray(), [0 if lang == 'English' else 1 for lang in languages])

494

tsne_viz.show()

495

496

# Word frequency analysis per language

497

english_docs = [doc for doc, lang in zip(multilingual_corpus, languages) if lang == 'English']

498

spanish_docs = [doc for doc, lang in zip(multilingual_corpus, languages) if lang == 'Spanish']

499

500

for lang, docs in [('English', english_docs), ('Spanish', spanish_docs)]:

501

print(f"\n{lang} word frequencies:")

502

all_words = ' '.join(docs).lower()

503

words = re.findall(r'\b\w+\b', all_words)

504

word_counts = Counter(words)

505

506

freq_viz = FreqDistVisualizer(n=10)

507

freq_viz.fit(word_counts)

508

freq_viz.show()

509

```