or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

text-preprocessing.mddocs/

0

# Text Preprocessing

1

2

Comprehensive text preprocessing pipeline with stemming, tokenization, and text cleaning functions. Gensim's preprocessing tools prepare raw text for NLP analysis by normalizing, filtering, and transforming textual data.

3

4

## Capabilities

5

6

### Text Preprocessing Functions

7

8

Core text preprocessing operations that can be chained together to create custom preprocessing pipelines.

9

10

```python { .api }

11

def preprocess_string(

12

s: str,

13

filters: list = None

14

) -> list:

15

"""

16

Apply preprocessing filters to a single string.

17

18

Parameters:

19

- s: Input text string

20

- filters: List of preprocessing functions to apply

21

22

Returns:

23

List of processed tokens

24

"""

25

26

def preprocess_documents(documents, filters=None):

27

"""

28

Apply preprocessing filters to multiple documents.

29

30

Parameters:

31

- documents: Iterable of text strings

32

- filters: List of preprocessing functions to apply

33

34

Returns:

35

Generator yielding lists of processed tokens

36

"""

37

38

def remove_stopwords(s: str) -> str:

39

"""

40

Remove stopwords from text string.

41

42

Parameters:

43

- s: Input text string

44

45

Returns:

46

Text with stopwords removed

47

"""

48

49

def strip_punctuation(s: str) -> str:

50

"""

51

Remove punctuation from text string.

52

53

Parameters:

54

- s: Input text string

55

56

Returns:

57

Text with punctuation removed

58

"""

59

60

def strip_tags(s: str) -> str:

61

"""

62

Remove HTML/XML tags from text string.

63

64

Parameters:

65

- s: Input text string

66

67

Returns:

68

Text with tags removed

69

"""

70

71

def strip_numeric(s: str) -> str:

72

"""

73

Remove numeric tokens from text string.

74

75

Parameters:

76

- s: Input text string

77

78

Returns:

79

Text with numeric tokens removed

80

"""

81

82

def strip_non_alphanum(s: str) -> str:

83

"""

84

Remove non-alphanumeric characters from text string.

85

86

Parameters:

87

- s: Input text string

88

89

Returns:

90

Text with only alphanumeric characters

91

"""

92

93

def strip_multiple_whitespaces(s: str) -> str:

94

"""

95

Normalize multiple whitespaces to single spaces.

96

97

Parameters:

98

- s: Input text string

99

100

Returns:

101

Text with normalized whitespace

102

"""

103

104

def strip_short(s: str, minsize: int = 3) -> str:

105

"""

106

Remove tokens shorter than minimum size.

107

108

Parameters:

109

- s: Input text string

110

- minsize: Minimum token length

111

112

Returns:

113

Text with short tokens removed

114

"""

115

116

def split_alphanum(s: str) -> str:

117

"""

118

Split alphanumeric tokens into separate alphabetic and numeric parts.

119

120

Parameters:

121

- s: Input text string

122

123

Returns:

124

Text with split alphanumeric tokens

125

"""

126

127

def stem_text(text: str) -> str:

128

"""

129

Apply stemming to text using Porter stemmer.

130

131

Parameters:

132

- text: Input text string

133

134

Returns:

135

Text with stemmed tokens

136

"""

137

```

138

139

### File I/O Functions

140

141

Functions for reading and preprocessing text files and directories.

142

143

```python { .api }

144

def read_file(path: str) -> str:

145

"""

146

Read and return contents of a text file.

147

148

Parameters:

149

- path: Path to text file

150

151

Returns:

152

File contents as string

153

"""

154

155

def read_files(pattern: str):

156

"""

157

Read multiple files matching a pattern.

158

159

Parameters:

160

- pattern: File path pattern (supports wildcards)

161

162

Returns:

163

Generator yielding file contents

164

"""

165

```

166

167

### General Utility Functions

168

169

Core utility functions for tokenization and text normalization from the gensim.utils module.

170

171

```python { .api }

172

def tokenize(

173

text,

174

lowercase=False,

175

deacc=False,

176

encoding='utf8',

177

errors="strict",

178

to_lower=False,

179

lower=False

180

):

181

"""

182

Iteratively yield tokens as unicode strings.

183

184

Parameters:

185

- text: Input string or bytes

186

- lowercase: Convert to lowercase (deprecated, use lower)

187

- deacc: Remove accentuation using deaccent()

188

- encoding: Encoding of input string

189

- errors: Error handling for encoding

190

- to_lower: Convert to lowercase (deprecated, use lower)

191

- lower: Convert to lowercase

192

193

Returns:

194

Generator yielding unicode tokens

195

"""

196

197

def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):

198

"""

199

Convert document into list of lowercase tokens.

200

201

Parameters:

202

- doc: Input document string

203

- deacc: Remove accent marks using deaccent()

204

- min_len: Minimum token length

205

- max_len: Maximum token length

206

207

Returns:

208

List of processed tokens

209

"""

210

211

def deaccent(text):

212

"""

213

Remove letter accents from the given string.

214

215

Parameters:

216

- text: Input string

217

218

Returns:

219

String with accents removed

220

"""

221

```

222

223

### Stemming

224

225

Porter stemming algorithm implementation for reducing words to their root forms.

226

227

```python { .api }

228

class PorterStemmer:

229

"""Porter stemming algorithm implementation."""

230

231

def __init__(self): ...

232

233

def stem(self, word: str, i: int = None, j: int = None) -> str:

234

"""

235

Stem a single word.

236

237

Parameters:

238

- word: Word to stem

239

- i: Start position (optional)

240

- j: End position (optional)

241

242

Returns:

243

Stemmed word

244

"""

245

```

246

247

## Usage Examples

248

249

### Basic Text Preprocessing

250

251

```python

252

from gensim.parsing.preprocessing import (

253

preprocess_string, remove_stopwords, strip_punctuation,

254

strip_numeric, strip_short, stem_text

255

)

256

from gensim.utils import tokenize, simple_preprocess, deaccent

257

258

# Single document preprocessing

259

text = "This is a sample document with some numbers 123 and punctuation!"

260

261

# Apply individual filters

262

no_punct = strip_punctuation(text)

263

print(f"No punctuation: {no_punct}")

264

265

no_numbers = strip_numeric(no_punct)

266

print(f"No numbers: {no_numbers}")

267

268

no_stopwords = remove_stopwords(no_numbers)

269

print(f"No stopwords: {no_stopwords}")

270

271

# Apply multiple filters at once using default preprocessing

272

tokens = preprocess_string(text)

273

print(f"Preprocessed tokens: {tokens}")

274

275

# Use utility functions for basic tokenization

276

basic_tokens = list(tokenize(text, lower=True, deacc=True))

277

print(f"Basic tokenization: {basic_tokens}")

278

279

# Use simple_preprocess for quick preprocessing

280

simple_tokens = simple_preprocess(text, deacc=True, min_len=2, max_len=15)

281

print(f"Simple preprocessing: {simple_tokens}")

282

283

# Remove accents from text

284

accented_text = "café naïve résumé"

285

clean_text = deaccent(accented_text)

286

print(f"Deaccented text: {clean_text}")

287

```

288

289

### Custom Preprocessing Pipeline

290

291

```python

292

from gensim.parsing.preprocessing import (

293

preprocess_string, strip_tags, strip_punctuation,

294

strip_multiple_whitespaces, strip_numeric,

295

remove_stopwords, strip_short, stem_text

296

)

297

298

# Define custom preprocessing pipeline

299

CUSTOM_FILTERS = [

300

strip_tags, # Remove HTML/XML tags

301

strip_punctuation, # Remove punctuation

302

strip_multiple_whitespaces, # Normalize whitespace

303

strip_numeric, # Remove numbers

304

remove_stopwords, # Remove stopwords

305

strip_short, # Remove short words

306

stem_text # Apply stemming

307

]

308

309

# Apply custom pipeline

310

text = "<p>This is some HTML text with numbers 123 and stopwords!</p>"

311

processed_tokens = preprocess_string(text, CUSTOM_FILTERS)

312

print(f"Custom preprocessing result: {processed_tokens}")

313

```

314

315

### Batch Document Preprocessing

316

317

```python

318

from gensim.parsing.preprocessing import preprocess_documents

319

320

# Process multiple documents

321

documents = [

322

"This is the first document about machine learning.",

323

"The second document discusses natural language processing.",

324

"Here's a third document on information retrieval.",

325

"<html>Some HTML content with <b>tags</b> and numbers 42.</html>"

326

]

327

328

# Apply preprocessing to all documents

329

processed_docs = list(preprocess_documents(documents, CUSTOM_FILTERS))

330

print("Processed documents:")

331

for i, tokens in enumerate(processed_docs):

332

print(f"Doc {i+1}: {tokens}")

333

```

334

335

### Porter Stemming

336

337

```python

338

from gensim.parsing.porter import PorterStemmer

339

340

# Create stemmer instance

341

stemmer = PorterStemmer()

342

343

# Stem individual words

344

words = ['running', 'runs', 'ran', 'easily', 'fairly', 'computing', 'computed']

345

stemmed_words = [stemmer.stem(word) for word in words]

346

347

print("Original -> Stemmed:")

348

for original, stemmed in zip(words, stemmed_words):

349

print(f"{original} -> {stemmed}")

350

```

351

352

### Reading and Preprocessing Files

353

354

```python

355

from gensim.parsing.preprocessing import read_file, preprocess_string

356

import os

357

358

# Read single file and preprocess

359

if os.path.exists('/tmp/sample.txt'):

360

file_content = read_file('/tmp/sample.txt')

361

processed_content = preprocess_string(file_content)

362

print(f"File preprocessing result: {processed_content}")

363

364

# Note: read_files function for multiple files with pattern matching

365

# would be used similarly for batch file processing

366

```

367

368

### Creating Reusable Preprocessing Functions

369

370

```python

371

def clean_text_simple(text):

372

"""Simple text cleaning pipeline."""

373

simple_filters = [

374

strip_punctuation,

375

strip_numeric,

376

strip_multiple_whitespaces,

377

remove_stopwords,

378

strip_short

379

]

380

return preprocess_string(text, simple_filters)

381

382

def clean_text_aggressive(text):

383

"""Aggressive text cleaning with stemming."""

384

aggressive_filters = [

385

strip_tags,

386

strip_punctuation,

387

strip_multiple_whitespaces,

388

strip_numeric,

389

strip_non_alphanum,

390

remove_stopwords,

391

strip_short,

392

stem_text

393

]

394

return preprocess_string(text, aggressive_filters)

395

396

# Test different cleaning approaches

397

test_text = "<p>The running dogs are quickly computing solutions!</p>"

398

399

simple_result = clean_text_simple(test_text)

400

aggressive_result = clean_text_aggressive(test_text)

401

402

print(f"Simple cleaning: {simple_result}")

403

print(f"Aggressive cleaning: {aggressive_result}")

404

```

405

406

### Integration with Corpus Processing

407

408

```python

409

from gensim import corpora

410

from gensim.parsing.preprocessing import preprocess_documents

411

412

# Preprocess documents for corpus creation

413

raw_documents = [

414

"Machine learning algorithms process data efficiently.",

415

"Natural language processing enables text analysis.",

416

"Information retrieval systems find relevant documents."

417

]

418

419

# Preprocess all documents

420

processed_docs = list(preprocess_documents(raw_documents))

421

422

# Create dictionary and corpus

423

dictionary = corpora.Dictionary(processed_docs)

424

corpus = [dictionary.doc2bow(tokens) for tokens in processed_docs]

425

426

print(f"Dictionary size: {len(dictionary)}")

427

print(f"Sample processed document: {processed_docs[0]}")

428

print(f"Sample BOW representation: {corpus[0]}")

429

```

430

431

### Custom Filter Functions

432

433

```python

434

import re

435

436

def remove_urls(text):

437

"""Custom filter to remove URLs."""

438

url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

439

return url_pattern.sub('', text)

440

441

def remove_email(text):

442

"""Custom filter to remove email addresses."""

443

email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

444

return email_pattern.sub('', text)

445

446

# Create custom preprocessing pipeline

447

CUSTOM_WEB_FILTERS = [

448

remove_urls,

449

remove_email,

450

strip_punctuation,

451

remove_stopwords,

452

strip_short

453

]

454

455

# Test with web content

456

web_text = "Check out https://example.com or email me at user@example.com for more info!"

457

cleaned_web = preprocess_string(web_text, CUSTOM_WEB_FILTERS)

458

print(f"Web content cleaned: {cleaned_web}")

459

```

460

461

### Performance Optimization

462

463

```python

464

# For large-scale preprocessing, consider using generators

465

def preprocess_large_corpus(documents, filters):

466

"""Memory-efficient preprocessing for large corpora."""

467

for doc in documents:

468

yield preprocess_string(doc, filters)

469

470

# Process documents one at a time to save memory

471

large_documents = ["doc1", "doc2", "doc3"] # Imagine this is very large

472

processed_generator = preprocess_large_corpus(large_documents, CUSTOM_FILTERS)

473

474

# Process incrementally

475

for i, processed_doc in enumerate(processed_generator):

476

print(f"Processed document {i+1}: {processed_doc}")

477

# Process one document at a time without loading all into memory

478

```