Tessl Tile for pypi/gensim@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

corpus-management.md data-downloading.md index.md mathematical-utilities.md nlp-models.md similarity-computations.md text-preprocessing.md

text-preprocessing.mddocs/

0
# Text Preprocessing
1

2
Comprehensive text preprocessing pipeline with stemming, tokenization, and text cleaning functions. Gensim's preprocessing tools prepare raw text for NLP analysis by normalizing, filtering, and transforming textual data.
3

4
## Capabilities
5

6
### Text Preprocessing Functions
7

8
Core text preprocessing operations that can be chained together to create custom preprocessing pipelines.
9

10
```python { .api }
11
def preprocess_string(
12
    s: str,
13
    filters: list = None
14
) -> list:
15
    """
16
    Apply preprocessing filters to a single string.
17
    
18
    Parameters:
19
    - s: Input text string
20
    - filters: List of preprocessing functions to apply
21
    
22
    Returns:
23
    List of processed tokens
24
    """
25

26
def preprocess_documents(documents, filters=None):
27
    """
28
    Apply preprocessing filters to multiple documents.
29
    
30
    Parameters:
31
    - documents: Iterable of text strings
32
    - filters: List of preprocessing functions to apply
33
    
34
    Returns:
35
    Generator yielding lists of processed tokens
36
    """
37

38
def remove_stopwords(s: str) -> str:
39
    """
40
    Remove stopwords from text string.
41
    
42
    Parameters:
43
    - s: Input text string
44
    
45
    Returns:
46
    Text with stopwords removed
47
    """
48

49
def strip_punctuation(s: str) -> str:
50
    """
51
    Remove punctuation from text string.
52
    
53
    Parameters:
54
    - s: Input text string
55
    
56
    Returns:
57
    Text with punctuation removed
58
    """
59

60
def strip_tags(s: str) -> str:
61
    """
62
    Remove HTML/XML tags from text string.
63
    
64
    Parameters:
65
    - s: Input text string
66
    
67
    Returns:
68
    Text with tags removed
69
    """
70

71
def strip_numeric(s: str) -> str:
72
    """
73
    Remove numeric tokens from text string.
74
    
75
    Parameters:
76
    - s: Input text string
77
    
78
    Returns:
79
    Text with numeric tokens removed
80
    """
81

82
def strip_non_alphanum(s: str) -> str:
83
    """
84
    Remove non-alphanumeric characters from text string.
85
    
86
    Parameters:
87
    - s: Input text string
88
    
89
    Returns:
90
    Text with only alphanumeric characters
91
    """
92

93
def strip_multiple_whitespaces(s: str) -> str:
94
    """
95
    Normalize multiple whitespaces to single spaces.
96
    
97
    Parameters:
98
    - s: Input text string
99
    
100
    Returns:
101
    Text with normalized whitespace
102
    """
103

104
def strip_short(s: str, minsize: int = 3) -> str:
105
    """
106
    Remove tokens shorter than minimum size.
107
    
108
    Parameters:
109
    - s: Input text string  
110
    - minsize: Minimum token length
111
    
112
    Returns:
113
    Text with short tokens removed
114
    """
115

116
def split_alphanum(s: str) -> str:
117
    """
118
    Split alphanumeric tokens into separate alphabetic and numeric parts.
119
    
120
    Parameters:
121
    - s: Input text string
122
    
123
    Returns:
124
    Text with split alphanumeric tokens
125
    """
126

127
def stem_text(text: str) -> str:
128
    """
129
    Apply stemming to text using Porter stemmer.
130
    
131
    Parameters:
132
    - text: Input text string
133
    
134
    Returns:
135
    Text with stemmed tokens
136
    """
137
```
138

139
### File I/O Functions
140

141
Functions for reading and preprocessing text files and directories.
142

143
```python { .api }
144
def read_file(path: str) -> str:
145
    """
146
    Read and return contents of a text file.
147
    
148
    Parameters:
149
    - path: Path to text file
150
    
151
    Returns:
152
    File contents as string
153
    """
154

155
def read_files(pattern: str):
156
    """
157
    Read multiple files matching a pattern.
158
    
159
    Parameters:
160
    - pattern: File path pattern (supports wildcards)
161
    
162
    Returns:
163
    Generator yielding file contents
164
    """
165
```
166

167
### General Utility Functions
168

169
Core utility functions for tokenization and text normalization from the gensim.utils module.
170

171
```python { .api }
172
def tokenize(
173
    text, 
174
    lowercase=False, 
175
    deacc=False, 
176
    encoding='utf8', 
177
    errors="strict", 
178
    to_lower=False, 
179
    lower=False
180
):
181
    """
182
    Iteratively yield tokens as unicode strings.
183
    
184
    Parameters:
185
    - text: Input string or bytes
186
    - lowercase: Convert to lowercase (deprecated, use lower)
187
    - deacc: Remove accentuation using deaccent()
188
    - encoding: Encoding of input string
189
    - errors: Error handling for encoding
190
    - to_lower: Convert to lowercase (deprecated, use lower)
191
    - lower: Convert to lowercase
192
    
193
    Returns:
194
    Generator yielding unicode tokens
195
    """
196

197
def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
198
    """
199
    Convert document into list of lowercase tokens.
200
    
201
    Parameters:
202
    - doc: Input document string
203
    - deacc: Remove accent marks using deaccent()
204
    - min_len: Minimum token length
205
    - max_len: Maximum token length
206
    
207
    Returns:
208
    List of processed tokens
209
    """
210

211
def deaccent(text):
212
    """
213
    Remove letter accents from the given string.
214
    
215
    Parameters:
216
    - text: Input string
217
    
218
    Returns:
219
    String with accents removed
220
    """
221
```
222

223
### Stemming
224

225
Porter stemming algorithm implementation for reducing words to their root forms.
226

227
```python { .api }
228
class PorterStemmer:
229
    """Porter stemming algorithm implementation."""
230
    
231
    def __init__(self): ...
232
    
233
    def stem(self, word: str, i: int = None, j: int = None) -> str:
234
        """
235
        Stem a single word.
236
        
237
        Parameters:
238
        - word: Word to stem
239
        - i: Start position (optional)
240
        - j: End position (optional)
241
        
242
        Returns:
243
        Stemmed word
244
        """
245
```
246

247
## Usage Examples
248

249
### Basic Text Preprocessing
250

251
```python
252
from gensim.parsing.preprocessing import (
253
    preprocess_string, remove_stopwords, strip_punctuation, 
254
    strip_numeric, strip_short, stem_text
255
)
256
from gensim.utils import tokenize, simple_preprocess, deaccent
257

258
# Single document preprocessing
259
text = "This is a sample document with some numbers 123 and punctuation!"
260

261
# Apply individual filters
262
no_punct = strip_punctuation(text)
263
print(f"No punctuation: {no_punct}")
264

265
no_numbers = strip_numeric(no_punct)
266
print(f"No numbers: {no_numbers}")
267

268
no_stopwords = remove_stopwords(no_numbers)
269
print(f"No stopwords: {no_stopwords}")
270

271
# Apply multiple filters at once using default preprocessing
272
tokens = preprocess_string(text)
273
print(f"Preprocessed tokens: {tokens}")
274

275
# Use utility functions for basic tokenization
276
basic_tokens = list(tokenize(text, lower=True, deacc=True))
277
print(f"Basic tokenization: {basic_tokens}")
278

279
# Use simple_preprocess for quick preprocessing
280
simple_tokens = simple_preprocess(text, deacc=True, min_len=2, max_len=15)
281
print(f"Simple preprocessing: {simple_tokens}")
282

283
# Remove accents from text
284
accented_text = "café naïve résumé"
285
clean_text = deaccent(accented_text)
286
print(f"Deaccented text: {clean_text}")
287
```
288

289
### Custom Preprocessing Pipeline
290

291
```python
292
from gensim.parsing.preprocessing import (
293
    preprocess_string, strip_tags, strip_punctuation, 
294
    strip_multiple_whitespaces, strip_numeric, 
295
    remove_stopwords, strip_short, stem_text
296
)
297

298
# Define custom preprocessing pipeline
299
CUSTOM_FILTERS = [
300
    strip_tags,                    # Remove HTML/XML tags
301
    strip_punctuation,             # Remove punctuation
302
    strip_multiple_whitespaces,    # Normalize whitespace
303
    strip_numeric,                 # Remove numbers
304
    remove_stopwords,              # Remove stopwords
305
    strip_short,                   # Remove short words
306
    stem_text                      # Apply stemming
307
]
308

309
# Apply custom pipeline
310
text = "<p>This is some HTML text with numbers 123 and stopwords!</p>"
311
processed_tokens = preprocess_string(text, CUSTOM_FILTERS)
312
print(f"Custom preprocessing result: {processed_tokens}")
313
```
314

315
### Batch Document Preprocessing
316

317
```python
318
from gensim.parsing.preprocessing import preprocess_documents
319

320
# Process multiple documents
321
documents = [
322
    "This is the first document about machine learning.",
323
    "The second document discusses natural language processing.",
324
    "Here's a third document on information retrieval.",
325
    "<html>Some HTML content with <b>tags</b> and numbers 42.</html>"
326
]
327

328
# Apply preprocessing to all documents
329
processed_docs = list(preprocess_documents(documents, CUSTOM_FILTERS))
330
print("Processed documents:")
331
for i, tokens in enumerate(processed_docs):
332
    print(f"Doc {i+1}: {tokens}")
333
```
334

335
### Porter Stemming
336

337
```python
338
from gensim.parsing.porter import PorterStemmer
339

340
# Create stemmer instance
341
stemmer = PorterStemmer()
342

343
# Stem individual words
344
words = ['running', 'runs', 'ran', 'easily', 'fairly', 'computing', 'computed']
345
stemmed_words = [stemmer.stem(word) for word in words]
346

347
print("Original -> Stemmed:")
348
for original, stemmed in zip(words, stemmed_words):
349
    print(f"{original} -> {stemmed}")
350
```
351

352
### Reading and Preprocessing Files
353

354
```python
355
from gensim.parsing.preprocessing import read_file, preprocess_string
356
import os
357

358
# Read single file and preprocess
359
if os.path.exists('/tmp/sample.txt'):
360
    file_content = read_file('/tmp/sample.txt')
361
    processed_content = preprocess_string(file_content)
362
    print(f"File preprocessing result: {processed_content}")
363

364
# Note: read_files function for multiple files with pattern matching
365
# would be used similarly for batch file processing
366
```
367

368
### Creating Reusable Preprocessing Functions
369

370
```python
371
def clean_text_simple(text):
372
    """Simple text cleaning pipeline."""
373
    simple_filters = [
374
        strip_punctuation,
375
        strip_numeric,
376
        strip_multiple_whitespaces,
377
        remove_stopwords,
378
        strip_short
379
    ]
380
    return preprocess_string(text, simple_filters)
381

382
def clean_text_aggressive(text):
383
    """Aggressive text cleaning with stemming."""
384
    aggressive_filters = [
385
        strip_tags,
386
        strip_punctuation,
387
        strip_multiple_whitespaces,
388
        strip_numeric,
389
        strip_non_alphanum,
390
        remove_stopwords,
391
        strip_short,
392
        stem_text
393
    ]
394
    return preprocess_string(text, aggressive_filters)
395

396
# Test different cleaning approaches
397
test_text = "<p>The running dogs are quickly computing solutions!</p>"
398

399
simple_result = clean_text_simple(test_text)
400
aggressive_result = clean_text_aggressive(test_text)
401

402
print(f"Simple cleaning: {simple_result}")
403
print(f"Aggressive cleaning: {aggressive_result}")
404
```
405

406
### Integration with Corpus Processing
407

408
```python
409
from gensim import corpora
410
from gensim.parsing.preprocessing import preprocess_documents
411

412
# Preprocess documents for corpus creation
413
raw_documents = [
414
    "Machine learning algorithms process data efficiently.",
415
    "Natural language processing enables text analysis.",
416
    "Information retrieval systems find relevant documents."
417
]
418

419
# Preprocess all documents
420
processed_docs = list(preprocess_documents(raw_documents))
421

422
# Create dictionary and corpus
423
dictionary = corpora.Dictionary(processed_docs)
424
corpus = [dictionary.doc2bow(tokens) for tokens in processed_docs]
425

426
print(f"Dictionary size: {len(dictionary)}")
427
print(f"Sample processed document: {processed_docs[0]}")
428
print(f"Sample BOW representation: {corpus[0]}")
429
```
430

431
### Custom Filter Functions
432

433
```python
434
import re
435

436
def remove_urls(text):
437
    """Custom filter to remove URLs."""
438
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
439
    return url_pattern.sub('', text)
440

441
def remove_email(text):
442
    """Custom filter to remove email addresses."""
443
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
444
    return email_pattern.sub('', text)
445

446
# Create custom preprocessing pipeline
447
CUSTOM_WEB_FILTERS = [
448
    remove_urls,
449
    remove_email,
450
    strip_punctuation,
451
    remove_stopwords,
452
    strip_short
453
]
454

455
# Test with web content
456
web_text = "Check out https://example.com or email me at user@example.com for more info!"
457
cleaned_web = preprocess_string(web_text, CUSTOM_WEB_FILTERS)
458
print(f"Web content cleaned: {cleaned_web}")
459
```
460

461
### Performance Optimization
462

463
```python
464
# For large-scale preprocessing, consider using generators
465
def preprocess_large_corpus(documents, filters):
466
    """Memory-efficient preprocessing for large corpora."""
467
    for doc in documents:
468
        yield preprocess_string(doc, filters)
469

470
# Process documents one at a time to save memory
471
large_documents = ["doc1", "doc2", "doc3"]  # Imagine this is very large
472
processed_generator = preprocess_large_corpus(large_documents, CUSTOM_FILTERS)
473

474
# Process incrementally
475
for i, processed_doc in enumerate(processed_generator):
476
    print(f"Processed document {i+1}: {processed_doc}")
477
    # Process one document at a time without loading all into memory
478
```

Version

Tile

Files

text-preprocessing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-preprocessing.mddocs/