Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Comprehensive text preprocessing pipeline with stemming, tokenization, and text cleaning functions. Gensim's preprocessing tools prepare raw text for NLP analysis by normalizing, filtering, and transforming textual data.
Core text preprocessing operations that can be chained together to create custom preprocessing pipelines.
def preprocess_string(
s: str,
filters: list = None
) -> list:
"""
Apply preprocessing filters to a single string.
Parameters:
- s: Input text string
- filters: List of preprocessing functions to apply
Returns:
List of processed tokens
"""
def preprocess_documents(documents, filters=None):
"""
Apply preprocessing filters to multiple documents.
Parameters:
- documents: Iterable of text strings
- filters: List of preprocessing functions to apply
Returns:
Generator yielding lists of processed tokens
"""
def remove_stopwords(s: str) -> str:
"""
Remove stopwords from text string.
Parameters:
- s: Input text string
Returns:
Text with stopwords removed
"""
def strip_punctuation(s: str) -> str:
"""
Remove punctuation from text string.
Parameters:
- s: Input text string
Returns:
Text with punctuation removed
"""
def strip_tags(s: str) -> str:
"""
Remove HTML/XML tags from text string.
Parameters:
- s: Input text string
Returns:
Text with tags removed
"""
def strip_numeric(s: str) -> str:
"""
Remove numeric tokens from text string.
Parameters:
- s: Input text string
Returns:
Text with numeric tokens removed
"""
def strip_non_alphanum(s: str) -> str:
"""
Remove non-alphanumeric characters from text string.
Parameters:
- s: Input text string
Returns:
Text with only alphanumeric characters
"""
def strip_multiple_whitespaces(s: str) -> str:
"""
Normalize multiple whitespaces to single spaces.
Parameters:
- s: Input text string
Returns:
Text with normalized whitespace
"""
def strip_short(s: str, minsize: int = 3) -> str:
"""
Remove tokens shorter than minimum size.
Parameters:
- s: Input text string
- minsize: Minimum token length
Returns:
Text with short tokens removed
"""
def split_alphanum(s: str) -> str:
"""
Split alphanumeric tokens into separate alphabetic and numeric parts.
Parameters:
- s: Input text string
Returns:
Text with split alphanumeric tokens
"""
def stem_text(text: str) -> str:
"""
Apply stemming to text using Porter stemmer.
Parameters:
- text: Input text string
Returns:
Text with stemmed tokens
"""Functions for reading and preprocessing text files and directories.
def read_file(path: str) -> str:
"""
Read and return contents of a text file.
Parameters:
- path: Path to text file
Returns:
File contents as string
"""
def read_files(pattern: str):
"""
Read multiple files matching a pattern.
Parameters:
- pattern: File path pattern (supports wildcards)
Returns:
Generator yielding file contents
"""Core utility functions for tokenization and text normalization from the gensim.utils module.
def tokenize(
text,
lowercase=False,
deacc=False,
encoding='utf8',
errors="strict",
to_lower=False,
lower=False
):
"""
Iteratively yield tokens as unicode strings.
Parameters:
- text: Input string or bytes
- lowercase: Convert to lowercase (deprecated, use lower)
- deacc: Remove accentuation using deaccent()
- encoding: Encoding of input string
- errors: Error handling for encoding
- to_lower: Convert to lowercase (deprecated, use lower)
- lower: Convert to lowercase
Returns:
Generator yielding unicode tokens
"""
def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
"""
Convert document into list of lowercase tokens.
Parameters:
- doc: Input document string
- deacc: Remove accent marks using deaccent()
- min_len: Minimum token length
- max_len: Maximum token length
Returns:
List of processed tokens
"""
def deaccent(text):
"""
Remove letter accents from the given string.
Parameters:
- text: Input string
Returns:
String with accents removed
"""Porter stemming algorithm implementation for reducing words to their root forms.
class PorterStemmer:
"""Porter stemming algorithm implementation."""
def __init__(self): ...
def stem(self, word: str, i: int = None, j: int = None) -> str:
"""
Stem a single word.
Parameters:
- word: Word to stem
- i: Start position (optional)
- j: End position (optional)
Returns:
Stemmed word
"""from gensim.parsing.preprocessing import (
preprocess_string, remove_stopwords, strip_punctuation,
strip_numeric, strip_short, stem_text
)
from gensim.utils import tokenize, simple_preprocess, deaccent
# Single document preprocessing
text = "This is a sample document with some numbers 123 and punctuation!"
# Apply individual filters
no_punct = strip_punctuation(text)
print(f"No punctuation: {no_punct}")
no_numbers = strip_numeric(no_punct)
print(f"No numbers: {no_numbers}")
no_stopwords = remove_stopwords(no_numbers)
print(f"No stopwords: {no_stopwords}")
# Apply multiple filters at once using default preprocessing
tokens = preprocess_string(text)
print(f"Preprocessed tokens: {tokens}")
# Use utility functions for basic tokenization
basic_tokens = list(tokenize(text, lower=True, deacc=True))
print(f"Basic tokenization: {basic_tokens}")
# Use simple_preprocess for quick preprocessing
simple_tokens = simple_preprocess(text, deacc=True, min_len=2, max_len=15)
print(f"Simple preprocessing: {simple_tokens}")
# Remove accents from text
accented_text = "café naïve résumé"
clean_text = deaccent(accented_text)
print(f"Deaccented text: {clean_text}")from gensim.parsing.preprocessing import (
preprocess_string, strip_tags, strip_punctuation,
strip_multiple_whitespaces, strip_numeric,
remove_stopwords, strip_short, stem_text
)
# Define custom preprocessing pipeline
CUSTOM_FILTERS = [
strip_tags, # Remove HTML/XML tags
strip_punctuation, # Remove punctuation
strip_multiple_whitespaces, # Normalize whitespace
strip_numeric, # Remove numbers
remove_stopwords, # Remove stopwords
strip_short, # Remove short words
stem_text # Apply stemming
]
# Apply custom pipeline
text = "<p>This is some HTML text with numbers 123 and stopwords!</p>"
processed_tokens = preprocess_string(text, CUSTOM_FILTERS)
print(f"Custom preprocessing result: {processed_tokens}")from gensim.parsing.preprocessing import preprocess_documents
# Process multiple documents
documents = [
"This is the first document about machine learning.",
"The second document discusses natural language processing.",
"Here's a third document on information retrieval.",
"<html>Some HTML content with <b>tags</b> and numbers 42.</html>"
]
# Apply preprocessing to all documents
processed_docs = list(preprocess_documents(documents, CUSTOM_FILTERS))
print("Processed documents:")
for i, tokens in enumerate(processed_docs):
print(f"Doc {i+1}: {tokens}")from gensim.parsing.porter import PorterStemmer
# Create stemmer instance
stemmer = PorterStemmer()
# Stem individual words
words = ['running', 'runs', 'ran', 'easily', 'fairly', 'computing', 'computed']
stemmed_words = [stemmer.stem(word) for word in words]
print("Original -> Stemmed:")
for original, stemmed in zip(words, stemmed_words):
print(f"{original} -> {stemmed}")from gensim.parsing.preprocessing import read_file, preprocess_string
import os
# Read single file and preprocess
if os.path.exists('/tmp/sample.txt'):
file_content = read_file('/tmp/sample.txt')
processed_content = preprocess_string(file_content)
print(f"File preprocessing result: {processed_content}")
# Note: read_files function for multiple files with pattern matching
# would be used similarly for batch file processingdef clean_text_simple(text):
"""Simple text cleaning pipeline."""
simple_filters = [
strip_punctuation,
strip_numeric,
strip_multiple_whitespaces,
remove_stopwords,
strip_short
]
return preprocess_string(text, simple_filters)
def clean_text_aggressive(text):
"""Aggressive text cleaning with stemming."""
aggressive_filters = [
strip_tags,
strip_punctuation,
strip_multiple_whitespaces,
strip_numeric,
strip_non_alphanum,
remove_stopwords,
strip_short,
stem_text
]
return preprocess_string(text, aggressive_filters)
# Test different cleaning approaches
test_text = "<p>The running dogs are quickly computing solutions!</p>"
simple_result = clean_text_simple(test_text)
aggressive_result = clean_text_aggressive(test_text)
print(f"Simple cleaning: {simple_result}")
print(f"Aggressive cleaning: {aggressive_result}")from gensim import corpora
from gensim.parsing.preprocessing import preprocess_documents
# Preprocess documents for corpus creation
raw_documents = [
"Machine learning algorithms process data efficiently.",
"Natural language processing enables text analysis.",
"Information retrieval systems find relevant documents."
]
# Preprocess all documents
processed_docs = list(preprocess_documents(raw_documents))
# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(tokens) for tokens in processed_docs]
print(f"Dictionary size: {len(dictionary)}")
print(f"Sample processed document: {processed_docs[0]}")
print(f"Sample BOW representation: {corpus[0]}")import re
def remove_urls(text):
"""Custom filter to remove URLs."""
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
return url_pattern.sub('', text)
def remove_email(text):
"""Custom filter to remove email addresses."""
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
return email_pattern.sub('', text)
# Create custom preprocessing pipeline
CUSTOM_WEB_FILTERS = [
remove_urls,
remove_email,
strip_punctuation,
remove_stopwords,
strip_short
]
# Test with web content
web_text = "Check out https://example.com or email me at user@example.com for more info!"
cleaned_web = preprocess_string(web_text, CUSTOM_WEB_FILTERS)
print(f"Web content cleaned: {cleaned_web}")# For large-scale preprocessing, consider using generators
def preprocess_large_corpus(documents, filters):
"""Memory-efficient preprocessing for large corpora."""
for doc in documents:
yield preprocess_string(doc, filters)
# Process documents one at a time to save memory
large_documents = ["doc1", "doc2", "doc3"] # Imagine this is very large
processed_generator = preprocess_large_corpus(large_documents, CUSTOM_FILTERS)
# Process incrementally
for i, processed_doc in enumerate(processed_generator):
print(f"Processed document {i+1}: {processed_doc}")
# Process one document at a time without loading all into memoryInstall with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9