Comprehensive Python stemming library providing 74 stemmers for 31 languages generated from Snowball algorithms.
npx @tessl/cli install tessl/pypi-snowballstemmer@3.0.0A comprehensive Python stemming library providing 74 stemmers for 31 languages generated from Snowball algorithms. Enables text processing applications to reduce words to their base forms for improved search and analysis, supporting multilingual text processing systems, search engines, and data analysis pipelines.
pip install snowballstemmerimport snowballstemmerAccess the public API functions:
from snowballstemmer import algorithms, stemmerimport snowballstemmer
# Get list of available languages
languages = snowballstemmer.algorithms()
print(f"Available languages: {len(languages)} total")
# Create a stemmer for English
stemmer = snowballstemmer.stemmer('english')
# Stem individual words
stemmed = stemmer.stemWord('running')
print(f"running -> {stemmed}") # prints: running -> run
# Stem multiple words at once
words = ['running', 'connected', 'connections', 'easily']
stemmed_words = stemmer.stemWords(words)
print(f"Original: {words}")
print(f"Stemmed: {stemmed_words}")
# Use different languages
french_stemmer = snowballstemmer.stemmer('french')
spanish_stemmer = snowballstemmer.stemmer('spanish')
print(f"French: 'connexions' -> {french_stemmer.stemWord('connexions')}")
print(f"Spanish: 'corriendo' -> {spanish_stemmer.stemWord('corriendo')}")Retrieve available stemming algorithms and supported languages.
def algorithms():
"""
Get list of available stemming algorithm names.
Returns:
list: List of strings representing available language codes
Note:
Automatically returns C extension algorithms if available, otherwise pure Python algorithms.
This function checks for the Stemmer C extension and falls back gracefully.
"""Create stemmer instances for specific languages with automatic fallback between C extension and pure Python implementations.
def stemmer(lang):
"""
Create a stemmer instance for the specified language.
Parameters:
lang (str): Language code for desired stemming algorithm.
Supports multiple formats: 'english', 'en', 'eng'
Returns:
Stemmer: A stemmer instance with stemWord() and stemWords() methods
Raises:
KeyError: If stemming algorithm for language not found
Note:
Automatically uses C extension (Stemmer.Stemmer) if available,
otherwise falls back to pure Python implementation.
"""Core stemming functionality for reducing words to their base forms. The stemmer instances returned by stemmer() provide these methods:
# Stemmer instance methods (available on returned stemmer objects)
def stemWord(word):
"""
Stem a single word to its base form.
Parameters:
word (str): Word to stem
Returns:
str: Stemmed word
"""
def stemWords(words):
"""
Stem multiple words to their base forms.
Parameters:
words (list): List of words to stem
Returns:
list: List of stemmed words in same order
"""Snowball stemmer supports 33 language algorithms across 31 languages, with multiple aliases for each.
The library automatically uses C extensions when available for significant performance improvements. The algorithms() and stemmer() functions transparently choose the best available implementation:
import snowballstemmer
# Automatically uses C extension if available, pure Python otherwise
stemmer = snowballstemmer.stemmer('english')
# Both implementations provide identical API
# C extension: faster performance
# Pure Python: broader compatibilitytry:
# Invalid language code
stemmer = snowballstemmer.stemmer('klingon')
except KeyError as e:
print(f"Language not supported: {e}")
# Safe language checking
available_langs = snowballstemmer.algorithms()
if 'german' in available_langs:
german_stemmer = snowballstemmer.stemmer('german')
else:
print("German stemming not available")import snowballstemmer
def process_multilingual_text(text_dict):
"""Process text in multiple languages."""
results = {}
for lang, words in text_dict.items():
try:
stemmer = snowballstemmer.stemmer(lang)
results[lang] = stemmer.stemWords(words)
except KeyError:
print(f"Warning: Language '{lang}' not supported")
results[lang] = words # Return original words
return results
# Example usage
texts = {
'english': ['running', 'connection', 'easily'],
'french': ['connexions', 'facilement', 'courant'],
'spanish': ['corriendo', 'conexión', 'fácilmente']
}
stemmed_results = process_multilingual_text(texts)
for lang, words in stemmed_results.items():
print(f"{lang}: {words}")import snowballstemmer
import re
class SearchIndexer:
def __init__(self, language='english'):
self.stemmer = snowballstemmer.stemmer(language)
self.word_pattern = re.compile(r'\b\w+\b')
def index_document(self, text):
"""Extract and stem words from document text."""
words = self.word_pattern.findall(text.lower())
return self.stemmer.stemWords(words)
def normalize_query(self, query):
"""Normalize search query for matching."""
words = self.word_pattern.findall(query.lower())
return self.stemmer.stemWords(words)
# Example usage
indexer = SearchIndexer('english')
document = "The quick brown foxes are running through the connected fields"
query = "quick brown fox running connections"
doc_terms = indexer.index_document(document)
query_terms = indexer.normalize_query(query)
print(f"Document terms: {doc_terms}")
print(f"Query terms: {query_terms}")
# Both 'running' and 'connected'/'connections' will match their stemmed forms