Python library for topic modelling, document indexing and similarity retrieval with large corpora
78
Comprehensive corpus I/O system supporting streaming document collections in multiple formats. Gensim's corpus infrastructure enables memory-efficient processing of datasets larger than available RAM through lazy evaluation and format-agnostic interfaces.
Core vocabulary management with word-to-integer ID mappings, corpus statistics, and vocabulary filtering operations.
class Dictionary:
"""Mapping between words and their integer IDs."""
def __init__(self, documents=None, prune_at=2000000): ...
def add_documents(self, documents, prune_at=2000000): ...
def doc2bow(self, document, allow_update=False, return_missing=False): ...
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): ...
def filter_n_most_frequent(self, remove_n): ...
def filter_tokens(self, bad_ids=None, good_ids=None): ...
def compactify(self, sort_by_word=True): ...
def save_as_text(self, fname, sort_by_word=True): ...
def merge_with(self, other): ...
def patch_with_special_tokens(self, special_tokens): ...
def most_common(self, n=None): ...
@classmethod
def load_from_text(cls, fname): ...
@classmethod
def from_documents(cls, documents): ...
@classmethod
def from_corpus(cls, corpus, id2word=None): ...
def __getitem__(self, tokenid): ...
def __len__(self): ...
def __str__(self): ...
def keys(self): ...
def __contains__(self, tokenid): ...
class HashDictionary:
"""Memory-efficient dictionary using hashing."""
def __init__(self, documents=None, id_range=32000, debug=True): ...
def add_documents(self, documents): ...
def doc2bow(self, document, allow_update=False, return_missing=False): ...
def filter_tokens(self, bad_ids=None, good_ids=None): ...
def save_as_text(self, fname, sort_by_word=True): ...
def __getitem__(self, tokenid): ...
def __len__(self): ...
def keys(self): ...Multiple corpus I/O formats for different data exchange standards and compatibility with external tools.
class MmCorpus:
"""Matrix Market format corpus."""
def __init__(self, fname): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, labels=None, comments=None, metadata=False): ...
@staticmethod
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): ...
def __iter__(self): ...
def __len__(self): ...
def docbyoffset(self, offset): ...
class BleiCorpus:
"""David Blei's LDA-C format corpus."""
def __init__(self, fname): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
def __iter__(self): ...
def __len__(self): ...
class SvmLightCorpus:
"""SVMlight format corpus."""
def __init__(self, fname, store_labels=True): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, labels=None, metadata=False): ...
def __iter__(self): ...
def __len__(self): ...
class LowCorpus:
"""GibbsLDA++ format corpus."""
def __init__(self, fname): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
def __iter__(self): ...
class UciCorpus:
"""UCI Bag-of-Words format corpus."""
def __init__(self, fname): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
def __iter__(self): ...
def __len__(self): ...
class MalletCorpus:
"""Mallet format corpus."""
def __init__(self, fname): ...
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False): ...
def __iter__(self): ...
class OpinosisCorpus:
"""Opinosis dataset corpus format."""
def __init__(self, fname): ...
def __iter__(self): ...
def __len__(self): ...Specialized corpus classes for processing text documents with built-in preprocessing and tokenization.
class TextCorpus:
"""Generic text corpus with preprocessing."""
def __init__(
self,
input=None,
dictionary=None,
metadata=False,
character_filters=None,
tokenizer=None,
token_filters=None
): ...
def preprocess_text(self, text): ...
def sample_texts(self, n, seed=None, length_range=(10, 500)): ...
def __iter__(self): ...
def __len__(self): ...
def getstream(self): ...
class TextDirectoryCorpus(TextCorpus):
"""Corpus from directory of text files."""
def __init__(
self,
input,
dictionary=None,
metadata=False,
min_depth=0,
max_depth=None,
pattern=None,
exclude_pattern=None,
lines_are_documents=False,
**kwargs
): ...
def iter_filepaths(self): ...Domain-specific corpus processors for particular data sources like Wikipedia.
class WikiCorpus:
"""Wikipedia dump corpus processor."""
def __init__(
self,
fname,
processes=None,
lemmatize=True,
dictionary=None,
filter_namespaces=('0',),
tokenizer_func=tokenize,
article_min_tokens=50,
token_min_len=2,
token_max_len=15,
lower=True
): ...
def get_texts(self): ...
def extract_pages(self, out, compress=True): ...
def __iter__(self): ...
def __len__(self): ...
class IndexedCorpus:
"""Base class for indexed corpora with random access."""
def __init__(self, fname, index_fname=None): ...
def __getitem__(self, docno): ...
def __iter__(self): ...
def __len__(self): ...
def save(self, fname_or_handle, separately=None, sep_limit=10485760, ignore=frozenset(), pickle_protocol=2): ...
def load(self, fname, mmap=None): ...from gensim import corpora
from gensim.test.utils import common_texts
# Create dictionary from documents
dictionary = corpora.Dictionary(common_texts)
print(f"Dictionary size: {len(dictionary)}")
# Convert documents to bag-of-words
corpus = [dictionary.doc2bow(text) for text in common_texts]
print(f"Corpus: {corpus[0]}") # Show first document
# Filter extremes
dictionary.filter_extremes(no_below=2, no_above=0.8)
# Save and load dictionary
dictionary.save('/tmp/dictionary.dict')
loaded_dict = corpora.Dictionary.load('/tmp/dictionary.dict')from gensim.corpora import MmCorpus, SvmLightCorpus
# Save corpus in Matrix Market format
MmCorpus.save_corpus('/tmp/corpus.mm', corpus, id2word=dictionary)
# Load corpus
mm_corpus = MmCorpus('/tmp/corpus.mm')
print(f"Corpus length: {len(mm_corpus)}")
# Convert to SVMlight format
SvmLightCorpus.save_corpus('/tmp/corpus.svmlight', corpus, id2word=dictionary)
svm_corpus = SvmLightCorpus('/tmp/corpus.svmlight')
# Iterate over documents
for doc in mm_corpus:
print(doc)
break # Just show first documentfrom gensim.corpora import TextDirectoryCorpus
# Create corpus from text files in directory
text_corpus = TextDirectoryCorpus('/path/to/text/files', min_depth=1)
# Create dictionary from text corpus
dictionary = text_corpus.dictionary
# Convert to bag-of-words
bow_corpus = [dictionary.doc2bow(doc) for doc in text_corpus.get_texts()]from gensim.corpora import WikiCorpus
# Process Wikipedia dump
wiki_corpus = WikiCorpus('/path/to/wikipedia/dump.xml.bz2',
lemmatize=True,
processes=4)
# Extract articles as text
wiki_corpus.extract_pages('/tmp/wiki_articles', compress=True)
# Create dictionary from wiki corpus
dictionary = wiki_corpus.dictionary
# Convert to bag-of-words
bow_corpus = [dictionary.doc2bow(article) for article in wiki_corpus.get_texts()]# Filter extremes: remove words that appear in less than 5 documents
# or more than 50% of documents
dictionary.filter_extremes(no_below=5, no_above=0.5)
# Keep only top 10000 most frequent words
dictionary.filter_n_most_frequent(10000)
# Merge dictionaries
other_dict = corpora.Dictionary(other_documents)
dictionary.merge_with(other_dict)
# Get word frequencies
word_freq = dictionary.cfs
most_common = dictionary.most_common(10)
print(f"Most common words: {most_common}")
# Check if word exists
if 'computer' in dictionary.token2id:
word_id = dictionary.token2id['computer']
print(f"'computer' has ID: {word_id}")# Get corpus statistics
num_docs = len(corpus)
num_tokens = sum(sum(freq for _, freq in doc) for doc in corpus)
print(f"Corpus: {num_docs} documents, {num_tokens} tokens")
# Get document lengths
doc_lengths = [sum(freq for _, freq in doc) for doc in corpus]
avg_length = sum(doc_lengths) / len(doc_lengths)
print(f"Average document length: {avg_length:.2f} tokens")
# Find sparse documents
sparse_docs = [i for i, doc in enumerate(corpus) if len(doc) < 10]
print(f"Sparse documents (< 10 unique tokens): {len(sparse_docs)}")Install with Tessl CLI
npx tessl i tessl/pypi-gensimdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9