CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-gensim

Python library for topic modelling, document indexing and similarity retrieval with large corpora

78

1.02x
Overview
Eval results
Files

corpus-management.mddocs/

Corpus Management

Comprehensive corpus I/O system supporting streaming document collections in multiple formats. Gensim's corpus infrastructure enables memory-efficient processing of datasets larger than available RAM through lazy evaluation and format-agnostic interfaces.

Capabilities

Dictionary Management

Core vocabulary management with word-to-integer ID mappings, corpus statistics, and vocabulary filtering operations.

class Dictionary:
    """Mapping between words and their integer IDs."""
    
    def __init__(self, documents=None, prune_at=2000000): ...
    
    def add_documents(self, documents, prune_at=2000000): ...
    def doc2bow(self, document, allow_update=False, return_missing=False): ...
    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): ...
    def filter_n_most_frequent(self, remove_n): ...
    def filter_tokens(self, bad_ids=None, good_ids=None): ...
    def compactify(self, sort_by_word=True): ...
    def save_as_text(self, fname, sort_by_word=True): ...
    def merge_with(self, other): ...
    def patch_with_special_tokens(self, special_tokens): ...
    def most_common(self, n=None): ...
    
    @classmethod
    def load_from_text(cls, fname): ...
    @classmethod  
    def from_documents(cls, documents): ...
    @classmethod
    def from_corpus(cls, corpus, id2word=None): ...
    
    def __getitem__(self, tokenid): ...
    def __len__(self): ...
    def __str__(self): ...
    def keys(self): ...
    def __contains__(self, tokenid): ...

class HashDictionary:
    """Memory-efficient dictionary using hashing."""
    
    def __init__(self, documents=None, id_range=32000, debug=True): ...
    
    def add_documents(self, documents): ...
    def doc2bow(self, document, allow_update=False, return_missing=False): ...
    def filter_tokens(self, bad_ids=None, good_ids=None): ...
    def save_as_text(self, fname, sort_by_word=True): ...
    
    def __getitem__(self, tokenid): ...
    def __len__(self): ...
    def keys(self): ...

Corpus Formats

Multiple corpus I/O formats for different data exchange standards and compatibility with external tools.

class MmCorpus:
    """Matrix Market format corpus."""
    
    def __init__(self, fname): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, labels=None, comments=None, metadata=False): ...
    @staticmethod
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): ...
    
    def __iter__(self): ...
    def __len__(self): ...
    def docbyoffset(self, offset): ...

class BleiCorpus:
    """David Blei's LDA-C format corpus."""
    
    def __init__(self, fname): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
    
    def __iter__(self): ...
    def __len__(self): ...

class SvmLightCorpus:
    """SVMlight format corpus."""
    
    def __init__(self, fname, store_labels=True): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, labels=None, metadata=False): ...
    
    def __iter__(self): ...
    def __len__(self): ...

class LowCorpus:
    """GibbsLDA++ format corpus."""
    
    def __init__(self, fname): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
    
    def __iter__(self): ...

class UciCorpus:
    """UCI Bag-of-Words format corpus."""
    
    def __init__(self, fname): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
    
    def __iter__(self): ...
    def __len__(self): ...

class MalletCorpus:
    """Mallet format corpus."""
    
    def __init__(self, fname): ...
    
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False): ...
    
    def __iter__(self): ...

class OpinosisCorpus:
    """Opinosis dataset corpus format."""
    
    def __init__(self, fname): ...
    
    def __iter__(self): ...
    def __len__(self): ...

Text Corpus Processing

Specialized corpus classes for processing text documents with built-in preprocessing and tokenization.

class TextCorpus:
    """Generic text corpus with preprocessing."""
    
    def __init__(
        self,
        input=None,
        dictionary=None,
        metadata=False,
        character_filters=None,
        tokenizer=None,
        token_filters=None
    ): ...
    
    def preprocess_text(self, text): ...
    def sample_texts(self, n, seed=None, length_range=(10, 500)): ...
    def __iter__(self): ...
    def __len__(self): ...
    def getstream(self): ...

class TextDirectoryCorpus(TextCorpus):
    """Corpus from directory of text files."""
    
    def __init__(
        self,
        input,
        dictionary=None,
        metadata=False,
        min_depth=0,
        max_depth=None,
        pattern=None,
        exclude_pattern=None,
        lines_are_documents=False,
        **kwargs
    ): ...
    
    def iter_filepaths(self): ...

Specialized Corpus Types

Domain-specific corpus processors for particular data sources like Wikipedia.

class WikiCorpus:
    """Wikipedia dump corpus processor."""
    
    def __init__(
        self,
        fname,
        processes=None,
        lemmatize=True,
        dictionary=None,
        filter_namespaces=('0',),
        tokenizer_func=tokenize,
        article_min_tokens=50,
        token_min_len=2,
        token_max_len=15,
        lower=True
    ): ...
    
    def get_texts(self): ...
    def extract_pages(self, out, compress=True): ...
    
    def __iter__(self): ...
    def __len__(self): ...

class IndexedCorpus:
    """Base class for indexed corpora with random access."""
    
    def __init__(self, fname, index_fname=None): ...
    
    def __getitem__(self, docno): ...
    def __iter__(self): ...
    def __len__(self): ...
    def save(self, fname_or_handle, separately=None, sep_limit=10485760, ignore=frozenset(), pickle_protocol=2): ...
    def load(self, fname, mmap=None): ...

Usage Examples

Creating and Using Dictionaries

from gensim import corpora
from gensim.test.utils import common_texts

# Create dictionary from documents
dictionary = corpora.Dictionary(common_texts)
print(f"Dictionary size: {len(dictionary)}")

# Convert documents to bag-of-words
corpus = [dictionary.doc2bow(text) for text in common_texts]
print(f"Corpus: {corpus[0]}")  # Show first document

# Filter extremes
dictionary.filter_extremes(no_below=2, no_above=0.8)

# Save and load dictionary
dictionary.save('/tmp/dictionary.dict')
loaded_dict = corpora.Dictionary.load('/tmp/dictionary.dict')

Working with Different Corpus Formats

from gensim.corpora import MmCorpus, SvmLightCorpus

# Save corpus in Matrix Market format
MmCorpus.save_corpus('/tmp/corpus.mm', corpus, id2word=dictionary)

# Load corpus
mm_corpus = MmCorpus('/tmp/corpus.mm')
print(f"Corpus length: {len(mm_corpus)}")

# Convert to SVMlight format
SvmLightCorpus.save_corpus('/tmp/corpus.svmlight', corpus, id2word=dictionary)
svm_corpus = SvmLightCorpus('/tmp/corpus.svmlight')

# Iterate over documents
for doc in mm_corpus:
    print(doc)
    break  # Just show first document

Processing Text Directories

from gensim.corpora import TextDirectoryCorpus

# Create corpus from text files in directory
text_corpus = TextDirectoryCorpus('/path/to/text/files', min_depth=1)

# Create dictionary from text corpus
dictionary = text_corpus.dictionary

# Convert to bag-of-words
bow_corpus = [dictionary.doc2bow(doc) for doc in text_corpus.get_texts()]

Working with Wikipedia Dumps

from gensim.corpora import WikiCorpus

# Process Wikipedia dump
wiki_corpus = WikiCorpus('/path/to/wikipedia/dump.xml.bz2', 
                        lemmatize=True, 
                        processes=4)

# Extract articles as text
wiki_corpus.extract_pages('/tmp/wiki_articles', compress=True)

# Create dictionary from wiki corpus
dictionary = wiki_corpus.dictionary

# Convert to bag-of-words
bow_corpus = [dictionary.doc2bow(article) for article in wiki_corpus.get_texts()]

Dictionary Filtering and Manipulation

# Filter extremes: remove words that appear in less than 5 documents
# or more than 50% of documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Keep only top 10000 most frequent words
dictionary.filter_n_most_frequent(10000)

# Merge dictionaries
other_dict = corpora.Dictionary(other_documents)
dictionary.merge_with(other_dict)

# Get word frequencies
word_freq = dictionary.cfs
most_common = dictionary.most_common(10)
print(f"Most common words: {most_common}")

# Check if word exists
if 'computer' in dictionary.token2id:
    word_id = dictionary.token2id['computer']
    print(f"'computer' has ID: {word_id}")

Corpus Statistics and Analysis

# Get corpus statistics
num_docs = len(corpus)
num_tokens = sum(sum(freq for _, freq in doc) for doc in corpus)
print(f"Corpus: {num_docs} documents, {num_tokens} tokens")

# Get document lengths
doc_lengths = [sum(freq for _, freq in doc) for doc in corpus]
avg_length = sum(doc_lengths) / len(doc_lengths)
print(f"Average document length: {avg_length:.2f} tokens")

# Find sparse documents
sparse_docs = [i for i, doc in enumerate(corpus) if len(doc) < 10]
print(f"Sparse documents (< 10 unique tokens): {len(sparse_docs)}")

Install with Tessl CLI

npx tessl i tessl/pypi-gensim

docs

corpus-management.md

data-downloading.md

index.md

mathematical-utilities.md

nlp-models.md

similarity-computations.md

text-preprocessing.md

tile.json