CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-preprocessing

Easy data preprocessing and data augmentation for deep learning models

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

text-processing.mddocs/

Text Processing

Text tokenization, vocabulary management, and text-to-sequence conversion utilities for natural language processing. These tools handle the transformation of raw text into numerical representations suitable for neural network training.

Capabilities

Text Tokenization

The Tokenizer class provides comprehensive text tokenization and vocabulary management with configurable preprocessing, filtering, and encoding options.

class Tokenizer:
    """
    Text tokenization utility class for vectorizing text corpus.
    
    Converts text to sequences of integers or other vectorized representations.
    Maintains internal vocabulary and word-to-index mappings.
    """
    
    def __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                 lower=True, split=' ', char_level=False, oov_token=None, 
                 document_count=0, **kwargs):
        """
        Initialize tokenizer.
        
        Parameters:
        - num_words (int, optional): Maximum number of words to keep based on frequency
        - filters (str): Characters to filter out from texts
        - lower (bool): Whether to convert texts to lowercase
        - split (str): Separator for word splitting
        - char_level (bool): Whether to use character-level tokenization
        - oov_token (str, optional): Token to replace out-of-vocabulary words
        - document_count (int): Count of documents processed (for statistics)
        """
    
    def fit_on_texts(self, texts):
        """
        Update internal vocabulary based on a list of texts.
        
        Parameters:
        - texts (list): List of texts to fit on
        """
    
    def texts_to_sequences(self, texts):
        """
        Transform each text to a sequence of integers.
        
        Parameters:
        - texts (list): List of texts to transform
        
        Returns:
        - list: List of sequences (lists of integers)
        """
    
    def texts_to_sequences_generator(self, texts):
        """
        Generator version of texts_to_sequences.
        
        Parameters:
        - texts (list): List of texts to transform
        
        Yields:
        - list: Sequence (list of integers) for each text
        """
    
    def sequences_to_texts(self, sequences):
        """
        Transform sequences back to texts.
        
        Parameters:
        - sequences (list): List of sequences to transform
        
        Returns:
        - list: List of texts
        """
    
    def sequences_to_texts_generator(self, sequences):
        """
        Generator version of sequences_to_texts.
        
        Parameters:
        - sequences (list): List of sequences to transform
        
        Yields:
        - str: Text for each sequence
        """
    
    def texts_to_matrix(self, texts, mode='binary'):
        """
        Convert texts to a matrix representation.
        
        Parameters:
        - texts (list): List of texts to convert
        - mode (str): 'binary', 'count', 'tfidf', 'freq'
        
        Returns:
        - numpy.ndarray: Matrix representation of texts
        """
    
    def sequences_to_matrix(self, sequences, mode='binary'):
        """
        Convert sequences to a matrix representation.
        
        Parameters:
        - sequences (list): List of sequences to convert
        - mode (str): 'binary', 'count', 'tfidf', 'freq'
        
        Returns:
        - numpy.ndarray: Matrix representation of sequences
        """
    
    def fit_on_sequences(self, sequences):
        """
        Update internal vocabulary based on a list of sequences.
        
        Parameters:
        - sequences (list): List of sequences to fit on
        """
    
    def get_config(self):
        """
        Return tokenizer configuration as dictionary.
        
        Returns:
        - dict: Configuration dictionary
        """
    
    def to_json(self, **kwargs):
        """
        Return JSON string containing tokenizer configuration.
        
        Returns:
        - str: JSON string of tokenizer configuration
        """

Text Preprocessing Functions

Utility functions for basic text preprocessing operations.

def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                          lower=True, split=" "):
    """
    Convert text to a sequence of words (or tokens).
    
    Parameters:
    - text (str): Input text
    - filters (str): Characters to filter out (punctuation, etc.)
    - lower (bool): Whether to convert to lowercase
    - split (str): Separator for word splitting
    
    Returns:
    - list: List of words/tokens
    """

Text Encoding Functions

Functions for encoding text using hashing and one-hot techniques.

def one_hot(text, n, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
            lower=True, split=' '):
    """
    One-hot encode text into list of word indexes using hashing.
    
    Parameters:
    - text (str): Input text
    - n (int): Size of vocabulary (hashing space)
    - filters (str): Characters to filter out
    - lower (bool): Whether to convert to lowercase
    - split (str): Separator for word splitting
    
    Returns:
    - list: List of integers (word indexes)
    """

def hashing_trick(text, n, hash_function=None, 
                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                  lower=True, split=' '):
    """
    Convert text to sequence of indexes in fixed-size hashing space.
    
    Parameters:
    - text (str): Input text
    - n (int): Size of hashing space
    - hash_function (callable, optional): Hash function to use (default: hash())
    - filters (str): Characters to filter out
    - lower (bool): Whether to convert to lowercase
    - split (str): Separator for word splitting
    
    Returns:
    - list: List of integers (hashed word indexes)
    """

Serialization

def tokenizer_from_json(json_string):
    """
    Parse JSON tokenizer configuration and return tokenizer instance.
    
    Parameters:
    - json_string (str): JSON string containing tokenizer configuration
    
    Returns:
    - Tokenizer: Tokenizer instance with loaded configuration
    """

Usage Examples

Basic Tokenization

from keras_preprocessing.text import Tokenizer

# Create and fit tokenizer
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
texts = [
    'The quick brown fox',
    'jumps over the lazy dog',
    'The dog was lazy'
]

tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)
# [[1, 4, 5, 6], [7, 8, 1, 2, 3], [1, 3, 9, 2]]

# Get word index
print(tokenizer.word_index)
# {'the': 1, 'lazy': 2, 'dog': 3, 'quick': 4, ...}

Text to Matrix Conversion

# Convert to binary matrix
binary_matrix = tokenizer.texts_to_matrix(texts, mode='binary')
print(binary_matrix.shape)  # (3, 1000)

# Convert to TF-IDF matrix
tfidf_matrix = tokenizer.texts_to_matrix(texts, mode='tfidf')

Simple Text Preprocessing

from keras_preprocessing.text import text_to_word_sequence, one_hot

# Basic word tokenization
words = text_to_word_sequence('Hello, world! How are you?')
print(words)  # ['hello', 'world', 'how', 'are', 'you']

# One-hot encoding with hashing
encoded = one_hot('Hello world', n=1000)
print(encoded)  # [123, 456]  # Hash-based word indexes

Install with Tessl CLI

npx tessl i tessl/pypi-keras-preprocessing

docs

image-processing.md

index.md

sequence-processing.md

text-processing.md

tile.json