Easy data preprocessing and data augmentation for deep learning models
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Text tokenization, vocabulary management, and text-to-sequence conversion utilities for natural language processing. These tools handle the transformation of raw text into numerical representations suitable for neural network training.
The Tokenizer class provides comprehensive text tokenization and vocabulary management with configurable preprocessing, filtering, and encoding options.
class Tokenizer:
"""
Text tokenization utility class for vectorizing text corpus.
Converts text to sequences of integers or other vectorized representations.
Maintains internal vocabulary and word-to-index mappings.
"""
def __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=' ', char_level=False, oov_token=None,
document_count=0, **kwargs):
"""
Initialize tokenizer.
Parameters:
- num_words (int, optional): Maximum number of words to keep based on frequency
- filters (str): Characters to filter out from texts
- lower (bool): Whether to convert texts to lowercase
- split (str): Separator for word splitting
- char_level (bool): Whether to use character-level tokenization
- oov_token (str, optional): Token to replace out-of-vocabulary words
- document_count (int): Count of documents processed (for statistics)
"""
def fit_on_texts(self, texts):
"""
Update internal vocabulary based on a list of texts.
Parameters:
- texts (list): List of texts to fit on
"""
def texts_to_sequences(self, texts):
"""
Transform each text to a sequence of integers.
Parameters:
- texts (list): List of texts to transform
Returns:
- list: List of sequences (lists of integers)
"""
def texts_to_sequences_generator(self, texts):
"""
Generator version of texts_to_sequences.
Parameters:
- texts (list): List of texts to transform
Yields:
- list: Sequence (list of integers) for each text
"""
def sequences_to_texts(self, sequences):
"""
Transform sequences back to texts.
Parameters:
- sequences (list): List of sequences to transform
Returns:
- list: List of texts
"""
def sequences_to_texts_generator(self, sequences):
"""
Generator version of sequences_to_texts.
Parameters:
- sequences (list): List of sequences to transform
Yields:
- str: Text for each sequence
"""
def texts_to_matrix(self, texts, mode='binary'):
"""
Convert texts to a matrix representation.
Parameters:
- texts (list): List of texts to convert
- mode (str): 'binary', 'count', 'tfidf', 'freq'
Returns:
- numpy.ndarray: Matrix representation of texts
"""
def sequences_to_matrix(self, sequences, mode='binary'):
"""
Convert sequences to a matrix representation.
Parameters:
- sequences (list): List of sequences to convert
- mode (str): 'binary', 'count', 'tfidf', 'freq'
Returns:
- numpy.ndarray: Matrix representation of sequences
"""
def fit_on_sequences(self, sequences):
"""
Update internal vocabulary based on a list of sequences.
Parameters:
- sequences (list): List of sequences to fit on
"""
def get_config(self):
"""
Return tokenizer configuration as dictionary.
Returns:
- dict: Configuration dictionary
"""
def to_json(self, **kwargs):
"""
Return JSON string containing tokenizer configuration.
Returns:
- str: JSON string of tokenizer configuration
"""Utility functions for basic text preprocessing operations.
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=" "):
"""
Convert text to a sequence of words (or tokens).
Parameters:
- text (str): Input text
- filters (str): Characters to filter out (punctuation, etc.)
- lower (bool): Whether to convert to lowercase
- split (str): Separator for word splitting
Returns:
- list: List of words/tokens
"""Functions for encoding text using hashing and one-hot techniques.
def one_hot(text, n, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=' '):
"""
One-hot encode text into list of word indexes using hashing.
Parameters:
- text (str): Input text
- n (int): Size of vocabulary (hashing space)
- filters (str): Characters to filter out
- lower (bool): Whether to convert to lowercase
- split (str): Separator for word splitting
Returns:
- list: List of integers (word indexes)
"""
def hashing_trick(text, n, hash_function=None,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=' '):
"""
Convert text to sequence of indexes in fixed-size hashing space.
Parameters:
- text (str): Input text
- n (int): Size of hashing space
- hash_function (callable, optional): Hash function to use (default: hash())
- filters (str): Characters to filter out
- lower (bool): Whether to convert to lowercase
- split (str): Separator for word splitting
Returns:
- list: List of integers (hashed word indexes)
"""def tokenizer_from_json(json_string):
"""
Parse JSON tokenizer configuration and return tokenizer instance.
Parameters:
- json_string (str): JSON string containing tokenizer configuration
Returns:
- Tokenizer: Tokenizer instance with loaded configuration
"""from keras_preprocessing.text import Tokenizer
# Create and fit tokenizer
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
texts = [
'The quick brown fox',
'jumps over the lazy dog',
'The dog was lazy'
]
tokenizer.fit_on_texts(texts)
# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)
# [[1, 4, 5, 6], [7, 8, 1, 2, 3], [1, 3, 9, 2]]
# Get word index
print(tokenizer.word_index)
# {'the': 1, 'lazy': 2, 'dog': 3, 'quick': 4, ...}# Convert to binary matrix
binary_matrix = tokenizer.texts_to_matrix(texts, mode='binary')
print(binary_matrix.shape) # (3, 1000)
# Convert to TF-IDF matrix
tfidf_matrix = tokenizer.texts_to_matrix(texts, mode='tfidf')from keras_preprocessing.text import text_to_word_sequence, one_hot
# Basic word tokenization
words = text_to_word_sequence('Hello, world! How are you?')
print(words) # ['hello', 'world', 'how', 'are', 'you']
# One-hot encoding with hashing
encoded = one_hot('Hello world', n=1000)
print(encoded) # [123, 456] # Hash-based word indexesInstall with Tessl CLI
npx tessl i tessl/pypi-keras-preprocessing