tessl/pypi-fastai

fastai simplifies training fast and accurate neural nets using modern best practices

—

Pending

Overview

Eval results

Files

Natural Language Processing

Name: tessl/pypi-fastai
Author: tessl

Comprehensive text processing and NLP capabilities including language models, text classification, tokenization, and specialized data processing for text tasks.

Capabilities

Text Learners

Main entry points for creating text models including language models and classifiers.

def language_model_learner(dls, arch, config=None, drop_mult=1.0, pretrained=True, 
                          pretrained_fnames=None, **kwargs):
    """
    Create a language model learner.
    
    Parameters:
    - dls: Text DataLoaders with language modeling setup
    - arch: Model architecture (AWD_LSTM, etc.)
    - config: Model configuration dictionary
    - drop_mult: Dropout multiplier
    - pretrained: Use pre-trained weights
    - pretrained_fnames: Custom pre-trained filenames
    - **kwargs: Additional learner arguments
    
    Returns:
    - Learner instance for language modeling
    """

def text_classifier_learner(dls, arch, seq_len=72, config=None, backwards=False, 
                           pretrained=True, drop_mult=1.0, n_out=None, lin_ftrs=None, 
                           ps=None, max_len=1400, y_range=None, **kwargs):
    """
    Create a text classifier learner.
    
    Parameters:
    - dls: Text DataLoaders with classification setup
    - arch: Model architecture (AWD_LSTM, etc.)
    - seq_len: Sequence length for classification
    - config: Model configuration
    - backwards: Use backwards language model
    - pretrained: Use pre-trained language model
    - drop_mult: Dropout multiplier
    - n_out: Number of output classes
    - lin_ftrs: Linear layer features
    - ps: Dropout probabilities for linear layers
    - max_len: Maximum sequence length
    - y_range: Range for regression outputs
    
    Returns:
    - Learner instance for text classification
    """

class TextLearner(Learner):
    """Base learner class for text tasks."""
    
    def predict(self, text, n_words=1, no_unk=True, temperature=1.0, 
                min_p=None, no_bar=False, decoder=decode_spec_tokens):
        """Predict next words in text."""
    
    def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):
        """Show model predictions on dataset."""

class LMLearner(TextLearner):
    """Language model learner with specialized methods."""
    
    def save_encoder(self, file):
        """Save encoder for transfer learning."""
    
    def load_encoder(self, file, device=None):
        """Load encoder from language model."""

Text Data Processing

Specialized data loaders and processing for text datasets.

class TextDataLoaders(DataLoaders):
    """DataLoaders for text datasets."""
    
    @classmethod
    def from_folder(cls, path, train='train', valid='valid', valid_pct=None, 
                    seed=None, vocab=None, tok_tfm=None, seq_len=72, 
                    backwards=False, **kwargs):
        """
        Create TextDataLoaders from folder structure.
        
        Parameters:
        - path: Path to text data
        - train: Training folder name
        - valid: Validation folder name
        - valid_pct: Validation percentage
        - seed: Random seed
        - vocab: Vocabulary object
        - tok_tfm: Tokenization transform
        - seq_len: Sequence length
        - backwards: Process text backwards
        
        Returns:
        - TextDataLoaders instance
        """
    
    @classmethod
    def from_csv(cls, path, csv_name='texts.csv', header='infer', delimiter=None, 
                 text_col='text', label_col='label', valid_col=None, **kwargs):
        """Create from CSV file."""
    
    @classmethod
    def from_df(cls, df, path='.', text_col='text', label_col='label', 
                valid_col=None, **kwargs):
        """Create from pandas DataFrame."""

class TextBlock(TransformBlock):
    """Transform block for text data."""
    
    def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72, 
                 backwards=False, min_freq=3, max_vocab=60000): ...

def TextDataLoaders.from_dsets(train_ds, valid_ds, path='.', **kwargs):
    """Create from text datasets."""

Tokenization

Comprehensive tokenization support for different text processing approaches.

class Tokenizer:
    """Base tokenizer class."""
    
    def __init__(self, tok_func, rules=None, counter=None, lengths=None, 
                 mode=None, sep=' '): ...
    
    def __call__(self, items): ...

class WordTokenizer:
    """Word-level tokenization."""
    
    def __init__(self, lang='en', rules=None, split_char=' ', **kwargs): ...

class SubwordTokenizer:
    """Subword tokenization (BPE, WordPiece, etc.)."""
    
    def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...

class SentencePieceTokenizer:
    """SentencePiece tokenizer integration."""
    
    def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...

def TokenizeWithRules(tok, rules, post_rules=None):
    """Apply tokenization with preprocessing rules."""

# Tokenization rules
def fix_html(x):
    """Fix HTML entities and formatting."""

def replace_rep(x):
    """Replace repetitions with special tokens."""

def replace_wrep(x):
    """Replace word repetitions."""

def spec_add_spaces(x):
    """Add spaces around special characters."""

def rm_useless_spaces(x):
    """Remove unnecessary spaces."""

def replace_all_caps(x):
    """Replace all-caps words with special tokens."""

def replace_maj(x):
    """Replace majority-caps words."""

def lowercase(x, add_bos=True, add_eos=False):
    """Convert to lowercase with optional special tokens."""

Text Models

Core model architectures for text processing tasks.

class AWD_LSTM(nn.Module):
    """AWD-LSTM language model implementation."""
    
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, 
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, 
                 bidir=False): ...
    
    def forward(self, input, from_embeddings=False): ...
    
    def reset(self): ...

class LinearDecoder(nn.Module):
    """Linear decoder for classification."""
    
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True): ...

class SentenceEncoder(nn.Module):
    """Encode sentences for classification."""
    
    def __init__(self, bptt, max_len, module): ...

def get_language_model(arch, vocab_sz, config=None, drop_mult=1):
    """Create language model."""

def get_text_classifier(arch, vocab_sz, n_class, seq_len=72, config=None, 
                       drop_mult=1, lin_ftrs=None, ps=None, y_range=None):
    """Create text classifier model."""

Text Tensor Classes

Specialized tensor classes for text data.

class TensorText(TensorBase):
    """Tensor subclass for text sequences."""
    
    def __init__(self, x, **kwargs): ...
    
    def show(self, ctx=None, **kwargs): ...

class LMTensorText(TensorText):
    """Tensor subclass for language model text."""
    
    def show(self, ctx=None, **kwargs): ...

Text Transforms

Data processing transforms specific to text.

class Numericalize(Transform):
    """Convert text tokens to numeric IDs."""
    
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None): ...
    
    def setup(self, items=None, train_setup=True): ...

class Categorize(Transform):
    """Convert text labels to categories."""
    
    def __init__(self, vocab=None, sort=True, add_na=False): ...

def make_vocab(count, min_freq=3, max_vocab=None, special_toks=None):
    """Create vocabulary from token counts."""

Text Constants

Special tokens and constants used in text processing.

# Special tokens
UNK = 'xxunk'      # Unknown token
PAD = 'xxpad'      # Padding token
BOS = 'xxbos'      # Beginning of sequence
EOS = 'xxeos'      # End of sequence
FLD = 'xxfld'      # Field separator
TK_REP = 'xxrep'   # Repetition token
TK_WREP = 'xxwrep' # Word repetition token
TK_UP = 'xxup'     # Uppercase token
TK_MAJ = 'xxmaj'   # Majority case token

# Default special tokens list
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]

# Text processing rules
text_rules_L = [fix_html, replace_rep, replace_wrep, spec_add_spaces, 
                rm_useless_spaces, replace_all_caps, replace_maj, lowercase]

Language Model Utilities

Utilities for working with language models and transfer learning.

def language_model_learner(dls, arch, config=None, drop_mult=1., pretrained=True, 
                          pretrained_fnames=None, **kwargs):
    """Create language model learner with pre-training support."""

def fine_tune_text_classifier_learner(dls, path, model_name='classifier', 
                                     arch=AWD_LSTM, **kwargs):
    """Fine-tune text classifier from language model."""

class LanguageModelLoader:
    """Load pre-trained language model weights."""
    
    def __init__(self, path, backwards=False, model_cls=AWD_LSTM): ...

def convert_weights(wgts, stoi_wgts, itos_new):
    """Convert pre-trained weights to new vocabulary."""

def lm_config(arch):
    """Get default language model configuration for architecture."""

Text Metrics

Specialized metrics for text tasks.

class Perplexity(Metric):
    """Perplexity metric for language models."""
    
    def __init__(self, dim=-1): ...
    
    def reset(self): ...
    def accumulate(self, learn): ...
    @property
    def value(self): ...

class BLEU:
    """BLEU score for text generation."""
    
    def __init__(self, n_gram=4, weights=None): ...
    
    def __call__(self, pred_tokens, targ_tokens): ...

Install with Tessl CLI