Tessl Tile for pypi/fastai@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

callbacks.md collaborative-filtering.md core-training.md data-loading.md index.md interpretation.md medical.md metrics-losses.md tabular.md text.md vision.md

text.mddocs/

0
# Natural Language Processing
1

2
Comprehensive text processing and NLP capabilities including language models, text classification, tokenization, and specialized data processing for text tasks.
3

4
## Capabilities
5

6
### Text Learners
7

8
Main entry points for creating text models including language models and classifiers.
9

10
```python { .api }
11
def language_model_learner(dls, arch, config=None, drop_mult=1.0, pretrained=True, 
12
                          pretrained_fnames=None, **kwargs):
13
    """
14
    Create a language model learner.
15
    
16
    Parameters:
17
    - dls: Text DataLoaders with language modeling setup
18
    - arch: Model architecture (AWD_LSTM, etc.)
19
    - config: Model configuration dictionary
20
    - drop_mult: Dropout multiplier
21
    - pretrained: Use pre-trained weights
22
    - pretrained_fnames: Custom pre-trained filenames
23
    - **kwargs: Additional learner arguments
24
    
25
    Returns:
26
    - Learner instance for language modeling
27
    """
28

29
def text_classifier_learner(dls, arch, seq_len=72, config=None, backwards=False, 
30
                           pretrained=True, drop_mult=1.0, n_out=None, lin_ftrs=None, 
31
                           ps=None, max_len=1400, y_range=None, **kwargs):
32
    """
33
    Create a text classifier learner.
34
    
35
    Parameters:
36
    - dls: Text DataLoaders with classification setup
37
    - arch: Model architecture (AWD_LSTM, etc.)
38
    - seq_len: Sequence length for classification
39
    - config: Model configuration
40
    - backwards: Use backwards language model
41
    - pretrained: Use pre-trained language model
42
    - drop_mult: Dropout multiplier
43
    - n_out: Number of output classes
44
    - lin_ftrs: Linear layer features
45
    - ps: Dropout probabilities for linear layers
46
    - max_len: Maximum sequence length
47
    - y_range: Range for regression outputs
48
    
49
    Returns:
50
    - Learner instance for text classification
51
    """
52

53
class TextLearner(Learner):
54
    """Base learner class for text tasks."""
55
    
56
    def predict(self, text, n_words=1, no_unk=True, temperature=1.0, 
57
                min_p=None, no_bar=False, decoder=decode_spec_tokens):
58
        """Predict next words in text."""
59
    
60
    def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):
61
        """Show model predictions on dataset."""
62

63
class LMLearner(TextLearner):
64
    """Language model learner with specialized methods."""
65
    
66
    def save_encoder(self, file):
67
        """Save encoder for transfer learning."""
68
    
69
    def load_encoder(self, file, device=None):
70
        """Load encoder from language model."""
71
```
72

73
### Text Data Processing
74

75
Specialized data loaders and processing for text datasets.
76

77
```python { .api }
78
class TextDataLoaders(DataLoaders):
79
    """DataLoaders for text datasets."""
80
    
81
    @classmethod
82
    def from_folder(cls, path, train='train', valid='valid', valid_pct=None, 
83
                    seed=None, vocab=None, tok_tfm=None, seq_len=72, 
84
                    backwards=False, **kwargs):
85
        """
86
        Create TextDataLoaders from folder structure.
87
        
88
        Parameters:
89
        - path: Path to text data
90
        - train: Training folder name
91
        - valid: Validation folder name
92
        - valid_pct: Validation percentage
93
        - seed: Random seed
94
        - vocab: Vocabulary object
95
        - tok_tfm: Tokenization transform
96
        - seq_len: Sequence length
97
        - backwards: Process text backwards
98
        
99
        Returns:
100
        - TextDataLoaders instance
101
        """
102
    
103
    @classmethod
104
    def from_csv(cls, path, csv_name='texts.csv', header='infer', delimiter=None, 
105
                 text_col='text', label_col='label', valid_col=None, **kwargs):
106
        """Create from CSV file."""
107
    
108
    @classmethod
109
    def from_df(cls, df, path='.', text_col='text', label_col='label', 
110
                valid_col=None, **kwargs):
111
        """Create from pandas DataFrame."""
112

113
class TextBlock(TransformBlock):
114
    """Transform block for text data."""
115
    
116
    def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72, 
117
                 backwards=False, min_freq=3, max_vocab=60000): ...
118

119
def TextDataLoaders.from_dsets(train_ds, valid_ds, path='.', **kwargs):
120
    """Create from text datasets."""
121
```
122

123
### Tokenization
124

125
Comprehensive tokenization support for different text processing approaches.
126

127
```python { .api }
128
class Tokenizer:
129
    """Base tokenizer class."""
130
    
131
    def __init__(self, tok_func, rules=None, counter=None, lengths=None, 
132
                 mode=None, sep=' '): ...
133
    
134
    def __call__(self, items): ...
135

136
class WordTokenizer:
137
    """Word-level tokenization."""
138
    
139
    def __init__(self, lang='en', rules=None, split_char=' ', **kwargs): ...
140

141
class SubwordTokenizer:
142
    """Subword tokenization (BPE, WordPiece, etc.)."""
143
    
144
    def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...
145

146
class SentencePieceTokenizer:
147
    """SentencePiece tokenizer integration."""
148
    
149
    def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...
150

151
def TokenizeWithRules(tok, rules, post_rules=None):
152
    """Apply tokenization with preprocessing rules."""
153

154
# Tokenization rules
155
def fix_html(x):
156
    """Fix HTML entities and formatting."""
157

158
def replace_rep(x):
159
    """Replace repetitions with special tokens."""
160

161
def replace_wrep(x):
162
    """Replace word repetitions."""
163

164
def spec_add_spaces(x):
165
    """Add spaces around special characters."""
166

167
def rm_useless_spaces(x):
168
    """Remove unnecessary spaces."""
169

170
def replace_all_caps(x):
171
    """Replace all-caps words with special tokens."""
172

173
def replace_maj(x):
174
    """Replace majority-caps words."""
175

176
def lowercase(x, add_bos=True, add_eos=False):
177
    """Convert to lowercase with optional special tokens."""
178
```
179

180
### Text Models
181

182
Core model architectures for text processing tasks.
183

184
```python { .api }
185
class AWD_LSTM(nn.Module):
186
    """AWD-LSTM language model implementation."""
187
    
188
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, 
189
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, 
190
                 bidir=False): ...
191
    
192
    def forward(self, input, from_embeddings=False): ...
193
    
194
    def reset(self): ...
195

196
class LinearDecoder(nn.Module):
197
    """Linear decoder for classification."""
198
    
199
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True): ...
200

201
class SentenceEncoder(nn.Module):
202
    """Encode sentences for classification."""
203
    
204
    def __init__(self, bptt, max_len, module): ...
205

206
def get_language_model(arch, vocab_sz, config=None, drop_mult=1):
207
    """Create language model."""
208

209
def get_text_classifier(arch, vocab_sz, n_class, seq_len=72, config=None, 
210
                       drop_mult=1, lin_ftrs=None, ps=None, y_range=None):
211
    """Create text classifier model."""
212
```
213

214
### Text Tensor Classes
215

216
Specialized tensor classes for text data.
217

218
```python { .api }
219
class TensorText(TensorBase):
220
    """Tensor subclass for text sequences."""
221
    
222
    def __init__(self, x, **kwargs): ...
223
    
224
    def show(self, ctx=None, **kwargs): ...
225

226
class LMTensorText(TensorText):
227
    """Tensor subclass for language model text."""
228
    
229
    def show(self, ctx=None, **kwargs): ...
230
```
231

232
### Text Transforms
233

234
Data processing transforms specific to text.
235

236
```python { .api }
237
class Numericalize(Transform):
238
    """Convert text tokens to numeric IDs."""
239
    
240
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None): ...
241
    
242
    def setup(self, items=None, train_setup=True): ...
243

244
class Categorize(Transform):
245
    """Convert text labels to categories."""
246
    
247
    def __init__(self, vocab=None, sort=True, add_na=False): ...
248

249
def make_vocab(count, min_freq=3, max_vocab=None, special_toks=None):
250
    """Create vocabulary from token counts."""
251
```
252

253
### Text Constants
254

255
Special tokens and constants used in text processing.
256

257
```python { .api }
258
# Special tokens
259
UNK = 'xxunk'      # Unknown token
260
PAD = 'xxpad'      # Padding token
261
BOS = 'xxbos'      # Beginning of sequence
262
EOS = 'xxeos'      # End of sequence
263
FLD = 'xxfld'      # Field separator
264
TK_REP = 'xxrep'   # Repetition token
265
TK_WREP = 'xxwrep' # Word repetition token
266
TK_UP = 'xxup'     # Uppercase token
267
TK_MAJ = 'xxmaj'   # Majority case token
268

269
# Default special tokens list
270
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]
271

272
# Text processing rules
273
text_rules_L = [fix_html, replace_rep, replace_wrep, spec_add_spaces, 
274
                rm_useless_spaces, replace_all_caps, replace_maj, lowercase]
275
```
276

277
### Language Model Utilities
278

279
Utilities for working with language models and transfer learning.
280

281
```python { .api }
282
def language_model_learner(dls, arch, config=None, drop_mult=1., pretrained=True, 
283
                          pretrained_fnames=None, **kwargs):
284
    """Create language model learner with pre-training support."""
285

286
def fine_tune_text_classifier_learner(dls, path, model_name='classifier', 
287
                                     arch=AWD_LSTM, **kwargs):
288
    """Fine-tune text classifier from language model."""
289

290
class LanguageModelLoader:
291
    """Load pre-trained language model weights."""
292
    
293
    def __init__(self, path, backwards=False, model_cls=AWD_LSTM): ...
294

295
def convert_weights(wgts, stoi_wgts, itos_new):
296
    """Convert pre-trained weights to new vocabulary."""
297

298
def lm_config(arch):
299
    """Get default language model configuration for architecture."""
300
```
301

302
### Text Metrics
303

304
Specialized metrics for text tasks.
305

306
```python { .api }
307
class Perplexity(Metric):
308
    """Perplexity metric for language models."""
309
    
310
    def __init__(self, dim=-1): ...
311
    
312
    def reset(self): ...
313
    def accumulate(self, learn): ...
314
    @property
315
    def value(self): ...
316

317
class BLEU:
318
    """BLEU score for text generation."""
319
    
320
    def __init__(self, n_gram=4, weights=None): ...
321
    
322
    def __call__(self, pred_tokens, targ_tokens): ...
323
```

Version

Tile

Files

text.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text.mddocs/