0
# Natural Language Processing
1
2
Comprehensive text processing and NLP capabilities including language models, text classification, tokenization, and specialized data processing for text tasks.
3
4
## Capabilities
5
6
### Text Learners
7
8
Main entry points for creating text models including language models and classifiers.
9
10
```python { .api }
11
def language_model_learner(dls, arch, config=None, drop_mult=1.0, pretrained=True,
12
pretrained_fnames=None, **kwargs):
13
"""
14
Create a language model learner.
15
16
Parameters:
17
- dls: Text DataLoaders with language modeling setup
18
- arch: Model architecture (AWD_LSTM, etc.)
19
- config: Model configuration dictionary
20
- drop_mult: Dropout multiplier
21
- pretrained: Use pre-trained weights
22
- pretrained_fnames: Custom pre-trained filenames
23
- **kwargs: Additional learner arguments
24
25
Returns:
26
- Learner instance for language modeling
27
"""
28
29
def text_classifier_learner(dls, arch, seq_len=72, config=None, backwards=False,
30
pretrained=True, drop_mult=1.0, n_out=None, lin_ftrs=None,
31
ps=None, max_len=1400, y_range=None, **kwargs):
32
"""
33
Create a text classifier learner.
34
35
Parameters:
36
- dls: Text DataLoaders with classification setup
37
- arch: Model architecture (AWD_LSTM, etc.)
38
- seq_len: Sequence length for classification
39
- config: Model configuration
40
- backwards: Use backwards language model
41
- pretrained: Use pre-trained language model
42
- drop_mult: Dropout multiplier
43
- n_out: Number of output classes
44
- lin_ftrs: Linear layer features
45
- ps: Dropout probabilities for linear layers
46
- max_len: Maximum sequence length
47
- y_range: Range for regression outputs
48
49
Returns:
50
- Learner instance for text classification
51
"""
52
53
class TextLearner(Learner):
54
"""Base learner class for text tasks."""
55
56
def predict(self, text, n_words=1, no_unk=True, temperature=1.0,
57
min_p=None, no_bar=False, decoder=decode_spec_tokens):
58
"""Predict next words in text."""
59
60
def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):
61
"""Show model predictions on dataset."""
62
63
class LMLearner(TextLearner):
64
"""Language model learner with specialized methods."""
65
66
def save_encoder(self, file):
67
"""Save encoder for transfer learning."""
68
69
def load_encoder(self, file, device=None):
70
"""Load encoder from language model."""
71
```
72
73
### Text Data Processing
74
75
Specialized data loaders and processing for text datasets.
76
77
```python { .api }
78
class TextDataLoaders(DataLoaders):
79
"""DataLoaders for text datasets."""
80
81
@classmethod
82
def from_folder(cls, path, train='train', valid='valid', valid_pct=None,
83
seed=None, vocab=None, tok_tfm=None, seq_len=72,
84
backwards=False, **kwargs):
85
"""
86
Create TextDataLoaders from folder structure.
87
88
Parameters:
89
- path: Path to text data
90
- train: Training folder name
91
- valid: Validation folder name
92
- valid_pct: Validation percentage
93
- seed: Random seed
94
- vocab: Vocabulary object
95
- tok_tfm: Tokenization transform
96
- seq_len: Sequence length
97
- backwards: Process text backwards
98
99
Returns:
100
- TextDataLoaders instance
101
"""
102
103
@classmethod
104
def from_csv(cls, path, csv_name='texts.csv', header='infer', delimiter=None,
105
text_col='text', label_col='label', valid_col=None, **kwargs):
106
"""Create from CSV file."""
107
108
@classmethod
109
def from_df(cls, df, path='.', text_col='text', label_col='label',
110
valid_col=None, **kwargs):
111
"""Create from pandas DataFrame."""
112
113
class TextBlock(TransformBlock):
114
"""Transform block for text data."""
115
116
def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72,
117
backwards=False, min_freq=3, max_vocab=60000): ...
118
119
def TextDataLoaders.from_dsets(train_ds, valid_ds, path='.', **kwargs):
120
"""Create from text datasets."""
121
```
122
123
### Tokenization
124
125
Comprehensive tokenization support for different text processing approaches.
126
127
```python { .api }
128
class Tokenizer:
129
"""Base tokenizer class."""
130
131
def __init__(self, tok_func, rules=None, counter=None, lengths=None,
132
mode=None, sep=' '): ...
133
134
def __call__(self, items): ...
135
136
class WordTokenizer:
137
"""Word-level tokenization."""
138
139
def __init__(self, lang='en', rules=None, split_char=' ', **kwargs): ...
140
141
class SubwordTokenizer:
142
"""Subword tokenization (BPE, WordPiece, etc.)."""
143
144
def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...
145
146
class SentencePieceTokenizer:
147
"""SentencePiece tokenizer integration."""
148
149
def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...
150
151
def TokenizeWithRules(tok, rules, post_rules=None):
152
"""Apply tokenization with preprocessing rules."""
153
154
# Tokenization rules
155
def fix_html(x):
156
"""Fix HTML entities and formatting."""
157
158
def replace_rep(x):
159
"""Replace repetitions with special tokens."""
160
161
def replace_wrep(x):
162
"""Replace word repetitions."""
163
164
def spec_add_spaces(x):
165
"""Add spaces around special characters."""
166
167
def rm_useless_spaces(x):
168
"""Remove unnecessary spaces."""
169
170
def replace_all_caps(x):
171
"""Replace all-caps words with special tokens."""
172
173
def replace_maj(x):
174
"""Replace majority-caps words."""
175
176
def lowercase(x, add_bos=True, add_eos=False):
177
"""Convert to lowercase with optional special tokens."""
178
```
179
180
### Text Models
181
182
Core model architectures for text processing tasks.
183
184
```python { .api }
185
class AWD_LSTM(nn.Module):
186
"""AWD-LSTM language model implementation."""
187
188
def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1,
189
hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5,
190
bidir=False): ...
191
192
def forward(self, input, from_embeddings=False): ...
193
194
def reset(self): ...
195
196
class LinearDecoder(nn.Module):
197
"""Linear decoder for classification."""
198
199
def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True): ...
200
201
class SentenceEncoder(nn.Module):
202
"""Encode sentences for classification."""
203
204
def __init__(self, bptt, max_len, module): ...
205
206
def get_language_model(arch, vocab_sz, config=None, drop_mult=1):
207
"""Create language model."""
208
209
def get_text_classifier(arch, vocab_sz, n_class, seq_len=72, config=None,
210
drop_mult=1, lin_ftrs=None, ps=None, y_range=None):
211
"""Create text classifier model."""
212
```
213
214
### Text Tensor Classes
215
216
Specialized tensor classes for text data.
217
218
```python { .api }
219
class TensorText(TensorBase):
220
"""Tensor subclass for text sequences."""
221
222
def __init__(self, x, **kwargs): ...
223
224
def show(self, ctx=None, **kwargs): ...
225
226
class LMTensorText(TensorText):
227
"""Tensor subclass for language model text."""
228
229
def show(self, ctx=None, **kwargs): ...
230
```
231
232
### Text Transforms
233
234
Data processing transforms specific to text.
235
236
```python { .api }
237
class Numericalize(Transform):
238
"""Convert text tokens to numeric IDs."""
239
240
def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None): ...
241
242
def setup(self, items=None, train_setup=True): ...
243
244
class Categorize(Transform):
245
"""Convert text labels to categories."""
246
247
def __init__(self, vocab=None, sort=True, add_na=False): ...
248
249
def make_vocab(count, min_freq=3, max_vocab=None, special_toks=None):
250
"""Create vocabulary from token counts."""
251
```
252
253
### Text Constants
254
255
Special tokens and constants used in text processing.
256
257
```python { .api }
258
# Special tokens
259
UNK = 'xxunk' # Unknown token
260
PAD = 'xxpad' # Padding token
261
BOS = 'xxbos' # Beginning of sequence
262
EOS = 'xxeos' # End of sequence
263
FLD = 'xxfld' # Field separator
264
TK_REP = 'xxrep' # Repetition token
265
TK_WREP = 'xxwrep' # Word repetition token
266
TK_UP = 'xxup' # Uppercase token
267
TK_MAJ = 'xxmaj' # Majority case token
268
269
# Default special tokens list
270
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]
271
272
# Text processing rules
273
text_rules_L = [fix_html, replace_rep, replace_wrep, spec_add_spaces,
274
rm_useless_spaces, replace_all_caps, replace_maj, lowercase]
275
```
276
277
### Language Model Utilities
278
279
Utilities for working with language models and transfer learning.
280
281
```python { .api }
282
def language_model_learner(dls, arch, config=None, drop_mult=1., pretrained=True,
283
pretrained_fnames=None, **kwargs):
284
"""Create language model learner with pre-training support."""
285
286
def fine_tune_text_classifier_learner(dls, path, model_name='classifier',
287
arch=AWD_LSTM, **kwargs):
288
"""Fine-tune text classifier from language model."""
289
290
class LanguageModelLoader:
291
"""Load pre-trained language model weights."""
292
293
def __init__(self, path, backwards=False, model_cls=AWD_LSTM): ...
294
295
def convert_weights(wgts, stoi_wgts, itos_new):
296
"""Convert pre-trained weights to new vocabulary."""
297
298
def lm_config(arch):
299
"""Get default language model configuration for architecture."""
300
```
301
302
### Text Metrics
303
304
Specialized metrics for text tasks.
305
306
```python { .api }
307
class Perplexity(Metric):
308
"""Perplexity metric for language models."""
309
310
def __init__(self, dim=-1): ...
311
312
def reset(self): ...
313
def accumulate(self, learn): ...
314
@property
315
def value(self): ...
316
317
class BLEU:
318
"""BLEU score for text generation."""
319
320
def __init__(self, n_gram=4, weights=None): ...
321
322
def __call__(self, pred_tokens, targ_tokens): ...
323
```