or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

callbacks.mdcollaborative-filtering.mdcore-training.mddata-loading.mdindex.mdinterpretation.mdmedical.mdmetrics-losses.mdtabular.mdtext.mdvision.md

text.mddocs/

0

# Natural Language Processing

1

2

Comprehensive text processing and NLP capabilities including language models, text classification, tokenization, and specialized data processing for text tasks.

3

4

## Capabilities

5

6

### Text Learners

7

8

Main entry points for creating text models including language models and classifiers.

9

10

```python { .api }

11

def language_model_learner(dls, arch, config=None, drop_mult=1.0, pretrained=True,

12

pretrained_fnames=None, **kwargs):

13

"""

14

Create a language model learner.

15

16

Parameters:

17

- dls: Text DataLoaders with language modeling setup

18

- arch: Model architecture (AWD_LSTM, etc.)

19

- config: Model configuration dictionary

20

- drop_mult: Dropout multiplier

21

- pretrained: Use pre-trained weights

22

- pretrained_fnames: Custom pre-trained filenames

23

- **kwargs: Additional learner arguments

24

25

Returns:

26

- Learner instance for language modeling

27

"""

28

29

def text_classifier_learner(dls, arch, seq_len=72, config=None, backwards=False,

30

pretrained=True, drop_mult=1.0, n_out=None, lin_ftrs=None,

31

ps=None, max_len=1400, y_range=None, **kwargs):

32

"""

33

Create a text classifier learner.

34

35

Parameters:

36

- dls: Text DataLoaders with classification setup

37

- arch: Model architecture (AWD_LSTM, etc.)

38

- seq_len: Sequence length for classification

39

- config: Model configuration

40

- backwards: Use backwards language model

41

- pretrained: Use pre-trained language model

42

- drop_mult: Dropout multiplier

43

- n_out: Number of output classes

44

- lin_ftrs: Linear layer features

45

- ps: Dropout probabilities for linear layers

46

- max_len: Maximum sequence length

47

- y_range: Range for regression outputs

48

49

Returns:

50

- Learner instance for text classification

51

"""

52

53

class TextLearner(Learner):

54

"""Base learner class for text tasks."""

55

56

def predict(self, text, n_words=1, no_unk=True, temperature=1.0,

57

min_p=None, no_bar=False, decoder=decode_spec_tokens):

58

"""Predict next words in text."""

59

60

def show_results(self, ds_idx=1, dl=None, max_n=10, **kwargs):

61

"""Show model predictions on dataset."""

62

63

class LMLearner(TextLearner):

64

"""Language model learner with specialized methods."""

65

66

def save_encoder(self, file):

67

"""Save encoder for transfer learning."""

68

69

def load_encoder(self, file, device=None):

70

"""Load encoder from language model."""

71

```

72

73

### Text Data Processing

74

75

Specialized data loaders and processing for text datasets.

76

77

```python { .api }

78

class TextDataLoaders(DataLoaders):

79

"""DataLoaders for text datasets."""

80

81

@classmethod

82

def from_folder(cls, path, train='train', valid='valid', valid_pct=None,

83

seed=None, vocab=None, tok_tfm=None, seq_len=72,

84

backwards=False, **kwargs):

85

"""

86

Create TextDataLoaders from folder structure.

87

88

Parameters:

89

- path: Path to text data

90

- train: Training folder name

91

- valid: Validation folder name

92

- valid_pct: Validation percentage

93

- seed: Random seed

94

- vocab: Vocabulary object

95

- tok_tfm: Tokenization transform

96

- seq_len: Sequence length

97

- backwards: Process text backwards

98

99

Returns:

100

- TextDataLoaders instance

101

"""

102

103

@classmethod

104

def from_csv(cls, path, csv_name='texts.csv', header='infer', delimiter=None,

105

text_col='text', label_col='label', valid_col=None, **kwargs):

106

"""Create from CSV file."""

107

108

@classmethod

109

def from_df(cls, df, path='.', text_col='text', label_col='label',

110

valid_col=None, **kwargs):

111

"""Create from pandas DataFrame."""

112

113

class TextBlock(TransformBlock):

114

"""Transform block for text data."""

115

116

def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72,

117

backwards=False, min_freq=3, max_vocab=60000): ...

118

119

def TextDataLoaders.from_dsets(train_ds, valid_ds, path='.', **kwargs):

120

"""Create from text datasets."""

121

```

122

123

### Tokenization

124

125

Comprehensive tokenization support for different text processing approaches.

126

127

```python { .api }

128

class Tokenizer:

129

"""Base tokenizer class."""

130

131

def __init__(self, tok_func, rules=None, counter=None, lengths=None,

132

mode=None, sep=' '): ...

133

134

def __call__(self, items): ...

135

136

class WordTokenizer:

137

"""Word-level tokenization."""

138

139

def __init__(self, lang='en', rules=None, split_char=' ', **kwargs): ...

140

141

class SubwordTokenizer:

142

"""Subword tokenization (BPE, WordPiece, etc.)."""

143

144

def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...

145

146

class SentencePieceTokenizer:

147

"""SentencePiece tokenizer integration."""

148

149

def __init__(self, lang='en', cache_dir=None, model_path=None, **kwargs): ...

150

151

def TokenizeWithRules(tok, rules, post_rules=None):

152

"""Apply tokenization with preprocessing rules."""

153

154

# Tokenization rules

155

def fix_html(x):

156

"""Fix HTML entities and formatting."""

157

158

def replace_rep(x):

159

"""Replace repetitions with special tokens."""

160

161

def replace_wrep(x):

162

"""Replace word repetitions."""

163

164

def spec_add_spaces(x):

165

"""Add spaces around special characters."""

166

167

def rm_useless_spaces(x):

168

"""Remove unnecessary spaces."""

169

170

def replace_all_caps(x):

171

"""Replace all-caps words with special tokens."""

172

173

def replace_maj(x):

174

"""Replace majority-caps words."""

175

176

def lowercase(x, add_bos=True, add_eos=False):

177

"""Convert to lowercase with optional special tokens."""

178

```

179

180

### Text Models

181

182

Core model architectures for text processing tasks.

183

184

```python { .api }

185

class AWD_LSTM(nn.Module):

186

"""AWD-LSTM language model implementation."""

187

188

def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1,

189

hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5,

190

bidir=False): ...

191

192

def forward(self, input, from_embeddings=False): ...

193

194

def reset(self): ...

195

196

class LinearDecoder(nn.Module):

197

"""Linear decoder for classification."""

198

199

def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True): ...

200

201

class SentenceEncoder(nn.Module):

202

"""Encode sentences for classification."""

203

204

def __init__(self, bptt, max_len, module): ...

205

206

def get_language_model(arch, vocab_sz, config=None, drop_mult=1):

207

"""Create language model."""

208

209

def get_text_classifier(arch, vocab_sz, n_class, seq_len=72, config=None,

210

drop_mult=1, lin_ftrs=None, ps=None, y_range=None):

211

"""Create text classifier model."""

212

```

213

214

### Text Tensor Classes

215

216

Specialized tensor classes for text data.

217

218

```python { .api }

219

class TensorText(TensorBase):

220

"""Tensor subclass for text sequences."""

221

222

def __init__(self, x, **kwargs): ...

223

224

def show(self, ctx=None, **kwargs): ...

225

226

class LMTensorText(TensorText):

227

"""Tensor subclass for language model text."""

228

229

def show(self, ctx=None, **kwargs): ...

230

```

231

232

### Text Transforms

233

234

Data processing transforms specific to text.

235

236

```python { .api }

237

class Numericalize(Transform):

238

"""Convert text tokens to numeric IDs."""

239

240

def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None): ...

241

242

def setup(self, items=None, train_setup=True): ...

243

244

class Categorize(Transform):

245

"""Convert text labels to categories."""

246

247

def __init__(self, vocab=None, sort=True, add_na=False): ...

248

249

def make_vocab(count, min_freq=3, max_vocab=None, special_toks=None):

250

"""Create vocabulary from token counts."""

251

```

252

253

### Text Constants

254

255

Special tokens and constants used in text processing.

256

257

```python { .api }

258

# Special tokens

259

UNK = 'xxunk' # Unknown token

260

PAD = 'xxpad' # Padding token

261

BOS = 'xxbos' # Beginning of sequence

262

EOS = 'xxeos' # End of sequence

263

FLD = 'xxfld' # Field separator

264

TK_REP = 'xxrep' # Repetition token

265

TK_WREP = 'xxwrep' # Word repetition token

266

TK_UP = 'xxup' # Uppercase token

267

TK_MAJ = 'xxmaj' # Majority case token

268

269

# Default special tokens list

270

defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]

271

272

# Text processing rules

273

text_rules_L = [fix_html, replace_rep, replace_wrep, spec_add_spaces,

274

rm_useless_spaces, replace_all_caps, replace_maj, lowercase]

275

```

276

277

### Language Model Utilities

278

279

Utilities for working with language models and transfer learning.

280

281

```python { .api }

282

def language_model_learner(dls, arch, config=None, drop_mult=1., pretrained=True,

283

pretrained_fnames=None, **kwargs):

284

"""Create language model learner with pre-training support."""

285

286

def fine_tune_text_classifier_learner(dls, path, model_name='classifier',

287

arch=AWD_LSTM, **kwargs):

288

"""Fine-tune text classifier from language model."""

289

290

class LanguageModelLoader:

291

"""Load pre-trained language model weights."""

292

293

def __init__(self, path, backwards=False, model_cls=AWD_LSTM): ...

294

295

def convert_weights(wgts, stoi_wgts, itos_new):

296

"""Convert pre-trained weights to new vocabulary."""

297

298

def lm_config(arch):

299

"""Get default language model configuration for architecture."""

300

```

301

302

### Text Metrics

303

304

Specialized metrics for text tasks.

305

306

```python { .api }

307

class Perplexity(Metric):

308

"""Perplexity metric for language models."""

309

310

def __init__(self, dim=-1): ...

311

312

def reset(self): ...

313

def accumulate(self, learn): ...

314

@property

315

def value(self): ...

316

317

class BLEU:

318

"""BLEU score for text generation."""

319

320

def __init__(self, n_gram=4, weights=None): ...

321

322

def __call__(self, pred_tokens, targ_tokens): ...

323

```