or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mdevaluation-tuning.mdfeature-engineering.mdindex.mdlinear-algebra.mdpipeline-components.mdrecommendation.mdregression.md

feature-engineering.mddocs/

0

# Feature Engineering

1

2

Comprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.

3

4

## Capabilities

5

6

### Vector Assembly and Manipulation

7

8

Core utilities for assembling feature vectors and manipulating vector data structures.

9

10

```scala { .api }

11

/**

12

* Combines multiple columns into a single vector column

13

*/

14

class VectorAssembler extends Transformer {

15

def setInputCols(value: Array[String]): this.type

16

def setOutputCol(value: String): this.type

17

def setHandleInvalid(value: String): this.type

18

}

19

20

/**

21

* Extracts elements from vectors by indices or names

22

*/

23

class VectorSlicer extends Transformer {

24

def setInputCol(value: String): this.type

25

def setOutputCol(value: String): this.type

26

def setIndices(value: Array[Int]): this.type

27

def setNames(value: Array[String]): this.type

28

}

29

30

/**

31

* Indexes categorical features in vector columns

32

*/

33

class VectorIndexer extends Estimator[VectorIndexerModel] {

34

def setInputCol(value: String): this.type

35

def setOutputCol(value: String): this.type

36

def setMaxCategories(value: Int): this.type

37

def setHandleInvalid(value: String): this.type

38

}

39

40

class VectorIndexerModel extends Model[VectorIndexerModel] with VectorIndexerParams {

41

def numFeatures: Int

42

def categoryMaps: Map[Int, Map[Double, Int]]

43

}

44

```

45

46

**Usage Example:**

47

48

```scala

49

import org.apache.spark.ml.feature.VectorAssembler

50

51

val assembler = new VectorAssembler()

52

.setInputCols(Array("hour", "mobile", "userFeatures"))

53

.setOutputCol("features")

54

55

val output = assembler.transform(dataset)

56

output.select("features").show()

57

```

58

59

### Scaling and Normalization

60

61

Feature scaling transformations for normalizing numerical data distributions.

62

63

```scala { .api }

64

/**

65

* Standardizes features by removing mean and scaling to unit variance

66

*/

67

class StandardScaler extends Estimator[StandardScalerModel] {

68

def setInputCol(value: String): this.type

69

def setOutputCol(value: String): this.type

70

def setWithMean(value: Boolean): this.type

71

def setWithStd(value: Boolean): this.type

72

}

73

74

class StandardScalerModel extends Model[StandardScalerModel] with StandardScalerParams {

75

def mean: Vector

76

def std: Vector

77

}

78

79

/**

80

* Rescales features to a common range [min, max]

81

*/

82

class MinMaxScaler extends Estimator[MinMaxScalerModel] {

83

def setInputCol(value: String): this.type

84

def setOutputCol(value: String): this.type

85

def setMin(value: Double): this.type

86

def setMax(value: Double): this.type

87

}

88

89

class MinMaxScalerModel extends Model[MinMaxScalerModel] with MinMaxScalerParams {

90

def originalMin: Vector

91

def originalMax: Vector

92

}

93

94

/**

95

* Scales features by the maximum absolute value

96

*/

97

class MaxAbsScaler extends Estimator[MaxAbsScalerModel] {

98

def setInputCol(value: String): this.type

99

def setOutputCol(value: String): this.type

100

}

101

102

class MaxAbsScalerModel extends Model[MaxAbsScalerModel] with MaxAbsScalerParams {

103

def maxAbs: Vector

104

}

105

106

/**

107

* Normalizes vectors to have unit norm

108

*/

109

class Normalizer extends Transformer {

110

def setInputCol(value: String): this.type

111

def setOutputCol(value: String): this.type

112

def setP(value: Double): this.type

113

}

114

115

/**

116

* Robust scaling using quantiles, less sensitive to outliers

117

*/

118

class RobustScaler extends Estimator[RobustScalerModel] {

119

def setInputCol(value: String): this.type

120

def setOutputCol(value: String): this.type

121

def setLower(value: Double): this.type

122

def setUpper(value: Double): this.type

123

def setWithCentering(value: Boolean): this.type

124

def setWithScaling(value: Boolean): this.type

125

def setRelativeError(value: Double): this.type

126

}

127

128

class RobustScalerModel extends Model[RobustScalerModel] with RobustScalerParams {

129

def median: Vector

130

def range: Vector

131

}

132

```

133

134

### Categorical Feature Processing

135

136

Transformations for handling categorical data including encoding and indexing.

137

138

```scala { .api }

139

/**

140

* Maps string labels to numeric indices

141

*/

142

class StringIndexer extends Estimator[StringIndexerModel] {

143

def setInputCol(value: String): this.type

144

def setOutputCol(value: String): this.type

145

def setHandleInvalid(value: String): this.type

146

def setStringOrderType(value: String): this.type

147

}

148

149

class StringIndexerModel extends Model[StringIndexerModel] with StringIndexerParams {

150

def labels: Array[String]

151

def labelsArray: Array[Array[String]]

152

}

153

154

/**

155

* Maps numeric indices back to string labels

156

*/

157

class IndexToString extends Transformer {

158

def setInputCol(value: String): this.type

159

def setOutputCol(value: String): this.type

160

def setLabels(value: Array[String]): this.type

161

}

162

163

/**

164

* One-hot encoding for categorical features

165

*/

166

class OneHotEncoder extends Transformer {

167

def setInputCols(value: Array[String]): this.type

168

def setOutputCols(value: Array[String]): this.type

169

def setDropLast(value: Boolean): this.type

170

def setHandleInvalid(value: String): this.type

171

}

172

```

173

174

**Usage Example:**

175

176

```scala

177

import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder}

178

import org.apache.spark.ml.Pipeline

179

180

val indexer = new StringIndexer()

181

.setInputCol("category")

182

.setOutputCol("categoryIndex")

183

184

val encoder = new OneHotEncoder()

185

.setInputCols(Array("categoryIndex"))

186

.setOutputCols(Array("categoryVec"))

187

188

val pipeline = new Pipeline()

189

.setStages(Array(indexer, encoder))

190

191

val model = pipeline.fit(df)

192

model.transform(df).show()

193

```

194

195

### Text Processing

196

197

Comprehensive text processing utilities for natural language processing tasks.

198

199

```scala { .api }

200

/**

201

* Tokenizes text into individual words

202

*/

203

class Tokenizer extends Transformer {

204

def setInputCol(value: String): this.type

205

def setOutputCol(value: String): this.type

206

}

207

208

/**

209

* Advanced tokenization using regular expressions

210

*/

211

class RegexTokenizer extends Transformer {

212

def setInputCol(value: String): this.type

213

def setOutputCol(value: String): this.type

214

def setPattern(value: String): this.type

215

def setGaps(value: Boolean): this.type

216

def setToLowercase(value: Boolean): this.type

217

def setMinTokenLength(value: Int): this.type

218

}

219

220

/**

221

* Removes common stop words from text

222

*/

223

class StopWordsRemover extends Transformer {

224

def setInputCol(value: String): this.type

225

def setOutputCol(value: String): this.type

226

def setStopWords(value: Array[String]): this.type

227

def setCaseSensitive(value: Boolean): this.type

228

def setLocale(value: String): this.type

229

}

230

231

/**

232

* Generates n-grams from sequences of tokens

233

*/

234

class NGram extends Transformer {

235

def setInputCol(value: String): this.type

236

def setOutputCol(value: String): this.type

237

def setN(value: Int): this.type

238

}

239

240

/**

241

* Term frequency using hashing trick

242

*/

243

class HashingTF extends Transformer {

244

def setInputCol(value: String): this.type

245

def setOutputCol(value: String): this.type

246

def setNumFeatures(value: Int): this.type

247

def setBinary(value: Boolean): this.type

248

}

249

250

/**

251

* Inverse document frequency weighting

252

*/

253

class IDF extends Estimator[IDFModel] {

254

def setInputCol(value: String): this.type

255

def setOutputCol(value: String): this.type

256

def setMinDocFreq(value: Int): this.type

257

}

258

259

class IDFModel extends Model[IDFModel] with IDFParams {

260

def idf: Vector

261

}

262

263

/**

264

* Count-based vectorization of text documents

265

*/

266

class CountVectorizer extends Estimator[CountVectorizerModel] {

267

def setInputCol(value: String): this.type

268

def setOutputCol(value: String): this.type

269

def setVocabSize(value: Int): this.type

270

def setMinDF(value: Double): this.type

271

def setMaxDF(value: Double): this.type

272

def setMinTF(value: Double): this.type

273

def setBinary(value: Boolean): this.type

274

}

275

276

class CountVectorizerModel extends Model[CountVectorizerModel] with CountVectorizerParams {

277

def vocabulary: Array[String]

278

}

279

280

/**

281

* Word2Vec for learning vector representations of words

282

*/

283

class Word2Vec extends Estimator[Word2VecModel] {

284

def setInputCol(value: String): this.type

285

def setOutputCol(value: String): this.type

286

def setVectorSize(value: Int): this.type

287

def setMinCount(value: Int): this.type

288

def setNumPartitions(value: Int): this.type

289

def setStepSize(value: Double): this.type

290

def setMaxIter(value: Int): this.type

291

def setSeed(value: Long): this.type

292

def setWindowSize(value: Int): this.type

293

def setMaxSentenceLength(value: Int): this.type

294

}

295

296

class Word2VecModel extends Model[Word2VecModel] with Word2VecParams {

297

def getVectors: DataFrame

298

def findSynonyms(word: String, num: Int): DataFrame

299

def findSynonymsArray(word: String, num: Int): Array[(String, Double)]

300

def transform(word: String): Vector

301

}

302

```

303

304

### Feature Selection

305

306

Statistical methods for selecting most relevant features for machine learning models.

307

308

```scala { .api }

309

/**

310

* Univariate feature selection using statistical tests

311

*/

312

class UnivariateFeatureSelector extends Estimator[UnivariateFeatureSelectorModel] {

313

def setFeatureType(value: String): this.type

314

def setLabelType(value: String): this.type

315

def setSelectionMode(value: String): this.type

316

def setSelectionThreshold(value: Double): this.type

317

def setInputCol(value: String): this.type

318

def setOutputCol(value: String): this.type

319

}

320

321

class UnivariateFeatureSelectorModel extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams {

322

def selectedFeatures: Array[Int]

323

}

324

325

/**

326

* Feature selection based on variance threshold

327

*/

328

class VarianceThresholdSelector extends Estimator[VarianceThresholdSelectorModel] {

329

def setVarianceThreshold(value: Double): this.type

330

def setInputCol(value: String): this.type

331

def setOutputCol(value: String): this.type

332

}

333

334

class VarianceThresholdSelectorModel extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams {

335

def selectedFeatures: Array[Int]

336

}

337

338

/**

339

* Chi-square feature selection for categorical data

340

*/

341

class ChiSqSelector extends Estimator[ChiSqSelectorModel] {

342

def setNumTopFeatures(value: Int): this.type

343

def setPercentile(value: Double): this.type

344

def setFpr(value: Double): this.type

345

def setFdr(value: Double): this.type

346

def setFwe(value: Double): this.type

347

def setSelectorType(value: String): this.type

348

def setInputCol(value: String): this.type

349

def setOutputCol(value: String): this.type

350

}

351

352

class ChiSqSelectorModel extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {

353

def selectedFeatures: Array[Int]

354

}

355

```

356

357

### Dimensionality Reduction

358

359

Algorithms for reducing the number of features while preserving important information.

360

361

```scala { .api }

362

/**

363

* Principal Component Analysis for dimensionality reduction

364

*/

365

class PCA extends Estimator[PCAModel] {

366

def setInputCol(value: String): this.type

367

def setOutputCol(value: String): this.type

368

def setK(value: Int): this.type

369

}

370

371

class PCAModel extends Model[PCAModel] with PCAParams {

372

def pc: Matrix

373

def explainedVariance: Vector

374

}

375

```

376

377

### Missing Value Handling

378

379

Utilities for handling missing or invalid data values.

380

381

```scala { .api }

382

/**

383

* Imputes missing values using mean, median, or mode

384

*/

385

class Imputer extends Estimator[ImputerModel] {

386

def setInputCols(value: Array[String]): this.type

387

def setOutputCols(value: Array[String]): this.type

388

def setStrategy(value: String): this.type

389

def setMissingValue(value: Double): this.type

390

def setRelativeError(value: Double): this.type

391

}

392

393

class ImputerModel extends Model[ImputerModel] with ImputerParams {

394

def surrogateDF: DataFrame

395

}

396

```

397

398

### Advanced Transformations

399

400

Specialized transformations for complex feature engineering tasks.

401

402

```scala { .api }

403

/**

404

* Applies SQL transformation to DataFrames

405

*/

406

class SQLTransformer extends Transformer {

407

def setStatement(value: String): this.type

408

}

409

410

/**

411

* Quantile-based discretization of continuous features

412

*/

413

class QuantileDiscretizer extends Estimator[QuantileDiscretizerModel] {

414

def setInputCol(value: String): this.type

415

def setOutputCol(value: String): this.type

416

def setNumBuckets(value: Int): this.type

417

def setRelativeError(value: Double): this.type

418

def setHandleInvalid(value: String): this.type

419

}

420

421

class QuantileDiscretizerModel extends Model[QuantileDiscretizerModel] with QuantileDiscretizerParams {

422

def getSplits: Array[Double]

423

}

424

425

/**

426

* Maps continuous features to categorical buckets

427

*/

428

class Bucketizer extends Transformer {

429

def setInputCol(value: String): this.type

430

def setOutputCol(value: String): this.type

431

def setSplits(value: Array[Double]): this.type

432

def setHandleInvalid(value: String): this.type

433

}

434

435

/**

436

* Feature hashing for high-dimensional categorical data

437

*/

438

class FeatureHasher extends Transformer {

439

def setInputCols(value: Array[String]): this.type

440

def setOutputCol(value: String): this.type

441

def setNumFeatures(value: Int): this.type

442

def setCategoricalCols(value: Array[String]): this.type

443

}

444

445

/**

446

* Creates interaction features between input columns

447

*/

448

class Interaction extends Transformer {

449

def setInputCols(value: Array[String]): this.type

450

def setOutputCol(value: String): this.type

451

}

452

453

/**

454

* Expands features to polynomial space

455

*/

456

class PolynomialExpansion extends Transformer {

457

def setInputCol(value: String): this.type

458

def setOutputCol(value: String): this.type

459

def setDegree(value: Int): this.type

460

}

461

```

462

463

## Types

464

465

```scala { .api }

466

// Feature engineering imports

467

import org.apache.spark.ml.feature._

468

import org.apache.spark.ml.linalg.{Vector, Matrix}

469

import org.apache.spark.sql.{DataFrame, Dataset}

470

471

// Parameter traits

472

import org.apache.spark.ml.param.shared._

473

474

// Text processing utilities

475

import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover}

476

477

// Vector utilities

478

import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer, VectorIndexer}

479

480

// Scaling utilities

481

import org.apache.spark.ml.feature.{StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler}

482

```