or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-context-rdds.mdindex.mdlegacy-mllib.mdmachine-learning.mdpandas-api.mdresource-management.mdsql-dataframes.mdstreaming.md

machine-learning.mddocs/

0

# Machine Learning (ML)

1

2

Modern machine learning pipeline API providing estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing. The ML package provides a high-level API built on DataFrames for constructing ML pipelines.

3

4

## Capabilities

5

6

### Pipeline Components

7

8

Core abstractions for building machine learning workflows.

9

10

```python { .api }

11

class Pipeline:

12

"""A simple pipeline that chains multiple Transformers and Estimators together."""

13

14

def __init__(self, stages=None):

15

"""

16

Initialize Pipeline.

17

18

Parameters:

19

- stages (list): List of pipeline stages (Transformers and Estimators)

20

"""

21

22

def fit(self, dataset, params=None):

23

"""

24

Fit the pipeline to training data.

25

26

Parameters:

27

- dataset (DataFrame): Training dataset

28

- params (dict): Additional parameters

29

30

Returns:

31

PipelineModel: Fitted pipeline model

32

"""

33

34

def setStages(self, value):

35

"""Set pipeline stages."""

36

37

class PipelineModel:

38

"""A fitted pipeline model."""

39

40

def transform(self, dataset):

41

"""

42

Transform the dataset using the fitted pipeline.

43

44

Parameters:

45

- dataset (DataFrame): Dataset to transform

46

47

Returns:

48

DataFrame: Transformed dataset

49

"""

50

51

def save(self, path):

52

"""Save the pipeline model to the given path."""

53

54

@classmethod

55

def load(cls, path):

56

"""Load a pipeline model from the given path."""

57

58

class Estimator:

59

"""Abstract class for estimators that can be fit on a DataFrame to produce a Model."""

60

61

def fit(self, dataset, params=None):

62

"""

63

Fit model to training data.

64

65

Parameters:

66

- dataset (DataFrame): Training dataset

67

- params (dict): Additional parameters

68

69

Returns:

70

Model: Fitted model

71

"""

72

73

class Transformer:

74

"""Abstract class for transformers that transform DataFrames into DataFrames."""

75

76

def transform(self, dataset):

77

"""

78

Transform the dataset.

79

80

Parameters:

81

- dataset (DataFrame): Dataset to transform

82

83

Returns:

84

DataFrame: Transformed dataset

85

"""

86

87

class Model:

88

"""Abstract class for models that are fitted by estimators."""

89

90

def transform(self, dataset):

91

"""

92

Transform the dataset using the fitted model.

93

94

Parameters:

95

- dataset (DataFrame): Dataset to transform

96

97

Returns:

98

DataFrame: Transformed dataset

99

"""

100

101

class Predictor(Estimator):

102

"""Base class for predictors that make predictions on feature vectors."""

103

104

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction"):

105

"""

106

Initialize Predictor.

107

108

Parameters:

109

- featuresCol (str): Features column name

110

- labelCol (str): Label column name

111

- predictionCol (str): Prediction column name

112

"""

113

114

class PredictionModel(Model):

115

"""Base class for prediction models."""

116

117

def predict(self, value):

118

"""Make a prediction on a single feature vector."""

119

120

def transform(self, dataset):

121

"""Transform dataset to include predictions."""

122

```

123

124

### Classification Algorithms

125

126

Supervised learning algorithms for classification tasks.

127

128

```python { .api }

129

class LogisticRegression(Predictor):

130

"""Logistic regression classifier."""

131

132

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

133

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,

134

threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction",

135

standardization=True, weightCol=None, aggregationDepth=2, family="auto",

136

lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,

137

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, maxBlockSizeInMB=0.0):

138

"""

139

Initialize LogisticRegression.

140

141

Parameters:

142

- featuresCol (str): Features column name

143

- labelCol (str): Label column name

144

- predictionCol (str): Prediction column name

145

- maxIter (int): Maximum number of iterations

146

- regParam (float): Regularization parameter

147

- elasticNetParam (float): ElasticNet mixing parameter

148

- tol (float): Convergence tolerance

149

- fitIntercept (bool): Whether to fit intercept

150

- threshold (float): Binary classification threshold

151

- thresholds (list): Thresholds for multiclass classification

152

- probabilityCol (str): Probability column name

153

- rawPredictionCol (str): Raw prediction column name

154

- standardization (bool): Whether to standardize features

155

- weightCol (str): Weight column name

156

- aggregationDepth (int): Aggregation depth for treeAggregate

157

- family (str): Name of family for GLM

158

- lowerBoundsOnCoefficients (Matrix): Lower bounds on coefficients

159

- upperBoundsOnCoefficients (Matrix): Upper bounds on coefficients

160

- lowerBoundsOnIntercepts (Vector): Lower bounds on intercepts

161

- upperBoundsOnIntercepts (Vector): Upper bounds on intercepts

162

- maxBlockSizeInMB (float): Maximum memory for stacking input data

163

"""

164

165

class DecisionTreeClassifier(Predictor):

166

"""Decision tree classifier."""

167

168

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

169

probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,

170

maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,

171

cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None,

172

weightCol=None, leafCol="", minWeightFractionPerNode=0.0):

173

"""

174

Initialize DecisionTreeClassifier.

175

176

Parameters:

177

- maxDepth (int): Maximum depth of tree

178

- maxBins (int): Maximum number of bins for discretizing continuous features

179

- minInstancesPerNode (int): Minimum number of instances each child must have

180

- minInfoGain (float): Minimum information gain for split

181

- maxMemoryInMB (int): Maximum memory in MB allocated to histogram aggregation

182

- cacheNodeIds (bool): Whether to cache node IDs

183

- checkpointInterval (int): Checkpoint interval

184

- impurity (str): Impurity measure ("gini" or "entropy")

185

- seed (int): Random seed

186

- weightCol (str): Weight column name

187

- leafCol (str): Leaf index column name

188

- minWeightFractionPerNode (float): Minimum weighted fraction of total weight

189

"""

190

191

class RandomForestClassifier(Predictor):

192

"""Random forest classifier."""

193

194

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

195

probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,

196

maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,

197

cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20,

198

featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, weightCol=None,

199

leafCol="", minWeightFractionPerNode=0.0, bootstrap=True):

200

"""

201

Initialize RandomForestClassifier.

202

203

Parameters:

204

- numTrees (int): Number of trees in the forest

205

- featureSubsetStrategy (str): Number of features to consider for splits

206

- subsamplingRate (float): Fraction of training data used for learning

207

- bootstrap (bool): Whether bootstrap samples are used when building trees

208

"""

209

210

class GBTClassifier(Predictor):

211

"""Gradient-boosted tree classifier."""

212

213

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

214

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

215

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

216

lossType="logistic", maxIter=20, stepSize=0.1, seed=None,

217

subsamplingRate=1.0, featureSubsetStrategy="all", validationTol=0.01,

218

validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,

219

weightCol=None):

220

"""

221

Initialize GBTClassifier.

222

223

Parameters:

224

- lossType (str): Loss function type

225

- maxIter (int): Maximum number of iterations

226

- stepSize (float): Step size for gradient descent

227

- subsamplingRate (float): Fraction of training data used for learning

228

- featureSubsetStrategy (str): Number of features to consider for splits

229

- validationTol (float): Validation tolerance for early stopping

230

- validationIndicatorCol (str): Validation indicator column name

231

"""

232

233

class NaiveBayes(Predictor):

234

"""Naive Bayes classifier."""

235

236

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

237

probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,

238

modelType="multinomial", thresholds=None, weightCol=None):

239

"""

240

Initialize NaiveBayes.

241

242

Parameters:

243

- smoothing (float): Smoothing parameter

244

- modelType (str): Model type ("multinomial" or "bernoulli")

245

- thresholds (list): Thresholds for binary classification

246

- weightCol (str): Weight column name

247

"""

248

249

class LinearSVC(Predictor):

250

"""Linear Support Vector Classifier."""

251

252

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

253

rawPredictionCol="rawPrediction", maxIter=100, regParam=0.0, tol=1e-6,

254

fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,

255

aggregationDepth=2, blockSize=1):

256

"""

257

Initialize LinearSVC.

258

259

Parameters:

260

- maxIter (int): Maximum number of iterations

261

- regParam (float): Regularization parameter

262

- tol (float): Convergence tolerance

263

- fitIntercept (bool): Whether to fit intercept

264

- standardization (bool): Whether to standardize features

265

- threshold (float): Classification threshold

266

- weightCol (str): Weight column name

267

- aggregationDepth (int): Aggregation depth for treeAggregate

268

- blockSize (int): Block size for stacking input data

269

"""

270

271

class MultilayerPerceptronClassifier(Predictor):

272

"""Multilayer perceptron classifier."""

273

274

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

275

maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,

276

solver="l-bfgs", initialWeights=None, probabilityCol="probability",

277

rawPredictionCol="rawPrediction"):

278

"""

279

Initialize MultilayerPerceptronClassifier.

280

281

Parameters:

282

- maxIter (int): Maximum number of iterations

283

- tol (float): Convergence tolerance

284

- seed (int): Random seed

285

- layers (list): Sizes of layers from input to output

286

- blockSize (int): Block size for stacking input data

287

- stepSize (float): Step size for gradient descent

288

- solver (str): Solver algorithm ("l-bfgs" or "gd")

289

- initialWeights (Vector): Initial weights

290

- probabilityCol (str): Probability column name

291

- rawPredictionCol (str): Raw prediction column name

292

"""

293

```

294

295

### Regression Algorithms

296

297

Supervised learning algorithms for regression tasks.

298

299

```python { .api }

300

class LinearRegression(Predictor):

301

"""Linear regression."""

302

303

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

304

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,

305

standardization=True, solver="auto", weightCol=None, aggregationDepth=2,

306

loss="squaredError", epsilon=1.35):

307

"""

308

Initialize LinearRegression.

309

310

Parameters:

311

- maxIter (int): Maximum number of iterations

312

- regParam (float): Regularization parameter

313

- elasticNetParam (float): ElasticNet mixing parameter

314

- tol (float): Convergence tolerance

315

- fitIntercept (bool): Whether to fit intercept

316

- standardization (bool): Whether to standardize features

317

- solver (str): Solver algorithm ("auto", "normal", "l-bfgs")

318

- weightCol (str): Weight column name

319

- aggregationDepth (int): Aggregation depth for treeAggregate

320

- loss (str): Loss function ("squaredError" or "huber")

321

- epsilon (float): Shape parameter for Huber loss

322

"""

323

324

class DecisionTreeRegressor(Predictor):

325

"""Decision tree regressor."""

326

327

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

328

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

329

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

330

impurity="variance", seed=None, varianceCol=None, weightCol=None,

331

leafCol="", minWeightFractionPerNode=0.0):

332

"""

333

Initialize DecisionTreeRegressor.

334

335

Parameters:

336

- impurity (str): Impurity measure ("variance")

337

- varianceCol (str): Variance column name

338

"""

339

340

class RandomForestRegressor(Predictor):

341

"""Random forest regressor."""

342

343

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

344

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

345

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

346

impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=None,

347

subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,

348

weightCol=None, bootstrap=True):

349

"""Initialize RandomForestRegressor."""

350

351

class GBTRegressor(Predictor):

352

"""Gradient-boosted tree regressor."""

353

354

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

355

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

356

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

357

lossType="squared", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,

358

featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,

359

leafCol="", minWeightFractionPerNode=0.0, weightCol=None):

360

"""Initialize GBTRegressor."""

361

362

class IsotonicRegression(Estimator):

363

"""Isotonic regression."""

364

365

def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

366

weightCol=None, isotonic=True, featureIndex=0):

367

"""

368

Initialize IsotonicRegression.

369

370

Parameters:

371

- isotonic (bool): Whether the output sequence should be isotonic/increasing

372

- featureIndex (int): Index of the feature to use if featuresCol is a vector

373

"""

374

```

375

376

### Clustering Algorithms

377

378

Unsupervised learning algorithms for clustering tasks.

379

380

```python { .api }

381

class KMeans(Estimator):

382

"""K-means clustering."""

383

384

def __init__(self, featuresCol="features", predictionCol="prediction", k=2,

385

initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, seed=None,

386

distanceMeasure="euclidean", weightCol=None):

387

"""

388

Initialize KMeans.

389

390

Parameters:

391

- k (int): Number of clusters

392

- initMode (str): Initialization algorithm ("k-means||" or "random")

393

- initSteps (int): Number of steps for k-means|| initialization

394

- tol (float): Convergence tolerance

395

- maxIter (int): Maximum number of iterations

396

- seed (int): Random seed

397

- distanceMeasure (str): Distance measure ("euclidean" or "cosine")

398

- weightCol (str): Weight column name

399

"""

400

401

class BisectingKMeans(Estimator):

402

"""Bisecting k-means clustering."""

403

404

def __init__(self, featuresCol="features", predictionCol="prediction", k=4,

405

maxIter=20, seed=None, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):

406

"""

407

Initialize BisectingKMeans.

408

409

Parameters:

410

- k (int): Number of clusters

411

- maxIter (int): Maximum number of iterations

412

- seed (int): Random seed

413

- minDivisibleClusterSize (float): Minimum divisible cluster size

414

- distanceMeasure (str): Distance measure

415

"""

416

417

class GaussianMixture(Estimator):

418

"""Gaussian Mixture Model."""

419

420

def __init__(self, featuresCol="features", predictionCol="prediction", k=2,

421

probabilityCol="probability", tol=0.01, maxIter=100, seed=None,

422

aggregationDepth=2, weightCol=None):

423

"""

424

Initialize GaussianMixture.

425

426

Parameters:

427

- k (int): Number of components

428

- probabilityCol (str): Probability column name

429

- tol (float): Convergence tolerance

430

- maxIter (int): Maximum number of iterations

431

- seed (int): Random seed

432

- aggregationDepth (int): Aggregation depth for treeAggregate

433

- weightCol (str): Weight column name

434

"""

435

436

class LDA(Estimator):

437

"""Latent Dirichlet Allocation."""

438

439

def __init__(self, featuresCol="features", maxIter=100, seed=None, checkpointInterval=10,

440

k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,

441

subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None,

442

topicConcentration=None, topicDistributionCol="topicDistribution",

443

keepLastCheckpoint=True):

444

"""

445

Initialize LDA.

446

447

Parameters:

448

- k (int): Number of topics

449

- optimizer (str): Optimizer ("online" or "em")

450

- learningOffset (float): Learning offset for online optimizer

451

- learningDecay (float): Learning decay rate

452

- subsamplingRate (float): Subsampling rate for online optimizer

453

- optimizeDocConcentration (bool): Whether to optimize document concentration

454

- docConcentration (Vector): Document concentration parameters

455

- topicConcentration (float): Topic concentration parameter

456

- topicDistributionCol (str): Topic distribution column name

457

- keepLastCheckpoint (bool): Whether to keep last checkpoint

458

"""

459

```

460

461

### Feature Processing

462

463

Transformers for feature engineering and preprocessing.

464

465

```python { .api }

466

class VectorAssembler(Transformer):

467

"""Combine multiple columns into a vector column."""

468

469

def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"):

470

"""

471

Initialize VectorAssembler.

472

473

Parameters:

474

- inputCols (list): Input column names

475

- outputCol (str): Output column name

476

- handleInvalid (str): How to handle invalid data ("error", "skip", "keep")

477

"""

478

479

class StandardScaler(Estimator):

480

"""Standardize features by removing mean and scaling to unit variance."""

481

482

def __init__(self, inputCol=None, outputCol=None, withMean=False, withStd=True):

483

"""

484

Initialize StandardScaler.

485

486

Parameters:

487

- inputCol (str): Input column name

488

- outputCol (str): Output column name

489

- withMean (bool): Whether to center data with mean

490

- withStd (bool): Whether to scale to unit standard deviation

491

"""

492

493

class MinMaxScaler(Estimator):

494

"""Transform features by scaling to a given range."""

495

496

def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):

497

"""

498

Initialize MinMaxScaler.

499

500

Parameters:

501

- min (float): Lower bound after transformation

502

- max (float): Upper bound after transformation

503

- inputCol (str): Input column name

504

- outputCol (str): Output column name

505

"""

506

507

class StringIndexer(Estimator):

508

"""Encode string labels to label indices."""

509

510

def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None,

511

handleInvalid="error", stringOrderType="frequencyDesc"):

512

"""

513

Initialize StringIndexer.

514

515

Parameters:

516

- inputCol (str): Input column name

517

- outputCol (str): Output column name

518

- inputCols (list): Input column names

519

- outputCols (list): Output column names

520

- handleInvalid (str): How to handle invalid data

521

- stringOrderType (str): How to order labels ("frequencyDesc", "frequencyAsc", "alphabetDesc", "alphabetAsc")

522

"""

523

524

class IndexToString(Transformer):

525

"""Map label indices back to label strings."""

526

527

def __init__(self, inputCol=None, outputCol=None, labels=None, inputCols=None,

528

outputCols=None):

529

"""

530

Initialize IndexToString.

531

532

Parameters:

533

- inputCol (str): Input column name

534

- outputCol (str): Output column name

535

- labels (list): Ordered list of labels

536

- inputCols (list): Input column names

537

- outputCols (list): Output column names

538

"""

539

540

class OneHotEncoder(Estimator):

541

"""One-hot encode categorical features."""

542

543

def __init__(self, inputCols=None, outputCols=None, dropLast=True, handleInvalid="error",

544

inputCol=None, outputCol=None):

545

"""

546

Initialize OneHotEncoder.

547

548

Parameters:

549

- inputCols (list): Input column names

550

- outputCols (list): Output column names

551

- dropLast (bool): Whether to drop the last category

552

- handleInvalid (str): How to handle invalid data

553

- inputCol (str): Input column name (deprecated)

554

- outputCol (str): Output column name (deprecated)

555

"""

556

557

class PCA(Estimator):

558

"""Principal component analysis dimensionality reduction."""

559

560

def __init__(self, k=None, inputCol=None, outputCol=None):

561

"""

562

Initialize PCA.

563

564

Parameters:

565

- k (int): Number of principal components

566

- inputCol (str): Input column name

567

- outputCol (str): Output column name

568

"""

569

570

class Word2Vec(Estimator):

571

"""Word2Vec transforms a dataset of text documents to vectors."""

572

573

def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,

574

maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5,

575

maxSentenceLength=1000):

576

"""

577

Initialize Word2Vec.

578

579

Parameters:

580

- vectorSize (int): Dimension of the code that maps words to

581

- minCount (int): Minimum number of times a token must appear

582

- numPartitions (int): Number of partitions for sentences

583

- stepSize (float): Step size for gradient descent

584

- maxIter (int): Maximum number of iterations

585

- seed (int): Random seed

586

- inputCol (str): Input column name

587

- outputCol (str): Output column name

588

- windowSize (int): Window size for Word2Vec

589

- maxSentenceLength (int): Maximum sentence length

590

"""

591

592

class CountVectorizer(Estimator):

593

"""Convert text documents to vectors of token counts."""

594

595

def __init__(self, inputCol=None, outputCol=None, vocabSize=1 << 18, minDF=1.0,

596

maxDF=None, minTF=1.0, binary=False):

597

"""

598

Initialize CountVectorizer.

599

600

Parameters:

601

- inputCol (str): Input column name

602

- outputCol (str): Output column name

603

- vocabSize (int): Maximum vocabulary size

604

- minDF (float): Minimum document frequency

605

- maxDF (float): Maximum document frequency

606

- minTF (float): Minimum term frequency

607

- binary (bool): Binary toggle to control term frequency counts

608

"""

609

610

class IDF(Estimator):

611

"""Compute Inverse Document Frequency (IDF) for TF-IDF."""

612

613

def __init__(self, inputCol=None, outputCol=None, minDocFreq=0):

614

"""

615

Initialize IDF.

616

617

Parameters:

618

- inputCol (str): Input column name

619

- outputCol (str): Output column name

620

- minDocFreq (int): Minimum document frequency

621

"""

622

```

623

624

### Model Evaluation

625

626

Evaluation metrics for assessing model performance.

627

628

```python { .api }

629

class Evaluator:

630

"""Base class for evaluators."""

631

632

def evaluate(self, dataset, params=None):

633

"""

634

Evaluate the dataset and return a scalar metric.

635

636

Parameters:

637

- dataset (DataFrame): Dataset to evaluate

638

- params (dict): Additional parameters

639

640

Returns:

641

float: Evaluation metric

642

"""

643

644

class BinaryClassificationEvaluator(Evaluator):

645

"""Evaluator for binary classification."""

646

647

def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",

648

metricName="areaUnderROC", weightCol=None, numBins=1000):

649

"""

650

Initialize BinaryClassificationEvaluator.

651

652

Parameters:

653

- rawPredictionCol (str): Raw prediction column name

654

- labelCol (str): Label column name

655

- metricName (str): Metric name ("areaUnderROC" or "areaUnderPR")

656

- weightCol (str): Weight column name

657

- numBins (int): Number of bins for ROC curve

658

"""

659

660

class MulticlassClassificationEvaluator(Evaluator):

661

"""Evaluator for multiclass classification."""

662

663

def __init__(self, predictionCol="prediction", labelCol="label", metricName="f1",

664

metricLabel=0.0, beta=1.0, probabilityCol="probability", eps=1e-15):

665

"""

666

Initialize MulticlassClassificationEvaluator.

667

668

Parameters:

669

- predictionCol (str): Prediction column name

670

- labelCol (str): Label column name

671

- metricName (str): Metric name ("f1", "accuracy", "weightedPrecision", etc.)

672

- metricLabel (float): Label for metric calculation

673

- beta (float): Beta value for F-beta score

674

- probabilityCol (str): Probability column name

675

- eps (float): Epsilon value to avoid division by zero

676

"""

677

678

class RegressionEvaluator(Evaluator):

679

"""Evaluator for regression."""

680

681

def __init__(self, predictionCol="prediction", labelCol="label", metricName="rmse",

682

weightCol=None, throughOrigin=False):

683

"""

684

Initialize RegressionEvaluator.

685

686

Parameters:

687

- predictionCol (str): Prediction column name

688

- labelCol (str): Label column name

689

- metricName (str): Metric name ("rmse", "mse", "r2", "mae", "var")

690

- weightCol (str): Weight column name

691

- throughOrigin (bool): Whether to fit line through origin for r2

692

"""

693

694

class ClusteringEvaluator(Evaluator):

695

"""Evaluator for clustering."""

696

697

def __init__(self, predictionCol="prediction", featuresCol="features",

698

metricName="silhouette", distanceMeasure="squaredEuclidean",

699

weightCol=None):

700

"""

701

Initialize ClusteringEvaluator.

702

703

Parameters:

704

- predictionCol (str): Prediction column name

705

- featuresCol (str): Features column name

706

- metricName (str): Metric name ("silhouette")

707

- distanceMeasure (str): Distance measure

708

- weightCol (str): Weight column name

709

"""

710

```

711

712

### Hyperparameter Tuning

713

714

Tools for hyperparameter optimization and model selection.

715

716

```python { .api }

717

class ParamGridBuilder:

718

"""Builder for a param grid used in grid search-based model selection."""

719

720

def __init__(self):

721

"""Initialize ParamGridBuilder."""

722

723

def addGrid(self, param, values):

724

"""

725

Add parameter values to the grid.

726

727

Parameters:

728

- param (Param): Parameter to tune

729

- values (list): List of parameter values

730

731

Returns:

732

ParamGridBuilder

733

"""

734

735

def build(self):

736

"""

737

Build and return the parameter grid.

738

739

Returns:

740

list: List of parameter maps

741

"""

742

743

class CrossValidator(Estimator):

744

"""K-fold cross validation."""

745

746

def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,

747

numFolds=3, seed=None, parallelism=1, collectSubModels=False,

748

foldCol=""):

749

"""

750

Initialize CrossValidator.

751

752

Parameters:

753

- estimator (Estimator): Estimator to cross-validate

754

- estimatorParamMaps (list): Parameter maps to evaluate

755

- evaluator (Evaluator): Evaluator for model selection

756

- numFolds (int): Number of folds for cross validation

757

- seed (int): Random seed

758

- parallelism (int): Number of threads to use for fitting models

759

- collectSubModels (bool): Whether to collect sub-models

760

- foldCol (str): Fold column name

761

"""

762

763

class TrainValidationSplit(Estimator):

764

"""Train-validation split for model selection."""

765

766

def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,

767

trainRatio=0.75, seed=None, parallelism=1, collectSubModels=False):

768

"""

769

Initialize TrainValidationSplit.

770

771

Parameters:

772

- estimator (Estimator): Estimator to tune

773

- estimatorParamMaps (list): Parameter maps to evaluate

774

- evaluator (Evaluator): Evaluator for model selection

775

- trainRatio (float): Ratio of training data

776

- seed (int): Random seed

777

- parallelism (int): Number of threads to use for fitting models

778

- collectSubModels (bool): Whether to collect sub-models

779

"""

780

```

781

782

## Types

783

784

```python { .api }

785

from pyspark.ml.linalg import Vector, DenseVector, SparseVector, Vectors

786

from pyspark.ml.linalg import Matrix, DenseMatrix, SparseMatrix, Matrices

787

788

class Vector:

789

"""Abstract base class for ML vector types."""

790

791

def toArray(self):

792

"""Convert to numpy array."""

793

794

class DenseVector(Vector):

795

"""Dense vector representation."""

796

797

def __init__(self, ar):

798

"""Create from array-like object."""

799

800

class SparseVector(Vector):

801

"""Sparse vector representation."""

802

803

def __init__(self, size, *args):

804

"""Create sparse vector."""

805

806

class Vectors:

807

"""Factory methods for creating vectors."""

808

809

@staticmethod

810

def dense(*values):

811

"""Create dense vector."""

812

813

@staticmethod

814

def sparse(size, *args):

815

"""Create sparse vector."""

816

817

class Matrix:

818

"""Abstract base class for ML matrix types."""

819

820

def numRows(self):

821

"""Number of rows."""

822

823

def numCols(self):

824

"""Number of columns."""

825

826

class DenseMatrix(Matrix):

827

"""Dense matrix representation."""

828

829

class SparseMatrix(Matrix):

830

"""Sparse matrix representation in CSC format."""

831

832

class Matrices:

833

"""Factory methods for creating matrices."""

834

835

@staticmethod

836

def dense(numRows, numCols, values):

837

"""Create dense matrix."""

838

839

@staticmethod

840

def sparse(numRows, numCols, colPtrs, rowIndices, values):

841

"""Create sparse matrix."""

842

```