0
# Machine Learning (ML)
1
2
Modern machine learning pipeline API providing estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing. The ML package provides a high-level API built on DataFrames for constructing ML pipelines.
3
4
## Capabilities
5
6
### Pipeline Components
7
8
Core abstractions for building machine learning workflows.
9
10
```python { .api }
11
class Pipeline:
12
"""A simple pipeline that chains multiple Transformers and Estimators together."""
13
14
def __init__(self, stages=None):
15
"""
16
Initialize Pipeline.
17
18
Parameters:
19
- stages (list): List of pipeline stages (Transformers and Estimators)
20
"""
21
22
def fit(self, dataset, params=None):
23
"""
24
Fit the pipeline to training data.
25
26
Parameters:
27
- dataset (DataFrame): Training dataset
28
- params (dict): Additional parameters
29
30
Returns:
31
PipelineModel: Fitted pipeline model
32
"""
33
34
def setStages(self, value):
35
"""Set pipeline stages."""
36
37
class PipelineModel:
38
"""A fitted pipeline model."""
39
40
def transform(self, dataset):
41
"""
42
Transform the dataset using the fitted pipeline.
43
44
Parameters:
45
- dataset (DataFrame): Dataset to transform
46
47
Returns:
48
DataFrame: Transformed dataset
49
"""
50
51
def save(self, path):
52
"""Save the pipeline model to the given path."""
53
54
@classmethod
55
def load(cls, path):
56
"""Load a pipeline model from the given path."""
57
58
class Estimator:
59
"""Abstract class for estimators that can be fit on a DataFrame to produce a Model."""
60
61
def fit(self, dataset, params=None):
62
"""
63
Fit model to training data.
64
65
Parameters:
66
- dataset (DataFrame): Training dataset
67
- params (dict): Additional parameters
68
69
Returns:
70
Model: Fitted model
71
"""
72
73
class Transformer:
74
"""Abstract class for transformers that transform DataFrames into DataFrames."""
75
76
def transform(self, dataset):
77
"""
78
Transform the dataset.
79
80
Parameters:
81
- dataset (DataFrame): Dataset to transform
82
83
Returns:
84
DataFrame: Transformed dataset
85
"""
86
87
class Model:
88
"""Abstract class for models that are fitted by estimators."""
89
90
def transform(self, dataset):
91
"""
92
Transform the dataset using the fitted model.
93
94
Parameters:
95
- dataset (DataFrame): Dataset to transform
96
97
Returns:
98
DataFrame: Transformed dataset
99
"""
100
101
class Predictor(Estimator):
102
"""Base class for predictors that make predictions on feature vectors."""
103
104
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction"):
105
"""
106
Initialize Predictor.
107
108
Parameters:
109
- featuresCol (str): Features column name
110
- labelCol (str): Label column name
111
- predictionCol (str): Prediction column name
112
"""
113
114
class PredictionModel(Model):
115
"""Base class for prediction models."""
116
117
def predict(self, value):
118
"""Make a prediction on a single feature vector."""
119
120
def transform(self, dataset):
121
"""Transform dataset to include predictions."""
122
```
123
124
### Classification Algorithms
125
126
Supervised learning algorithms for classification tasks.
127
128
```python { .api }
129
class LogisticRegression(Predictor):
130
"""Logistic regression classifier."""
131
132
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
133
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
134
threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction",
135
standardization=True, weightCol=None, aggregationDepth=2, family="auto",
136
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
137
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, maxBlockSizeInMB=0.0):
138
"""
139
Initialize LogisticRegression.
140
141
Parameters:
142
- featuresCol (str): Features column name
143
- labelCol (str): Label column name
144
- predictionCol (str): Prediction column name
145
- maxIter (int): Maximum number of iterations
146
- regParam (float): Regularization parameter
147
- elasticNetParam (float): ElasticNet mixing parameter
148
- tol (float): Convergence tolerance
149
- fitIntercept (bool): Whether to fit intercept
150
- threshold (float): Binary classification threshold
151
- thresholds (list): Thresholds for multiclass classification
152
- probabilityCol (str): Probability column name
153
- rawPredictionCol (str): Raw prediction column name
154
- standardization (bool): Whether to standardize features
155
- weightCol (str): Weight column name
156
- aggregationDepth (int): Aggregation depth for treeAggregate
157
- family (str): Name of family for GLM
158
- lowerBoundsOnCoefficients (Matrix): Lower bounds on coefficients
159
- upperBoundsOnCoefficients (Matrix): Upper bounds on coefficients
160
- lowerBoundsOnIntercepts (Vector): Lower bounds on intercepts
161
- upperBoundsOnIntercepts (Vector): Upper bounds on intercepts
162
- maxBlockSizeInMB (float): Maximum memory for stacking input data
163
"""
164
165
class DecisionTreeClassifier(Predictor):
166
"""Decision tree classifier."""
167
168
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
169
probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
170
maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
171
cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None,
172
weightCol=None, leafCol="", minWeightFractionPerNode=0.0):
173
"""
174
Initialize DecisionTreeClassifier.
175
176
Parameters:
177
- maxDepth (int): Maximum depth of tree
178
- maxBins (int): Maximum number of bins for discretizing continuous features
179
- minInstancesPerNode (int): Minimum number of instances each child must have
180
- minInfoGain (float): Minimum information gain for split
181
- maxMemoryInMB (int): Maximum memory in MB allocated to histogram aggregation
182
- cacheNodeIds (bool): Whether to cache node IDs
183
- checkpointInterval (int): Checkpoint interval
184
- impurity (str): Impurity measure ("gini" or "entropy")
185
- seed (int): Random seed
186
- weightCol (str): Weight column name
187
- leafCol (str): Leaf index column name
188
- minWeightFractionPerNode (float): Minimum weighted fraction of total weight
189
"""
190
191
class RandomForestClassifier(Predictor):
192
"""Random forest classifier."""
193
194
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
195
probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
196
maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
197
cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20,
198
featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, weightCol=None,
199
leafCol="", minWeightFractionPerNode=0.0, bootstrap=True):
200
"""
201
Initialize RandomForestClassifier.
202
203
Parameters:
204
- numTrees (int): Number of trees in the forest
205
- featureSubsetStrategy (str): Number of features to consider for splits
206
- subsamplingRate (float): Fraction of training data used for learning
207
- bootstrap (bool): Whether bootstrap samples are used when building trees
208
"""
209
210
class GBTClassifier(Predictor):
211
"""Gradient-boosted tree classifier."""
212
213
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
214
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
215
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
216
lossType="logistic", maxIter=20, stepSize=0.1, seed=None,
217
subsamplingRate=1.0, featureSubsetStrategy="all", validationTol=0.01,
218
validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,
219
weightCol=None):
220
"""
221
Initialize GBTClassifier.
222
223
Parameters:
224
- lossType (str): Loss function type
225
- maxIter (int): Maximum number of iterations
226
- stepSize (float): Step size for gradient descent
227
- subsamplingRate (float): Fraction of training data used for learning
228
- featureSubsetStrategy (str): Number of features to consider for splits
229
- validationTol (float): Validation tolerance for early stopping
230
- validationIndicatorCol (str): Validation indicator column name
231
"""
232
233
class NaiveBayes(Predictor):
234
"""Naive Bayes classifier."""
235
236
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
237
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
238
modelType="multinomial", thresholds=None, weightCol=None):
239
"""
240
Initialize NaiveBayes.
241
242
Parameters:
243
- smoothing (float): Smoothing parameter
244
- modelType (str): Model type ("multinomial" or "bernoulli")
245
- thresholds (list): Thresholds for binary classification
246
- weightCol (str): Weight column name
247
"""
248
249
class LinearSVC(Predictor):
250
"""Linear Support Vector Classifier."""
251
252
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
253
rawPredictionCol="rawPrediction", maxIter=100, regParam=0.0, tol=1e-6,
254
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
255
aggregationDepth=2, blockSize=1):
256
"""
257
Initialize LinearSVC.
258
259
Parameters:
260
- maxIter (int): Maximum number of iterations
261
- regParam (float): Regularization parameter
262
- tol (float): Convergence tolerance
263
- fitIntercept (bool): Whether to fit intercept
264
- standardization (bool): Whether to standardize features
265
- threshold (float): Classification threshold
266
- weightCol (str): Weight column name
267
- aggregationDepth (int): Aggregation depth for treeAggregate
268
- blockSize (int): Block size for stacking input data
269
"""
270
271
class MultilayerPerceptronClassifier(Predictor):
272
"""Multilayer perceptron classifier."""
273
274
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
275
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
276
solver="l-bfgs", initialWeights=None, probabilityCol="probability",
277
rawPredictionCol="rawPrediction"):
278
"""
279
Initialize MultilayerPerceptronClassifier.
280
281
Parameters:
282
- maxIter (int): Maximum number of iterations
283
- tol (float): Convergence tolerance
284
- seed (int): Random seed
285
- layers (list): Sizes of layers from input to output
286
- blockSize (int): Block size for stacking input data
287
- stepSize (float): Step size for gradient descent
288
- solver (str): Solver algorithm ("l-bfgs" or "gd")
289
- initialWeights (Vector): Initial weights
290
- probabilityCol (str): Probability column name
291
- rawPredictionCol (str): Raw prediction column name
292
"""
293
```
294
295
### Regression Algorithms
296
297
Supervised learning algorithms for regression tasks.
298
299
```python { .api }
300
class LinearRegression(Predictor):
301
"""Linear regression."""
302
303
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
304
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
305
standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
306
loss="squaredError", epsilon=1.35):
307
"""
308
Initialize LinearRegression.
309
310
Parameters:
311
- maxIter (int): Maximum number of iterations
312
- regParam (float): Regularization parameter
313
- elasticNetParam (float): ElasticNet mixing parameter
314
- tol (float): Convergence tolerance
315
- fitIntercept (bool): Whether to fit intercept
316
- standardization (bool): Whether to standardize features
317
- solver (str): Solver algorithm ("auto", "normal", "l-bfgs")
318
- weightCol (str): Weight column name
319
- aggregationDepth (int): Aggregation depth for treeAggregate
320
- loss (str): Loss function ("squaredError" or "huber")
321
- epsilon (float): Shape parameter for Huber loss
322
"""
323
324
class DecisionTreeRegressor(Predictor):
325
"""Decision tree regressor."""
326
327
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
328
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
329
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
330
impurity="variance", seed=None, varianceCol=None, weightCol=None,
331
leafCol="", minWeightFractionPerNode=0.0):
332
"""
333
Initialize DecisionTreeRegressor.
334
335
Parameters:
336
- impurity (str): Impurity measure ("variance")
337
- varianceCol (str): Variance column name
338
"""
339
340
class RandomForestRegressor(Predictor):
341
"""Random forest regressor."""
342
343
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
344
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
345
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
346
impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=None,
347
subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,
348
weightCol=None, bootstrap=True):
349
"""Initialize RandomForestRegressor."""
350
351
class GBTRegressor(Predictor):
352
"""Gradient-boosted tree regressor."""
353
354
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
355
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
356
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
357
lossType="squared", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
358
featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,
359
leafCol="", minWeightFractionPerNode=0.0, weightCol=None):
360
"""Initialize GBTRegressor."""
361
362
class IsotonicRegression(Estimator):
363
"""Isotonic regression."""
364
365
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
366
weightCol=None, isotonic=True, featureIndex=0):
367
"""
368
Initialize IsotonicRegression.
369
370
Parameters:
371
- isotonic (bool): Whether the output sequence should be isotonic/increasing
372
- featureIndex (int): Index of the feature to use if featuresCol is a vector
373
"""
374
```
375
376
### Clustering Algorithms
377
378
Unsupervised learning algorithms for clustering tasks.
379
380
```python { .api }
381
class KMeans(Estimator):
382
"""K-means clustering."""
383
384
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
385
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, seed=None,
386
distanceMeasure="euclidean", weightCol=None):
387
"""
388
Initialize KMeans.
389
390
Parameters:
391
- k (int): Number of clusters
392
- initMode (str): Initialization algorithm ("k-means||" or "random")
393
- initSteps (int): Number of steps for k-means|| initialization
394
- tol (float): Convergence tolerance
395
- maxIter (int): Maximum number of iterations
396
- seed (int): Random seed
397
- distanceMeasure (str): Distance measure ("euclidean" or "cosine")
398
- weightCol (str): Weight column name
399
"""
400
401
class BisectingKMeans(Estimator):
402
"""Bisecting k-means clustering."""
403
404
def __init__(self, featuresCol="features", predictionCol="prediction", k=4,
405
maxIter=20, seed=None, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
406
"""
407
Initialize BisectingKMeans.
408
409
Parameters:
410
- k (int): Number of clusters
411
- maxIter (int): Maximum number of iterations
412
- seed (int): Random seed
413
- minDivisibleClusterSize (float): Minimum divisible cluster size
414
- distanceMeasure (str): Distance measure
415
"""
416
417
class GaussianMixture(Estimator):
418
"""Gaussian Mixture Model."""
419
420
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
421
probabilityCol="probability", tol=0.01, maxIter=100, seed=None,
422
aggregationDepth=2, weightCol=None):
423
"""
424
Initialize GaussianMixture.
425
426
Parameters:
427
- k (int): Number of components
428
- probabilityCol (str): Probability column name
429
- tol (float): Convergence tolerance
430
- maxIter (int): Maximum number of iterations
431
- seed (int): Random seed
432
- aggregationDepth (int): Aggregation depth for treeAggregate
433
- weightCol (str): Weight column name
434
"""
435
436
class LDA(Estimator):
437
"""Latent Dirichlet Allocation."""
438
439
def __init__(self, featuresCol="features", maxIter=100, seed=None, checkpointInterval=10,
440
k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
441
subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None,
442
topicConcentration=None, topicDistributionCol="topicDistribution",
443
keepLastCheckpoint=True):
444
"""
445
Initialize LDA.
446
447
Parameters:
448
- k (int): Number of topics
449
- optimizer (str): Optimizer ("online" or "em")
450
- learningOffset (float): Learning offset for online optimizer
451
- learningDecay (float): Learning decay rate
452
- subsamplingRate (float): Subsampling rate for online optimizer
453
- optimizeDocConcentration (bool): Whether to optimize document concentration
454
- docConcentration (Vector): Document concentration parameters
455
- topicConcentration (float): Topic concentration parameter
456
- topicDistributionCol (str): Topic distribution column name
457
- keepLastCheckpoint (bool): Whether to keep last checkpoint
458
"""
459
```
460
461
### Feature Processing
462
463
Transformers for feature engineering and preprocessing.
464
465
```python { .api }
466
class VectorAssembler(Transformer):
467
"""Combine multiple columns into a vector column."""
468
469
def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"):
470
"""
471
Initialize VectorAssembler.
472
473
Parameters:
474
- inputCols (list): Input column names
475
- outputCol (str): Output column name
476
- handleInvalid (str): How to handle invalid data ("error", "skip", "keep")
477
"""
478
479
class StandardScaler(Estimator):
480
"""Standardize features by removing mean and scaling to unit variance."""
481
482
def __init__(self, inputCol=None, outputCol=None, withMean=False, withStd=True):
483
"""
484
Initialize StandardScaler.
485
486
Parameters:
487
- inputCol (str): Input column name
488
- outputCol (str): Output column name
489
- withMean (bool): Whether to center data with mean
490
- withStd (bool): Whether to scale to unit standard deviation
491
"""
492
493
class MinMaxScaler(Estimator):
494
"""Transform features by scaling to a given range."""
495
496
def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
497
"""
498
Initialize MinMaxScaler.
499
500
Parameters:
501
- min (float): Lower bound after transformation
502
- max (float): Upper bound after transformation
503
- inputCol (str): Input column name
504
- outputCol (str): Output column name
505
"""
506
507
class StringIndexer(Estimator):
508
"""Encode string labels to label indices."""
509
510
def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None,
511
handleInvalid="error", stringOrderType="frequencyDesc"):
512
"""
513
Initialize StringIndexer.
514
515
Parameters:
516
- inputCol (str): Input column name
517
- outputCol (str): Output column name
518
- inputCols (list): Input column names
519
- outputCols (list): Output column names
520
- handleInvalid (str): How to handle invalid data
521
- stringOrderType (str): How to order labels ("frequencyDesc", "frequencyAsc", "alphabetDesc", "alphabetAsc")
522
"""
523
524
class IndexToString(Transformer):
525
"""Map label indices back to label strings."""
526
527
def __init__(self, inputCol=None, outputCol=None, labels=None, inputCols=None,
528
outputCols=None):
529
"""
530
Initialize IndexToString.
531
532
Parameters:
533
- inputCol (str): Input column name
534
- outputCol (str): Output column name
535
- labels (list): Ordered list of labels
536
- inputCols (list): Input column names
537
- outputCols (list): Output column names
538
"""
539
540
class OneHotEncoder(Estimator):
541
"""One-hot encode categorical features."""
542
543
def __init__(self, inputCols=None, outputCols=None, dropLast=True, handleInvalid="error",
544
inputCol=None, outputCol=None):
545
"""
546
Initialize OneHotEncoder.
547
548
Parameters:
549
- inputCols (list): Input column names
550
- outputCols (list): Output column names
551
- dropLast (bool): Whether to drop the last category
552
- handleInvalid (str): How to handle invalid data
553
- inputCol (str): Input column name (deprecated)
554
- outputCol (str): Output column name (deprecated)
555
"""
556
557
class PCA(Estimator):
558
"""Principal component analysis dimensionality reduction."""
559
560
def __init__(self, k=None, inputCol=None, outputCol=None):
561
"""
562
Initialize PCA.
563
564
Parameters:
565
- k (int): Number of principal components
566
- inputCol (str): Input column name
567
- outputCol (str): Output column name
568
"""
569
570
class Word2Vec(Estimator):
571
"""Word2Vec transforms a dataset of text documents to vectors."""
572
573
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
574
maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5,
575
maxSentenceLength=1000):
576
"""
577
Initialize Word2Vec.
578
579
Parameters:
580
- vectorSize (int): Dimension of the code that maps words to
581
- minCount (int): Minimum number of times a token must appear
582
- numPartitions (int): Number of partitions for sentences
583
- stepSize (float): Step size for gradient descent
584
- maxIter (int): Maximum number of iterations
585
- seed (int): Random seed
586
- inputCol (str): Input column name
587
- outputCol (str): Output column name
588
- windowSize (int): Window size for Word2Vec
589
- maxSentenceLength (int): Maximum sentence length
590
"""
591
592
class CountVectorizer(Estimator):
593
"""Convert text documents to vectors of token counts."""
594
595
def __init__(self, inputCol=None, outputCol=None, vocabSize=1 << 18, minDF=1.0,
596
maxDF=None, minTF=1.0, binary=False):
597
"""
598
Initialize CountVectorizer.
599
600
Parameters:
601
- inputCol (str): Input column name
602
- outputCol (str): Output column name
603
- vocabSize (int): Maximum vocabulary size
604
- minDF (float): Minimum document frequency
605
- maxDF (float): Maximum document frequency
606
- minTF (float): Minimum term frequency
607
- binary (bool): Binary toggle to control term frequency counts
608
"""
609
610
class IDF(Estimator):
611
"""Compute Inverse Document Frequency (IDF) for TF-IDF."""
612
613
def __init__(self, inputCol=None, outputCol=None, minDocFreq=0):
614
"""
615
Initialize IDF.
616
617
Parameters:
618
- inputCol (str): Input column name
619
- outputCol (str): Output column name
620
- minDocFreq (int): Minimum document frequency
621
"""
622
```
623
624
### Model Evaluation
625
626
Evaluation metrics for assessing model performance.
627
628
```python { .api }
629
class Evaluator:
630
"""Base class for evaluators."""
631
632
def evaluate(self, dataset, params=None):
633
"""
634
Evaluate the dataset and return a scalar metric.
635
636
Parameters:
637
- dataset (DataFrame): Dataset to evaluate
638
- params (dict): Additional parameters
639
640
Returns:
641
float: Evaluation metric
642
"""
643
644
class BinaryClassificationEvaluator(Evaluator):
645
"""Evaluator for binary classification."""
646
647
def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
648
metricName="areaUnderROC", weightCol=None, numBins=1000):
649
"""
650
Initialize BinaryClassificationEvaluator.
651
652
Parameters:
653
- rawPredictionCol (str): Raw prediction column name
654
- labelCol (str): Label column name
655
- metricName (str): Metric name ("areaUnderROC" or "areaUnderPR")
656
- weightCol (str): Weight column name
657
- numBins (int): Number of bins for ROC curve
658
"""
659
660
class MulticlassClassificationEvaluator(Evaluator):
661
"""Evaluator for multiclass classification."""
662
663
def __init__(self, predictionCol="prediction", labelCol="label", metricName="f1",
664
metricLabel=0.0, beta=1.0, probabilityCol="probability", eps=1e-15):
665
"""
666
Initialize MulticlassClassificationEvaluator.
667
668
Parameters:
669
- predictionCol (str): Prediction column name
670
- labelCol (str): Label column name
671
- metricName (str): Metric name ("f1", "accuracy", "weightedPrecision", etc.)
672
- metricLabel (float): Label for metric calculation
673
- beta (float): Beta value for F-beta score
674
- probabilityCol (str): Probability column name
675
- eps (float): Epsilon value to avoid division by zero
676
"""
677
678
class RegressionEvaluator(Evaluator):
679
"""Evaluator for regression."""
680
681
def __init__(self, predictionCol="prediction", labelCol="label", metricName="rmse",
682
weightCol=None, throughOrigin=False):
683
"""
684
Initialize RegressionEvaluator.
685
686
Parameters:
687
- predictionCol (str): Prediction column name
688
- labelCol (str): Label column name
689
- metricName (str): Metric name ("rmse", "mse", "r2", "mae", "var")
690
- weightCol (str): Weight column name
691
- throughOrigin (bool): Whether to fit line through origin for r2
692
"""
693
694
class ClusteringEvaluator(Evaluator):
695
"""Evaluator for clustering."""
696
697
def __init__(self, predictionCol="prediction", featuresCol="features",
698
metricName="silhouette", distanceMeasure="squaredEuclidean",
699
weightCol=None):
700
"""
701
Initialize ClusteringEvaluator.
702
703
Parameters:
704
- predictionCol (str): Prediction column name
705
- featuresCol (str): Features column name
706
- metricName (str): Metric name ("silhouette")
707
- distanceMeasure (str): Distance measure
708
- weightCol (str): Weight column name
709
"""
710
```
711
712
### Hyperparameter Tuning
713
714
Tools for hyperparameter optimization and model selection.
715
716
```python { .api }
717
class ParamGridBuilder:
718
"""Builder for a param grid used in grid search-based model selection."""
719
720
def __init__(self):
721
"""Initialize ParamGridBuilder."""
722
723
def addGrid(self, param, values):
724
"""
725
Add parameter values to the grid.
726
727
Parameters:
728
- param (Param): Parameter to tune
729
- values (list): List of parameter values
730
731
Returns:
732
ParamGridBuilder
733
"""
734
735
def build(self):
736
"""
737
Build and return the parameter grid.
738
739
Returns:
740
list: List of parameter maps
741
"""
742
743
class CrossValidator(Estimator):
744
"""K-fold cross validation."""
745
746
def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
747
numFolds=3, seed=None, parallelism=1, collectSubModels=False,
748
foldCol=""):
749
"""
750
Initialize CrossValidator.
751
752
Parameters:
753
- estimator (Estimator): Estimator to cross-validate
754
- estimatorParamMaps (list): Parameter maps to evaluate
755
- evaluator (Evaluator): Evaluator for model selection
756
- numFolds (int): Number of folds for cross validation
757
- seed (int): Random seed
758
- parallelism (int): Number of threads to use for fitting models
759
- collectSubModels (bool): Whether to collect sub-models
760
- foldCol (str): Fold column name
761
"""
762
763
class TrainValidationSplit(Estimator):
764
"""Train-validation split for model selection."""
765
766
def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
767
trainRatio=0.75, seed=None, parallelism=1, collectSubModels=False):
768
"""
769
Initialize TrainValidationSplit.
770
771
Parameters:
772
- estimator (Estimator): Estimator to tune
773
- estimatorParamMaps (list): Parameter maps to evaluate
774
- evaluator (Evaluator): Evaluator for model selection
775
- trainRatio (float): Ratio of training data
776
- seed (int): Random seed
777
- parallelism (int): Number of threads to use for fitting models
778
- collectSubModels (bool): Whether to collect sub-models
779
"""
780
```
781
782
## Types
783
784
```python { .api }
785
from pyspark.ml.linalg import Vector, DenseVector, SparseVector, Vectors
786
from pyspark.ml.linalg import Matrix, DenseMatrix, SparseMatrix, Matrices
787
788
class Vector:
789
"""Abstract base class for ML vector types."""
790
791
def toArray(self):
792
"""Convert to numpy array."""
793
794
class DenseVector(Vector):
795
"""Dense vector representation."""
796
797
def __init__(self, ar):
798
"""Create from array-like object."""
799
800
class SparseVector(Vector):
801
"""Sparse vector representation."""
802
803
def __init__(self, size, *args):
804
"""Create sparse vector."""
805
806
class Vectors:
807
"""Factory methods for creating vectors."""
808
809
@staticmethod
810
def dense(*values):
811
"""Create dense vector."""
812
813
@staticmethod
814
def sparse(size, *args):
815
"""Create sparse vector."""
816
817
class Matrix:
818
"""Abstract base class for ML matrix types."""
819
820
def numRows(self):
821
"""Number of rows."""
822
823
def numCols(self):
824
"""Number of columns."""
825
826
class DenseMatrix(Matrix):
827
"""Dense matrix representation."""
828
829
class SparseMatrix(Matrix):
830
"""Sparse matrix representation in CSC format."""
831
832
class Matrices:
833
"""Factory methods for creating matrices."""
834
835
@staticmethod
836
def dense(numRows, numCols, values):
837
"""Create dense matrix."""
838
839
@staticmethod
840
def sparse(numRows, numCols, colPtrs, rowIndices, values):
841
"""Create sparse matrix."""
842
```