Tessl Tile for pypi/pyspark@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-context-rdds.md index.md legacy-mllib.md machine-learning.md pandas-api.md resource-management.md sql-dataframes.md streaming.md

machine-learning.mddocs/

0
# Machine Learning (ML)
1

2
Modern machine learning pipeline API providing estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing. The ML package provides a high-level API built on DataFrames for constructing ML pipelines.
3

4
## Capabilities
5

6
### Pipeline Components
7

8
Core abstractions for building machine learning workflows.
9

10
```python { .api }
11
class Pipeline:
12
    """A simple pipeline that chains multiple Transformers and Estimators together."""
13
    
14
    def __init__(self, stages=None):
15
        """
16
        Initialize Pipeline.
17
        
18
        Parameters:
19
        - stages (list): List of pipeline stages (Transformers and Estimators)
20
        """
21
    
22
    def fit(self, dataset, params=None):
23
        """
24
        Fit the pipeline to training data.
25
        
26
        Parameters:
27
        - dataset (DataFrame): Training dataset
28
        - params (dict): Additional parameters
29
        
30
        Returns:
31
        PipelineModel: Fitted pipeline model
32
        """
33
    
34
    def setStages(self, value):
35
        """Set pipeline stages."""
36

37
class PipelineModel:
38
    """A fitted pipeline model."""
39
    
40
    def transform(self, dataset):
41
        """
42
        Transform the dataset using the fitted pipeline.
43
        
44
        Parameters:
45
        - dataset (DataFrame): Dataset to transform
46
        
47
        Returns:
48
        DataFrame: Transformed dataset
49
        """
50
    
51
    def save(self, path):
52
        """Save the pipeline model to the given path."""
53
    
54
    @classmethod
55
    def load(cls, path):
56
        """Load a pipeline model from the given path."""
57

58
class Estimator:
59
    """Abstract class for estimators that can be fit on a DataFrame to produce a Model."""
60
    
61
    def fit(self, dataset, params=None):
62
        """
63
        Fit model to training data.
64
        
65
        Parameters:
66
        - dataset (DataFrame): Training dataset
67
        - params (dict): Additional parameters
68
        
69
        Returns:
70
        Model: Fitted model
71
        """
72

73
class Transformer:
74
    """Abstract class for transformers that transform DataFrames into DataFrames."""
75
    
76
    def transform(self, dataset):
77
        """
78
        Transform the dataset.
79
        
80
        Parameters:
81
        - dataset (DataFrame): Dataset to transform
82
        
83
        Returns:
84
        DataFrame: Transformed dataset
85
        """
86

87
class Model:
88
    """Abstract class for models that are fitted by estimators."""
89
    
90
    def transform(self, dataset):
91
        """
92
        Transform the dataset using the fitted model.
93
        
94
        Parameters:
95
        - dataset (DataFrame): Dataset to transform
96
        
97
        Returns:
98
        DataFrame: Transformed dataset
99
        """
100

101
class Predictor(Estimator):
102
    """Base class for predictors that make predictions on feature vectors."""
103
    
104
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction"):
105
        """
106
        Initialize Predictor.
107
        
108
        Parameters:
109
        - featuresCol (str): Features column name
110
        - labelCol (str): Label column name  
111
        - predictionCol (str): Prediction column name
112
        """
113

114
class PredictionModel(Model):
115
    """Base class for prediction models."""
116
    
117
    def predict(self, value):
118
        """Make a prediction on a single feature vector."""
119
    
120
    def transform(self, dataset):
121
        """Transform dataset to include predictions."""
122
```
123

124
### Classification Algorithms
125

126
Supervised learning algorithms for classification tasks.
127

128
```python { .api }
129
class LogisticRegression(Predictor):
130
    """Logistic regression classifier."""
131
    
132
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
133
                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
134
                 threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction",
135
                 standardization=True, weightCol=None, aggregationDepth=2, family="auto",
136
                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
137
                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, maxBlockSizeInMB=0.0):
138
        """
139
        Initialize LogisticRegression.
140
        
141
        Parameters:
142
        - featuresCol (str): Features column name
143
        - labelCol (str): Label column name
144
        - predictionCol (str): Prediction column name
145
        - maxIter (int): Maximum number of iterations
146
        - regParam (float): Regularization parameter
147
        - elasticNetParam (float): ElasticNet mixing parameter
148
        - tol (float): Convergence tolerance
149
        - fitIntercept (bool): Whether to fit intercept
150
        - threshold (float): Binary classification threshold
151
        - thresholds (list): Thresholds for multiclass classification
152
        - probabilityCol (str): Probability column name
153
        - rawPredictionCol (str): Raw prediction column name
154
        - standardization (bool): Whether to standardize features
155
        - weightCol (str): Weight column name
156
        - aggregationDepth (int): Aggregation depth for treeAggregate
157
        - family (str): Name of family for GLM
158
        - lowerBoundsOnCoefficients (Matrix): Lower bounds on coefficients
159
        - upperBoundsOnCoefficients (Matrix): Upper bounds on coefficients
160
        - lowerBoundsOnIntercepts (Vector): Lower bounds on intercepts
161
        - upperBoundsOnIntercepts (Vector): Upper bounds on intercepts
162
        - maxBlockSizeInMB (float): Maximum memory for stacking input data
163
        """
164

165
class DecisionTreeClassifier(Predictor):
166
    """Decision tree classifier."""
167
    
168
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
169
                 probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
170
                 maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
171
                 cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None,
172
                 weightCol=None, leafCol="", minWeightFractionPerNode=0.0):
173
        """
174
        Initialize DecisionTreeClassifier.
175
        
176
        Parameters:
177
        - maxDepth (int): Maximum depth of tree
178
        - maxBins (int): Maximum number of bins for discretizing continuous features
179
        - minInstancesPerNode (int): Minimum number of instances each child must have
180
        - minInfoGain (float): Minimum information gain for split
181
        - maxMemoryInMB (int): Maximum memory in MB allocated to histogram aggregation
182
        - cacheNodeIds (bool): Whether to cache node IDs
183
        - checkpointInterval (int): Checkpoint interval
184
        - impurity (str): Impurity measure ("gini" or "entropy")
185
        - seed (int): Random seed
186
        - weightCol (str): Weight column name
187
        - leafCol (str): Leaf index column name
188
        - minWeightFractionPerNode (float): Minimum weighted fraction of total weight
189
        """
190

191
class RandomForestClassifier(Predictor):
192
    """Random forest classifier."""
193
    
194
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
195
                 probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
196
                 maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
197
                 cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20,
198
                 featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, weightCol=None,
199
                 leafCol="", minWeightFractionPerNode=0.0, bootstrap=True):
200
        """
201
        Initialize RandomForestClassifier.
202
        
203
        Parameters:
204
        - numTrees (int): Number of trees in the forest
205
        - featureSubsetStrategy (str): Number of features to consider for splits
206
        - subsamplingRate (float): Fraction of training data used for learning
207
        - bootstrap (bool): Whether bootstrap samples are used when building trees
208
        """
209

210
class GBTClassifier(Predictor):
211
    """Gradient-boosted tree classifier."""
212
    
213
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
214
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
215
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
216
                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None,
217
                 subsamplingRate=1.0, featureSubsetStrategy="all", validationTol=0.01,
218
                 validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,
219
                 weightCol=None):
220
        """
221
        Initialize GBTClassifier.
222
        
223
        Parameters:
224
        - lossType (str): Loss function type
225
        - maxIter (int): Maximum number of iterations
226
        - stepSize (float): Step size for gradient descent
227
        - subsamplingRate (float): Fraction of training data used for learning
228
        - featureSubsetStrategy (str): Number of features to consider for splits
229
        - validationTol (float): Validation tolerance for early stopping
230
        - validationIndicatorCol (str): Validation indicator column name
231
        """
232

233
class NaiveBayes(Predictor):
234
    """Naive Bayes classifier."""
235
    
236
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
237
                 probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
238
                 modelType="multinomial", thresholds=None, weightCol=None):
239
        """
240
        Initialize NaiveBayes.
241
        
242
        Parameters:
243
        - smoothing (float): Smoothing parameter
244
        - modelType (str): Model type ("multinomial" or "bernoulli")
245
        - thresholds (list): Thresholds for binary classification
246
        - weightCol (str): Weight column name
247
        """
248

249
class LinearSVC(Predictor):
250
    """Linear Support Vector Classifier."""
251
    
252
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
253
                 rawPredictionCol="rawPrediction", maxIter=100, regParam=0.0, tol=1e-6,
254
                 fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
255
                 aggregationDepth=2, blockSize=1):
256
        """
257
        Initialize LinearSVC.
258
        
259
        Parameters:
260
        - maxIter (int): Maximum number of iterations
261
        - regParam (float): Regularization parameter
262
        - tol (float): Convergence tolerance
263
        - fitIntercept (bool): Whether to fit intercept
264
        - standardization (bool): Whether to standardize features
265
        - threshold (float): Classification threshold
266
        - weightCol (str): Weight column name
267
        - aggregationDepth (int): Aggregation depth for treeAggregate
268
        - blockSize (int): Block size for stacking input data
269
        """
270

271
class MultilayerPerceptronClassifier(Predictor):
272
    """Multilayer perceptron classifier."""
273
    
274
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
275
                 maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
276
                 solver="l-bfgs", initialWeights=None, probabilityCol="probability",
277
                 rawPredictionCol="rawPrediction"):
278
        """
279
        Initialize MultilayerPerceptronClassifier.
280
        
281
        Parameters:
282
        - maxIter (int): Maximum number of iterations
283
        - tol (float): Convergence tolerance
284
        - seed (int): Random seed
285
        - layers (list): Sizes of layers from input to output
286
        - blockSize (int): Block size for stacking input data
287
        - stepSize (float): Step size for gradient descent
288
        - solver (str): Solver algorithm ("l-bfgs" or "gd")
289
        - initialWeights (Vector): Initial weights
290
        - probabilityCol (str): Probability column name
291
        - rawPredictionCol (str): Raw prediction column name
292
        """
293
```
294

295
### Regression Algorithms
296

297
Supervised learning algorithms for regression tasks.
298

299
```python { .api }
300
class LinearRegression(Predictor):
301
    """Linear regression."""
302
    
303
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
304
                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
305
                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
306
                 loss="squaredError", epsilon=1.35):
307
        """
308
        Initialize LinearRegression.
309
        
310
        Parameters:
311
        - maxIter (int): Maximum number of iterations
312
        - regParam (float): Regularization parameter
313
        - elasticNetParam (float): ElasticNet mixing parameter
314
        - tol (float): Convergence tolerance
315
        - fitIntercept (bool): Whether to fit intercept
316
        - standardization (bool): Whether to standardize features
317
        - solver (str): Solver algorithm ("auto", "normal", "l-bfgs")
318
        - weightCol (str): Weight column name
319
        - aggregationDepth (int): Aggregation depth for treeAggregate
320
        - loss (str): Loss function ("squaredError" or "huber")
321
        - epsilon (float): Shape parameter for Huber loss
322
        """
323

324
class DecisionTreeRegressor(Predictor):
325
    """Decision tree regressor."""
326
    
327
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
328
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
329
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
330
                 impurity="variance", seed=None, varianceCol=None, weightCol=None,
331
                 leafCol="", minWeightFractionPerNode=0.0):
332
        """
333
        Initialize DecisionTreeRegressor.
334
        
335
        Parameters:
336
        - impurity (str): Impurity measure ("variance")
337
        - varianceCol (str): Variance column name
338
        """
339

340
class RandomForestRegressor(Predictor):
341
    """Random forest regressor."""
342
    
343
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
344
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
345
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
346
                 impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=None,
347
                 subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,
348
                 weightCol=None, bootstrap=True):
349
        """Initialize RandomForestRegressor."""
350

351
class GBTRegressor(Predictor):
352
    """Gradient-boosted tree regressor."""
353
    
354
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
355
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
356
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
357
                 lossType="squared", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
358
                 featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,
359
                 leafCol="", minWeightFractionPerNode=0.0, weightCol=None):
360
        """Initialize GBTRegressor."""
361

362
class IsotonicRegression(Estimator):
363
    """Isotonic regression."""
364
    
365
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
366
                 weightCol=None, isotonic=True, featureIndex=0):
367
        """
368
        Initialize IsotonicRegression.
369
        
370
        Parameters:
371
        - isotonic (bool): Whether the output sequence should be isotonic/increasing
372
        - featureIndex (int): Index of the feature to use if featuresCol is a vector
373
        """
374
```
375

376
### Clustering Algorithms
377

378
Unsupervised learning algorithms for clustering tasks.
379

380
```python { .api }
381
class KMeans(Estimator):
382
    """K-means clustering."""
383
    
384
    def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
385
                 initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, seed=None,
386
                 distanceMeasure="euclidean", weightCol=None):
387
        """
388
        Initialize KMeans.
389
        
390
        Parameters:
391
        - k (int): Number of clusters
392
        - initMode (str): Initialization algorithm ("k-means||" or "random")
393
        - initSteps (int): Number of steps for k-means|| initialization
394
        - tol (float): Convergence tolerance
395
        - maxIter (int): Maximum number of iterations
396
        - seed (int): Random seed
397
        - distanceMeasure (str): Distance measure ("euclidean" or "cosine")
398
        - weightCol (str): Weight column name
399
        """
400

401
class BisectingKMeans(Estimator):
402
    """Bisecting k-means clustering."""
403
    
404
    def __init__(self, featuresCol="features", predictionCol="prediction", k=4,
405
                 maxIter=20, seed=None, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
406
        """
407
        Initialize BisectingKMeans.
408
        
409
        Parameters:
410
        - k (int): Number of clusters
411
        - maxIter (int): Maximum number of iterations
412
        - seed (int): Random seed
413
        - minDivisibleClusterSize (float): Minimum divisible cluster size
414
        - distanceMeasure (str): Distance measure
415
        """
416

417
class GaussianMixture(Estimator):
418
    """Gaussian Mixture Model."""
419
    
420
    def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
421
                 probabilityCol="probability", tol=0.01, maxIter=100, seed=None,
422
                 aggregationDepth=2, weightCol=None):
423
        """
424
        Initialize GaussianMixture.
425
        
426
        Parameters:
427
        - k (int): Number of components
428
        - probabilityCol (str): Probability column name
429
        - tol (float): Convergence tolerance
430
        - maxIter (int): Maximum number of iterations
431
        - seed (int): Random seed
432
        - aggregationDepth (int): Aggregation depth for treeAggregate
433
        - weightCol (str): Weight column name
434
        """
435

436
class LDA(Estimator):
437
    """Latent Dirichlet Allocation."""
438
    
439
    def __init__(self, featuresCol="features", maxIter=100, seed=None, checkpointInterval=10,
440
                 k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
441
                 subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None,
442
                 topicConcentration=None, topicDistributionCol="topicDistribution",
443
                 keepLastCheckpoint=True):
444
        """
445
        Initialize LDA.
446
        
447
        Parameters:
448
        - k (int): Number of topics
449
        - optimizer (str): Optimizer ("online" or "em")
450
        - learningOffset (float): Learning offset for online optimizer
451
        - learningDecay (float): Learning decay rate
452
        - subsamplingRate (float): Subsampling rate for online optimizer
453
        - optimizeDocConcentration (bool): Whether to optimize document concentration
454
        - docConcentration (Vector): Document concentration parameters
455
        - topicConcentration (float): Topic concentration parameter
456
        - topicDistributionCol (str): Topic distribution column name
457
        - keepLastCheckpoint (bool): Whether to keep last checkpoint
458
        """
459
```
460

461
### Feature Processing
462

463
Transformers for feature engineering and preprocessing.
464

465
```python { .api }
466
class VectorAssembler(Transformer):
467
    """Combine multiple columns into a vector column."""
468
    
469
    def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"):
470
        """
471
        Initialize VectorAssembler.
472
        
473
        Parameters:
474
        - inputCols (list): Input column names
475
        - outputCol (str): Output column name
476
        - handleInvalid (str): How to handle invalid data ("error", "skip", "keep")
477
        """
478

479
class StandardScaler(Estimator):
480
    """Standardize features by removing mean and scaling to unit variance."""
481
    
482
    def __init__(self, inputCol=None, outputCol=None, withMean=False, withStd=True):
483
        """
484
        Initialize StandardScaler.
485
        
486
        Parameters:
487
        - inputCol (str): Input column name
488
        - outputCol (str): Output column name
489
        - withMean (bool): Whether to center data with mean
490
        - withStd (bool): Whether to scale to unit standard deviation
491
        """
492

493
class MinMaxScaler(Estimator):
494
    """Transform features by scaling to a given range."""
495
    
496
    def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
497
        """
498
        Initialize MinMaxScaler.
499
        
500
        Parameters:
501
        - min (float): Lower bound after transformation
502
        - max (float): Upper bound after transformation
503
        - inputCol (str): Input column name
504
        - outputCol (str): Output column name
505
        """
506

507
class StringIndexer(Estimator):
508
    """Encode string labels to label indices."""
509
    
510
    def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None,
511
                 handleInvalid="error", stringOrderType="frequencyDesc"):
512
        """
513
        Initialize StringIndexer.
514
        
515
        Parameters:
516
        - inputCol (str): Input column name
517
        - outputCol (str): Output column name
518
        - inputCols (list): Input column names
519
        - outputCols (list): Output column names
520
        - handleInvalid (str): How to handle invalid data
521
        - stringOrderType (str): How to order labels ("frequencyDesc", "frequencyAsc", "alphabetDesc", "alphabetAsc")
522
        """
523

524
class IndexToString(Transformer):
525
    """Map label indices back to label strings."""
526
    
527
    def __init__(self, inputCol=None, outputCol=None, labels=None, inputCols=None,
528
                 outputCols=None):
529
        """
530
        Initialize IndexToString.
531
        
532
        Parameters:
533
        - inputCol (str): Input column name
534
        - outputCol (str): Output column name
535
        - labels (list): Ordered list of labels
536
        - inputCols (list): Input column names
537
        - outputCols (list): Output column names
538
        """
539

540
class OneHotEncoder(Estimator):
541
    """One-hot encode categorical features."""
542
    
543
    def __init__(self, inputCols=None, outputCols=None, dropLast=True, handleInvalid="error",
544
                 inputCol=None, outputCol=None):
545
        """
546
        Initialize OneHotEncoder.
547
        
548
        Parameters:
549
        - inputCols (list): Input column names
550
        - outputCols (list): Output column names
551
        - dropLast (bool): Whether to drop the last category
552
        - handleInvalid (str): How to handle invalid data
553
        - inputCol (str): Input column name (deprecated)
554
        - outputCol (str): Output column name (deprecated)
555
        """
556

557
class PCA(Estimator):
558
    """Principal component analysis dimensionality reduction."""
559
    
560
    def __init__(self, k=None, inputCol=None, outputCol=None):
561
        """
562
        Initialize PCA.
563
        
564
        Parameters:
565
        - k (int): Number of principal components
566
        - inputCol (str): Input column name
567
        - outputCol (str): Output column name
568
        """
569

570
class Word2Vec(Estimator):
571
    """Word2Vec transforms a dataset of text documents to vectors."""
572
    
573
    def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
574
                 maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5,
575
                 maxSentenceLength=1000):
576
        """
577
        Initialize Word2Vec.
578
        
579
        Parameters:
580
        - vectorSize (int): Dimension of the code that maps words to
581
        - minCount (int): Minimum number of times a token must appear
582
        - numPartitions (int): Number of partitions for sentences
583
        - stepSize (float): Step size for gradient descent
584
        - maxIter (int): Maximum number of iterations
585
        - seed (int): Random seed
586
        - inputCol (str): Input column name
587
        - outputCol (str): Output column name
588
        - windowSize (int): Window size for Word2Vec
589
        - maxSentenceLength (int): Maximum sentence length
590
        """
591

592
class CountVectorizer(Estimator):
593
    """Convert text documents to vectors of token counts."""
594
    
595
    def __init__(self, inputCol=None, outputCol=None, vocabSize=1 << 18, minDF=1.0,
596
                 maxDF=None, minTF=1.0, binary=False):
597
        """
598
        Initialize CountVectorizer.
599
        
600
        Parameters:
601
        - inputCol (str): Input column name
602
        - outputCol (str): Output column name
603
        - vocabSize (int): Maximum vocabulary size
604
        - minDF (float): Minimum document frequency
605
        - maxDF (float): Maximum document frequency
606
        - minTF (float): Minimum term frequency
607
        - binary (bool): Binary toggle to control term frequency counts
608
        """
609

610
class IDF(Estimator):
611
    """Compute Inverse Document Frequency (IDF) for TF-IDF."""
612
    
613
    def __init__(self, inputCol=None, outputCol=None, minDocFreq=0):
614
        """
615
        Initialize IDF.
616
        
617
        Parameters:
618
        - inputCol (str): Input column name
619
        - outputCol (str): Output column name
620
        - minDocFreq (int): Minimum document frequency
621
        """
622
```
623

624
### Model Evaluation
625

626
Evaluation metrics for assessing model performance.
627

628
```python { .api }
629
class Evaluator:
630
    """Base class for evaluators."""
631
    
632
    def evaluate(self, dataset, params=None):
633
        """
634
        Evaluate the dataset and return a scalar metric.
635
        
636
        Parameters:
637
        - dataset (DataFrame): Dataset to evaluate
638
        - params (dict): Additional parameters
639
        
640
        Returns:
641
        float: Evaluation metric
642
        """
643

644
class BinaryClassificationEvaluator(Evaluator):
645
    """Evaluator for binary classification."""
646
    
647
    def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
648
                 metricName="areaUnderROC", weightCol=None, numBins=1000):
649
        """
650
        Initialize BinaryClassificationEvaluator.
651
        
652
        Parameters:
653
        - rawPredictionCol (str): Raw prediction column name
654
        - labelCol (str): Label column name
655
        - metricName (str): Metric name ("areaUnderROC" or "areaUnderPR")
656
        - weightCol (str): Weight column name
657
        - numBins (int): Number of bins for ROC curve
658
        """
659

660
class MulticlassClassificationEvaluator(Evaluator):
661
    """Evaluator for multiclass classification."""
662
    
663
    def __init__(self, predictionCol="prediction", labelCol="label", metricName="f1",
664
                 metricLabel=0.0, beta=1.0, probabilityCol="probability", eps=1e-15):
665
        """
666
        Initialize MulticlassClassificationEvaluator.
667
        
668
        Parameters:
669
        - predictionCol (str): Prediction column name
670
        - labelCol (str): Label column name
671
        - metricName (str): Metric name ("f1", "accuracy", "weightedPrecision", etc.)
672
        - metricLabel (float): Label for metric calculation
673
        - beta (float): Beta value for F-beta score
674
        - probabilityCol (str): Probability column name
675
        - eps (float): Epsilon value to avoid division by zero
676
        """
677

678
class RegressionEvaluator(Evaluator):
679
    """Evaluator for regression."""
680
    
681
    def __init__(self, predictionCol="prediction", labelCol="label", metricName="rmse",
682
                 weightCol=None, throughOrigin=False):
683
        """
684
        Initialize RegressionEvaluator.
685
        
686
        Parameters:
687
        - predictionCol (str): Prediction column name
688
        - labelCol (str): Label column name
689
        - metricName (str): Metric name ("rmse", "mse", "r2", "mae", "var")
690
        - weightCol (str): Weight column name
691
        - throughOrigin (bool): Whether to fit line through origin for r2
692
        """
693

694
class ClusteringEvaluator(Evaluator):
695
    """Evaluator for clustering."""
696
    
697
    def __init__(self, predictionCol="prediction", featuresCol="features",
698
                 metricName="silhouette", distanceMeasure="squaredEuclidean",
699
                 weightCol=None):
700
        """
701
        Initialize ClusteringEvaluator.
702
        
703
        Parameters:
704
        - predictionCol (str): Prediction column name
705
        - featuresCol (str): Features column name
706
        - metricName (str): Metric name ("silhouette")
707
        - distanceMeasure (str): Distance measure
708
        - weightCol (str): Weight column name
709
        """
710
```
711

712
### Hyperparameter Tuning
713

714
Tools for hyperparameter optimization and model selection.
715

716
```python { .api }
717
class ParamGridBuilder:
718
    """Builder for a param grid used in grid search-based model selection."""
719
    
720
    def __init__(self):
721
        """Initialize ParamGridBuilder."""
722
    
723
    def addGrid(self, param, values):
724
        """
725
        Add parameter values to the grid.
726
        
727
        Parameters:
728
        - param (Param): Parameter to tune
729
        - values (list): List of parameter values
730
        
731
        Returns:
732
        ParamGridBuilder
733
        """
734
    
735
    def build(self):
736
        """
737
        Build and return the parameter grid.
738
        
739
        Returns:
740
        list: List of parameter maps
741
        """
742

743
class CrossValidator(Estimator):
744
    """K-fold cross validation."""
745
    
746
    def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
747
                 numFolds=3, seed=None, parallelism=1, collectSubModels=False,
748
                 foldCol=""):
749
        """
750
        Initialize CrossValidator.
751
        
752
        Parameters:
753
        - estimator (Estimator): Estimator to cross-validate
754
        - estimatorParamMaps (list): Parameter maps to evaluate
755
        - evaluator (Evaluator): Evaluator for model selection
756
        - numFolds (int): Number of folds for cross validation
757
        - seed (int): Random seed
758
        - parallelism (int): Number of threads to use for fitting models
759
        - collectSubModels (bool): Whether to collect sub-models
760
        - foldCol (str): Fold column name
761
        """
762

763
class TrainValidationSplit(Estimator):
764
    """Train-validation split for model selection."""
765
    
766
    def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
767
                 trainRatio=0.75, seed=None, parallelism=1, collectSubModels=False):
768
        """
769
        Initialize TrainValidationSplit.
770
        
771
        Parameters:
772
        - estimator (Estimator): Estimator to tune
773
        - estimatorParamMaps (list): Parameter maps to evaluate
774
        - evaluator (Evaluator): Evaluator for model selection
775
        - trainRatio (float): Ratio of training data
776
        - seed (int): Random seed
777
        - parallelism (int): Number of threads to use for fitting models
778
        - collectSubModels (bool): Whether to collect sub-models
779
        """
780
```
781

782
## Types
783

784
```python { .api }
785
from pyspark.ml.linalg import Vector, DenseVector, SparseVector, Vectors
786
from pyspark.ml.linalg import Matrix, DenseMatrix, SparseMatrix, Matrices
787

788
class Vector:
789
    """Abstract base class for ML vector types."""
790
    
791
    def toArray(self):
792
        """Convert to numpy array."""
793

794
class DenseVector(Vector):
795
    """Dense vector representation."""
796
    
797
    def __init__(self, ar):
798
        """Create from array-like object."""
799

800
class SparseVector(Vector):
801
    """Sparse vector representation."""
802
    
803
    def __init__(self, size, *args):
804
        """Create sparse vector."""
805

806
class Vectors:
807
    """Factory methods for creating vectors."""
808
    
809
    @staticmethod
810
    def dense(*values):
811
        """Create dense vector."""
812
    
813
    @staticmethod
814
    def sparse(size, *args):
815
        """Create sparse vector."""
816

817
class Matrix:
818
    """Abstract base class for ML matrix types."""
819
    
820
    def numRows(self):
821
        """Number of rows."""
822
    
823
    def numCols(self):
824
        """Number of columns."""
825

826
class DenseMatrix(Matrix):
827
    """Dense matrix representation."""
828

829
class SparseMatrix(Matrix):
830
    """Sparse matrix representation in CSC format."""
831

832
class Matrices:
833
    """Factory methods for creating matrices."""
834
    
835
    @staticmethod
836
    def dense(numRows, numCols, values):
837
        """Create dense matrix."""
838
    
839
    @staticmethod
840
    def sparse(numRows, numCols, colPtrs, rowIndices, values):
841
        """Create sparse matrix."""
842
```

Version

Tile

Files

machine-learning.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

machine-learning.mddocs/