0
# Legacy MLlib
1
2
Legacy machine learning library providing RDD-based algorithms for classification, regression, clustering, and collaborative filtering. This library is maintained for backward compatibility but new applications should use the DataFrame-based ML package.
3
4
## Capabilities
5
6
### Classification
7
8
RDD-based classification algorithms.
9
10
```python { .api }
11
class LogisticRegressionWithSGD:
12
"""Logistic regression using stochastic gradient descent."""
13
14
@classmethod
15
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
16
initialWeights=None, regParam=0.01, regType="l2", intercept=False,
17
validateData=True, convergenceTol=0.001):
18
"""
19
Train a logistic regression model.
20
21
Parameters:
22
- data: Training data RDD of LabeledPoint
23
- iterations (int): Number of iterations
24
- step (float): Step size for SGD
25
- miniBatchFraction (float): Fraction of data for mini-batch
26
- initialWeights (Vector): Initial weights
27
- regParam (float): Regularization parameter
28
- regType (str): Regularization type ("l1", "l2", "none")
29
- intercept (bool): Whether to add intercept
30
- validateData (bool): Whether to validate input data
31
- convergenceTol (float): Convergence tolerance
32
33
Returns:
34
LogisticRegressionModel
35
"""
36
37
class LogisticRegressionWithLBFGS:
38
"""Logistic regression using L-BFGS optimizer."""
39
40
@classmethod
41
def train(cls, data, iterations=100, initialWeights=None, regParam=0.0,
42
regType="l2", intercept=False, corrections=10, tolerance=1e-6,
43
validateData=True, numClasses=2):
44
"""Train a logistic regression model using L-BFGS."""
45
46
class SVMWithSGD:
47
"""Support Vector Machine using stochastic gradient descent."""
48
49
@classmethod
50
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
51
miniBatchFraction=1.0, initialWeights=None, regType="l2",
52
intercept=False, validateData=True, convergenceTol=0.001):
53
"""Train an SVM model using SGD."""
54
55
class NaiveBayes:
56
"""Naive Bayes classifier."""
57
58
@classmethod
59
def train(cls, data, lambda_=1.0, modelType="multinomial"):
60
"""
61
Train a Naive Bayes model.
62
63
Parameters:
64
- data: Training data RDD of LabeledPoint
65
- lambda_ (float): Smoothing parameter
66
- modelType (str): Model type ("multinomial" or "bernoulli")
67
68
Returns:
69
NaiveBayesModel
70
"""
71
```
72
73
### Regression
74
75
RDD-based regression algorithms.
76
77
```python { .api }
78
class LinearRegressionWithSGD:
79
"""Linear regression using stochastic gradient descent."""
80
81
@classmethod
82
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
83
initialWeights=None, regParam=0.0, regType=None, intercept=False,
84
validateData=True, convergenceTol=0.001):
85
"""Train a linear regression model using SGD."""
86
87
class LassoWithSGD:
88
"""Lasso regression using stochastic gradient descent."""
89
90
@classmethod
91
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
92
miniBatchFraction=1.0, initialWeights=None, intercept=False,
93
validateData=True, convergenceTol=0.001):
94
"""Train a Lasso regression model using SGD."""
95
96
class RidgeRegressionWithSGD:
97
"""Ridge regression using stochastic gradient descent."""
98
99
@classmethod
100
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
101
miniBatchFraction=1.0, initialWeights=None, intercept=False,
102
validateData=True, convergenceTol=0.001):
103
"""Train a Ridge regression model using SGD."""
104
```
105
106
### Clustering
107
108
RDD-based clustering algorithms.
109
110
```python { .api }
111
class KMeans:
112
"""K-means clustering algorithm."""
113
114
@classmethod
115
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
116
seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None):
117
"""
118
Train a k-means model.
119
120
Parameters:
121
- rdd: Training data RDD of vectors
122
- k (int): Number of clusters
123
- maxIterations (int): Maximum number of iterations
124
- runs (int): Number of runs to execute
125
- initializationMode (str): Initialization algorithm
126
- seed (int): Random seed
127
- initializationSteps (int): Number of steps for k-means|| initialization
128
- epsilon (float): Convergence tolerance
129
- initialModel: Initial cluster centers
130
131
Returns:
132
KMeansModel
133
"""
134
135
class GaussianMixture:
136
"""Gaussian Mixture Model clustering."""
137
138
@classmethod
139
def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None,
140
initialModel=None):
141
"""
142
Train a Gaussian Mixture Model.
143
144
Parameters:
145
- rdd: Training data RDD of vectors
146
- k (int): Number of components
147
- convergenceTol (float): Convergence tolerance
148
- maxIterations (int): Maximum number of iterations
149
- seed (int): Random seed
150
- initialModel: Initial model
151
152
Returns:
153
GaussianMixtureModel
154
"""
155
156
class LDA:
157
"""Latent Dirichlet Allocation."""
158
159
@classmethod
160
def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
161
topicConcentration=-1.0, seed=None, checkpointInterval=10,
162
optimizer="em"):
163
"""
164
Train an LDA model.
165
166
Parameters:
167
- rdd: Training data RDD of (document ID, word counts) pairs
168
- k (int): Number of topics
169
- maxIterations (int): Maximum number of iterations
170
- docConcentration (float): Document concentration parameter
171
- topicConcentration (float): Topic concentration parameter
172
- seed (int): Random seed
173
- checkpointInterval (int): Checkpoint interval
174
- optimizer (str): Optimizer ("em" or "online")
175
176
Returns:
177
LDAModel
178
"""
179
```
180
181
### Linear Algebra
182
183
Vector and matrix operations for MLlib.
184
185
```python { .api }
186
class Vector:
187
"""Abstract base class for vectors."""
188
189
def toArray(self):
190
"""Convert to numpy array."""
191
192
class DenseVector(Vector):
193
"""Dense vector."""
194
195
def __init__(self, ar):
196
"""Create dense vector from array."""
197
198
class SparseVector(Vector):
199
"""Sparse vector."""
200
201
def __init__(self, size, *args):
202
"""Create sparse vector."""
203
204
class Vectors:
205
"""Factory methods for vectors."""
206
207
@staticmethod
208
def dense(*values):
209
"""Create dense vector."""
210
211
@staticmethod
212
def sparse(size, *args):
213
"""Create sparse vector."""
214
215
@staticmethod
216
def zeros(size):
217
"""Create zero vector."""
218
219
class Matrix:
220
"""Abstract base class for matrices."""
221
222
def numRows(self):
223
"""Number of rows."""
224
225
def numCols(self):
226
"""Number of columns."""
227
228
class DenseMatrix(Matrix):
229
"""Dense matrix."""
230
231
def __init__(self, numRows, numCols, values, isTransposed=False):
232
"""Create dense matrix."""
233
234
class Matrices:
235
"""Factory methods for matrices."""
236
237
@staticmethod
238
def dense(numRows, numCols, values):
239
"""Create dense matrix."""
240
241
class LabeledPoint:
242
"""Labeled data point for supervised learning."""
243
244
def __init__(self, label, features):
245
"""
246
Create labeled point.
247
248
Parameters:
249
- label (float): Point label
250
- features (Vector): Feature vector
251
"""
252
```
253
254
### Recommendation
255
256
Collaborative filtering for recommendation systems.
257
258
```python { .api }
259
class ALS:
260
"""Alternating Least Squares matrix factorization."""
261
262
@classmethod
263
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
264
seed=None):
265
"""
266
Train an ALS model.
267
268
Parameters:
269
- ratings: RDD of Rating objects
270
- rank (int): Number of latent factors
271
- iterations (int): Number of iterations
272
- lambda_ (float): Regularization parameter
273
- blocks (int): Number of blocks for parallel computation
274
- nonnegative (bool): Whether to enforce non-negative constraints
275
- seed (int): Random seed
276
277
Returns:
278
MatrixFactorizationModel
279
"""
280
281
@classmethod
282
def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1,
283
alpha=0.01, nonnegative=False, seed=None):
284
"""Train ALS model for implicit feedback data."""
285
286
class Rating:
287
"""Rating for collaborative filtering."""
288
289
def __init__(self, user, product, rating):
290
"""
291
Create rating.
292
293
Parameters:
294
- user (int): User ID
295
- product (int): Product ID
296
- rating (float): Rating value
297
"""
298
```
299
300
### Feature Extraction
301
302
Feature extraction and transformation utilities.
303
304
```python { .api }
305
class HashingTF:
306
"""Hashing Term Frequency."""
307
308
def __init__(self, numFeatures=1048576):
309
"""
310
Initialize HashingTF.
311
312
Parameters:
313
- numFeatures (int): Number of features/buckets
314
"""
315
316
def transform(self, document):
317
"""Transform document to TF vector."""
318
319
class IDF:
320
"""Inverse Document Frequency."""
321
322
def __init__(self, minDocFreq=0):
323
"""
324
Initialize IDF.
325
326
Parameters:
327
- minDocFreq (int): Minimum document frequency
328
"""
329
330
def fit(self, dataset):
331
"""
332
Compute IDF from dataset.
333
334
Parameters:
335
- dataset: RDD of TF vectors
336
337
Returns:
338
IDFModel
339
"""
340
341
class StandardScaler:
342
"""Feature standardization."""
343
344
def __init__(self, withMean=False, withStd=True):
345
"""
346
Initialize StandardScaler.
347
348
Parameters:
349
- withMean (bool): Whether to center data
350
- withStd (bool): Whether to scale to unit variance
351
"""
352
353
def fit(self, data):
354
"""
355
Compute statistics for scaling.
356
357
Parameters:
358
- data: RDD of vectors
359
360
Returns:
361
StandardScalerModel
362
"""
363
364
class Word2Vec:
365
"""Word2Vec model for word embeddings."""
366
367
def __init__(self, vectorSize=100, learningRate=0.025, numPartitions=1,
368
numIterations=1, seed=None, minCount=5, windowSize=5):
369
"""
370
Initialize Word2Vec.
371
372
Parameters:
373
- vectorSize (int): Size of word vectors
374
- learningRate (float): Learning rate
375
- numPartitions (int): Number of partitions
376
- numIterations (int): Number of iterations
377
- seed (int): Random seed
378
- minCount (int): Minimum word frequency
379
- windowSize (int): Context window size
380
"""
381
382
def fit(self, data):
383
"""
384
Train Word2Vec model.
385
386
Parameters:
387
- data: RDD of sentences (lists of words)
388
389
Returns:
390
Word2VecModel
391
"""
392
```
393
394
## Types
395
396
```python { .api }
397
class LabeledPoint:
398
"""Labeled data point for supervised learning."""
399
400
def __init__(self, label, features):
401
"""
402
Create labeled point.
403
404
Parameters:
405
- label (float): Point label
406
- features (Vector): Feature vector
407
"""
408
409
class Rating:
410
"""Rating for collaborative filtering."""
411
412
def __init__(self, user, product, rating):
413
"""
414
Create rating.
415
416
Parameters:
417
- user (int): User ID
418
- product (int): Product ID
419
- rating (float): Rating value
420
"""
421
```