or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-context-rdds.mdindex.mdlegacy-mllib.mdmachine-learning.mdpandas-api.mdresource-management.mdsql-dataframes.mdstreaming.md

legacy-mllib.mddocs/

0

# Legacy MLlib

1

2

Legacy machine learning library providing RDD-based algorithms for classification, regression, clustering, and collaborative filtering. This library is maintained for backward compatibility but new applications should use the DataFrame-based ML package.

3

4

## Capabilities

5

6

### Classification

7

8

RDD-based classification algorithms.

9

10

```python { .api }

11

class LogisticRegressionWithSGD:

12

"""Logistic regression using stochastic gradient descent."""

13

14

@classmethod

15

def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,

16

initialWeights=None, regParam=0.01, regType="l2", intercept=False,

17

validateData=True, convergenceTol=0.001):

18

"""

19

Train a logistic regression model.

20

21

Parameters:

22

- data: Training data RDD of LabeledPoint

23

- iterations (int): Number of iterations

24

- step (float): Step size for SGD

25

- miniBatchFraction (float): Fraction of data for mini-batch

26

- initialWeights (Vector): Initial weights

27

- regParam (float): Regularization parameter

28

- regType (str): Regularization type ("l1", "l2", "none")

29

- intercept (bool): Whether to add intercept

30

- validateData (bool): Whether to validate input data

31

- convergenceTol (float): Convergence tolerance

32

33

Returns:

34

LogisticRegressionModel

35

"""

36

37

class LogisticRegressionWithLBFGS:

38

"""Logistic regression using L-BFGS optimizer."""

39

40

@classmethod

41

def train(cls, data, iterations=100, initialWeights=None, regParam=0.0,

42

regType="l2", intercept=False, corrections=10, tolerance=1e-6,

43

validateData=True, numClasses=2):

44

"""Train a logistic regression model using L-BFGS."""

45

46

class SVMWithSGD:

47

"""Support Vector Machine using stochastic gradient descent."""

48

49

@classmethod

50

def train(cls, data, iterations=100, step=1.0, regParam=0.01,

51

miniBatchFraction=1.0, initialWeights=None, regType="l2",

52

intercept=False, validateData=True, convergenceTol=0.001):

53

"""Train an SVM model using SGD."""

54

55

class NaiveBayes:

56

"""Naive Bayes classifier."""

57

58

@classmethod

59

def train(cls, data, lambda_=1.0, modelType="multinomial"):

60

"""

61

Train a Naive Bayes model.

62

63

Parameters:

64

- data: Training data RDD of LabeledPoint

65

- lambda_ (float): Smoothing parameter

66

- modelType (str): Model type ("multinomial" or "bernoulli")

67

68

Returns:

69

NaiveBayesModel

70

"""

71

```

72

73

### Regression

74

75

RDD-based regression algorithms.

76

77

```python { .api }

78

class LinearRegressionWithSGD:

79

"""Linear regression using stochastic gradient descent."""

80

81

@classmethod

82

def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,

83

initialWeights=None, regParam=0.0, regType=None, intercept=False,

84

validateData=True, convergenceTol=0.001):

85

"""Train a linear regression model using SGD."""

86

87

class LassoWithSGD:

88

"""Lasso regression using stochastic gradient descent."""

89

90

@classmethod

91

def train(cls, data, iterations=100, step=1.0, regParam=0.01,

92

miniBatchFraction=1.0, initialWeights=None, intercept=False,

93

validateData=True, convergenceTol=0.001):

94

"""Train a Lasso regression model using SGD."""

95

96

class RidgeRegressionWithSGD:

97

"""Ridge regression using stochastic gradient descent."""

98

99

@classmethod

100

def train(cls, data, iterations=100, step=1.0, regParam=0.01,

101

miniBatchFraction=1.0, initialWeights=None, intercept=False,

102

validateData=True, convergenceTol=0.001):

103

"""Train a Ridge regression model using SGD."""

104

```

105

106

### Clustering

107

108

RDD-based clustering algorithms.

109

110

```python { .api }

111

class KMeans:

112

"""K-means clustering algorithm."""

113

114

@classmethod

115

def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",

116

seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None):

117

"""

118

Train a k-means model.

119

120

Parameters:

121

- rdd: Training data RDD of vectors

122

- k (int): Number of clusters

123

- maxIterations (int): Maximum number of iterations

124

- runs (int): Number of runs to execute

125

- initializationMode (str): Initialization algorithm

126

- seed (int): Random seed

127

- initializationSteps (int): Number of steps for k-means|| initialization

128

- epsilon (float): Convergence tolerance

129

- initialModel: Initial cluster centers

130

131

Returns:

132

KMeansModel

133

"""

134

135

class GaussianMixture:

136

"""Gaussian Mixture Model clustering."""

137

138

@classmethod

139

def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None,

140

initialModel=None):

141

"""

142

Train a Gaussian Mixture Model.

143

144

Parameters:

145

- rdd: Training data RDD of vectors

146

- k (int): Number of components

147

- convergenceTol (float): Convergence tolerance

148

- maxIterations (int): Maximum number of iterations

149

- seed (int): Random seed

150

- initialModel: Initial model

151

152

Returns:

153

GaussianMixtureModel

154

"""

155

156

class LDA:

157

"""Latent Dirichlet Allocation."""

158

159

@classmethod

160

def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,

161

topicConcentration=-1.0, seed=None, checkpointInterval=10,

162

optimizer="em"):

163

"""

164

Train an LDA model.

165

166

Parameters:

167

- rdd: Training data RDD of (document ID, word counts) pairs

168

- k (int): Number of topics

169

- maxIterations (int): Maximum number of iterations

170

- docConcentration (float): Document concentration parameter

171

- topicConcentration (float): Topic concentration parameter

172

- seed (int): Random seed

173

- checkpointInterval (int): Checkpoint interval

174

- optimizer (str): Optimizer ("em" or "online")

175

176

Returns:

177

LDAModel

178

"""

179

```

180

181

### Linear Algebra

182

183

Vector and matrix operations for MLlib.

184

185

```python { .api }

186

class Vector:

187

"""Abstract base class for vectors."""

188

189

def toArray(self):

190

"""Convert to numpy array."""

191

192

class DenseVector(Vector):

193

"""Dense vector."""

194

195

def __init__(self, ar):

196

"""Create dense vector from array."""

197

198

class SparseVector(Vector):

199

"""Sparse vector."""

200

201

def __init__(self, size, *args):

202

"""Create sparse vector."""

203

204

class Vectors:

205

"""Factory methods for vectors."""

206

207

@staticmethod

208

def dense(*values):

209

"""Create dense vector."""

210

211

@staticmethod

212

def sparse(size, *args):

213

"""Create sparse vector."""

214

215

@staticmethod

216

def zeros(size):

217

"""Create zero vector."""

218

219

class Matrix:

220

"""Abstract base class for matrices."""

221

222

def numRows(self):

223

"""Number of rows."""

224

225

def numCols(self):

226

"""Number of columns."""

227

228

class DenseMatrix(Matrix):

229

"""Dense matrix."""

230

231

def __init__(self, numRows, numCols, values, isTransposed=False):

232

"""Create dense matrix."""

233

234

class Matrices:

235

"""Factory methods for matrices."""

236

237

@staticmethod

238

def dense(numRows, numCols, values):

239

"""Create dense matrix."""

240

241

class LabeledPoint:

242

"""Labeled data point for supervised learning."""

243

244

def __init__(self, label, features):

245

"""

246

Create labeled point.

247

248

Parameters:

249

- label (float): Point label

250

- features (Vector): Feature vector

251

"""

252

```

253

254

### Recommendation

255

256

Collaborative filtering for recommendation systems.

257

258

```python { .api }

259

class ALS:

260

"""Alternating Least Squares matrix factorization."""

261

262

@classmethod

263

def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,

264

seed=None):

265

"""

266

Train an ALS model.

267

268

Parameters:

269

- ratings: RDD of Rating objects

270

- rank (int): Number of latent factors

271

- iterations (int): Number of iterations

272

- lambda_ (float): Regularization parameter

273

- blocks (int): Number of blocks for parallel computation

274

- nonnegative (bool): Whether to enforce non-negative constraints

275

- seed (int): Random seed

276

277

Returns:

278

MatrixFactorizationModel

279

"""

280

281

@classmethod

282

def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1,

283

alpha=0.01, nonnegative=False, seed=None):

284

"""Train ALS model for implicit feedback data."""

285

286

class Rating:

287

"""Rating for collaborative filtering."""

288

289

def __init__(self, user, product, rating):

290

"""

291

Create rating.

292

293

Parameters:

294

- user (int): User ID

295

- product (int): Product ID

296

- rating (float): Rating value

297

"""

298

```

299

300

### Feature Extraction

301

302

Feature extraction and transformation utilities.

303

304

```python { .api }

305

class HashingTF:

306

"""Hashing Term Frequency."""

307

308

def __init__(self, numFeatures=1048576):

309

"""

310

Initialize HashingTF.

311

312

Parameters:

313

- numFeatures (int): Number of features/buckets

314

"""

315

316

def transform(self, document):

317

"""Transform document to TF vector."""

318

319

class IDF:

320

"""Inverse Document Frequency."""

321

322

def __init__(self, minDocFreq=0):

323

"""

324

Initialize IDF.

325

326

Parameters:

327

- minDocFreq (int): Minimum document frequency

328

"""

329

330

def fit(self, dataset):

331

"""

332

Compute IDF from dataset.

333

334

Parameters:

335

- dataset: RDD of TF vectors

336

337

Returns:

338

IDFModel

339

"""

340

341

class StandardScaler:

342

"""Feature standardization."""

343

344

def __init__(self, withMean=False, withStd=True):

345

"""

346

Initialize StandardScaler.

347

348

Parameters:

349

- withMean (bool): Whether to center data

350

- withStd (bool): Whether to scale to unit variance

351

"""

352

353

def fit(self, data):

354

"""

355

Compute statistics for scaling.

356

357

Parameters:

358

- data: RDD of vectors

359

360

Returns:

361

StandardScalerModel

362

"""

363

364

class Word2Vec:

365

"""Word2Vec model for word embeddings."""

366

367

def __init__(self, vectorSize=100, learningRate=0.025, numPartitions=1,

368

numIterations=1, seed=None, minCount=5, windowSize=5):

369

"""

370

Initialize Word2Vec.

371

372

Parameters:

373

- vectorSize (int): Size of word vectors

374

- learningRate (float): Learning rate

375

- numPartitions (int): Number of partitions

376

- numIterations (int): Number of iterations

377

- seed (int): Random seed

378

- minCount (int): Minimum word frequency

379

- windowSize (int): Context window size

380

"""

381

382

def fit(self, data):

383

"""

384

Train Word2Vec model.

385

386

Parameters:

387

- data: RDD of sentences (lists of words)

388

389

Returns:

390

Word2VecModel

391

"""

392

```

393

394

## Types

395

396

```python { .api }

397

class LabeledPoint:

398

"""Labeled data point for supervised learning."""

399

400

def __init__(self, label, features):

401

"""

402

Create labeled point.

403

404

Parameters:

405

- label (float): Point label

406

- features (Vector): Feature vector

407

"""

408

409

class Rating:

410

"""Rating for collaborative filtering."""

411

412

def __init__(self, user, product, rating):

413

"""

414

Create rating.

415

416

Parameters:

417

- user (int): User ID

418

- product (int): Product ID

419

- rating (float): Rating value

420

"""

421

```