Apache Spark's scalable machine learning library providing comprehensive ML algorithms and utilities for large-scale data processing
npx @tessl/cli install tessl/maven-org-apache-spark--spark-mllib_2-13@4.0.0Apache Spark's scalable machine learning library providing comprehensive ML algorithms and utilities for large-scale data processing. MLlib delivers high-performance distributed machine learning that scales from single machines to large clusters.
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.13</artifactId>
<version>4.0.0</version>
</dependency>libraryDependencies += "org.apache.spark" %% "spark-mllib" % "4.0.0"implementation 'org.apache.spark:spark-mllib_2.13:4.0.0'// Modern DataFrame-based API (recommended)
import org.apache.spark.ml._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.ml.clustering._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.fpm._
// Legacy RDD-based API (maintained for compatibility)
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.clustering._import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
import org.apache.spark.ml.Pipeline
// Initialize Spark session
val spark = SparkSession.builder()
.appName("MLlib Example")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// Load data
val data = spark.read.format("libsvm")
.load("data/mllib/sample_multiclass_classification_data.txt")
// Prepare features
val assembler = new VectorAssembler()
.setInputCols(Array("feature1", "feature2"))
.setOutputCol("features")
// Create classifier
val lr = new LogisticRegression()
.setMaxIter(20)
.setRegParam(0.3)
.setElasticNetParam(0.8)
// Create pipeline
val pipeline = new Pipeline()
.setStages(Array(assembler, lr))
// Train model
val model = pipeline.fit(data)
// Make predictions
val predictions = model.transform(data)
predictions.show()MLlib provides two complementary APIs built on Spark's distributed computing engine:
Supervised learning algorithms for predicting categorical outcomes, including binary and multiclass classification with probabilistic predictions and comprehensive model evaluation.
// Logistic Regression
class LogisticRegression extends Classifier[Vector, LogisticRegression, LogisticRegressionModel]
class LogisticRegressionModel extends ClassificationModel[Vector, LogisticRegressionModel]
// Decision Trees
class DecisionTreeClassifier extends Classifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
class DecisionTreeClassificationModel extends ClassificationModel[Vector, DecisionTreeClassificationModel]
// Random Forest
class RandomForestClassifier extends Classifier[Vector, RandomForestClassifier, RandomForestClassificationModel]
class RandomForestClassificationModel extends ClassificationModel[Vector, RandomForestClassificationModel]Supervised learning algorithms for predicting continuous numerical values, including linear models, tree-based methods, and survival analysis with comprehensive residual analysis.
// Linear Regression
class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
class LinearRegressionModel extends RegressionModel[Vector, LinearRegressionModel]
// Decision Tree Regression
class DecisionTreeRegressor extends Regressor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel]
class DecisionTreeRegressionModel extends RegressionModel[Vector, DecisionTreeRegressionModel]
// Random Forest Regression
class RandomForestRegressor extends Regressor[Vector, RandomForestRegressor, RandomForestRegressionModel]
class RandomForestRegressionModel extends RegressionModel[Vector, RandomForestRegressionModel]Unsupervised learning algorithms for discovering hidden patterns and grouping similar data points, including partitioning, hierarchical, and probabilistic clustering methods.
// K-Means Clustering
class KMeans extends Estimator[KMeans] with KMeansParams
class KMeansModel extends Model[KMeansModel] with KMeansParams
// Gaussian Mixture Model
class GaussianMixture extends Estimator[GaussianMixture] with GaussianMixtureParams
class GaussianMixtureModel extends Model[GaussianMixtureModel] with GaussianMixtureParams
// Latent Dirichlet Allocation
class LDA extends Estimator[LDA] with LDAParams
abstract class LDAModel extends Model[LDAModel] with LDAParamsComprehensive data preprocessing and feature transformation utilities for preparing raw data for machine learning algorithms, including text processing, categorical encoding, and numerical scaling.
// Vector Assembly and Manipulation
class VectorAssembler extends Transformer
class VectorSlicer extends Transformer
class VectorIndexer extends Estimator[VectorIndexerModel]
// Scaling and Normalization
class StandardScaler extends Estimator[StandardScalerModel]
class MinMaxScaler extends Estimator[MinMaxScalerModel]
class Normalizer extends Transformer
// Categorical Features
class StringIndexer extends Estimator[StringIndexerModel]
class OneHotEncoder extends Transformer
class IndexToString extends TransformerComprehensive model evaluation metrics and automated hyperparameter tuning capabilities for assessing model performance and optimizing ML pipelines.
// Evaluators
abstract class Evaluator extends Params
class BinaryClassificationEvaluator extends Evaluator
class MulticlassClassificationEvaluator extends Evaluator
class RegressionEvaluator extends Evaluator
// Model Selection
class CrossValidator extends Estimator[CrossValidatorModel]
class TrainValidationSplit extends Estimator[TrainValidationSplitModel]
class ParamGridBuilderCollaborative filtering algorithms for building recommendation engines, including matrix factorization techniques optimized for large-scale user-item interaction datasets.
// Alternating Least Squares
class ALS extends Estimator[ALSModel] with ALSParams
class ALSModel extends Model[ALSModel] with ALSParamsCore abstractions and utilities for building composable machine learning workflows with automated parameter management, model persistence, and metadata handling.
// Core Pipeline Classes
abstract class Estimator[M <: Model[M]] extends PipelineStage
abstract class Transformer extends PipelineStage
abstract class Model[M <: Model[M]] extends Transformer
class Pipeline extends Estimator[PipelineModel]
class PipelineModel extends Model[PipelineModel]
// Parameter System
trait Params
class Param[T]
class ParamMapDistributed linear algebra operations and data structures optimized for large-scale numerical computations across cluster nodes.
// Vector Types
abstract class Vector
class DenseVector extends Vector
class SparseVector extends Vector
object Vectors
// Matrix Types
abstract class Matrix
class DenseMatrix extends Matrix
class SparseMatrix extends Matrix
object MatricesAlgorithms for discovering frequent patterns, association rules, and sequences in large datasets, essential for market basket analysis and recommendation systems.
// FP-Growth Algorithm
class FPGrowth extends Estimator[FPGrowthModel] with FPGrowthParams
class FPGrowthModel extends Model[FPGrowthModel] with FPGrowthParams
// PrefixSpan Algorithm
class PrefixSpan extends Estimator[PrefixSpanModel] with PrefixSpanParams
class PrefixSpanModel extends Model[PrefixSpanModel] with PrefixSpanParamsNote: Frequent Pattern Mining capabilities are included in the core MLlib package.
// Core ML Types
import org.apache.spark.ml.linalg.{Vector, DenseVector, SparseVector, Matrix, DenseMatrix, SparseMatrix}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
// Pipeline Parameter Types
import org.apache.spark.ml.param.{Param, ParamMap, Params}
import org.apache.spark.ml.util.{Identifiable, MLWritable, MLReadable}
// Algorithm-Specific Types
import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ProbabilisticClassifier}
import org.apache.spark.ml.regression.{RegressionModel, Regressor}
import org.apache.spark.ml.clustering.ClusteringModel
import org.apache.spark.ml.fpm.{FPGrowth, FPGrowthModel, PrefixSpan, PrefixSpanModel}
// Parameter Traits
trait LogisticRegressionParams extends Params
trait ClassificationParams extends Params
trait RegressionParams extends Params