Spark ML Local Library providing linear algebra and statistical utilities for local machine learning operations without requiring a distributed Spark cluster
npx @tessl/cli install tessl/maven-org-apache-spark--spark-mllib-local-2-11@2.2.0Spark MLlib Local is a lightweight, dependency-minimal package that provides fundamental linear algebra operations and basic statistical utilities for machine learning tasks that can be executed locally without requiring a distributed Spark cluster framework.
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib-local_2.11</artifactId>
<version>2.2.3</version>
</dependency>For SBT:
libraryDependencies += "org.apache.spark" %% "spark-mllib-local" % "2.2.3"import org.apache.spark.ml.linalg.{Vector, DenseVector, SparseVector, Vectors}
import org.apache.spark.ml.linalg.{Matrix, DenseMatrix, SparseMatrix, Matrices}
import org.apache.spark.ml.stat.distribution.MultivariateGaussianimport org.apache.spark.ml.linalg.{Vectors, Matrices}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
// Create vectors
val dense = Vectors.dense(1.0, 2.0, 3.0)
val sparse = Vectors.sparse(5, Array(0, 2, 4), Array(1.0, 3.0, 5.0))
// Create matrices
val denseMatrix = Matrices.dense(2, 3, Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0))
val sparseMatrix = Matrices.sparse(3, 3, Array(0, 2, 3, 6), Array(0, 2, 1, 0, 1, 2), Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0))
// Vector operations
val vectorCopy = dense.copy
val dotProduct = sparse.toArray.zip(dense.toArray).map{ case (a, b) => a * b }.sum
val l2Norm = Vectors.norm(dense, 2.0)
val distance = Vectors.sqdist(dense, sparse)
// Matrix operations
val matrixProduct = denseMatrix.multiply(sparseMatrix.transpose)
val transposed = denseMatrix.transpose
val compressed = sparseMatrix.compressed
// Statistical distribution
val mean = Vectors.dense(0.0, 0.0)
val cov = Matrices.eye(2)
val gaussian = new MultivariateGaussian(mean, cov)
val density = gaussian.pdf(Vectors.dense(1.0, 1.0))Spark MLlib Local is organized around several key components:
The library emphasizes performance through:
Core vector functionality providing dense and sparse representations with unified operations. Essential for feature representations and mathematical computations.
// Factory methods
object Vectors {
def dense(firstValue: Double, otherValues: Double*): Vector
def dense(values: Array[Double]): Vector
def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector
def sparse(size: Int, elements: Seq[(Int, Double)]): Vector
def zeros(size: Int): Vector
def norm(vector: Vector, p: Double): Double
def sqdist(v1: Vector, v2: Vector): Double
}
// Vector trait
trait Vector {
def size: Int
def toArray: Array[Double]
def apply(i: Int): Double
def copy: Vector
def foreachActive(f: (Int, Double) => Unit): Unit
def numActives: Int
def numNonzeros: Int
def toSparse: SparseVector
def toDense: DenseVector
def compressed: Vector
def argmax: Int
}Comprehensive matrix functionality supporting both dense and sparse matrices with efficient storage formats and mathematical operations.
// Factory methods
object Matrices {
def dense(numRows: Int, numCols: Int, values: Array[Double]): Matrix
def sparse(numRows: Int, numCols: Int, colPtrs: Array[Int], rowIndices: Array[Int], values: Array[Double]): Matrix
def zeros(numRows: Int, numCols: Int): Matrix
def ones(numRows: Int, numCols: Int): Matrix
def eye(n: Int): Matrix
def speye(n: Int): Matrix
def diag(vector: Vector): Matrix
def horzcat(matrices: Array[Matrix]): Matrix
def vertcat(matrices: Array[Matrix]): Matrix
}
// Matrix trait
trait Matrix {
def numRows: Int
def numCols: Int
def apply(i: Int, j: Int): Double
def transpose: Matrix
def multiply(y: Matrix): DenseMatrix
def multiply(y: Vector): DenseVector
def toSparse: SparseMatrix
def toDense: DenseMatrix
def compressed: Matrix
}Statistical distribution implementations for probability computations and machine learning algorithms.
class MultivariateGaussian(mean: Vector, cov: Matrix) {
val mean: Vector
val cov: Matrix
def pdf(x: Vector): Double
def logpdf(x: Vector): Double
}Numerical testing utilities with tolerance-based comparisons for vectors, matrices, and doubles, essential for testing numerical algorithms.
object TestingUtils {
implicit class DoubleWithAlmostEquals(val x: Double) {
def ~=(r: CompareDoubleRightSide): Boolean
def ~==(r: CompareDoubleRightSide): Boolean
def absTol(eps: Double): CompareDoubleRightSide
def relTol(eps: Double): CompareDoubleRightSide
}
implicit class VectorWithAlmostEquals(val x: Vector) {
def ~=(r: CompareVectorRightSide): Boolean
def ~==(r: CompareVectorRightSide): Boolean
def absTol(eps: Double): CompareVectorRightSide
def relTol(eps: Double): CompareVectorRightSide
}
}// Vector hierarchy
sealed trait Vector extends Serializable
class DenseVector(val values: Array[Double]) extends Vector
class SparseVector(override val size: Int, val indices: Array[Int], val values: Array[Double]) extends Vector
// Matrix hierarchy
sealed trait Matrix extends Serializable
class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double], override val isTransposed: Boolean) extends Matrix
class SparseMatrix(val numRows: Int, val numCols: Int, val colPtrs: Array[Int], val rowIndices: Array[Int], val values: Array[Double], override val isTransposed: Boolean) extends Matrixcase class CompareDoubleRightSide(fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String)
case class CompareVectorRightSide(fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String)
case class CompareMatrixRightSide(fun: (Matrix, Matrix, Double) => Boolean, y: Matrix, eps: Double, method: String)