Distributed linear algebra operations and data structures optimized for large-scale numerical computations across cluster nodes.
Core vector data structures and operations for representing feature vectors and model parameters.
/**
* Abstract base class for vectors
*/
abstract class Vector extends Serializable {
def size: Int
def apply(i: Int): Double
def copy: Vector
def foreachActive(f: (Int, Double) => Unit): Unit
def numActives: Int
def numNonzeros: Int
def toArray: Array[Double]
def toSparse: SparseVector
def toDense: DenseVector
def compressed: Vector
def argmax: Int
def dot(v: Vector): Double
def equals(other: Any): Boolean
def hashCode(): Int
def toString: String
}
/**
* Dense vector implementation storing all values
*/
class DenseVector(val values: Array[Double]) extends Vector {
def size: Int = values.length
def apply(i: Int): Double = values(i)
def copy: DenseVector = new DenseVector(values.clone())
def update(i: Int, value: Double): Unit = values(i) = value
def dot(other: Vector): Double
def norm(p: Double): Double
}
/**
* Sparse vector implementation storing only non-zero values
*/
class SparseVector(
override val size: Int,
val indices: Array[Int],
val values: Array[Double]
) extends Vector {
def apply(i: Int): Double = {
val idx = java.util.Arrays.binarySearch(indices, i)
if (idx >= 0) values(idx) else 0.0
}
def copy: SparseVector = new SparseVector(size, indices.clone(), values.clone())
def dot(other: Vector): Double
def norm(p: Double): Double
}
/**
* Vector factory methods and utilities
*/
object Vectors {
def dense(firstValue: Double, otherValues: Double*): DenseVector
def dense(values: Array[Double]): DenseVector
def sparse(size: Int, elements: Seq[(Int, Double)]): SparseVector
def sparse(size: Int, indices: Array[Int], values: Array[Double]): SparseVector
def zeros(size: Int): DenseVector
def norm(vector: Vector, p: Double): Double
def sqdist(v1: Vector, v2: Vector): Double
def fromML(v: org.apache.spark.mllib.linalg.Vector): Vector
def fromBreeze(bv: breeze.linalg.Vector[Double]): Vector
}Usage Example:
import org.apache.spark.ml.linalg.{Vector, Vectors}
// Create dense vector
val denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0)
println(s"Dense vector: $denseVec")
// Create sparse vector
val sparseVec = Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 3.0, 5.0))
println(s"Sparse vector: $sparseVec")
// Vector operations
val norm = Vectors.norm(denseVec, 2.0)
println(s"L2 norm: $norm")
val distance = Vectors.sqdist(denseVec, sparseVec.toDense)
println(s"Squared distance: $distance")Matrix data structures and operations for representing datasets and model parameters.
/**
* Abstract base class for matrices
*/
abstract class Matrix extends Serializable {
def numRows: Int
def numCols: Int
def apply(i: Int, j: Int): Double
def copy: Matrix
def foreachActive(f: (Int, Int, Double) => Unit): Unit
def numActives: Int
def numNonzeros: Int
def toArray: Array[Double]
def isTransposed: Boolean
def asML: org.apache.spark.mllib.linalg.Matrix
def toSparse: SparseMatrix
def toDense: DenseMatrix
def transpose: Matrix
def multiply(y: DenseVector): DenseVector
def multiply(y: DenseMatrix): DenseMatrix
def equals(other: Any): Boolean
def hashCode(): Int
def toString: String
}
/**
* Dense matrix implementation storing all values in column-major order
*/
class DenseMatrix(
val numRows: Int,
val numCols: Int,
val values: Array[Double],
val isTransposed: Boolean = false
) extends Matrix {
def apply(i: Int, j: Int): Double = {
if (isTransposed) values(j * numRows + i)
else values(i + j * numRows)
}
def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone(), isTransposed)
def update(i: Int, j: Int, value: Double): Unit = {
if (isTransposed) values(j * numRows + i) = value
else values(i + j * numRows) = value
}
}
/**
* Sparse matrix implementation storing only non-zero values in compressed sparse column format
*/
class SparseMatrix(
val numRows: Int,
val numCols: Int,
val colPtrs: Array[Int],
val rowIndices: Array[Int],
val values: Array[Double],
val isTransposed: Boolean = false
) extends Matrix {
def apply(i: Int, j: Int): Double = {
val startIdx = colPtrs(j)
val endIdx = colPtrs(j + 1)
val idx = java.util.Arrays.binarySearch(rowIndices, startIdx, endIdx, i)
if (idx >= 0) values(idx) else 0.0
}
def copy: SparseMatrix = new SparseMatrix(
numRows, numCols, colPtrs.clone(), rowIndices.clone(), values.clone(), isTransposed
)
}
/**
* Matrix factory methods and utilities
*/
object Matrices {
def dense(numRows: Int, numCols: Int, values: Array[Double]): DenseMatrix
def sparse(numRows: Int, numCols: Int, entries: Seq[(Int, Int, Double)]): SparseMatrix
def sparse(
numRows: Int,
numCols: Int,
colPtrs: Array[Int],
rowIndices: Array[Int],
values: Array[Double]
): SparseMatrix
def eye(n: Int): DenseMatrix
def zeros(numRows: Int, numCols: Int): DenseMatrix
def ones(numRows: Int, numCols: Int): DenseMatrix
def diag(vector: Vector): DenseMatrix
def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix
def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix
def horzcat(matrices: Array[Matrix]): Matrix
def vertcat(matrices: Array[Matrix]): Matrix
def fromML(m: org.apache.spark.mllib.linalg.Matrix): Matrix
def fromBreeze(bm: breeze.linalg.Matrix[Double]): Matrix
}Usage Example:
import org.apache.spark.ml.linalg.{Matrix, Matrices, Vectors}
// Create dense matrix
val denseMatrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
println(s"Dense matrix:\n$denseMatrix")
// Create sparse matrix
val sparseMatrix = Matrices.sparse(3, 2, Seq((0, 0, 9.0), (2, 1, 6.0)))
println(s"Sparse matrix:\n$sparseMatrix")
// Matrix operations
val identity = Matrices.eye(3)
val vector = Vectors.dense(1.0, 2.0, 3.0)
val result = identity.multiply(vector.toDense)
println(s"Matrix-vector multiplication: $result")Extended operations for complex numerical computations and transformations.
/**
* BLAS (Basic Linear Algebra Subprograms) operations
*/
object BLAS {
/**
* Vector dot product: x^T * y
*/
def dot(x: Vector, y: Vector): Double
/**
* Vector L2 norm: ||x||_2
*/
def nrm2(x: Vector): Double
/**
* Scalar-vector multiplication: a * x
*/
def scal(a: Double, x: Vector): Unit
/**
* Vector addition: y := a * x + y
*/
def axpy(a: Double, x: Vector, y: Vector): Unit
/**
* Matrix-vector multiplication: y := alpha * A * x + beta * y
*/
def gemv(
alpha: Double, A: Matrix, x: Vector, beta: Double, y: Vector
): Unit
/**
* Matrix-matrix multiplication: C := alpha * A * B + beta * C
*/
def gemm(
alpha: Double, A: Matrix, B: Matrix, beta: Double, C: Matrix
): Unit
/**
* Symmetric matrix-vector multiplication
*/
def symv(
alpha: Double, A: Matrix, x: Vector, beta: Double, y: Vector
): Unit
/**
* Rank-1 update: A := alpha * x * y^T + A
*/
def ger(alpha: Double, x: Vector, y: Vector, A: Matrix): Unit
/**
* Symmetric rank-1 update: A := alpha * x * x^T + A
*/
def syr(alpha: Double, x: Vector, A: Matrix): Unit
}
/**
* LAPACK (Linear Algebra Package) operations
*/
object LAPACK {
/**
* Cholesky decomposition
*/
def potrf(A: DenseMatrix): Int
/**
* Solve linear system using Cholesky decomposition
*/
def potrs(A: DenseMatrix, B: DenseMatrix): Int
/**
* QR decomposition
*/
def geqrf(A: DenseMatrix, tau: Array[Double]): Int
/**
* Singular Value Decomposition
*/
def gesvd(
A: DenseMatrix,
U: DenseMatrix,
s: Array[Double],
Vt: DenseMatrix
): Int
/**
* Eigenvalue decomposition
*/
def syev(
A: DenseMatrix,
w: Array[Double]
): Int
}Utilities for converting between different vector and matrix representations.
/**
* Conversion utilities between MLlib and ML linear algebra types
*/
object LinearAlgebraUtils {
/**
* Convert ML vector to MLlib vector
*/
def toMLlib(v: org.apache.spark.ml.linalg.Vector): org.apache.spark.mllib.linalg.Vector
/**
* Convert MLlib vector to ML vector
*/
def fromMLlib(v: org.apache.spark.mllib.linalg.Vector): org.apache.spark.ml.linalg.Vector
/**
* Convert ML matrix to MLlib matrix
*/
def toMLlib(m: org.apache.spark.ml.linalg.Matrix): org.apache.spark.mllib.linalg.Matrix
/**
* Convert MLlib matrix to ML matrix
*/
def fromMLlib(m: org.apache.spark.mllib.linalg.Matrix): org.apache.spark.ml.linalg.Matrix
/**
* Convert Breeze vector to ML vector
*/
def fromBreeze(bv: breeze.linalg.Vector[Double]): Vector
/**
* Convert ML vector to Breeze vector
*/
def toBreeze(v: Vector): breeze.linalg.Vector[Double]
/**
* Convert Breeze matrix to ML matrix
*/
def fromBreeze(bm: breeze.linalg.Matrix[Double]): Matrix
/**
* Convert ML matrix to Breeze matrix
*/
def toBreeze(m: Matrix): breeze.linalg.Matrix[Double]
}Large-scale distributed matrix operations from the legacy RDD-based API.
/**
* Base class for distributed matrices
*/
abstract class org.apache.spark.mllib.linalg.distributed.DistributedMatrix {
def numRows(): Long
def numCols(): Long
}
/**
* Row-oriented distributed matrix
*/
class org.apache.spark.mllib.linalg.distributed.RowMatrix(
val rows: RDD[org.apache.spark.mllib.linalg.Vector]
) extends DistributedMatrix {
def computeColumnSummaryStatistics(): MultivariateStatisticalSummary
def computeCovariance(): org.apache.spark.mllib.linalg.Matrix
def computeGramianMatrix(): org.apache.spark.mllib.linalg.Matrix
def computePrincipalComponents(k: Int): org.apache.spark.mllib.linalg.Matrix
def computeSVD(
k: Int,
computeU: Boolean = false,
rCond: Double = 1e-9
): SingularValueDecomposition[RowMatrix, org.apache.spark.mllib.linalg.Matrix]
def multiply(B: org.apache.spark.mllib.linalg.Matrix): RowMatrix
def columnSimilarities(): CoordinateMatrix
}
/**
* Indexed row matrix for matrices with meaningful row indices
*/
class org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix(
val rows: RDD[IndexedRow]
) extends DistributedMatrix {
def toRowMatrix(): RowMatrix
def toCoordinateMatrix(): CoordinateMatrix
def toBlockMatrix(): BlockMatrix
def multiply(B: org.apache.spark.mllib.linalg.Matrix): IndexedRowMatrix
def computeGramianMatrix(): org.apache.spark.mllib.linalg.Matrix
}
/**
* Coordinate matrix for matrices stored as (row, col, value) triplets
*/
class org.apache.spark.mllib.linalg.distributed.CoordinateMatrix(
val entries: RDD[MatrixEntry]
) extends DistributedMatrix {
def toRowMatrix(): RowMatrix
def toIndexedRowMatrix(): IndexedRowMatrix
def toBlockMatrix(): BlockMatrix
def transpose(): CoordinateMatrix
}
/**
* Block matrix for matrices partitioned into blocks
*/
class org.apache.spark.mllib.linalg.distributed.BlockMatrix(
val blocks: RDD[((Int, Int), org.apache.spark.mllib.linalg.Matrix)],
val rowsPerBlock: Int,
val colsPerBlock: Int
) extends DistributedMatrix {
def add(other: BlockMatrix): BlockMatrix
def subtract(other: BlockMatrix): BlockMatrix
def multiply(other: BlockMatrix): BlockMatrix
def transpose: BlockMatrix
def toLocalMatrix(): org.apache.spark.mllib.linalg.Matrix
def toIndexedRowMatrix(): IndexedRowMatrix
def toCoordinateMatrix(): CoordinateMatrix
}Statistical operations on vectors and matrices for data analysis.
/**
* Multivariate statistical summary
*/
trait MultivariateStatisticalSummary {
def mean: org.apache.spark.mllib.linalg.Vector
def variance: org.apache.spark.mllib.linalg.Vector
def count: Long
def numNonzeros: org.apache.spark.mllib.linalg.Vector
def max: org.apache.spark.mllib.linalg.Vector
def min: org.apache.spark.mllib.linalg.Vector
def normL1: org.apache.spark.mllib.linalg.Vector
def normL2: org.apache.spark.mllib.linalg.Vector
}
/**
* Online multivariate summarizer for streaming statistics
*/
class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary {
def add(sample: org.apache.spark.mllib.linalg.Vector): this.type
def add(sample: org.apache.spark.mllib.linalg.Vector, weight: Double): this.type
def merge(other: MultivariateOnlineSummarizer): this.type
}// Core linear algebra imports
import org.apache.spark.ml.linalg._
// Vector types
import org.apache.spark.ml.linalg.{Vector, DenseVector, SparseVector, Vectors}
// Matrix types
import org.apache.spark.ml.linalg.{Matrix, DenseMatrix, SparseMatrix, Matrices}
// BLAS and LAPACK operations
import org.apache.spark.ml.linalg.{BLAS, LAPACK}
// Legacy distributed linear algebra (from mllib)
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
// Conversion utilities
import org.apache.spark.mllib.linalg.{Vector => OldVector, Matrix => OldMatrix}
import breeze.linalg.{Vector => BreezeVector, Matrix => BreezeMatrix}
// Supporting types
case class IndexedRow(index: Long, vector: org.apache.spark.mllib.linalg.Vector)
case class MatrixEntry(i: Long, j: Long, value: Double)
case class SingularValueDecomposition[RowType, MatrixType](
U: RowType,
s: org.apache.spark.mllib.linalg.Vector,
V: MatrixType
)