0
# Linear Algebra
1
2
Distributed linear algebra operations and data structures optimized for large-scale numerical computations across cluster nodes.
3
4
## Capabilities
5
6
### Vector Operations
7
8
Core vector data structures and operations for representing feature vectors and model parameters.
9
10
```scala { .api }
11
/**
12
* Abstract base class for vectors
13
*/
14
abstract class Vector extends Serializable {
15
def size: Int
16
def apply(i: Int): Double
17
def copy: Vector
18
def foreachActive(f: (Int, Double) => Unit): Unit
19
def numActives: Int
20
def numNonzeros: Int
21
def toArray: Array[Double]
22
def toSparse: SparseVector
23
def toDense: DenseVector
24
def compressed: Vector
25
def argmax: Int
26
def dot(v: Vector): Double
27
def equals(other: Any): Boolean
28
def hashCode(): Int
29
def toString: String
30
}
31
32
/**
33
* Dense vector implementation storing all values
34
*/
35
class DenseVector(val values: Array[Double]) extends Vector {
36
def size: Int = values.length
37
def apply(i: Int): Double = values(i)
38
def copy: DenseVector = new DenseVector(values.clone())
39
def update(i: Int, value: Double): Unit = values(i) = value
40
def dot(other: Vector): Double
41
def norm(p: Double): Double
42
}
43
44
/**
45
* Sparse vector implementation storing only non-zero values
46
*/
47
class SparseVector(
48
override val size: Int,
49
val indices: Array[Int],
50
val values: Array[Double]
51
) extends Vector {
52
def apply(i: Int): Double = {
53
val idx = java.util.Arrays.binarySearch(indices, i)
54
if (idx >= 0) values(idx) else 0.0
55
}
56
def copy: SparseVector = new SparseVector(size, indices.clone(), values.clone())
57
def dot(other: Vector): Double
58
def norm(p: Double): Double
59
}
60
61
/**
62
* Vector factory methods and utilities
63
*/
64
object Vectors {
65
def dense(firstValue: Double, otherValues: Double*): DenseVector
66
def dense(values: Array[Double]): DenseVector
67
def sparse(size: Int, elements: Seq[(Int, Double)]): SparseVector
68
def sparse(size: Int, indices: Array[Int], values: Array[Double]): SparseVector
69
def zeros(size: Int): DenseVector
70
def norm(vector: Vector, p: Double): Double
71
def sqdist(v1: Vector, v2: Vector): Double
72
def fromML(v: org.apache.spark.mllib.linalg.Vector): Vector
73
def fromBreeze(bv: breeze.linalg.Vector[Double]): Vector
74
}
75
```
76
77
**Usage Example:**
78
79
```scala
80
import org.apache.spark.ml.linalg.{Vector, Vectors}
81
82
// Create dense vector
83
val denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0)
84
println(s"Dense vector: $denseVec")
85
86
// Create sparse vector
87
val sparseVec = Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 3.0, 5.0))
88
println(s"Sparse vector: $sparseVec")
89
90
// Vector operations
91
val norm = Vectors.norm(denseVec, 2.0)
92
println(s"L2 norm: $norm")
93
94
val distance = Vectors.sqdist(denseVec, sparseVec.toDense)
95
println(s"Squared distance: $distance")
96
```
97
98
### Matrix Operations
99
100
Matrix data structures and operations for representing datasets and model parameters.
101
102
```scala { .api }
103
/**
104
* Abstract base class for matrices
105
*/
106
abstract class Matrix extends Serializable {
107
def numRows: Int
108
def numCols: Int
109
def apply(i: Int, j: Int): Double
110
def copy: Matrix
111
def foreachActive(f: (Int, Int, Double) => Unit): Unit
112
def numActives: Int
113
def numNonzeros: Int
114
def toArray: Array[Double]
115
def isTransposed: Boolean
116
def asML: org.apache.spark.mllib.linalg.Matrix
117
def toSparse: SparseMatrix
118
def toDense: DenseMatrix
119
def transpose: Matrix
120
def multiply(y: DenseVector): DenseVector
121
def multiply(y: DenseMatrix): DenseMatrix
122
def equals(other: Any): Boolean
123
def hashCode(): Int
124
def toString: String
125
}
126
127
/**
128
* Dense matrix implementation storing all values in column-major order
129
*/
130
class DenseMatrix(
131
val numRows: Int,
132
val numCols: Int,
133
val values: Array[Double],
134
val isTransposed: Boolean = false
135
) extends Matrix {
136
def apply(i: Int, j: Int): Double = {
137
if (isTransposed) values(j * numRows + i)
138
else values(i + j * numRows)
139
}
140
def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone(), isTransposed)
141
def update(i: Int, j: Int, value: Double): Unit = {
142
if (isTransposed) values(j * numRows + i) = value
143
else values(i + j * numRows) = value
144
}
145
}
146
147
/**
148
* Sparse matrix implementation storing only non-zero values in compressed sparse column format
149
*/
150
class SparseMatrix(
151
val numRows: Int,
152
val numCols: Int,
153
val colPtrs: Array[Int],
154
val rowIndices: Array[Int],
155
val values: Array[Double],
156
val isTransposed: Boolean = false
157
) extends Matrix {
158
def apply(i: Int, j: Int): Double = {
159
val startIdx = colPtrs(j)
160
val endIdx = colPtrs(j + 1)
161
val idx = java.util.Arrays.binarySearch(rowIndices, startIdx, endIdx, i)
162
if (idx >= 0) values(idx) else 0.0
163
}
164
def copy: SparseMatrix = new SparseMatrix(
165
numRows, numCols, colPtrs.clone(), rowIndices.clone(), values.clone(), isTransposed
166
)
167
}
168
169
/**
170
* Matrix factory methods and utilities
171
*/
172
object Matrices {
173
def dense(numRows: Int, numCols: Int, values: Array[Double]): DenseMatrix
174
def sparse(numRows: Int, numCols: Int, entries: Seq[(Int, Int, Double)]): SparseMatrix
175
def sparse(
176
numRows: Int,
177
numCols: Int,
178
colPtrs: Array[Int],
179
rowIndices: Array[Int],
180
values: Array[Double]
181
): SparseMatrix
182
def eye(n: Int): DenseMatrix
183
def zeros(numRows: Int, numCols: Int): DenseMatrix
184
def ones(numRows: Int, numCols: Int): DenseMatrix
185
def diag(vector: Vector): DenseMatrix
186
def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix
187
def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix
188
def horzcat(matrices: Array[Matrix]): Matrix
189
def vertcat(matrices: Array[Matrix]): Matrix
190
def fromML(m: org.apache.spark.mllib.linalg.Matrix): Matrix
191
def fromBreeze(bm: breeze.linalg.Matrix[Double]): Matrix
192
}
193
```
194
195
**Usage Example:**
196
197
```scala
198
import org.apache.spark.ml.linalg.{Matrix, Matrices, Vectors}
199
200
// Create dense matrix
201
val denseMatrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
202
println(s"Dense matrix:\n$denseMatrix")
203
204
// Create sparse matrix
205
val sparseMatrix = Matrices.sparse(3, 2, Seq((0, 0, 9.0), (2, 1, 6.0)))
206
println(s"Sparse matrix:\n$sparseMatrix")
207
208
// Matrix operations
209
val identity = Matrices.eye(3)
210
val vector = Vectors.dense(1.0, 2.0, 3.0)
211
val result = identity.multiply(vector.toDense)
212
println(s"Matrix-vector multiplication: $result")
213
```
214
215
### Advanced Linear Algebra Operations
216
217
Extended operations for complex numerical computations and transformations.
218
219
```scala { .api }
220
/**
221
* BLAS (Basic Linear Algebra Subprograms) operations
222
*/
223
object BLAS {
224
/**
225
* Vector dot product: x^T * y
226
*/
227
def dot(x: Vector, y: Vector): Double
228
229
/**
230
* Vector L2 norm: ||x||_2
231
*/
232
def nrm2(x: Vector): Double
233
234
/**
235
* Scalar-vector multiplication: a * x
236
*/
237
def scal(a: Double, x: Vector): Unit
238
239
/**
240
* Vector addition: y := a * x + y
241
*/
242
def axpy(a: Double, x: Vector, y: Vector): Unit
243
244
/**
245
* Matrix-vector multiplication: y := alpha * A * x + beta * y
246
*/
247
def gemv(
248
alpha: Double, A: Matrix, x: Vector, beta: Double, y: Vector
249
): Unit
250
251
/**
252
* Matrix-matrix multiplication: C := alpha * A * B + beta * C
253
*/
254
def gemm(
255
alpha: Double, A: Matrix, B: Matrix, beta: Double, C: Matrix
256
): Unit
257
258
/**
259
* Symmetric matrix-vector multiplication
260
*/
261
def symv(
262
alpha: Double, A: Matrix, x: Vector, beta: Double, y: Vector
263
): Unit
264
265
/**
266
* Rank-1 update: A := alpha * x * y^T + A
267
*/
268
def ger(alpha: Double, x: Vector, y: Vector, A: Matrix): Unit
269
270
/**
271
* Symmetric rank-1 update: A := alpha * x * x^T + A
272
*/
273
def syr(alpha: Double, x: Vector, A: Matrix): Unit
274
}
275
276
/**
277
* LAPACK (Linear Algebra Package) operations
278
*/
279
object LAPACK {
280
/**
281
* Cholesky decomposition
282
*/
283
def potrf(A: DenseMatrix): Int
284
285
/**
286
* Solve linear system using Cholesky decomposition
287
*/
288
def potrs(A: DenseMatrix, B: DenseMatrix): Int
289
290
/**
291
* QR decomposition
292
*/
293
def geqrf(A: DenseMatrix, tau: Array[Double]): Int
294
295
/**
296
* Singular Value Decomposition
297
*/
298
def gesvd(
299
A: DenseMatrix,
300
U: DenseMatrix,
301
s: Array[Double],
302
Vt: DenseMatrix
303
): Int
304
305
/**
306
* Eigenvalue decomposition
307
*/
308
def syev(
309
A: DenseMatrix,
310
w: Array[Double]
311
): Int
312
}
313
```
314
315
### Vector and Matrix Conversions
316
317
Utilities for converting between different vector and matrix representations.
318
319
```scala { .api }
320
/**
321
* Conversion utilities between MLlib and ML linear algebra types
322
*/
323
object LinearAlgebraUtils {
324
/**
325
* Convert ML vector to MLlib vector
326
*/
327
def toMLlib(v: org.apache.spark.ml.linalg.Vector): org.apache.spark.mllib.linalg.Vector
328
329
/**
330
* Convert MLlib vector to ML vector
331
*/
332
def fromMLlib(v: org.apache.spark.mllib.linalg.Vector): org.apache.spark.ml.linalg.Vector
333
334
/**
335
* Convert ML matrix to MLlib matrix
336
*/
337
def toMLlib(m: org.apache.spark.ml.linalg.Matrix): org.apache.spark.mllib.linalg.Matrix
338
339
/**
340
* Convert MLlib matrix to ML matrix
341
*/
342
def fromMLlib(m: org.apache.spark.mllib.linalg.Matrix): org.apache.spark.ml.linalg.Matrix
343
344
/**
345
* Convert Breeze vector to ML vector
346
*/
347
def fromBreeze(bv: breeze.linalg.Vector[Double]): Vector
348
349
/**
350
* Convert ML vector to Breeze vector
351
*/
352
def toBreeze(v: Vector): breeze.linalg.Vector[Double]
353
354
/**
355
* Convert Breeze matrix to ML matrix
356
*/
357
def fromBreeze(bm: breeze.linalg.Matrix[Double]): Matrix
358
359
/**
360
* Convert ML matrix to Breeze matrix
361
*/
362
def toBreeze(m: Matrix): breeze.linalg.Matrix[Double]
363
}
364
```
365
366
### Distributed Linear Algebra (Legacy MLlib)
367
368
Large-scale distributed matrix operations from the legacy RDD-based API.
369
370
```scala { .api }
371
/**
372
* Base class for distributed matrices
373
*/
374
abstract class org.apache.spark.mllib.linalg.distributed.DistributedMatrix {
375
def numRows(): Long
376
def numCols(): Long
377
}
378
379
/**
380
* Row-oriented distributed matrix
381
*/
382
class org.apache.spark.mllib.linalg.distributed.RowMatrix(
383
val rows: RDD[org.apache.spark.mllib.linalg.Vector]
384
) extends DistributedMatrix {
385
def computeColumnSummaryStatistics(): MultivariateStatisticalSummary
386
def computeCovariance(): org.apache.spark.mllib.linalg.Matrix
387
def computeGramianMatrix(): org.apache.spark.mllib.linalg.Matrix
388
def computePrincipalComponents(k: Int): org.apache.spark.mllib.linalg.Matrix
389
def computeSVD(
390
k: Int,
391
computeU: Boolean = false,
392
rCond: Double = 1e-9
393
): SingularValueDecomposition[RowMatrix, org.apache.spark.mllib.linalg.Matrix]
394
def multiply(B: org.apache.spark.mllib.linalg.Matrix): RowMatrix
395
def columnSimilarities(): CoordinateMatrix
396
}
397
398
/**
399
* Indexed row matrix for matrices with meaningful row indices
400
*/
401
class org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix(
402
val rows: RDD[IndexedRow]
403
) extends DistributedMatrix {
404
def toRowMatrix(): RowMatrix
405
def toCoordinateMatrix(): CoordinateMatrix
406
def toBlockMatrix(): BlockMatrix
407
def multiply(B: org.apache.spark.mllib.linalg.Matrix): IndexedRowMatrix
408
def computeGramianMatrix(): org.apache.spark.mllib.linalg.Matrix
409
}
410
411
/**
412
* Coordinate matrix for matrices stored as (row, col, value) triplets
413
*/
414
class org.apache.spark.mllib.linalg.distributed.CoordinateMatrix(
415
val entries: RDD[MatrixEntry]
416
) extends DistributedMatrix {
417
def toRowMatrix(): RowMatrix
418
def toIndexedRowMatrix(): IndexedRowMatrix
419
def toBlockMatrix(): BlockMatrix
420
def transpose(): CoordinateMatrix
421
}
422
423
/**
424
* Block matrix for matrices partitioned into blocks
425
*/
426
class org.apache.spark.mllib.linalg.distributed.BlockMatrix(
427
val blocks: RDD[((Int, Int), org.apache.spark.mllib.linalg.Matrix)],
428
val rowsPerBlock: Int,
429
val colsPerBlock: Int
430
) extends DistributedMatrix {
431
def add(other: BlockMatrix): BlockMatrix
432
def subtract(other: BlockMatrix): BlockMatrix
433
def multiply(other: BlockMatrix): BlockMatrix
434
def transpose: BlockMatrix
435
def toLocalMatrix(): org.apache.spark.mllib.linalg.Matrix
436
def toIndexedRowMatrix(): IndexedRowMatrix
437
def toCoordinateMatrix(): CoordinateMatrix
438
}
439
```
440
441
### Statistical Summary
442
443
Statistical operations on vectors and matrices for data analysis.
444
445
```scala { .api }
446
/**
447
* Multivariate statistical summary
448
*/
449
trait MultivariateStatisticalSummary {
450
def mean: org.apache.spark.mllib.linalg.Vector
451
def variance: org.apache.spark.mllib.linalg.Vector
452
def count: Long
453
def numNonzeros: org.apache.spark.mllib.linalg.Vector
454
def max: org.apache.spark.mllib.linalg.Vector
455
def min: org.apache.spark.mllib.linalg.Vector
456
def normL1: org.apache.spark.mllib.linalg.Vector
457
def normL2: org.apache.spark.mllib.linalg.Vector
458
}
459
460
/**
461
* Online multivariate summarizer for streaming statistics
462
*/
463
class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary {
464
def add(sample: org.apache.spark.mllib.linalg.Vector): this.type
465
def add(sample: org.apache.spark.mllib.linalg.Vector, weight: Double): this.type
466
def merge(other: MultivariateOnlineSummarizer): this.type
467
}
468
```
469
470
## Types
471
472
```scala { .api }
473
// Core linear algebra imports
474
import org.apache.spark.ml.linalg._
475
476
// Vector types
477
import org.apache.spark.ml.linalg.{Vector, DenseVector, SparseVector, Vectors}
478
479
// Matrix types
480
import org.apache.spark.ml.linalg.{Matrix, DenseMatrix, SparseMatrix, Matrices}
481
482
// BLAS and LAPACK operations
483
import org.apache.spark.ml.linalg.{BLAS, LAPACK}
484
485
// Legacy distributed linear algebra (from mllib)
486
import org.apache.spark.mllib.linalg.distributed._
487
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
488
489
// Conversion utilities
490
import org.apache.spark.mllib.linalg.{Vector => OldVector, Matrix => OldMatrix}
491
import breeze.linalg.{Vector => BreezeVector, Matrix => BreezeMatrix}
492
493
// Supporting types
494
case class IndexedRow(index: Long, vector: org.apache.spark.mllib.linalg.Vector)
495
case class MatrixEntry(i: Long, j: Long, value: Double)
496
case class SingularValueDecomposition[RowType, MatrixType](
497
U: RowType,
498
s: org.apache.spark.mllib.linalg.Vector,
499
V: MatrixType
500
)
501
```