0
# Statistical Operations
1
2
Statistical utilities including multivariate Gaussian distributions for probabilistic machine learning applications. These operations provide essential statistical functionality for ML algorithms.
3
4
## Capabilities
5
6
### MultivariateGaussian Class
7
8
Multivariate Gaussian (Normal) distribution implementation providing probability density function calculations for high-dimensional data.
9
10
```java { .api }
11
/**
12
* Multivariate Gaussian (Normal) distribution implementation
13
*/
14
public class MultivariateGaussian {
15
16
/**
17
* Constructor with mean vector and covariance matrix
18
* @param mean Mean vector of the distribution
19
* @param cov Covariance matrix of the distribution
20
*/
21
public MultivariateGaussian(DenseVector mean, DenseMatrix cov);
22
23
/**
24
* Compute probability density function value
25
* @param x Input vector
26
* @return Probability density at point x
27
*/
28
public double pdf(Vector x);
29
30
/**
31
* Compute log probability density function value
32
* @param x Input vector
33
* @return Log probability density at point x
34
*/
35
public double logpdf(Vector x);
36
}
37
```
38
39
**Usage Examples:**
40
41
```java
42
import org.apache.flink.ml.common.statistics.basicstatistic.MultivariateGaussian;
43
import org.apache.flink.ml.common.linalg.DenseVector;
44
import org.apache.flink.ml.common.linalg.DenseMatrix;
45
46
// Create multivariate Gaussian distribution
47
DenseVector mean = new DenseVector(new double[]{0.0, 0.0});
48
DenseMatrix covariance = DenseMatrix.eye(2); // Identity covariance matrix
49
50
MultivariateGaussian gaussian = new MultivariateGaussian(mean, covariance);
51
52
// Evaluate probability density
53
DenseVector point = new DenseVector(new double[]{1.0, 1.0});
54
double probability = gaussian.pdf(point);
55
double logProbability = gaussian.logpdf(point);
56
57
System.out.println("PDF at point (1,1): " + probability);
58
System.out.println("Log PDF at point (1,1): " + logProbability);
59
```
60
61
### Statistical Computation Patterns
62
63
Common patterns for using multivariate Gaussian distributions in machine learning contexts.
64
65
**Usage Examples:**
66
67
```java
68
// Gaussian Mixture Model component example
69
public class GaussianComponent {
70
private MultivariateGaussian gaussian;
71
private double weight;
72
73
public GaussianComponent(DenseVector mean, DenseMatrix covariance, double weight) {
74
this.gaussian = new MultivariateGaussian(mean, covariance);
75
this.weight = weight;
76
}
77
78
public double computeWeightedProbability(Vector x) {
79
return weight * gaussian.pdf(x);
80
}
81
82
public double computeLogLikelihood(Vector x) {
83
return Math.log(weight) + gaussian.logpdf(x);
84
}
85
}
86
87
// Anomaly detection using Gaussian distribution
88
public class GaussianAnomalyDetector {
89
private MultivariateGaussian normalDistribution;
90
private double threshold;
91
92
public GaussianAnomalyDetector(DenseVector mean, DenseMatrix covariance, double threshold) {
93
this.normalDistribution = new MultivariateGaussian(mean, covariance);
94
this.threshold = threshold;
95
}
96
97
public boolean isAnomaly(Vector point) {
98
double probability = normalDistribution.pdf(point);
99
return probability < threshold;
100
}
101
102
public double getAnomalyScore(Vector point) {
103
// Lower probability = higher anomaly score
104
return -normalDistribution.logpdf(point);
105
}
106
}
107
108
// Usage examples
109
DenseVector trainingMean = new DenseVector(new double[]{5.0, 10.0});
110
DenseMatrix trainingCov = new DenseMatrix(new double[][]{{2.0, 0.5}, {0.5, 3.0}});
111
112
// Anomaly detection
113
GaussianAnomalyDetector detector = new GaussianAnomalyDetector(
114
trainingMean, trainingCov, 0.01);
115
116
DenseVector testPoint = new DenseVector(new double[]{5.1, 9.8});
117
boolean isAnomalous = detector.isAnomaly(testPoint);
118
double anomalyScore = detector.getAnomalyScore(testPoint);
119
120
// Gaussian mixture component
121
GaussianComponent component = new GaussianComponent(trainingMean, trainingCov, 0.3);
122
double weightedProb = component.computeWeightedProbability(testPoint);
123
```
124
125
### Probability Calculations
126
127
Advanced probability calculations and statistical analysis using multivariate Gaussian distributions.
128
129
**Usage Examples:**
130
131
```java
132
// Maximum likelihood estimation helper
133
public class GaussianMLEstimator {
134
135
public static MultivariateGaussian estimate(List<DenseVector> data) {
136
int n = data.size();
137
int dimensions = data.get(0).size();
138
139
// Compute sample mean
140
DenseVector mean = DenseVector.zeros(dimensions);
141
for (DenseVector point : data) {
142
mean.plusEqual(point);
143
}
144
mean.scaleEqual(1.0 / n);
145
146
// Compute sample covariance
147
DenseMatrix covariance = DenseMatrix.zeros(dimensions, dimensions);
148
for (DenseVector point : data) {
149
DenseVector centered = point.minus(mean);
150
DenseMatrix outer = centered.outer();
151
covariance.plusEquals(outer);
152
}
153
covariance.scaleEqual(1.0 / (n - 1));
154
155
return new MultivariateGaussian(mean, covariance);
156
}
157
}
158
159
// Probability comparison and classification
160
public class GaussianClassifier {
161
private MultivariateGaussian[] classDistributions;
162
private double[] classPriors;
163
164
public GaussianClassifier(MultivariateGaussian[] distributions, double[] priors) {
165
this.classDistributions = distributions;
166
this.classPriors = priors;
167
}
168
169
public int classify(Vector point) {
170
double maxLogPosterior = Double.NEGATIVE_INFINITY;
171
int bestClass = -1;
172
173
for (int i = 0; i < classDistributions.length; i++) {
174
double logPosterior = Math.log(classPriors[i]) +
175
classDistributions[i].logpdf(point);
176
177
if (logPosterior > maxLogPosterior) {
178
maxLogPosterior = logPosterior;
179
bestClass = i;
180
}
181
}
182
183
return bestClass;
184
}
185
186
public double[] getClassProbabilities(Vector point) {
187
double[] logProbs = new double[classDistributions.length];
188
double maxLogProb = Double.NEGATIVE_INFINITY;
189
190
// Compute log probabilities
191
for (int i = 0; i < classDistributions.length; i++) {
192
logProbs[i] = Math.log(classPriors[i]) + classDistributions[i].logpdf(point);
193
maxLogProb = Math.max(maxLogProb, logProbs[i]);
194
}
195
196
// Convert to probabilities with numerical stability
197
double[] probs = new double[classDistributions.length];
198
double sum = 0.0;
199
200
for (int i = 0; i < logProbs.length; i++) {
201
probs[i] = Math.exp(logProbs[i] - maxLogProb);
202
sum += probs[i];
203
}
204
205
// Normalize
206
for (int i = 0; i < probs.length; i++) {
207
probs[i] /= sum;
208
}
209
210
return probs;
211
}
212
}
213
214
// Usage
215
List<DenseVector> class1Data = getClass1TrainingData();
216
List<DenseVector> class2Data = getClass2TrainingData();
217
218
// Estimate distributions
219
MultivariateGaussian dist1 = GaussianMLEstimator.estimate(class1Data);
220
MultivariateGaussian dist2 = GaussianMLEstimator.estimate(class2Data);
221
222
// Create classifier
223
MultivariateGaussian[] distributions = {dist1, dist2};
224
double[] priors = {0.6, 0.4}; // Class priors
225
GaussianClassifier classifier = new GaussianClassifier(distributions, priors);
226
227
// Classify new point
228
DenseVector newPoint = new DenseVector(new double[]{3.0, 7.0});
229
int predictedClass = classifier.classify(newPoint);
230
double[] classProbabilities = classifier.getClassProbabilities(newPoint);
231
232
System.out.println("Predicted class: " + predictedClass);
233
System.out.println("Class probabilities: " + Arrays.toString(classProbabilities));
234
```
235
236
### Numerical Considerations
237
238
Important numerical considerations when working with multivariate Gaussian distributions.
239
240
**Usage Examples:**
241
242
```java
243
// Numerically stable Gaussian operations
244
public class NumericallyStableGaussian {
245
246
public static boolean isPositiveDefinite(DenseMatrix matrix) {
247
// Check if covariance matrix is positive definite
248
// Implementation would use eigenvalue decomposition or Cholesky decomposition
249
try {
250
// Attempt Cholesky decomposition
251
// If successful, matrix is positive definite
252
return true;
253
} catch (Exception e) {
254
return false;
255
}
256
}
257
258
public static DenseMatrix regularizeCovariance(DenseMatrix covariance, double regularization) {
259
// Add regularization to diagonal to ensure positive definiteness
260
DenseMatrix regularized = covariance.clone();
261
for (int i = 0; i < covariance.numRows(); i++) {
262
regularized.add(i, i, regularization);
263
}
264
return regularized;
265
}
266
267
public static MultivariateGaussian createStableGaussian(DenseVector mean, DenseMatrix covariance) {
268
// Ensure numerical stability
269
final double MIN_VARIANCE = 1e-6;
270
271
DenseMatrix stableCovariance = covariance.clone();
272
273
// Regularize if needed
274
if (!isPositiveDefinite(stableCovariance)) {
275
stableCovariance = regularizeCovariance(stableCovariance, MIN_VARIANCE);
276
}
277
278
return new MultivariateGaussian(mean, stableCovariance);
279
}
280
}
281
282
// Safe probability computations
283
public class SafeProbabilityCalculator {
284
285
public static double safeLogPdf(MultivariateGaussian gaussian, Vector point) {
286
try {
287
double logPdf = gaussian.logpdf(point);
288
289
// Check for numerical issues
290
if (Double.isNaN(logPdf) || Double.isInfinite(logPdf)) {
291
return Double.NEGATIVE_INFINITY; // Very low probability
292
}
293
294
return logPdf;
295
} catch (Exception e) {
296
// Handle numerical exceptions
297
return Double.NEGATIVE_INFINITY;
298
}
299
}
300
301
public static double safePdf(MultivariateGaussian gaussian, Vector point) {
302
double logPdf = safeLogPdf(gaussian, point);
303
return logPdf == Double.NEGATIVE_INFINITY ? 0.0 : Math.exp(logPdf);
304
}
305
}
306
307
// Usage with numerical safety
308
DenseVector mean = new DenseVector(new double[]{0.0, 0.0});
309
DenseMatrix covariance = new DenseMatrix(new double[][]{{1e-10, 0}, {0, 1e-10}}); // Very small variance
310
311
// Create numerically stable Gaussian
312
MultivariateGaussian stableGaussian = NumericallyStableGaussian.createStableGaussian(mean, covariance);
313
314
// Safe probability calculations
315
DenseVector testPoint = new DenseVector(new double[]{1.0, 1.0});
316
double safeProbability = SafeProbabilityCalculator.safePdf(stableGaussian, testPoint);
317
double safeLogProbability = SafeProbabilityCalculator.safeLogPdf(stableGaussian, testPoint);
318
```