0
# Utilities
1
2
Data I/O, statistical analysis, filtering functions, and various utility operations for supporting machine learning workflows and data processing tasks.
3
4
## Capabilities
5
6
### Data Input/Output
7
8
Functions for loading and saving data in standard machine learning formats.
9
10
```python { .api }
11
def load_libsvm_formatted_data(filename: str) -> tuple:
12
"""
13
Load data in libsvm format.
14
15
Args:
16
filename: Path to libsvm format file
17
18
Returns:
19
Tuple of (samples, labels) where samples is list of sparse_vector
20
and labels is list of numeric labels
21
"""
22
23
def save_libsvm_formatted_data(filename: str, samples, labels):
24
"""
25
Save data in libsvm format.
26
27
Args:
28
filename: Output filename
29
samples: List of sample vectors (sparse or dense)
30
labels: List of corresponding labels
31
"""
32
```
33
34
**Usage Example:**
35
```python
36
import dlib
37
38
# Create sample data
39
samples = []
40
labels = []
41
42
# Dense vectors
43
for i in range(100):
44
sample = dlib.vector([i * 0.1, i * 0.2, i * 0.3])
45
samples.append(sample)
46
labels.append(1 if i % 2 == 0 else -1)
47
48
# Save in libsvm format
49
dlib.save_libsvm_formatted_data("dataset.libsvm", samples, labels)
50
51
# Load back
52
loaded_samples, loaded_labels = dlib.load_libsvm_formatted_data("dataset.libsvm")
53
54
print(f"Loaded {len(loaded_samples)} samples")
55
print(f"First sample: {loaded_samples[0]}")
56
print(f"First label: {loaded_labels[0]}")
57
58
# Works with sparse vectors too
59
sparse_samples = []
60
for i in range(50):
61
sparse_vec = dlib.sparse_vector()
62
sparse_vec.extend([
63
dlib.pair(0, i * 0.5),
64
dlib.pair(5, i * 0.3),
65
dlib.pair(10, i * 0.1)
66
])
67
sparse_samples.append(sparse_vec)
68
69
sparse_labels = [1] * 25 + [-1] * 25
70
dlib.save_libsvm_formatted_data("sparse_dataset.libsvm", sparse_samples, sparse_labels)
71
```
72
73
### Statistical Analysis
74
75
Functions for time series analysis and statistical testing.
76
77
```python { .api }
78
def count_steps_without_decrease(time_series, probability: float = 0.51) -> int:
79
"""
80
Count steps without decrease in time series.
81
82
Args:
83
time_series: List or array of numeric values
84
probability: Probability threshold for statistical test
85
86
Returns:
87
Number of steps without significant decrease
88
"""
89
90
def count_steps_without_decrease_robust(
91
time_series,
92
probability: float = 0.51,
93
quantile_discard: float = 0.1
94
) -> int:
95
"""
96
Robust version that discards outliers.
97
98
Args:
99
time_series: List or array of numeric values
100
probability: Probability threshold for statistical test
101
quantile_discard: Fraction of extreme values to discard
102
103
Returns:
104
Number of steps without significant decrease (robust estimate)
105
"""
106
107
def probability_that_sequence_is_increasing(time_series) -> float:
108
"""
109
Statistical test for increasing sequence.
110
111
Args:
112
time_series: List or array of numeric values
113
114
Returns:
115
Probability that sequence is increasing (0-1)
116
"""
117
```
118
119
**Usage Example:**
120
```python
121
import dlib
122
import numpy as np
123
124
# Generate time series data
125
np.random.seed(42)
126
127
# Increasing trend with noise
128
trend_data = []
129
for i in range(100):
130
trend_value = i * 0.1 + np.random.normal(0, 0.5)
131
trend_data.append(trend_value)
132
133
# Analyze time series
134
steps_no_decrease = dlib.count_steps_without_decrease(trend_data)
135
steps_robust = dlib.count_steps_without_decrease_robust(trend_data, quantile_discard=0.2)
136
increasing_prob = dlib.probability_that_sequence_is_increasing(trend_data)
137
138
print(f"Steps without decrease: {steps_no_decrease}")
139
print(f"Steps without decrease (robust): {steps_robust}")
140
print(f"Probability of increasing: {increasing_prob:.3f}")
141
142
# Test with different data patterns
143
flat_data = [1.0] * 50 + [np.random.normal(1.0, 0.1) for _ in range(50)]
144
decreasing_data = [10.0 - i * 0.1 + np.random.normal(0, 0.2) for i in range(100)]
145
146
print(f"Flat data increasing probability: {dlib.probability_that_sequence_is_increasing(flat_data):.3f}")
147
print(f"Decreasing data increasing probability: {dlib.probability_that_sequence_is_increasing(decreasing_data):.3f}")
148
```
149
150
### Filtering and Signal Processing
151
152
Kalman filtering and signal processing utilities for tracking and noise reduction.
153
154
```python { .api }
155
class momentum_filter:
156
"""Kalman filter for tracking moving objects."""
157
158
def __init__(
159
self,
160
measurement_noise: float,
161
typical_acceleration: float,
162
max_measurement_deviation: float
163
):
164
"""
165
Initialize momentum filter.
166
167
Args:
168
measurement_noise: Expected measurement noise level
169
typical_acceleration: Expected acceleration magnitude
170
max_measurement_deviation: Maximum allowed measurement deviation
171
"""
172
173
def measurement_noise(self) -> float:
174
"""Get measurement noise parameter."""
175
176
def typical_acceleration(self) -> float:
177
"""Get typical acceleration parameter."""
178
179
def max_measurement_deviation(self) -> float:
180
"""Get max measurement deviation parameter."""
181
182
def __call__(self, measurement) -> object:
183
"""
184
Filter measurement through Kalman filter.
185
186
Args:
187
measurement: New measurement (point, vector, etc.)
188
189
Returns:
190
Filtered estimate
191
"""
192
193
def find_optimal_momentum_filter(
194
sequence: list,
195
smoothness: float = 1.0
196
) -> momentum_filter:
197
"""
198
Find optimal momentum filter parameters.
199
200
Args:
201
sequence: Sequence of measurements to analyze
202
smoothness: Smoothness parameter (higher = smoother filtering)
203
204
Returns:
205
Optimally configured momentum filter
206
"""
207
```
208
209
**Usage Example:**
210
```python
211
import dlib
212
import numpy as np
213
214
# Generate noisy position measurements
215
np.random.seed(42)
216
true_positions = []
217
noisy_measurements = []
218
219
for t in range(100):
220
# True position with some acceleration
221
true_pos = dlib.point(int(t + 0.01 * t**2), int(50 + 5 * np.sin(t * 0.1)))
222
true_positions.append(true_pos)
223
224
# Add measurement noise
225
noisy_x = true_pos.x + np.random.normal(0, 3.0)
226
noisy_y = true_pos.y + np.random.normal(0, 3.0)
227
noisy_measurements.append(dlib.point(int(noisy_x), int(noisy_y)))
228
229
# Create momentum filter
230
filter = dlib.momentum_filter(
231
measurement_noise=3.0,
232
typical_acceleration=0.1,
233
max_measurement_deviation=2.0
234
)
235
236
# Filter measurements
237
filtered_positions = []
238
for measurement in noisy_measurements:
239
filtered = filter(measurement)
240
filtered_positions.append(filtered)
241
242
# Or find optimal parameters automatically
243
optimal_filter = dlib.find_optimal_momentum_filter(noisy_measurements, smoothness=2.0)
244
245
optimal_filtered = []
246
for measurement in noisy_measurements:
247
filtered = optimal_filter(measurement)
248
optimal_filtered.append(filtered)
249
250
print(f"Original filter noise param: {filter.measurement_noise()}")
251
print(f"Optimal filter noise param: {optimal_filter.measurement_noise()}")
252
```
253
254
### Assignment and Optimization Utilities
255
256
Utility functions for assignment problems and optimization tasks.
257
258
```python { .api }
259
def assignment_cost(cost_matrix, assignment: list) -> float:
260
"""
261
Calculate total cost of assignment.
262
263
Args:
264
cost_matrix: 2D matrix of assignment costs
265
assignment: List of assignments (row to column mapping)
266
267
Returns:
268
Total assignment cost
269
"""
270
271
def max_cost_assignment(cost_matrix) -> list:
272
"""
273
Solve maximum cost assignment problem using Hungarian algorithm.
274
275
Args:
276
cost_matrix: 2D matrix where cost_matrix[i][j] is cost of assigning row i to column j
277
278
Returns:
279
List where result[i] is the column assigned to row i
280
"""
281
```
282
283
### Sparse Vector Utilities
284
285
Helper functions for working with sparse vectors.
286
287
```python { .api }
288
def make_sparse_vector(sparse_vec: sparse_vector) -> sparse_vector:
289
"""
290
Sort and deduplicate sparse vector.
291
292
Args:
293
sparse_vec: Input sparse vector (may have unsorted or duplicate indices)
294
295
Returns:
296
Cleaned sparse vector with sorted indices and no duplicates
297
"""
298
```
299
300
**Usage Example:**
301
```python
302
import dlib
303
304
# Create sparse vector with potential issues
305
sparse_vec = dlib.sparse_vector()
306
sparse_vec.extend([
307
dlib.pair(5, 2.5),
308
dlib.pair(1, 1.0),
309
dlib.pair(5, 3.0), # Duplicate index
310
dlib.pair(3, 1.5),
311
dlib.pair(1, 0.5) # Another duplicate
312
])
313
314
print("Original sparse vector:")
315
for i in range(len(sparse_vec)):
316
pair = sparse_vec[i]
317
print(f" Index {pair.first}: {pair.second}")
318
319
# Clean up sparse vector
320
clean_vec = dlib.make_sparse_vector(sparse_vec)
321
322
print("Cleaned sparse vector:")
323
for i in range(len(clean_vec)):
324
pair = clean_vec[i]
325
print(f" Index {pair.first}: {pair.second}")
326
```
327
328
### Interactive Utilities
329
330
Simple utilities for interactive use and debugging.
331
332
```python { .api }
333
def hit_enter_to_continue():
334
"""
335
Interactive pause utility - waits for user to press Enter.
336
Useful for debugging and interactive scripts.
337
"""
338
```
339
340
**Usage Example:**
341
```python
342
import dlib
343
344
print("Starting data processing...")
345
346
# Process some data
347
data = list(range(1000))
348
processed = [x * 2 for x in data]
349
350
print("Processing complete. Press Enter to continue...")
351
dlib.hit_enter_to_continue()
352
353
print("Continuing with analysis...")
354
```
355
356
### Image Dataset Metadata
357
358
Functions for working with image dataset XML metadata files (used by object detection training).
359
360
```python { .api }
361
def load_image_dataset_metadata(filename: str):
362
"""
363
Load image dataset metadata from XML file.
364
365
Args:
366
filename: Path to XML metadata file
367
368
Returns:
369
Dataset metadata structure containing image paths and annotations
370
"""
371
372
def save_image_dataset_metadata(metadata, filename: str):
373
"""
374
Save image dataset metadata to XML file.
375
376
Args:
377
metadata: Dataset metadata structure
378
filename: Output XML filename
379
"""
380
```
381
382
**Usage Example:**
383
```python
384
import dlib
385
386
# Load existing dataset metadata
387
try:
388
dataset = dlib.load_image_dataset_metadata("training_dataset.xml")
389
print("Loaded dataset metadata successfully")
390
391
# Process or modify dataset
392
# ... modify dataset structure ...
393
394
# Save modified dataset
395
dlib.save_image_dataset_metadata(dataset, "modified_dataset.xml")
396
397
except Exception as e:
398
print(f"Error loading dataset: {e}")
399
```
400
401
### Advanced Filtering Options
402
403
Additional filtering utilities for specific use cases.
404
405
```python { .api }
406
def create_kalman_filter(
407
initial_state,
408
measurement_noise: float,
409
process_noise: float
410
):
411
"""
412
Create generic Kalman filter.
413
414
Args:
415
initial_state: Initial state estimate
416
measurement_noise: Measurement noise variance
417
process_noise: Process noise variance
418
419
Returns:
420
Configured Kalman filter
421
"""
422
423
def apply_temporal_smoothing(
424
measurements: list,
425
window_size: int = 5,
426
method: str = "gaussian"
427
):
428
"""
429
Apply temporal smoothing to measurement sequence.
430
431
Args:
432
measurements: List of measurements over time
433
window_size: Size of smoothing window
434
method: Smoothing method ("gaussian", "uniform", "exponential")
435
436
Returns:
437
Smoothed measurement sequence
438
"""
439
```
440
441
### Performance and Debugging Utilities
442
443
Helper functions for performance monitoring and debugging.
444
445
```python { .api }
446
def benchmark_function(func, args: tuple, num_iterations: int = 100) -> float:
447
"""
448
Benchmark function execution time.
449
450
Args:
451
func: Function to benchmark
452
args: Arguments to pass to function
453
num_iterations: Number of iterations to run
454
455
Returns:
456
Average execution time in seconds
457
"""
458
459
def memory_usage_estimate(data_structure) -> int:
460
"""
461
Estimate memory usage of dlib data structure.
462
463
Args:
464
data_structure: Dlib object (matrix, vector, etc.)
465
466
Returns:
467
Estimated memory usage in bytes
468
"""
469
```
470
471
**Complete Utilities Usage Example:**
472
```python
473
import dlib
474
import numpy as np
475
import time
476
477
def comprehensive_utilities_demo():
478
"""Demonstrate various utility functions."""
479
480
print("=== Data I/O Demo ===")
481
482
# Create and save dataset
483
samples = [dlib.vector([i, i*2, i*3]) for i in range(100)]
484
labels = [1 if i % 2 == 0 else -1 for i in range(100)]
485
486
dlib.save_libsvm_formatted_data("demo_dataset.libsvm", samples, labels)
487
loaded_samples, loaded_labels = dlib.load_libsvm_formatted_data("demo_dataset.libsvm")
488
print(f"Saved and loaded {len(loaded_samples)} samples")
489
490
print("\n=== Statistical Analysis Demo ===")
491
492
# Generate time series with trend
493
time_series = [i + np.random.normal(0, 0.5) for i in range(50)]
494
495
steps = dlib.count_steps_without_decrease(time_series)
496
increasing_prob = dlib.probability_that_sequence_is_increasing(time_series)
497
498
print(f"Steps without decrease: {steps}")
499
print(f"Increasing probability: {increasing_prob:.3f}")
500
501
print("\n=== Filtering Demo ===")
502
503
# Create noisy position data
504
true_trajectory = [dlib.point(t, int(50 + 20 * np.sin(t * 0.1))) for t in range(100)]
505
noisy_trajectory = [
506
dlib.point(p.x + int(np.random.normal(0, 3)),
507
p.y + int(np.random.normal(0, 3)))
508
for p in true_trajectory
509
]
510
511
# Apply filtering
512
filter = dlib.momentum_filter(3.0, 0.1, 2.0)
513
filtered_trajectory = [filter(p) for p in noisy_trajectory]
514
515
print(f"Filtered {len(filtered_trajectory)} position measurements")
516
517
print("\n=== Assignment Problem Demo ===")
518
519
# Solve assignment problem
520
cost_matrix = [
521
[9, 2, 7, 8],
522
[6, 4, 3, 7],
523
[5, 8, 1, 8],
524
[7, 6, 9, 4]
525
]
526
527
assignment = dlib.max_cost_assignment(cost_matrix)
528
total_cost = dlib.assignment_cost(cost_matrix, assignment)
529
530
print(f"Optimal assignment: {assignment}")
531
print(f"Total cost: {total_cost}")
532
533
print("\n=== Sparse Vector Demo ===")
534
535
# Create and clean sparse vector
536
sparse_vec = dlib.sparse_vector()
537
sparse_vec.extend([
538
dlib.pair(10, 1.0),
539
dlib.pair(2, 2.0),
540
dlib.pair(10, 3.0), # Duplicate
541
dlib.pair(5, 1.5)
542
])
543
544
clean_vec = dlib.make_sparse_vector(sparse_vec)
545
print(f"Cleaned sparse vector with {len(clean_vec)} unique elements")
546
547
print("\n=== Interactive Demo ===")
548
print("Demonstration complete. Press Enter to finish...")
549
dlib.hit_enter_to_continue()
550
print("Demo finished!")
551
552
if __name__ == "__main__":
553
comprehensive_utilities_demo()
554
```
555
556
These utility functions provide essential support for machine learning workflows, data processing, and interactive development with dlib.