Fast numerical expression evaluator for NumPy that accelerates array operations through optimized implementations and multi-threading
—
Integration with Intel's Vector Math Library (VML) for hardware-accelerated transcendental functions when available. VML provides optimized implementations of mathematical functions that can significantly improve performance for expressions containing trigonometric, exponential, and logarithmic operations.
Control VML library behavior including accuracy modes and threading for optimal performance based on application requirements.
def get_vml_version():
"""
Get the VML/MKL library version information.
Returns the version string of the Intel Vector Math Library or
Math Kernel Library if available and linked with NumExpr.
Returns:
str or None: VML/MKL version string if available, None if VML not available
"""
def set_vml_accuracy_mode(mode):
"""
Set the accuracy mode for VML operations.
Controls the trade-off between computational speed and numerical accuracy
for VML-accelerated functions. Different modes provide different guarantees
about precision and performance.
Parameters:
- mode (str or None): Accuracy mode setting
- 'high': High accuracy mode (HA), <1 least significant bit error
- 'low': Low accuracy mode (LA), typically 1-2 LSB error
- 'fast': Enhanced performance mode (EP), fastest with relaxed accuracy
- None: Use VML default mode settings
Returns:
str or None: Previous accuracy mode setting
Raises:
ValueError: If mode is not one of the supported values
"""Usage Examples:
import numexpr as ne
import numpy as np
# Check VML availability and version
if ne.use_vml:
print(f"VML Version: {ne.get_vml_version()}")
# Set accuracy mode for performance-critical code
old_mode = ne.set_vml_accuracy_mode('fast')
# Perform VML-accelerated computations
x = np.linspace(0, 10, 1000000)
result = ne.evaluate("sin(x) * exp(-x/5) + log(x + 1)")
# Restore previous accuracy mode
ne.set_vml_accuracy_mode(old_mode)
else:
print("VML not available - using standard implementations")Manage threading specifically for VML operations, which may have different optimal settings than general NumExpr threading.
def set_vml_num_threads(nthreads):
"""
Set the number of threads for VML operations.
Suggests a maximum number of threads for VML library operations.
This is independent of NumExpr's general threading and allows
fine-tuning of VML performance characteristics.
Parameters:
- nthreads (int): Number of threads for VML operations
Note:
This function is equivalent to mkl_domain_set_num_threads(nthreads, MKL_DOMAIN_VML)
in the Intel MKL library.
"""Usage Examples:
# Configure VML threading independently
if ne.use_vml:
# Note: get_vml_num_threads() is not available in public API
print(f"Current NumExpr threads: {ne.get_num_threads()}")
# Set VML to use fewer threads than NumExpr
ne.set_num_threads(8) # NumExpr uses 8 threads
ne.set_vml_num_threads(4) # VML uses 4 threads
# Benchmark VML-heavy expression
data = np.random.random(1000000)
result = ne.evaluate("sin(data) + cos(data) + exp(data) + log(data + 1)")Runtime detection of VML availability and capabilities.
# VML availability flag
use_vml: bool # True if VML support is available and enabledUsage Examples:
# Conditional logic based on VML availability
if ne.use_vml:
# Use VML-optimized expressions
expression = "sin(a) * cos(b) + exp(c) * log(d + 1)"
ne.set_vml_accuracy_mode('fast') # Prioritize speed
else:
# Fallback to simpler expressions or warn user
print("Warning: VML not available, performance may be limited")
expression = "a * 0.8414 + b * 0.5403 + c * 2.718 + d * 0.693" # ApproximationsWhen VML is available, the following functions receive hardware acceleration:
Trigonometric Functions:
sin, cos, tanarcsin, arccos, arctan, arctan2sinh, cosh, tanharcsinh, arccosh, arctanhExponential and Logarithmic:
exp, expm1log, log1p, log10Power Functions:
sqrtpow (power operations)Other Functions:
absolute/absconjugateceil, floorfmoddiv, inv (division and inverse)Speed Improvements:
Accuracy Modes:
VML support requires Intel MKL to be available during NumExpr compilation:
# Install NumExpr with MKL support via conda (recommended)
conda install numexpr
# Or compile from source with MKL
# 1. Install Intel MKL
# 2. Copy site.cfg.example to site.cfg
# 3. Edit site.cfg to point to MKL libraries
# 4. Build: python setup.py buildimport numexpr as ne
# Check if VML is available
print(f"VML available: {ne.use_vml}")
if ne.use_vml:
print(f"VML version: {ne.get_vml_version()}")
# VML threading information not available via public API
# Test VML acceleration
import numpy as np
import time
x = np.random.random(1000000)
# Time VML-accelerated expression
start = time.time()
result_vml = ne.evaluate("sin(x) + cos(x) + exp(x)")
vml_time = time.time() - start
# Time equivalent NumPy expression
start = time.time()
result_numpy = np.sin(x) + np.cos(x) + np.exp(x)
numpy_time = time.time() - start
print(f"VML time: {vml_time:.4f}s")
print(f"NumPy time: {numpy_time:.4f}s")
print(f"Speedup: {numpy_time/vml_time:.2f}x")import numpy as np
import numexpr as ne
def benchmark_vml_modes(expression, data_dict):
"""Benchmark VML accuracy modes for an expression."""
if not ne.use_vml:
print("VML not available")
return
modes = ['high', 'low', 'fast']
results = {}
for mode in modes:
ne.set_vml_accuracy_mode(mode)
# Warm up
ne.evaluate(expression, local_dict=data_dict)
# Time multiple evaluations
start = time.time()
for _ in range(100):
result = ne.evaluate(expression, local_dict=data_dict)
elapsed = time.time() - start
results[mode] = {
'time': elapsed / 100,
'sample_result': result[:5] # First few values for comparison
}
return results
# Example usage
data = {'x': np.linspace(0.1, 10, 100000)}
results = benchmark_vml_modes("sin(x) + cos(x) + log(x)", data)
for mode, info in results.items():
print(f"{mode}: {info['time']:.6f}s, sample: {info['sample_result']}")# Strategy 1: Match VML threads to NumExpr threads
ne.set_num_threads(4)
ne.set_vml_num_threads(4)
# Strategy 2: Use fewer VML threads for memory-bound operations
ne.set_num_threads(8)
ne.set_vml_num_threads(2) # Reduce VML threading to avoid memory bandwidth limits
# Strategy 3: Disable VML threading for small arrays
if array_size < 10000:
ne.set_vml_num_threads(1)
else:
ne.set_vml_num_threads(ne.get_num_threads())# VML-friendly expression patterns
vml_optimized = "sin(a) * cos(b) + exp(c/10) * sqrt(d)" # Uses VML functions
# Less VML-friendly (uses non-VML operations)
mixed_expression = "where(a > 0, sin(a), cos(a)) + b**3" # where() not VML-accelerated
# Consider rewriting for better VML utilization
# Instead of: where(x > 0, sin(x), 0)
# Use: (x > 0) * sin(x) # Better VML utilizationVML integration provides substantial performance improvements for mathematical expressions, particularly those involving transcendental functions. The key is balancing accuracy requirements with performance needs and properly configuring threading for your specific use case.
Install with Tessl CLI
npx tessl i tessl/pypi-numexpr