Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.
62
Pre-built, optimized kernels for common parallel operations including element-wise operations, reductions, and prefix scans with automatic type handling. These kernels provide high-performance implementations of frequently used parallel algorithms.
Generate kernels for element-wise operations on GPU arrays with automatic type handling and optimized memory access patterns.
class ElementwiseKernel:
def __init__(self, arguments: str, operation: str, name: str = "kernel",
keep: bool = False, options: list = None, preamble: str = "",
loop_prep: str = "", after_loop: str = ""):
"""
Create element-wise operation kernel.
Parameters:
- arguments: str, kernel argument specification
- operation: str, element-wise operation code
- name: str, kernel function name
- keep: bool, keep generated source files
- options: list, compiler options
- preamble: str, code before kernel
- loop_prep: str, code before operation loop
- after_loop: str, code after operation loop
"""
def __call__(self, *args, **kwargs) -> None:
"""
Execute element-wise kernel.
Parameters:
- args: kernel arguments matching argument specification
- range: slice, element range to process (optional)
- slice: slice, deprecated alias for range
- stream: Stream, CUDA stream (optional)
"""
def get_elwise_kernel(arguments: str, operation: str, name: str = "kernel", **kwargs) -> ElementwiseKernel:
"""
Get cached element-wise kernel.
Parameters:
- arguments: str, argument specification
- operation: str, operation code
- name: str, kernel name
- **kwargs: additional kernel options
Returns:
ElementwiseKernel: compiled kernel function
"""Pre-built element-wise operation kernels for common operations.
def get_binary_op_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
operator: str, x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
"""
Get binary operation kernel.
Parameters:
- dtype_x: numpy.dtype, first operand data type
- dtype_y: numpy.dtype, second operand data type
- dtype_z: numpy.dtype, result data type
- operator: str, binary operator (+, -, *, /, etc.)
- x_is_scalar: bool, first operand is scalar
- y_is_scalar: bool, second operand is scalar
Returns:
ElementwiseKernel: binary operation kernel
"""
def get_axpbyz_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
"""
Get AXPBYZ kernel (z = a*x + b*y).
Parameters:
- dtype_x: numpy.dtype, x array data type
- dtype_y: numpy.dtype, y array data type
- dtype_z: numpy.dtype, z array data type
- x_is_scalar: bool, x is scalar
- y_is_scalar: bool, y is scalar
Returns:
ElementwiseKernel: AXPBYZ operation kernel
"""
def get_axpbz_kernel(dtype_x: np.dtype, dtype_z: np.dtype) -> ElementwiseKernel:
"""
Get AXPBZ kernel (z = a*x + b*z).
Parameters:
- dtype_x: numpy.dtype, x array data type
- dtype_z: numpy.dtype, z array data type
Returns:
ElementwiseKernel: AXPBZ operation kernel
"""
def get_linear_combination_kernel(summand_descriptors: list, dtype_z: np.dtype) -> ElementwiseKernel:
"""
Get linear combination kernel.
Parameters:
- summand_descriptors: list, list of (coeff_dtype, var_dtype) tuples
- dtype_z: numpy.dtype, result data type
Returns:
ElementwiseKernel: linear combination kernel
"""
def get_copy_kernel(dtype_dest: np.dtype, dtype_src: np.dtype) -> ElementwiseKernel:
"""
Get array copy kernel with type conversion.
Parameters:
- dtype_dest: numpy.dtype, destination data type
- dtype_src: numpy.dtype, source data type
Returns:
ElementwiseKernel: copy kernel
"""
def get_fill_kernel(dtype: np.dtype) -> ElementwiseKernel:
"""
Get array fill kernel.
Parameters:
- dtype: numpy.dtype, array data type
Returns:
ElementwiseKernel: fill kernel
"""
def get_reverse_kernel(dtype: np.dtype) -> ElementwiseKernel:
"""
Get array reverse kernel.
Parameters:
- dtype: numpy.dtype, array data type
Returns:
ElementwiseKernel: reverse kernel
"""
def get_arange_kernel(dtype: np.dtype) -> ElementwiseKernel:
"""
Get arange kernel for creating sequential arrays.
Parameters:
- dtype: numpy.dtype, array data type
Returns:
ElementwiseKernel: arange kernel
"""
def get_pow_array_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
is_base_array: bool, is_exp_array: bool) -> ElementwiseKernel:
"""
Get power operation kernel.
Parameters:
- dtype_x: numpy.dtype, base data type
- dtype_y: numpy.dtype, exponent data type
- dtype_z: numpy.dtype, result data type
- is_base_array: bool, base is array (not scalar)
- is_exp_array: bool, exponent is array (not scalar)
Returns:
ElementwiseKernel: power operation kernel
"""
def get_unary_func_kernel(func_name: str, in_dtype: np.dtype, out_dtype: np.dtype = None) -> ElementwiseKernel:
"""
Get unary function kernel.
Parameters:
- func_name: str, function name (sin, cos, exp, etc.)
- in_dtype: numpy.dtype, input data type
- out_dtype: numpy.dtype, output data type (defaults to in_dtype)
Returns:
ElementwiseKernel: unary function kernel
"""Kernels for advanced array indexing operations.
def get_take_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
"""
Get take (fancy indexing) kernel.
Parameters:
- dtype: numpy.dtype, array element data type
- idx_dtype: numpy.dtype, index array data type
- vec_count: int, vector components per element
Returns:
ElementwiseKernel: take kernel
"""
def get_take_put_kernel(dtype: np.dtype, idx_dtype: np.dtype,
with_offsets: bool, vec_count: int = 1) -> ElementwiseKernel:
"""
Get take-put kernel for indexed assignment.
Parameters:
- dtype: numpy.dtype, array element data type
- idx_dtype: numpy.dtype, index array data type
- with_offsets: bool, use offset indexing
- vec_count: int, vector components per element
Returns:
ElementwiseKernel: take-put kernel
"""
def get_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
"""
Get put (indexed assignment) kernel.
Parameters:
- dtype: numpy.dtype, array element data type
- idx_dtype: numpy.dtype, index array data type
- vec_count: int, vector components per element
Returns:
ElementwiseKernel: put kernel
"""Parallel reduction operations for computing aggregate values.
class ReductionKernel:
def __init__(self, dtype: np.dtype, neutral: str, reduce_expr: str,
map_expr: str = None, arguments: str = None, name: str = "reduce_kernel",
keep: bool = False, options: list = None, preamble: str = ""):
"""
Create reduction kernel.
Parameters:
- dtype: numpy.dtype, data type for reduction
- neutral: str, neutral element for reduction
- reduce_expr: str, reduction expression
- map_expr: str, pre-reduction mapping expression
- arguments: str, additional kernel arguments
- name: str, kernel function name
- keep: bool, keep generated source files
- options: list, compiler options
- preamble: str, code before kernel
"""
def __call__(self, input_array: GPUArray, stream: Stream = None,
allocator=None) -> GPUArray:
"""
Execute reduction on array.
Parameters:
- input_array: GPUArray, input array to reduce
- stream: Stream, CUDA stream (optional)
- allocator: memory allocator (optional)
Returns:
GPUArray: reduction result (scalar array)
"""
def get_sum_kernel(dtype_out: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
"""
Get sum reduction kernel.
Parameters:
- dtype_out: numpy.dtype, output data type
- dtype_in: numpy.dtype, input data type
Returns:
ReductionKernel: sum reduction kernel
"""
def get_dot_kernel(dtype_out: np.dtype, dtype_a: np.dtype, dtype_b: np.dtype = None) -> ReductionKernel:
"""
Get dot product reduction kernel.
Parameters:
- dtype_out: numpy.dtype, output data type
- dtype_a: numpy.dtype, first array data type
- dtype_b: numpy.dtype, second array data type (defaults to dtype_a)
Returns:
ReductionKernel: dot product kernel
"""
def get_minmax_kernel(what: str, dtype: np.dtype) -> ReductionKernel:
"""
Get min/max reduction kernel.
Parameters:
- what: str, "min" or "max"
- dtype: numpy.dtype, array data type
Returns:
ReductionKernel: min/max reduction kernel
"""
def get_subset_sum_kernel(dtype_out: np.dtype, dtype_subset: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
"""
Get subset sum kernel (sum with mask).
Parameters:
- dtype_out: numpy.dtype, output data type
- dtype_subset: numpy.dtype, mask array data type
- dtype_in: numpy.dtype, input array data type
Returns:
ReductionKernel: subset sum kernel
"""
def get_subset_dot_kernel(dtype_out: np.dtype, dtype_subset: np.dtype,
dtype_a: np.dtype = None, dtype_b: np.dtype = None) -> ReductionKernel:
"""
Get subset dot product kernel.
Parameters:
- dtype_out: numpy.dtype, output data type
- dtype_subset: numpy.dtype, mask array data type
- dtype_a: numpy.dtype, first array data type
- dtype_b: numpy.dtype, second array data type
Returns:
ReductionKernel: subset dot product kernel
"""Parallel prefix scan (cumulative) operations.
class InclusiveScanKernel:
def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str = None,
name_prefix: str = "scan", options: list = None, preamble: str = "",
devices: list = None):
"""
Create inclusive scan kernel.
Parameters:
- dtype: numpy.dtype, data type for scan
- scan_expr: str, scan operation expression
- neutral: str, neutral element
- name_prefix: str, kernel name prefix
- options: list, compiler options
- preamble: str, code before kernel
- devices: list, target devices
"""
def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
allocator=None, stream: Stream = None) -> GPUArray:
"""
Execute inclusive scan.
Parameters:
- input_ary: GPUArray, input array
- output_ary: GPUArray, output array (optional)
- allocator: memory allocator (optional)
- stream: Stream, CUDA stream (optional)
Returns:
GPUArray: scan result array
"""
class ExclusiveScanKernel:
def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str,
name_prefix: str = "scan", options: list = None, preamble: str = "",
devices: list = None):
"""
Create exclusive scan kernel.
Parameters:
- dtype: numpy.dtype, data type for scan
- scan_expr: str, scan operation expression
- neutral: str, neutral element (required)
- name_prefix: str, kernel name prefix
- options: list, compiler options
- preamble: str, code before kernel
- devices: list, target devices
"""
def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
allocator=None, stream: Stream = None) -> GPUArray:
"""
Execute exclusive scan.
Parameters:
- input_ary: GPUArray, input array
- output_ary: GPUArray, output array (optional)
- allocator: memory allocator (optional)
- stream: Stream, CUDA stream (optional)
Returns:
GPUArray: scan result array
"""import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
# Custom element-wise operation: complex magnitude
magnitude_kernel = ElementwiseKernel(
"pycuda::complex<float> *z, float *out",
"out[i] = abs(z[i])",
"magnitude"
)
# Execute kernel
complex_array = gpuarray.to_gpu(np.array([1+2j, 3+4j, 5+6j], dtype=np.complex64))
result = gpuarray.empty(complex_array.shape, dtype=np.float32)
magnitude_kernel(complex_array, result)from pycuda.reduction import ReductionKernel
# Custom reduction: sum of squares
sum_squares = ReductionKernel(
np.float32, # output dtype
neutral="0", # neutral element
reduce_expr="a+b", # reduction operation
map_expr="x[i]*x[i]", # pre-reduction mapping
arguments="float *x" # input arguments
)
# Execute reduction
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.float32))
result = sum_squares(input_array).get() # Returns sum of squaresfrom pycuda.scan import InclusiveScanKernel
# Cumulative sum scan
cumsum_kernel = InclusiveScanKernel(
np.int32, # data type
"a+b", # scan operation
neutral="0" # neutral element
)
# Execute scan
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.int32))
cumulative_sum = cumsum_kernel(input_array)
# Result: [1, 3, 6, 10, 15]Install with Tessl CLI
npx tessl i tessl/pypi-pycudadocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10