or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdcuda-management.mdfft.mdindex.mdkernels.mdlinear-algebra.mdmath-functions.mdrandom.mdscipy-extensions.mdsparse.mdstatistics.md

kernels.mddocs/

0

# Custom Kernels and Performance

1

2

Tools for writing custom CUDA kernels and optimizing GPU performance. CuPy provides high-level interfaces for creating custom GPU operations when built-in functions are insufficient.

3

4

## Capabilities

5

6

### Element-wise Kernels

7

8

Create custom element-wise operations that apply to each array element.

9

10

```python { .api }

11

class ElementwiseKernel:

12

"""

13

Custom element-wise CUDA kernel.

14

15

Parameters:

16

- in_params: str, input parameter specification

17

- out_params: str, output parameter specification

18

- operation: str, CUDA C++ code for operation

19

- name: str, kernel name

20

- preamble: str, code inserted before kernel

21

- loop_prep: str, code before element loop

22

- after_loop: str, code after element loop

23

"""

24

def __init__(self, in_params, out_params, operation, name='kernel', preamble='', loop_prep='', after_loop=''): ...

25

26

def __call__(self, *args, **kwargs):

27

"""

28

Execute kernel on input arrays.

29

30

Parameters:

31

- args: input and output arrays matching parameter specification

32

- size: int, number of elements to process

33

- stream: cupy.cuda.Stream, CUDA stream

34

35

Returns:

36

cupy.ndarray: Output array(s)

37

"""

38

39

class ReductionKernel:

40

"""

41

Custom reduction CUDA kernel.

42

43

Parameters:

44

- in_params: str, input parameter specification

45

- out_params: str, output parameter specification

46

- map_expr: str, expression to map input to intermediate values

47

- reduce_expr: str, expression to reduce intermediate values

48

- post_map_expr: str, expression to post-process results

49

- identity: str, identity value for reduction

50

- name: str, kernel name

51

"""

52

def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel'): ...

53

54

def __call__(self, *args, **kwargs):

55

"""Execute reduction kernel."""

56

57

class RawKernel:

58

"""

59

Raw CUDA kernel from source code.

60

61

Parameters:

62

- code: str, complete CUDA C++ kernel source

63

- name: str, kernel function name

64

- options: tuple, compiler options

65

- backend: str, compilation backend

66

"""

67

def __init__(self, code, name, options=(), backend='nvrtc'): ...

68

69

def __call__(self, grid, block, args, *, shared_mem=0, stream=None):

70

"""

71

Launch raw CUDA kernel.

72

73

Parameters:

74

- grid: tuple, grid dimensions

75

- block: tuple, block dimensions

76

- args: tuple, kernel arguments

77

- shared_mem: int, shared memory size

78

- stream: cupy.cuda.Stream, CUDA stream

79

"""

80

81

class RawModule:

82

"""

83

Raw CUDA module containing multiple kernels.

84

85

Parameters:

86

- code: str, complete CUDA module source

87

- options: tuple, compiler options

88

- backend: str, compilation backend

89

"""

90

def __init__(self, code, options=(), backend='nvrtc'): ...

91

92

def get_function(self, name):

93

"""

94

Get kernel function by name.

95

96

Parameters:

97

- name: str, function name

98

99

Returns:

100

RawKernel: Kernel function

101

"""

102

```

103

104

### Performance Utilities

105

106

Tools for optimizing GPU performance.

107

108

```python { .api }

109

def fuse(*args, **kwargs):

110

"""

111

Decorator for kernel fusion optimization.

112

113

Parameters:

114

- args: positional arguments for fusion

115

- kwargs: keyword arguments for fusion

116

117

Returns:

118

function: Fused function decorator

119

"""

120

121

def clear_memo():

122

"""Clear memoization cache."""

123

124

def memoize(for_each_device=False):

125

"""

126

Memoization decorator for caching function results.

127

128

Parameters:

129

- for_each_device: bool, separate cache per device

130

131

Returns:

132

function: Memoization decorator

133

"""

134

```

135

136

## Usage Examples

137

138

### Element-wise Kernel

139

140

```python

141

import cupy as cp

142

143

# Custom element-wise operation

144

multiply_add = cp.ElementwiseKernel(

145

'T x, T y, T z', # Input parameters

146

'T w', # Output parameters

147

'w = x * y + z', # Operation

148

'multiply_add' # Kernel name

149

)

150

151

# Use the kernel

152

a = cp.random.random((1000, 1000))

153

b = cp.random.random((1000, 1000))

154

c = cp.random.random((1000, 1000))

155

result = multiply_add(a, b, c)

156

```

157

158

### Reduction Kernel

159

160

```python

161

# Custom reduction operation (sum of squares)

162

sum_of_squares = cp.ReductionKernel(

163

'T x', # Input parameter

164

'T y', # Output parameter

165

'x * x', # Map expression

166

'a + b', # Reduce expression

167

'y = a', # Post-map expression

168

'0', # Identity value

169

'sum_of_squares' # Kernel name

170

)

171

172

# Use the reduction kernel

173

data = cp.random.random((10000,))

174

result = sum_of_squares(data, axis=None)

175

```

176

177

### Raw CUDA Kernel

178

179

```python

180

# Raw CUDA kernel for advanced operations

181

cuda_code = '''

182

extern "C" __global__

183

void vector_add(const float* a, const float* b, float* c, int n) {

184

int idx = blockIdx.x * blockDim.x + threadIdx.x;

185

if (idx < n) {

186

c[idx] = a[idx] + b[idx];

187

}

188

}

189

'''

190

191

kernel = cp.RawKernel(cuda_code, 'vector_add')

192

193

# Use raw kernel

194

n = 1000000

195

a_gpu = cp.random.random((n,), dtype=cp.float32)

196

b_gpu = cp.random.random((n,), dtype=cp.float32)

197

c_gpu = cp.zeros((n,), dtype=cp.float32)

198

199

threads_per_block = 256

200

blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

201

202

kernel((blocks_per_grid,), (threads_per_block,),

203

(a_gpu, b_gpu, c_gpu, n))

204

```