0
# GPU Computing
1
2
High-performance GPU computing capabilities through CUDA, OpenCL, and associated libraries for parallel processing and acceleration.
3
4
## Capabilities
5
6
### CUDA Runtime Operations
7
8
NVIDIA CUDA runtime API for GPU device management, memory operations, and kernel execution.
9
10
```java { .api }
11
/**
12
* Device management functions
13
*/
14
public static class CUDADevice {
15
/**
16
* Get number of CUDA devices
17
* @param count Output device count
18
* @return CUDA error code
19
*/
20
public static native int cudaGetDeviceCount(IntPointer count);
21
22
/**
23
* Set current CUDA device
24
* @param device Device index to use
25
* @return CUDA error code
26
*/
27
public static native int cudaSetDevice(int device);
28
29
/**
30
* Get current CUDA device
31
* @param device Output current device index
32
* @return CUDA error code
33
*/
34
public static native int cudaGetDevice(IntPointer device);
35
36
/**
37
* Get device properties
38
* @param prop Output device properties
39
* @param device Device index
40
* @return CUDA error code
41
*/
42
public static native int cudaGetDeviceProperties(cudaDeviceProp prop, int device);
43
44
/**
45
* Synchronize with current device
46
* @return CUDA error code
47
*/
48
public static native int cudaDeviceSynchronize();
49
50
/**
51
* Reset current device
52
* @return CUDA error code
53
*/
54
public static native int cudaDeviceReset();
55
}
56
57
/**
58
* Memory management functions
59
*/
60
public static class CUDAMemory {
61
/**
62
* Allocate device memory
63
* @param devPtr Output pointer to allocated memory
64
* @param size Size in bytes to allocate
65
* @return CUDA error code
66
*/
67
public static native int cudaMalloc(PointerPointer devPtr, long size);
68
69
/**
70
* Free device memory
71
* @param devPtr Device pointer to free
72
* @return CUDA error code
73
*/
74
public static native int cudaFree(Pointer devPtr);
75
76
/**
77
* Copy memory between host and device
78
* @param dst Destination pointer
79
* @param src Source pointer
80
* @param count Size in bytes
81
* @param kind Copy direction (cudaMemcpyHostToDevice, etc.)
82
* @return CUDA error code
83
*/
84
public static native int cudaMemcpy(Pointer dst, Pointer src, long count, int kind);
85
86
/**
87
* Asynchronous memory copy
88
* @param dst Destination pointer
89
* @param src Source pointer
90
* @param count Size in bytes
91
* @param kind Copy direction
92
* @param stream CUDA stream
93
* @return CUDA error code
94
*/
95
public static native int cudaMemcpyAsync(Pointer dst, Pointer src, long count,
96
int kind, cudaStream_t stream);
97
98
/**
99
* Set device memory to value
100
* @param devPtr Device pointer
101
* @param value Value to set (byte)
102
* @param count Number of bytes
103
* @return CUDA error code
104
*/
105
public static native int cudaMemset(Pointer devPtr, int value, long count);
106
107
/**
108
* Allocate page-locked host memory
109
* @param ptr Output pointer to allocated host memory
110
* @param size Size in bytes
111
* @return CUDA error code
112
*/
113
public static native int cudaMallocHost(PointerPointer ptr, long size);
114
115
/**
116
* Free page-locked host memory
117
* @param ptr Host pointer to free
118
* @return CUDA error code
119
*/
120
public static native int cudaFreeHost(Pointer ptr);
121
}
122
123
/**
124
* Stream management for asynchronous operations
125
*/
126
public static class CUDAStream {
127
/**
128
* Create CUDA stream
129
* @param pStream Output stream handle
130
* @return CUDA error code
131
*/
132
public static native int cudaStreamCreate(cudaStream_t pStream);
133
134
/**
135
* Destroy CUDA stream
136
* @param stream Stream to destroy
137
* @return CUDA error code
138
*/
139
public static native int cudaStreamDestroy(cudaStream_t stream);
140
141
/**
142
* Synchronize with stream
143
* @param stream Stream to synchronize
144
* @return CUDA error code
145
*/
146
public static native int cudaStreamSynchronize(cudaStream_t stream);
147
148
/**
149
* Query stream status
150
* @param stream Stream to query
151
* @return CUDA error code (cudaSuccess if complete)
152
*/
153
public static native int cudaStreamQuery(cudaStream_t stream);
154
}
155
156
/**
157
* CUDA device properties structure
158
*/
159
public class cudaDeviceProp extends Pointer {
160
/** Device name */
161
public native String name();
162
163
/** Total global memory in bytes */
164
public native long totalGlobalMem();
165
166
/** Shared memory per block */
167
public native long sharedMemPerBlock();
168
169
/** Number of registers per block */
170
public native int regsPerBlock();
171
172
/** Warp size */
173
public native int warpSize();
174
175
/** Maximum threads per block */
176
public native int maxThreadsPerBlock();
177
178
/** Maximum block dimensions */
179
public native IntPointer maxThreadsDim();
180
181
/** Maximum grid dimensions */
182
public native IntPointer maxGridSize();
183
184
/** Compute capability major version */
185
public native int major();
186
187
/** Compute capability minor version */
188
public native int minor();
189
190
/** Number of multiprocessors */
191
public native int multiProcessorCount();
192
}
193
```
194
195
### cuBLAS Operations
196
197
CUDA Basic Linear Algebra Subprograms for GPU-accelerated linear algebra.
198
199
```java { .api }
200
/**
201
* cuBLAS context and initialization
202
*/
203
public static class cuBLASContext {
204
/**
205
* Create cuBLAS handle
206
* @param handle Output handle
207
* @return cuBLAS status
208
*/
209
public static native int cublasCreate_v2(cublasHandle_t handle);
210
211
/**
212
* Destroy cuBLAS handle
213
* @param handle Handle to destroy
214
* @return cuBLAS status
215
*/
216
public static native int cublasDestroy_v2(cublasHandle_t handle);
217
218
/**
219
* Set cuBLAS stream
220
* @param handle cuBLAS handle
221
* @param streamId CUDA stream
222
* @return cuBLAS status
223
*/
224
public static native int cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
225
}
226
227
/**
228
* cuBLAS Level 3 operations (matrix-matrix)
229
*/
230
public static class cuBLASLevel3 {
231
/**
232
* Single precision matrix multiplication: C = α*A*B + β*C
233
* @param handle cuBLAS handle
234
* @param transa Transpose operation for A
235
* @param transb Transpose operation for B
236
* @param m Number of rows in A and C
237
* @param n Number of columns in B and C
238
* @param k Number of columns in A and rows in B
239
* @param alpha Scalar α
240
* @param A Matrix A on device
241
* @param lda Leading dimension of A
242
* @param B Matrix B on device
243
* @param ldb Leading dimension of B
244
* @param beta Scalar β
245
* @param C Matrix C on device
246
* @param ldc Leading dimension of C
247
* @return cuBLAS status
248
*/
249
public static native int cublasSgemm_v2(cublasHandle_t handle, int transa, int transb,
250
int m, int n, int k, FloatPointer alpha, FloatPointer A, int lda,
251
FloatPointer B, int ldb, FloatPointer beta, FloatPointer C, int ldc);
252
253
/**
254
* Double precision matrix multiplication
255
*/
256
public static native int cublasDgemm_v2(cublasHandle_t handle, int transa, int transb,
257
int m, int n, int k, DoublePointer alpha, DoublePointer A, int lda,
258
DoublePointer B, int ldb, DoublePointer beta, DoublePointer C, int ldc);
259
260
/**
261
* Batched matrix multiplication
262
* @param handle cuBLAS handle
263
* @param transa Transpose operation for A
264
* @param transb Transpose operation for B
265
* @param m Number of rows in A and C
266
* @param n Number of columns in B and C
267
* @param k Number of columns in A and rows in B
268
* @param alpha Scalar α
269
* @param Aarray Array of pointers to matrices A
270
* @param lda Leading dimension of A
271
* @param Barray Array of pointers to matrices B
272
* @param ldb Leading dimension of B
273
* @param beta Scalar β
274
* @param Carray Array of pointers to matrices C
275
* @param ldc Leading dimension of C
276
* @param batchCount Number of matrices
277
* @return cuBLAS status
278
*/
279
public static native int cublasSgemmBatched(cublasHandle_t handle, int transa, int transb,
280
int m, int n, int k, FloatPointer alpha, PointerPointer Aarray, int lda,
281
PointerPointer Barray, int ldb, FloatPointer beta, PointerPointer Carray,
282
int ldc, int batchCount);
283
}
284
```
285
286
### cuDNN Deep Learning
287
288
CUDA Deep Neural Network library for accelerated deep learning operations.
289
290
```java { .api }
291
/**
292
* cuDNN context management
293
*/
294
public static class cuDNNContext {
295
/**
296
* Create cuDNN handle
297
* @param handle Output handle
298
* @return cuDNN status
299
*/
300
public static native int cudnnCreate(cudnnHandle_t handle);
301
302
/**
303
* Destroy cuDNN handle
304
* @param handle Handle to destroy
305
* @return cuDNN status
306
*/
307
public static native int cudnnDestroy(cudnnHandle_t handle);
308
309
/**
310
* Set cuDNN stream
311
* @param handle cuDNN handle
312
* @param stream CUDA stream
313
* @return cuDNN status
314
*/
315
public static native int cudnnSetStream(cudnnHandle_t handle, cudaStream_t stream);
316
}
317
318
/**
319
* Tensor descriptor management
320
*/
321
public static class cuDNNTensor {
322
/**
323
* Create tensor descriptor
324
* @param tensorDesc Output tensor descriptor
325
* @return cuDNN status
326
*/
327
public static native int cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
328
329
/**
330
* Set tensor descriptor
331
* @param tensorDesc Tensor descriptor
332
* @param format Data format (NCHW, NHWC, etc.)
333
* @param dataType Data type (float, half, etc.)
334
* @param nbDims Number of dimensions
335
* @param dimA Dimension sizes
336
* @param strideA Stride sizes
337
* @return cuDNN status
338
*/
339
public static native int cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
340
int dataType, int nbDims, IntPointer dimA, IntPointer strideA);
341
342
/**
343
* Destroy tensor descriptor
344
* @param tensorDesc Tensor descriptor to destroy
345
* @return cuDNN status
346
*/
347
public static native int cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
348
}
349
350
/**
351
* Convolution operations
352
*/
353
public static class cuDNNConvolution {
354
/**
355
* Create convolution descriptor
356
* @param convDesc Output convolution descriptor
357
* @return cuDNN status
358
*/
359
public static native int cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
360
361
/**
362
* Set convolution descriptor
363
* @param convDesc Convolution descriptor
364
* @param arrayLength Number of dimensions
365
* @param padA Padding for each dimension
366
* @param filterStrideA Stride for each dimension
367
* @param dilationA Dilation for each dimension
368
* @param mode Convolution mode
369
* @param computeType Computation data type
370
* @return cuDNN status
371
*/
372
public static native int cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
373
int arrayLength, IntPointer padA, IntPointer filterStrideA, IntPointer dilationA,
374
int mode, int computeType);
375
376
/**
377
* Forward convolution
378
* @param handle cuDNN handle
379
* @param alpha Scaling factor for input
380
* @param xDesc Input tensor descriptor
381
* @param x Input tensor data
382
* @param wDesc Filter tensor descriptor
383
* @param w Filter data
384
* @param convDesc Convolution descriptor
385
* @param algo Convolution algorithm
386
* @param workSpace Workspace memory
387
* @param workSpaceSizeInBytes Workspace size
388
* @param beta Scaling factor for output
389
* @param yDesc Output tensor descriptor
390
* @param y Output tensor data
391
* @return cuDNN status
392
*/
393
public static native int cudnnConvolutionForward(cudnnHandle_t handle,
394
Pointer alpha, cudnnTensorDescriptor_t xDesc, Pointer x,
395
cudnnFilterDescriptor_t wDesc, Pointer w, cudnnConvolutionDescriptor_t convDesc,
396
int algo, Pointer workSpace, long workSpaceSizeInBytes, Pointer beta,
397
cudnnTensorDescriptor_t yDesc, Pointer y);
398
}
399
400
/**
401
* Activation functions
402
*/
403
public static class cuDNNActivation {
404
/**
405
* Create activation descriptor
406
* @param activationDesc Output activation descriptor
407
* @return cuDNN status
408
*/
409
public static native int cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
410
411
/**
412
* Set activation descriptor
413
* @param activationDesc Activation descriptor
414
* @param mode Activation mode (sigmoid, relu, tanh, etc.)
415
* @param reluNanOpt NaN propagation mode
416
* @param coef Coefficient for some activation modes
417
* @return cuDNN status
418
*/
419
public static native int cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
420
int mode, int reluNanOpt, double coef);
421
422
/**
423
* Forward activation
424
* @param handle cuDNN handle
425
* @param activationDesc Activation descriptor
426
* @param alpha Scaling factor for input
427
* @param xDesc Input tensor descriptor
428
* @param x Input tensor data
429
* @param beta Scaling factor for output
430
* @param yDesc Output tensor descriptor
431
* @param y Output tensor data
432
* @return cuDNN status
433
*/
434
public static native int cudnnActivationForward(cudnnHandle_t handle,
435
cudnnActivationDescriptor_t activationDesc, Pointer alpha,
436
cudnnTensorDescriptor_t xDesc, Pointer x, Pointer beta,
437
cudnnTensorDescriptor_t yDesc, Pointer y);
438
}
439
```
440
441
### OpenCL Cross-Platform Computing
442
443
OpenCL API for cross-platform parallel computing across CPUs, GPUs, and other devices.
444
445
```java { .api }
446
/**
447
* OpenCL platform and device management
448
*/
449
public static class OpenCLPlatform {
450
/**
451
* Get platform IDs
452
* @param num_entries Number of platform entries
453
* @param platforms Output platform array
454
* @param num_platforms Actual number of platforms found
455
* @return OpenCL error code
456
*/
457
public static native int clGetPlatformIDs(int num_entries, cl_platform_id platforms,
458
IntPointer num_platforms);
459
460
/**
461
* Get platform information
462
* @param platform Platform ID
463
* @param param_name Information parameter
464
* @param param_value_size Size of output buffer
465
* @param param_value Output buffer
466
* @param param_value_size_ret Actual size of information
467
* @return OpenCL error code
468
*/
469
public static native int clGetPlatformInfo(cl_platform_id platform, int param_name,
470
long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
471
472
/**
473
* Get device IDs for platform
474
* @param platform Platform ID
475
* @param device_type Device type filter (GPU, CPU, ALL, etc.)
476
* @param num_entries Number of device entries
477
* @param devices Output device array
478
* @param num_devices Actual number of devices found
479
* @return OpenCL error code
480
*/
481
public static native int clGetDeviceIDs(cl_platform_id platform, long device_type,
482
int num_entries, cl_device_id devices, IntPointer num_devices);
483
484
/**
485
* Get device information
486
* @param device Device ID
487
* @param param_name Information parameter
488
* @param param_value_size Size of output buffer
489
* @param param_value Output buffer
490
* @param param_value_size_ret Actual size of information
491
* @return OpenCL error code
492
*/
493
public static native int clGetDeviceInfo(cl_device_id device, int param_name,
494
long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
495
}
496
497
/**
498
* OpenCL context and command queue management
499
*/
500
public static class OpenCLContext {
501
/**
502
* Create OpenCL context
503
* @param properties Context properties
504
* @param num_devices Number of devices
505
* @param devices Device array
506
* @param pfn_notify Notification callback
507
* @param user_data User data for callback
508
* @param errcode_ret Error code output
509
* @return Context handle
510
*/
511
public static native cl_context clCreateContext(cl_context_properties properties,
512
int num_devices, cl_device_id devices, CreateContextCallbackFunction pfn_notify,
513
Pointer user_data, IntPointer errcode_ret);
514
515
/**
516
* Release context
517
* @param context Context to release
518
* @return OpenCL error code
519
*/
520
public static native int clReleaseContext(cl_context context);
521
522
/**
523
* Create command queue
524
* @param context OpenCL context
525
* @param device Target device
526
* @param properties Queue properties
527
* @param errcode_ret Error code output
528
* @return Command queue handle
529
*/
530
public static native cl_command_queue clCreateCommandQueue(cl_context context,
531
cl_device_id device, long properties, IntPointer errcode_ret);
532
533
/**
534
* Release command queue
535
* @param command_queue Queue to release
536
* @return OpenCL error code
537
*/
538
public static native int clReleaseCommandQueue(cl_command_queue command_queue);
539
}
540
541
/**
542
* OpenCL memory management
543
*/
544
public static class OpenCLMemory {
545
/**
546
* Create buffer object
547
* @param context OpenCL context
548
* @param flags Memory flags (read/write permissions, etc.)
549
* @param size Buffer size in bytes
550
* @param host_ptr Host memory pointer (optional)
551
* @param errcode_ret Error code output
552
* @return Memory object handle
553
*/
554
public static native cl_mem clCreateBuffer(cl_context context, long flags, long size,
555
Pointer host_ptr, IntPointer errcode_ret);
556
557
/**
558
* Release memory object
559
* @param memobj Memory object to release
560
* @return OpenCL error code
561
*/
562
public static native int clReleaseMemObject(cl_mem memobj);
563
564
/**
565
* Enqueue buffer write operation
566
* @param command_queue Command queue
567
* @param buffer Target buffer
568
* @param blocking_write Blocking operation flag
569
* @param offset Offset in buffer
570
* @param size Size to write
571
* @param ptr Source data pointer
572
* @param num_events_in_wait_list Number of events to wait for
573
* @param event_wait_list Events to wait for
574
* @param event Output event
575
* @return OpenCL error code
576
*/
577
public static native int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
578
int blocking_write, long offset, long size, Pointer ptr, int num_events_in_wait_list,
579
cl_event event_wait_list, cl_event event);
580
581
/**
582
* Enqueue buffer read operation
583
* @param command_queue Command queue
584
* @param buffer Source buffer
585
* @param blocking_read Blocking operation flag
586
* @param offset Offset in buffer
587
* @param size Size to read
588
* @param ptr Destination data pointer
589
* @param num_events_in_wait_list Number of events to wait for
590
* @param event_wait_list Events to wait for
591
* @param event Output event
592
* @return OpenCL error code
593
*/
594
public static native int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
595
int blocking_read, long offset, long size, Pointer ptr, int num_events_in_wait_list,
596
cl_event event_wait_list, cl_event event);
597
}
598
599
/**
600
* OpenCL kernel execution
601
*/
602
public static class OpenCLKernel {
603
/**
604
* Create program from source
605
* @param context OpenCL context
606
* @param count Number of source strings
607
* @param strings Source code strings
608
* @param lengths String lengths (null for null-terminated)
609
* @param errcode_ret Error code output
610
* @return Program handle
611
*/
612
public static native cl_program clCreateProgramWithSource(cl_context context, int count,
613
PointerPointer strings, SizeTPointer lengths, IntPointer errcode_ret);
614
615
/**
616
* Build program
617
* @param program Program to build
618
* @param num_devices Number of devices
619
* @param device_list Target devices
620
* @param options Build options
621
* @param pfn_notify Build callback
622
* @param user_data User data for callback
623
* @return OpenCL error code
624
*/
625
public static native int clBuildProgram(cl_program program, int num_devices,
626
cl_device_id device_list, String options, BuildProgramCallbackFunction pfn_notify,
627
Pointer user_data);
628
629
/**
630
* Create kernel from program
631
* @param program Compiled program
632
* @param kernel_name Kernel function name
633
* @param errcode_ret Error code output
634
* @return Kernel handle
635
*/
636
public static native cl_kernel clCreateKernel(cl_program program, String kernel_name,
637
IntPointer errcode_ret);
638
639
/**
640
* Set kernel argument
641
* @param kernel Kernel handle
642
* @param arg_index Argument index
643
* @param arg_size Argument size
644
* @param arg_value Argument value pointer
645
* @return OpenCL error code
646
*/
647
public static native int clSetKernelArg(cl_kernel kernel, int arg_index, long arg_size,
648
Pointer arg_value);
649
650
/**
651
* Enqueue kernel execution
652
* @param command_queue Command queue
653
* @param kernel Kernel to execute
654
* @param work_dim Number of work dimensions
655
* @param global_work_offset Global work offset
656
* @param global_work_size Global work size
657
* @param local_work_size Local work size
658
* @param num_events_in_wait_list Number of events to wait for
659
* @param event_wait_list Events to wait for
660
* @param event Output event
661
* @return OpenCL error code
662
*/
663
public static native int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
664
int work_dim, SizeTPointer global_work_offset, SizeTPointer global_work_size,
665
SizeTPointer local_work_size, int num_events_in_wait_list, cl_event event_wait_list,
666
cl_event event);
667
}
668
```
669
670
## Usage Examples
671
672
### CUDA Vector Addition
673
674
```java
675
import org.bytedeco.cuda.cudart.*;
676
import static org.bytedeco.cuda.global.cudart.*;
677
678
public class CUDAVectorAdd {
679
static {
680
Loader.load(cudart.class);
681
}
682
683
public static void vectorAdd() {
684
try (PointerScope scope = new PointerScope()) {
685
int N = 1024;
686
int size = N * 4; // sizeof(float) * N
687
688
// Host arrays
689
float[] h_A = new float[N];
690
float[] h_B = new float[N];
691
float[] h_C = new float[N];
692
693
// Initialize host arrays
694
for (int i = 0; i < N; i++) {
695
h_A[i] = i;
696
h_B[i] = i * 2;
697
}
698
699
// Device pointers
700
Pointer d_A = new Pointer();
701
Pointer d_B = new Pointer();
702
Pointer d_C = new Pointer();
703
704
// Allocate device memory
705
cudaMalloc(d_A, size);
706
cudaMalloc(d_B, size);
707
cudaMalloc(d_C, size);
708
709
// Copy data to device
710
FloatPointer fp_A = new FloatPointer(h_A);
711
FloatPointer fp_B = new FloatPointer(h_B);
712
FloatPointer fp_C = new FloatPointer(h_C);
713
714
cudaMemcpy(d_A, fp_A, size, cudaMemcpyHostToDevice);
715
cudaMemcpy(d_B, fp_B, size, cudaMemcpyHostToDevice);
716
717
// Launch kernel (this would require a compiled CUDA kernel)
718
// For illustration - actual kernel launch would use CUDA driver API
719
// or require JCuda/JCUDA for higher-level kernel launching
720
721
// Copy result back to host
722
cudaMemcpy(fp_C, d_C, size, cudaMemcpyDeviceToHost);
723
724
// Verify results
725
boolean success = true;
726
for (int i = 0; i < N && success; i++) {
727
if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
728
success = false;
729
System.err.println("Verification failed at index " + i);
730
}
731
}
732
733
if (success) {
734
System.out.println("Vector addition completed successfully!");
735
}
736
737
// Free device memory
738
cudaFree(d_A);
739
cudaFree(d_B);
740
cudaFree(d_C);
741
}
742
}
743
744
public static void deviceInfo() {
745
try (PointerScope scope = new PointerScope()) {
746
IntPointer deviceCount = new IntPointer(1);
747
cudaGetDeviceCount(deviceCount);
748
749
System.out.println("Number of CUDA devices: " + deviceCount.get());
750
751
for (int i = 0; i < deviceCount.get(); i++) {
752
cudaDeviceProp prop = new cudaDeviceProp();
753
cudaGetDeviceProperties(prop, i);
754
755
System.out.println("\nDevice " + i + ":");
756
System.out.println(" Name: " + prop.name().getString());
757
System.out.println(" Compute Capability: " + prop.major() + "." + prop.minor());
758
System.out.println(" Total Global Memory: " + prop.totalGlobalMem() / (1024*1024) + " MB");
759
System.out.println(" Multiprocessors: " + prop.multiProcessorCount());
760
System.out.println(" Max Threads per Block: " + prop.maxThreadsPerBlock());
761
System.out.println(" Warp Size: " + prop.warpSize());
762
}
763
}
764
}
765
}
766
```
767
768
### cuBLAS Matrix Multiplication
769
770
```java
771
import org.bytedeco.cuda.cudart.*;
772
import org.bytedeco.cuda.cublas.*;
773
import static org.bytedeco.cuda.global.cudart.*;
774
import static org.bytedeco.cuda.global.cublas.*;
775
776
public class cuBLASExample {
777
static {
778
Loader.load(cudart.class);
779
Loader.load(cublas.class);
780
}
781
782
public static void matrixMultiplication() {
783
try (PointerScope scope = new PointerScope()) {
784
int M = 3, N = 3, K = 3;
785
786
// Host matrices
787
float[] h_A = {1, 2, 3, 4, 5, 6, 7, 8, 9};
788
float[] h_B = {9, 8, 7, 6, 5, 4, 3, 2, 1};
789
float[] h_C = new float[M * N];
790
791
// Device matrices
792
Pointer d_A = new Pointer();
793
Pointer d_B = new Pointer();
794
Pointer d_C = new Pointer();
795
796
int sizeA = M * K * 4; // sizeof(float)
797
int sizeB = K * N * 4;
798
int sizeC = M * N * 4;
799
800
cudaMalloc(d_A, sizeA);
801
cudaMalloc(d_B, sizeB);
802
cudaMalloc(d_C, sizeC);
803
804
// Copy matrices to device
805
FloatPointer fp_A = new FloatPointer(h_A);
806
FloatPointer fp_B = new FloatPointer(h_B);
807
FloatPointer fp_C = new FloatPointer(h_C);
808
809
cudaMemcpy(d_A, fp_A, sizeA, cudaMemcpyHostToDevice);
810
cudaMemcpy(d_B, fp_B, sizeB, cudaMemcpyHostToDevice);
811
812
// Create cuBLAS handle
813
cublasHandle_t handle = new cublasHandle_t();
814
cublasCreate_v2(handle);
815
816
// Scalars for GEMM
817
FloatPointer alpha = new FloatPointer(1.0f);
818
FloatPointer beta = new FloatPointer(0.0f);
819
820
// Perform matrix multiplication: C = α*A*B + β*C
821
cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
822
M, N, K, alpha,
823
new FloatPointer(d_A), M,
824
new FloatPointer(d_B), K,
825
beta, new FloatPointer(d_C), M);
826
827
// Copy result back to host
828
cudaMemcpy(fp_C, d_C, sizeC, cudaMemcpyDeviceToHost);
829
830
// Print result
831
System.out.println("cuBLAS Matrix multiplication result:");
832
for (int i = 0; i < M; i++) {
833
for (int j = 0; j < N; j++) {
834
System.out.printf("%.1f ", h_C[i * N + j]);
835
}
836
System.out.println();
837
}
838
839
// Cleanup
840
cublasDestroy_v2(handle);
841
cudaFree(d_A);
842
cudaFree(d_B);
843
cudaFree(d_C);
844
}
845
}
846
}
847
```
848
849
### OpenCL Vector Addition
850
851
```java
852
import org.bytedeco.opencl.*;
853
import static org.bytedeco.opencl.global.OpenCL.*;
854
855
public class OpenCLExample {
856
static {
857
Loader.load(OpenCL.class);
858
}
859
860
// OpenCL kernel source code
861
static final String kernelSource =
862
"__kernel void vector_add(__global const float* A, __global const float* B, " +
863
"__global float* C) { " +
864
" int i = get_global_id(0); " +
865
" C[i] = A[i] + B[i]; " +
866
"}";
867
868
public static void vectorAdd() {
869
try (PointerScope scope = new PointerScope()) {
870
int N = 1024;
871
872
// Get platform and device
873
cl_platform_id platform = new cl_platform_id();
874
cl_device_id device = new cl_device_id();
875
IntPointer ret = new IntPointer(1);
876
877
clGetPlatformIDs(1, platform, null);
878
clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, device, null);
879
880
// Create context and command queue
881
cl_context context = clCreateContext(null, 1, device, null, null, ret);
882
cl_command_queue queue = clCreateCommandQueue(context, device, 0, ret);
883
884
// Host data
885
float[] h_A = new float[N];
886
float[] h_B = new float[N];
887
float[] h_C = new float[N];
888
889
for (int i = 0; i < N; i++) {
890
h_A[i] = i;
891
h_B[i] = i * 2;
892
}
893
894
// Create device buffers
895
cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
896
cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
897
cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 4, null, ret);
898
899
// Copy data to device
900
FloatPointer fp_A = new FloatPointer(h_A);
901
FloatPointer fp_B = new FloatPointer(h_B);
902
903
clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, N * 4, fp_A, 0, null, null);
904
clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, N * 4, fp_B, 0, null, null);
905
906
// Create and build program
907
PointerPointer kernelSourcePtr = new PointerPointer(kernelSource);
908
cl_program program = clCreateProgramWithSource(context, 1, kernelSourcePtr, null, ret);
909
clBuildProgram(program, 1, device, null, null, null);
910
911
// Create kernel
912
cl_kernel kernel = clCreateKernel(program, "vector_add", ret);
913
914
// Set kernel arguments
915
clSetKernelArg(kernel, 0, Pointer.sizeof(cl_mem.class), d_A);
916
clSetKernelArg(kernel, 1, Pointer.sizeof(cl_mem.class), d_B);
917
clSetKernelArg(kernel, 2, Pointer.sizeof(cl_mem.class), d_C);
918
919
// Execute kernel
920
SizeTPointer globalWorkSize = new SizeTPointer(N);
921
clEnqueueNDRangeKernel(queue, kernel, 1, null, globalWorkSize, null, 0, null, null);
922
923
// Read result
924
FloatPointer fp_C = new FloatPointer(h_C);
925
clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, N * 4, fp_C, 0, null, null);
926
927
// Verify results
928
boolean success = true;
929
for (int i = 0; i < Math.min(N, 10) && success; i++) {
930
if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
931
success = false;
932
}
933
System.out.printf("C[%d] = %.1f (expected %.1f)\n", i, h_C[i], h_A[i] + h_B[i]);
934
}
935
936
System.out.println(success ? "OpenCL vector addition successful!" : "Verification failed");
937
938
// Cleanup (in reverse order of creation)
939
clReleaseKernel(kernel);
940
clReleaseProgram(program);
941
clReleaseMemObject(d_A);
942
clReleaseMemObject(d_B);
943
clReleaseMemObject(d_C);
944
clReleaseCommandQueue(queue);
945
clReleaseContext(context);
946
}
947
}
948
}
949
```