Tessl Tile for maven/org.bytedeco/javacpp-presets-platform@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

computer-vision.md gpu-computing.md index.md machine-learning.md multimedia.md scientific-computing.md text-processing.md

gpu-computing.mddocs/

0
# GPU Computing
1

2
High-performance GPU computing capabilities through CUDA, OpenCL, and associated libraries for parallel processing and acceleration.
3

4
## Capabilities
5

6
### CUDA Runtime Operations
7

8
NVIDIA CUDA runtime API for GPU device management, memory operations, and kernel execution.
9

10
```java { .api }
11
/**
12
 * Device management functions
13
 */
14
public static class CUDADevice {
15
    /**
16
     * Get number of CUDA devices
17
     * @param count Output device count
18
     * @return CUDA error code
19
     */
20
    public static native int cudaGetDeviceCount(IntPointer count);
21
    
22
    /**
23
     * Set current CUDA device
24
     * @param device Device index to use
25
     * @return CUDA error code
26
     */
27
    public static native int cudaSetDevice(int device);
28
    
29
    /**
30
     * Get current CUDA device
31
     * @param device Output current device index
32
     * @return CUDA error code
33
     */
34
    public static native int cudaGetDevice(IntPointer device);
35
    
36
    /**
37
     * Get device properties
38
     * @param prop Output device properties
39
     * @param device Device index
40
     * @return CUDA error code
41
     */
42
    public static native int cudaGetDeviceProperties(cudaDeviceProp prop, int device);
43
    
44
    /**
45
     * Synchronize with current device
46
     * @return CUDA error code
47
     */
48
    public static native int cudaDeviceSynchronize();
49
    
50
    /**
51
     * Reset current device
52
     * @return CUDA error code
53
     */
54
    public static native int cudaDeviceReset();
55
}
56

57
/**
58
 * Memory management functions
59
 */
60
public static class CUDAMemory {
61
    /**
62
     * Allocate device memory
63
     * @param devPtr Output pointer to allocated memory
64
     * @param size Size in bytes to allocate
65
     * @return CUDA error code
66
     */
67
    public static native int cudaMalloc(PointerPointer devPtr, long size);
68
    
69
    /**
70
     * Free device memory
71
     * @param devPtr Device pointer to free
72
     * @return CUDA error code
73
     */
74
    public static native int cudaFree(Pointer devPtr);
75
    
76
    /**
77
     * Copy memory between host and device
78
     * @param dst Destination pointer
79
     * @param src Source pointer
80
     * @param count Size in bytes
81
     * @param kind Copy direction (cudaMemcpyHostToDevice, etc.)
82
     * @return CUDA error code
83
     */
84
    public static native int cudaMemcpy(Pointer dst, Pointer src, long count, int kind);
85
    
86
    /**
87
     * Asynchronous memory copy
88
     * @param dst Destination pointer
89
     * @param src Source pointer
90
     * @param count Size in bytes
91
     * @param kind Copy direction
92
     * @param stream CUDA stream
93
     * @return CUDA error code
94
     */
95
    public static native int cudaMemcpyAsync(Pointer dst, Pointer src, long count,
96
        int kind, cudaStream_t stream);
97
    
98
    /**
99
     * Set device memory to value
100
     * @param devPtr Device pointer
101
     * @param value Value to set (byte)
102
     * @param count Number of bytes
103
     * @return CUDA error code
104
     */
105
    public static native int cudaMemset(Pointer devPtr, int value, long count);
106
    
107
    /**
108
     * Allocate page-locked host memory
109
     * @param ptr Output pointer to allocated host memory
110
     * @param size Size in bytes
111
     * @return CUDA error code
112
     */
113
    public static native int cudaMallocHost(PointerPointer ptr, long size);
114
    
115
    /**
116
     * Free page-locked host memory
117
     * @param ptr Host pointer to free
118
     * @return CUDA error code
119
     */
120
    public static native int cudaFreeHost(Pointer ptr);
121
}
122

123
/**
124
 * Stream management for asynchronous operations
125
 */
126
public static class CUDAStream {
127
    /**
128
     * Create CUDA stream
129
     * @param pStream Output stream handle
130
     * @return CUDA error code
131
     */
132
    public static native int cudaStreamCreate(cudaStream_t pStream);
133
    
134
    /**
135
     * Destroy CUDA stream
136
     * @param stream Stream to destroy
137
     * @return CUDA error code
138
     */
139
    public static native int cudaStreamDestroy(cudaStream_t stream);
140
    
141
    /**
142
     * Synchronize with stream
143
     * @param stream Stream to synchronize
144
     * @return CUDA error code
145
     */
146
    public static native int cudaStreamSynchronize(cudaStream_t stream);
147
    
148
    /**
149
     * Query stream status
150
     * @param stream Stream to query
151
     * @return CUDA error code (cudaSuccess if complete)
152
     */
153
    public static native int cudaStreamQuery(cudaStream_t stream);
154
}
155

156
/**
157
 * CUDA device properties structure
158
 */
159
public class cudaDeviceProp extends Pointer {
160
    /** Device name */
161
    public native String name();
162
    
163
    /** Total global memory in bytes */
164
    public native long totalGlobalMem();
165
    
166
    /** Shared memory per block */
167
    public native long sharedMemPerBlock();
168
    
169
    /** Number of registers per block */
170
    public native int regsPerBlock();
171
    
172
    /** Warp size */
173
    public native int warpSize();
174
    
175
    /** Maximum threads per block */
176
    public native int maxThreadsPerBlock();
177
    
178
    /** Maximum block dimensions */
179
    public native IntPointer maxThreadsDim();
180
    
181
    /** Maximum grid dimensions */
182
    public native IntPointer maxGridSize();
183
    
184
    /** Compute capability major version */
185
    public native int major();
186
    
187
    /** Compute capability minor version */
188
    public native int minor();
189
    
190
    /** Number of multiprocessors */
191
    public native int multiProcessorCount();
192
}
193
```
194

195
### cuBLAS Operations
196

197
CUDA Basic Linear Algebra Subprograms for GPU-accelerated linear algebra.
198

199
```java { .api }
200
/**
201
 * cuBLAS context and initialization
202
 */
203
public static class cuBLASContext {
204
    /**
205
     * Create cuBLAS handle
206
     * @param handle Output handle
207
     * @return cuBLAS status
208
     */
209
    public static native int cublasCreate_v2(cublasHandle_t handle);
210
    
211
    /**
212
     * Destroy cuBLAS handle
213
     * @param handle Handle to destroy
214
     * @return cuBLAS status
215
     */
216
    public static native int cublasDestroy_v2(cublasHandle_t handle);
217
    
218
    /**
219
     * Set cuBLAS stream
220
     * @param handle cuBLAS handle
221
     * @param streamId CUDA stream
222
     * @return cuBLAS status
223
     */
224
    public static native int cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
225
}
226

227
/**
228
 * cuBLAS Level 3 operations (matrix-matrix)
229
 */
230
public static class cuBLASLevel3 {
231
    /**
232
     * Single precision matrix multiplication: C = α*A*B + β*C
233
     * @param handle cuBLAS handle
234
     * @param transa Transpose operation for A
235
     * @param transb Transpose operation for B
236
     * @param m Number of rows in A and C
237
     * @param n Number of columns in B and C
238
     * @param k Number of columns in A and rows in B
239
     * @param alpha Scalar α
240
     * @param A Matrix A on device
241
     * @param lda Leading dimension of A
242
     * @param B Matrix B on device
243
     * @param ldb Leading dimension of B
244
     * @param beta Scalar β
245
     * @param C Matrix C on device
246
     * @param ldc Leading dimension of C
247
     * @return cuBLAS status
248
     */
249
    public static native int cublasSgemm_v2(cublasHandle_t handle, int transa, int transb,
250
        int m, int n, int k, FloatPointer alpha, FloatPointer A, int lda,
251
        FloatPointer B, int ldb, FloatPointer beta, FloatPointer C, int ldc);
252
    
253
    /**
254
     * Double precision matrix multiplication
255
     */
256
    public static native int cublasDgemm_v2(cublasHandle_t handle, int transa, int transb,
257
        int m, int n, int k, DoublePointer alpha, DoublePointer A, int lda,
258
        DoublePointer B, int ldb, DoublePointer beta, DoublePointer C, int ldc);
259
    
260
    /**
261
     * Batched matrix multiplication
262
     * @param handle cuBLAS handle
263
     * @param transa Transpose operation for A
264
     * @param transb Transpose operation for B
265
     * @param m Number of rows in A and C
266
     * @param n Number of columns in B and C
267
     * @param k Number of columns in A and rows in B
268
     * @param alpha Scalar α
269
     * @param Aarray Array of pointers to matrices A
270
     * @param lda Leading dimension of A
271
     * @param Barray Array of pointers to matrices B
272
     * @param ldb Leading dimension of B
273
     * @param beta Scalar β
274
     * @param Carray Array of pointers to matrices C
275
     * @param ldc Leading dimension of C
276
     * @param batchCount Number of matrices
277
     * @return cuBLAS status
278
     */
279
    public static native int cublasSgemmBatched(cublasHandle_t handle, int transa, int transb,
280
        int m, int n, int k, FloatPointer alpha, PointerPointer Aarray, int lda,
281
        PointerPointer Barray, int ldb, FloatPointer beta, PointerPointer Carray,
282
        int ldc, int batchCount);
283
}
284
```
285

286
### cuDNN Deep Learning
287

288
CUDA Deep Neural Network library for accelerated deep learning operations.
289

290
```java { .api }
291
/**
292
 * cuDNN context management
293
 */
294
public static class cuDNNContext {
295
    /**
296
     * Create cuDNN handle
297
     * @param handle Output handle
298
     * @return cuDNN status
299
     */
300
    public static native int cudnnCreate(cudnnHandle_t handle);
301
    
302
    /**
303
     * Destroy cuDNN handle
304
     * @param handle Handle to destroy
305
     * @return cuDNN status
306
     */
307
    public static native int cudnnDestroy(cudnnHandle_t handle);
308
    
309
    /**
310
     * Set cuDNN stream
311
     * @param handle cuDNN handle
312
     * @param stream CUDA stream
313
     * @return cuDNN status
314
     */
315
    public static native int cudnnSetStream(cudnnHandle_t handle, cudaStream_t stream);
316
}
317

318
/**
319
 * Tensor descriptor management
320
 */
321
public static class cuDNNTensor {
322
    /**
323
     * Create tensor descriptor
324
     * @param tensorDesc Output tensor descriptor
325
     * @return cuDNN status
326
     */
327
    public static native int cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
328
    
329
    /**
330
     * Set tensor descriptor
331
     * @param tensorDesc Tensor descriptor
332
     * @param format Data format (NCHW, NHWC, etc.)
333
     * @param dataType Data type (float, half, etc.)
334
     * @param nbDims Number of dimensions
335
     * @param dimA Dimension sizes
336
     * @param strideA Stride sizes
337
     * @return cuDNN status
338
     */
339
    public static native int cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
340
        int dataType, int nbDims, IntPointer dimA, IntPointer strideA);
341
    
342
    /**
343
     * Destroy tensor descriptor
344
     * @param tensorDesc Tensor descriptor to destroy
345
     * @return cuDNN status
346
     */
347
    public static native int cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
348
}
349

350
/**
351
 * Convolution operations
352
 */
353
public static class cuDNNConvolution {
354
    /**
355
     * Create convolution descriptor
356
     * @param convDesc Output convolution descriptor
357
     * @return cuDNN status
358
     */
359
    public static native int cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
360
    
361
    /**
362
     * Set convolution descriptor
363
     * @param convDesc Convolution descriptor  
364
     * @param arrayLength Number of dimensions
365
     * @param padA Padding for each dimension
366
     * @param filterStrideA Stride for each dimension
367
     * @param dilationA Dilation for each dimension
368
     * @param mode Convolution mode
369
     * @param computeType Computation data type
370
     * @return cuDNN status
371
     */
372
    public static native int cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
373
        int arrayLength, IntPointer padA, IntPointer filterStrideA, IntPointer dilationA,
374
        int mode, int computeType);
375
    
376
    /**
377
     * Forward convolution
378
     * @param handle cuDNN handle
379
     * @param alpha Scaling factor for input
380
     * @param xDesc Input tensor descriptor
381
     * @param x Input tensor data
382
     * @param wDesc Filter tensor descriptor
383
     * @param w Filter data
384
     * @param convDesc Convolution descriptor
385
     * @param algo Convolution algorithm
386
     * @param workSpace Workspace memory
387
     * @param workSpaceSizeInBytes Workspace size
388
     * @param beta Scaling factor for output
389
     * @param yDesc Output tensor descriptor
390
     * @param y Output tensor data
391
     * @return cuDNN status
392
     */
393
    public static native int cudnnConvolutionForward(cudnnHandle_t handle,
394
        Pointer alpha, cudnnTensorDescriptor_t xDesc, Pointer x,
395
        cudnnFilterDescriptor_t wDesc, Pointer w, cudnnConvolutionDescriptor_t convDesc,
396
        int algo, Pointer workSpace, long workSpaceSizeInBytes, Pointer beta,
397
        cudnnTensorDescriptor_t yDesc, Pointer y);
398
}
399

400
/**
401
 * Activation functions
402
 */
403
public static class cuDNNActivation {
404
    /**
405
     * Create activation descriptor
406
     * @param activationDesc Output activation descriptor
407
     * @return cuDNN status
408
     */
409
    public static native int cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
410
    
411
    /**
412
     * Set activation descriptor
413
     * @param activationDesc Activation descriptor
414
     * @param mode Activation mode (sigmoid, relu, tanh, etc.)
415
     * @param reluNanOpt NaN propagation mode
416
     * @param coef Coefficient for some activation modes
417
     * @return cuDNN status
418
     */
419
    public static native int cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
420
        int mode, int reluNanOpt, double coef);
421
    
422
    /**
423
     * Forward activation
424
     * @param handle cuDNN handle
425
     * @param activationDesc Activation descriptor
426
     * @param alpha Scaling factor for input
427
     * @param xDesc Input tensor descriptor
428
     * @param x Input tensor data
429
     * @param beta Scaling factor for output
430
     * @param yDesc Output tensor descriptor
431
     * @param y Output tensor data
432
     * @return cuDNN status
433
     */
434
    public static native int cudnnActivationForward(cudnnHandle_t handle,
435
        cudnnActivationDescriptor_t activationDesc, Pointer alpha,
436
        cudnnTensorDescriptor_t xDesc, Pointer x, Pointer beta,
437
        cudnnTensorDescriptor_t yDesc, Pointer y);
438
}
439
```
440

441
### OpenCL Cross-Platform Computing
442

443
OpenCL API for cross-platform parallel computing across CPUs, GPUs, and other devices.
444

445
```java { .api }
446
/**
447
 * OpenCL platform and device management
448
 */
449
public static class OpenCLPlatform {
450
    /**
451
     * Get platform IDs
452
     * @param num_entries Number of platform entries
453
     * @param platforms Output platform array
454
     * @param num_platforms Actual number of platforms found
455
     * @return OpenCL error code
456
     */
457
    public static native int clGetPlatformIDs(int num_entries, cl_platform_id platforms,
458
        IntPointer num_platforms);
459
    
460
    /**
461
     * Get platform information
462
     * @param platform Platform ID
463
     * @param param_name Information parameter
464
     * @param param_value_size Size of output buffer
465
     * @param param_value Output buffer
466
     * @param param_value_size_ret Actual size of information
467
     * @return OpenCL error code
468
     */
469
    public static native int clGetPlatformInfo(cl_platform_id platform, int param_name,
470
        long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
471
    
472
    /**
473
     * Get device IDs for platform
474
     * @param platform Platform ID
475
     * @param device_type Device type filter (GPU, CPU, ALL, etc.)
476
     * @param num_entries Number of device entries
477
     * @param devices Output device array
478
     * @param num_devices Actual number of devices found
479
     * @return OpenCL error code
480
     */
481
    public static native int clGetDeviceIDs(cl_platform_id platform, long device_type,
482
        int num_entries, cl_device_id devices, IntPointer num_devices);
483
    
484
    /**
485
     * Get device information
486
     * @param device Device ID
487
     * @param param_name Information parameter
488
     * @param param_value_size Size of output buffer
489
     * @param param_value Output buffer
490
     * @param param_value_size_ret Actual size of information
491
     * @return OpenCL error code
492
     */
493
    public static native int clGetDeviceInfo(cl_device_id device, int param_name,
494
        long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
495
}
496

497
/**
498
 * OpenCL context and command queue management
499
 */
500
public static class OpenCLContext {
501
    /**
502
     * Create OpenCL context
503
     * @param properties Context properties
504
     * @param num_devices Number of devices
505
     * @param devices Device array
506
     * @param pfn_notify Notification callback
507
     * @param user_data User data for callback
508
     * @param errcode_ret Error code output
509
     * @return Context handle
510
     */
511
    public static native cl_context clCreateContext(cl_context_properties properties,
512
        int num_devices, cl_device_id devices, CreateContextCallbackFunction pfn_notify,
513
        Pointer user_data, IntPointer errcode_ret);
514
    
515
    /**
516
     * Release context
517
     * @param context Context to release
518
     * @return OpenCL error code
519
     */
520
    public static native int clReleaseContext(cl_context context);
521
    
522
    /**
523
     * Create command queue
524
     * @param context OpenCL context
525
     * @param device Target device
526
     * @param properties Queue properties
527
     * @param errcode_ret Error code output
528
     * @return Command queue handle
529
     */
530
    public static native cl_command_queue clCreateCommandQueue(cl_context context,
531
        cl_device_id device, long properties, IntPointer errcode_ret);
532
    
533
    /**
534
     * Release command queue
535
     * @param command_queue Queue to release
536
     * @return OpenCL error code
537
     */
538
    public static native int clReleaseCommandQueue(cl_command_queue command_queue);
539
}
540

541
/**
542
 * OpenCL memory management
543
 */
544
public static class OpenCLMemory {
545
    /**
546
     * Create buffer object
547
     * @param context OpenCL context
548
     * @param flags Memory flags (read/write permissions, etc.)
549
     * @param size Buffer size in bytes
550
     * @param host_ptr Host memory pointer (optional)
551
     * @param errcode_ret Error code output
552
     * @return Memory object handle
553
     */
554
    public static native cl_mem clCreateBuffer(cl_context context, long flags, long size,
555
        Pointer host_ptr, IntPointer errcode_ret);
556
    
557
    /**
558
     * Release memory object
559
     * @param memobj Memory object to release
560
     * @return OpenCL error code
561
     */
562
    public static native int clReleaseMemObject(cl_mem memobj);
563
    
564
    /**
565
     * Enqueue buffer write operation
566
     * @param command_queue Command queue
567
     * @param buffer Target buffer
568
     * @param blocking_write Blocking operation flag
569
     * @param offset Offset in buffer
570
     * @param size Size to write
571
     * @param ptr Source data pointer
572
     * @param num_events_in_wait_list Number of events to wait for
573
     * @param event_wait_list Events to wait for
574
     * @param event Output event
575
     * @return OpenCL error code
576
     */
577
    public static native int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
578
        int blocking_write, long offset, long size, Pointer ptr, int num_events_in_wait_list,
579
        cl_event event_wait_list, cl_event event);
580
    
581
    /**
582
     * Enqueue buffer read operation
583
     * @param command_queue Command queue
584
     * @param buffer Source buffer
585
     * @param blocking_read Blocking operation flag
586
     * @param offset Offset in buffer
587
     * @param size Size to read
588
     * @param ptr Destination data pointer
589
     * @param num_events_in_wait_list Number of events to wait for
590
     * @param event_wait_list Events to wait for
591
     * @param event Output event
592
     * @return OpenCL error code
593
     */
594
    public static native int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
595
        int blocking_read, long offset, long size, Pointer ptr, int num_events_in_wait_list,
596
        cl_event event_wait_list, cl_event event);
597
}
598

599
/**
600
 * OpenCL kernel execution
601
 */
602
public static class OpenCLKernel {
603
    /**
604
     * Create program from source
605
     * @param context OpenCL context
606
     * @param count Number of source strings
607
     * @param strings Source code strings
608
     * @param lengths String lengths (null for null-terminated)
609
     * @param errcode_ret Error code output
610
     * @return Program handle
611
     */
612
    public static native cl_program clCreateProgramWithSource(cl_context context, int count,
613
        PointerPointer strings, SizeTPointer lengths, IntPointer errcode_ret);
614
    
615
    /**
616
     * Build program
617
     * @param program Program to build
618
     * @param num_devices Number of devices
619
     * @param device_list Target devices
620
     * @param options Build options
621
     * @param pfn_notify Build callback
622
     * @param user_data User data for callback
623
     * @return OpenCL error code
624
     */
625
    public static native int clBuildProgram(cl_program program, int num_devices,
626
        cl_device_id device_list, String options, BuildProgramCallbackFunction pfn_notify,
627
        Pointer user_data);
628
    
629
    /**
630
     * Create kernel from program
631
     * @param program Compiled program
632
     * @param kernel_name Kernel function name
633
     * @param errcode_ret Error code output
634
     * @return Kernel handle
635
     */
636
    public static native cl_kernel clCreateKernel(cl_program program, String kernel_name,
637
        IntPointer errcode_ret);
638
    
639
    /**
640
     * Set kernel argument
641
     * @param kernel Kernel handle
642
     * @param arg_index Argument index
643
     * @param arg_size Argument size
644
     * @param arg_value Argument value pointer
645
     * @return OpenCL error code
646
     */
647
    public static native int clSetKernelArg(cl_kernel kernel, int arg_index, long arg_size,
648
        Pointer arg_value);
649
    
650
    /**
651
     * Enqueue kernel execution
652
     * @param command_queue Command queue
653
     * @param kernel Kernel to execute
654
     * @param work_dim Number of work dimensions
655
     * @param global_work_offset Global work offset
656
     * @param global_work_size Global work size
657
     * @param local_work_size Local work size
658
     * @param num_events_in_wait_list Number of events to wait for
659
     * @param event_wait_list Events to wait for
660
     * @param event Output event
661
     * @return OpenCL error code
662
     */
663
    public static native int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
664
        int work_dim, SizeTPointer global_work_offset, SizeTPointer global_work_size,
665
        SizeTPointer local_work_size, int num_events_in_wait_list, cl_event event_wait_list,
666
        cl_event event);
667
}
668
```
669

670
## Usage Examples
671

672
### CUDA Vector Addition
673

674
```java
675
import org.bytedeco.cuda.cudart.*;
676
import static org.bytedeco.cuda.global.cudart.*;
677

678
public class CUDAVectorAdd {
679
    static {
680
        Loader.load(cudart.class);
681
    }
682
    
683
    public static void vectorAdd() {
684
        try (PointerScope scope = new PointerScope()) {
685
            int N = 1024;
686
            int size = N * 4; // sizeof(float) * N
687
            
688
            // Host arrays
689
            float[] h_A = new float[N];
690
            float[] h_B = new float[N];
691
            float[] h_C = new float[N];
692
            
693
            // Initialize host arrays
694
            for (int i = 0; i < N; i++) {
695
                h_A[i] = i;
696
                h_B[i] = i * 2;
697
            }
698
            
699
            // Device pointers
700
            Pointer d_A = new Pointer();
701
            Pointer d_B = new Pointer();
702
            Pointer d_C = new Pointer();
703
            
704
            // Allocate device memory
705
            cudaMalloc(d_A, size);
706
            cudaMalloc(d_B, size);
707
            cudaMalloc(d_C, size);
708
            
709
            // Copy data to device
710
            FloatPointer fp_A = new FloatPointer(h_A);
711
            FloatPointer fp_B = new FloatPointer(h_B);
712
            FloatPointer fp_C = new FloatPointer(h_C);
713
            
714
            cudaMemcpy(d_A, fp_A, size, cudaMemcpyHostToDevice);
715
            cudaMemcpy(d_B, fp_B, size, cudaMemcpyHostToDevice);
716
            
717
            // Launch kernel (this would require a compiled CUDA kernel)
718
            // For illustration - actual kernel launch would use CUDA driver API
719
            // or require JCuda/JCUDA for higher-level kernel launching
720
            
721
            // Copy result back to host
722
            cudaMemcpy(fp_C, d_C, size, cudaMemcpyDeviceToHost);
723
            
724
            // Verify results
725
            boolean success = true;
726
            for (int i = 0; i < N && success; i++) {
727
                if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
728
                    success = false;
729
                    System.err.println("Verification failed at index " + i);
730
                }
731
            }
732
            
733
            if (success) {
734
                System.out.println("Vector addition completed successfully!");
735
            }
736
            
737
            // Free device memory
738
            cudaFree(d_A);
739
            cudaFree(d_B);
740
            cudaFree(d_C);
741
        }
742
    }
743
    
744
    public static void deviceInfo() {
745
        try (PointerScope scope = new PointerScope()) {
746
            IntPointer deviceCount = new IntPointer(1);
747
            cudaGetDeviceCount(deviceCount);
748
            
749
            System.out.println("Number of CUDA devices: " + deviceCount.get());
750
            
751
            for (int i = 0; i < deviceCount.get(); i++) {
752
                cudaDeviceProp prop = new cudaDeviceProp();
753
                cudaGetDeviceProperties(prop, i);
754
                
755
                System.out.println("\nDevice " + i + ":");
756
                System.out.println("  Name: " + prop.name().getString());
757
                System.out.println("  Compute Capability: " + prop.major() + "." + prop.minor());
758
                System.out.println("  Total Global Memory: " + prop.totalGlobalMem() / (1024*1024) + " MB");
759
                System.out.println("  Multiprocessors: " + prop.multiProcessorCount());
760
                System.out.println("  Max Threads per Block: " + prop.maxThreadsPerBlock());
761
                System.out.println("  Warp Size: " + prop.warpSize());
762
            }
763
        }
764
    }
765
}
766
```
767

768
### cuBLAS Matrix Multiplication
769

770
```java
771
import org.bytedeco.cuda.cudart.*;
772
import org.bytedeco.cuda.cublas.*;
773
import static org.bytedeco.cuda.global.cudart.*;
774
import static org.bytedeco.cuda.global.cublas.*;
775

776
public class cuBLASExample {
777
    static {
778
        Loader.load(cudart.class);
779
        Loader.load(cublas.class);
780
    }
781
    
782
    public static void matrixMultiplication() {
783
        try (PointerScope scope = new PointerScope()) {
784
            int M = 3, N = 3, K = 3;
785
            
786
            // Host matrices
787
            float[] h_A = {1, 2, 3, 4, 5, 6, 7, 8, 9};
788
            float[] h_B = {9, 8, 7, 6, 5, 4, 3, 2, 1};
789
            float[] h_C = new float[M * N];
790
            
791
            // Device matrices
792
            Pointer d_A = new Pointer();
793
            Pointer d_B = new Pointer();
794
            Pointer d_C = new Pointer();
795
            
796
            int sizeA = M * K * 4; // sizeof(float)
797
            int sizeB = K * N * 4;
798
            int sizeC = M * N * 4;
799
            
800
            cudaMalloc(d_A, sizeA);
801
            cudaMalloc(d_B, sizeB);
802
            cudaMalloc(d_C, sizeC);
803
            
804
            // Copy matrices to device
805
            FloatPointer fp_A = new FloatPointer(h_A);
806
            FloatPointer fp_B = new FloatPointer(h_B);
807
            FloatPointer fp_C = new FloatPointer(h_C);
808
            
809
            cudaMemcpy(d_A, fp_A, sizeA, cudaMemcpyHostToDevice);
810
            cudaMemcpy(d_B, fp_B, sizeB, cudaMemcpyHostToDevice);
811
            
812
            // Create cuBLAS handle
813
            cublasHandle_t handle = new cublasHandle_t();
814
            cublasCreate_v2(handle);
815
            
816
            // Scalars for GEMM
817
            FloatPointer alpha = new FloatPointer(1.0f);
818
            FloatPointer beta = new FloatPointer(0.0f);
819
            
820
            // Perform matrix multiplication: C = α*A*B + β*C
821
            cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
822
                M, N, K, alpha,
823
                new FloatPointer(d_A), M,
824
                new FloatPointer(d_B), K,
825
                beta, new FloatPointer(d_C), M);
826
            
827
            // Copy result back to host
828
            cudaMemcpy(fp_C, d_C, sizeC, cudaMemcpyDeviceToHost);
829
            
830
            // Print result
831
            System.out.println("cuBLAS Matrix multiplication result:");
832
            for (int i = 0; i < M; i++) {
833
                for (int j = 0; j < N; j++) {
834
                    System.out.printf("%.1f ", h_C[i * N + j]);
835
                }
836
                System.out.println();
837
            }
838
            
839
            // Cleanup
840
            cublasDestroy_v2(handle);
841
            cudaFree(d_A);
842
            cudaFree(d_B);
843
            cudaFree(d_C);
844
        }
845
    }
846
}
847
```
848

849
### OpenCL Vector Addition
850

851
```java
852
import org.bytedeco.opencl.*;
853
import static org.bytedeco.opencl.global.OpenCL.*;
854

855
public class OpenCLExample {
856
    static {
857
        Loader.load(OpenCL.class);
858
    }
859
    
860
    // OpenCL kernel source code
861
    static final String kernelSource = 
862
        "__kernel void vector_add(__global const float* A, __global const float* B, " +
863
        "__global float* C) { " +
864
        "    int i = get_global_id(0); " +
865
        "    C[i] = A[i] + B[i]; " +
866
        "}";
867
    
868
    public static void vectorAdd() {
869
        try (PointerScope scope = new PointerScope()) {
870
            int N = 1024;
871
            
872
            // Get platform and device
873
            cl_platform_id platform = new cl_platform_id();
874
            cl_device_id device = new cl_device_id();
875
            IntPointer ret = new IntPointer(1);
876
            
877
            clGetPlatformIDs(1, platform, null);
878
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, device, null);
879
            
880
            // Create context and command queue
881
            cl_context context = clCreateContext(null, 1, device, null, null, ret);
882
            cl_command_queue queue = clCreateCommandQueue(context, device, 0, ret);
883
            
884
            // Host data
885
            float[] h_A = new float[N];
886
            float[] h_B = new float[N];
887
            float[] h_C = new float[N];
888
            
889
            for (int i = 0; i < N; i++) {
890
                h_A[i] = i;
891
                h_B[i] = i * 2;
892
            }
893
            
894
            // Create device buffers
895
            cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
896
            cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
897
            cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 4, null, ret);
898
            
899
            // Copy data to device
900
            FloatPointer fp_A = new FloatPointer(h_A);
901
            FloatPointer fp_B = new FloatPointer(h_B);
902
            
903
            clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, N * 4, fp_A, 0, null, null);
904
            clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, N * 4, fp_B, 0, null, null);
905
            
906
            // Create and build program
907
            PointerPointer kernelSourcePtr = new PointerPointer(kernelSource);
908
            cl_program program = clCreateProgramWithSource(context, 1, kernelSourcePtr, null, ret);
909
            clBuildProgram(program, 1, device, null, null, null);
910
            
911
            // Create kernel
912
            cl_kernel kernel = clCreateKernel(program, "vector_add", ret);
913
            
914
            // Set kernel arguments
915
            clSetKernelArg(kernel, 0, Pointer.sizeof(cl_mem.class), d_A);
916
            clSetKernelArg(kernel, 1, Pointer.sizeof(cl_mem.class), d_B);
917
            clSetKernelArg(kernel, 2, Pointer.sizeof(cl_mem.class), d_C);
918
            
919
            // Execute kernel
920
            SizeTPointer globalWorkSize = new SizeTPointer(N);
921
            clEnqueueNDRangeKernel(queue, kernel, 1, null, globalWorkSize, null, 0, null, null);
922
            
923
            // Read result
924
            FloatPointer fp_C = new FloatPointer(h_C);
925
            clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, N * 4, fp_C, 0, null, null);
926
            
927
            // Verify results
928
            boolean success = true;
929
            for (int i = 0; i < Math.min(N, 10) && success; i++) {
930
                if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
931
                    success = false;
932
                }
933
                System.out.printf("C[%d] = %.1f (expected %.1f)\n", i, h_C[i], h_A[i] + h_B[i]);
934
            }
935
            
936
            System.out.println(success ? "OpenCL vector addition successful!" : "Verification failed");
937
            
938
            // Cleanup (in reverse order of creation)
939
            clReleaseKernel(kernel);
940
            clReleaseProgram(program);
941
            clReleaseMemObject(d_A);
942
            clReleaseMemObject(d_B);
943
            clReleaseMemObject(d_C);
944
            clReleaseCommandQueue(queue);
945
            clReleaseContext(context);
946
        }
947
    }
948
}
949
```

Version

Tile

Files

gpu-computing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

gpu-computing.mddocs/