Cross-platform Java bindings for 60+ native C/C++ libraries including OpenCV, FFmpeg, PyTorch, TensorFlow, and scientific computing libraries
—
High-performance GPU computing capabilities through CUDA, OpenCL, and associated libraries for parallel processing and acceleration.
NVIDIA CUDA runtime API for GPU device management, memory operations, and kernel execution.
/**
* Device management functions
*/
public static class CUDADevice {
/**
* Get number of CUDA devices
* @param count Output device count
* @return CUDA error code
*/
public static native int cudaGetDeviceCount(IntPointer count);
/**
* Set current CUDA device
* @param device Device index to use
* @return CUDA error code
*/
public static native int cudaSetDevice(int device);
/**
* Get current CUDA device
* @param device Output current device index
* @return CUDA error code
*/
public static native int cudaGetDevice(IntPointer device);
/**
* Get device properties
* @param prop Output device properties
* @param device Device index
* @return CUDA error code
*/
public static native int cudaGetDeviceProperties(cudaDeviceProp prop, int device);
/**
* Synchronize with current device
* @return CUDA error code
*/
public static native int cudaDeviceSynchronize();
/**
* Reset current device
* @return CUDA error code
*/
public static native int cudaDeviceReset();
}
/**
* Memory management functions
*/
public static class CUDAMemory {
/**
* Allocate device memory
* @param devPtr Output pointer to allocated memory
* @param size Size in bytes to allocate
* @return CUDA error code
*/
public static native int cudaMalloc(PointerPointer devPtr, long size);
/**
* Free device memory
* @param devPtr Device pointer to free
* @return CUDA error code
*/
public static native int cudaFree(Pointer devPtr);
/**
* Copy memory between host and device
* @param dst Destination pointer
* @param src Source pointer
* @param count Size in bytes
* @param kind Copy direction (cudaMemcpyHostToDevice, etc.)
* @return CUDA error code
*/
public static native int cudaMemcpy(Pointer dst, Pointer src, long count, int kind);
/**
* Asynchronous memory copy
* @param dst Destination pointer
* @param src Source pointer
* @param count Size in bytes
* @param kind Copy direction
* @param stream CUDA stream
* @return CUDA error code
*/
public static native int cudaMemcpyAsync(Pointer dst, Pointer src, long count,
int kind, cudaStream_t stream);
/**
* Set device memory to value
* @param devPtr Device pointer
* @param value Value to set (byte)
* @param count Number of bytes
* @return CUDA error code
*/
public static native int cudaMemset(Pointer devPtr, int value, long count);
/**
* Allocate page-locked host memory
* @param ptr Output pointer to allocated host memory
* @param size Size in bytes
* @return CUDA error code
*/
public static native int cudaMallocHost(PointerPointer ptr, long size);
/**
* Free page-locked host memory
* @param ptr Host pointer to free
* @return CUDA error code
*/
public static native int cudaFreeHost(Pointer ptr);
}
/**
* Stream management for asynchronous operations
*/
public static class CUDAStream {
/**
* Create CUDA stream
* @param pStream Output stream handle
* @return CUDA error code
*/
public static native int cudaStreamCreate(cudaStream_t pStream);
/**
* Destroy CUDA stream
* @param stream Stream to destroy
* @return CUDA error code
*/
public static native int cudaStreamDestroy(cudaStream_t stream);
/**
* Synchronize with stream
* @param stream Stream to synchronize
* @return CUDA error code
*/
public static native int cudaStreamSynchronize(cudaStream_t stream);
/**
* Query stream status
* @param stream Stream to query
* @return CUDA error code (cudaSuccess if complete)
*/
public static native int cudaStreamQuery(cudaStream_t stream);
}
/**
* CUDA device properties structure
*/
public class cudaDeviceProp extends Pointer {
/** Device name */
public native String name();
/** Total global memory in bytes */
public native long totalGlobalMem();
/** Shared memory per block */
public native long sharedMemPerBlock();
/** Number of registers per block */
public native int regsPerBlock();
/** Warp size */
public native int warpSize();
/** Maximum threads per block */
public native int maxThreadsPerBlock();
/** Maximum block dimensions */
public native IntPointer maxThreadsDim();
/** Maximum grid dimensions */
public native IntPointer maxGridSize();
/** Compute capability major version */
public native int major();
/** Compute capability minor version */
public native int minor();
/** Number of multiprocessors */
public native int multiProcessorCount();
}CUDA Basic Linear Algebra Subprograms for GPU-accelerated linear algebra.
/**
* cuBLAS context and initialization
*/
public static class cuBLASContext {
/**
* Create cuBLAS handle
* @param handle Output handle
* @return cuBLAS status
*/
public static native int cublasCreate_v2(cublasHandle_t handle);
/**
* Destroy cuBLAS handle
* @param handle Handle to destroy
* @return cuBLAS status
*/
public static native int cublasDestroy_v2(cublasHandle_t handle);
/**
* Set cuBLAS stream
* @param handle cuBLAS handle
* @param streamId CUDA stream
* @return cuBLAS status
*/
public static native int cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
}
/**
* cuBLAS Level 3 operations (matrix-matrix)
*/
public static class cuBLASLevel3 {
/**
* Single precision matrix multiplication: C = α*A*B + β*C
* @param handle cuBLAS handle
* @param transa Transpose operation for A
* @param transb Transpose operation for B
* @param m Number of rows in A and C
* @param n Number of columns in B and C
* @param k Number of columns in A and rows in B
* @param alpha Scalar α
* @param A Matrix A on device
* @param lda Leading dimension of A
* @param B Matrix B on device
* @param ldb Leading dimension of B
* @param beta Scalar β
* @param C Matrix C on device
* @param ldc Leading dimension of C
* @return cuBLAS status
*/
public static native int cublasSgemm_v2(cublasHandle_t handle, int transa, int transb,
int m, int n, int k, FloatPointer alpha, FloatPointer A, int lda,
FloatPointer B, int ldb, FloatPointer beta, FloatPointer C, int ldc);
/**
* Double precision matrix multiplication
*/
public static native int cublasDgemm_v2(cublasHandle_t handle, int transa, int transb,
int m, int n, int k, DoublePointer alpha, DoublePointer A, int lda,
DoublePointer B, int ldb, DoublePointer beta, DoublePointer C, int ldc);
/**
* Batched matrix multiplication
* @param handle cuBLAS handle
* @param transa Transpose operation for A
* @param transb Transpose operation for B
* @param m Number of rows in A and C
* @param n Number of columns in B and C
* @param k Number of columns in A and rows in B
* @param alpha Scalar α
* @param Aarray Array of pointers to matrices A
* @param lda Leading dimension of A
* @param Barray Array of pointers to matrices B
* @param ldb Leading dimension of B
* @param beta Scalar β
* @param Carray Array of pointers to matrices C
* @param ldc Leading dimension of C
* @param batchCount Number of matrices
* @return cuBLAS status
*/
public static native int cublasSgemmBatched(cublasHandle_t handle, int transa, int transb,
int m, int n, int k, FloatPointer alpha, PointerPointer Aarray, int lda,
PointerPointer Barray, int ldb, FloatPointer beta, PointerPointer Carray,
int ldc, int batchCount);
}CUDA Deep Neural Network library for accelerated deep learning operations.
/**
* cuDNN context management
*/
public static class cuDNNContext {
/**
* Create cuDNN handle
* @param handle Output handle
* @return cuDNN status
*/
public static native int cudnnCreate(cudnnHandle_t handle);
/**
* Destroy cuDNN handle
* @param handle Handle to destroy
* @return cuDNN status
*/
public static native int cudnnDestroy(cudnnHandle_t handle);
/**
* Set cuDNN stream
* @param handle cuDNN handle
* @param stream CUDA stream
* @return cuDNN status
*/
public static native int cudnnSetStream(cudnnHandle_t handle, cudaStream_t stream);
}
/**
* Tensor descriptor management
*/
public static class cuDNNTensor {
/**
* Create tensor descriptor
* @param tensorDesc Output tensor descriptor
* @return cuDNN status
*/
public static native int cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
/**
* Set tensor descriptor
* @param tensorDesc Tensor descriptor
* @param format Data format (NCHW, NHWC, etc.)
* @param dataType Data type (float, half, etc.)
* @param nbDims Number of dimensions
* @param dimA Dimension sizes
* @param strideA Stride sizes
* @return cuDNN status
*/
public static native int cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
int dataType, int nbDims, IntPointer dimA, IntPointer strideA);
/**
* Destroy tensor descriptor
* @param tensorDesc Tensor descriptor to destroy
* @return cuDNN status
*/
public static native int cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
}
/**
* Convolution operations
*/
public static class cuDNNConvolution {
/**
* Create convolution descriptor
* @param convDesc Output convolution descriptor
* @return cuDNN status
*/
public static native int cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
/**
* Set convolution descriptor
* @param convDesc Convolution descriptor
* @param arrayLength Number of dimensions
* @param padA Padding for each dimension
* @param filterStrideA Stride for each dimension
* @param dilationA Dilation for each dimension
* @param mode Convolution mode
* @param computeType Computation data type
* @return cuDNN status
*/
public static native int cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
int arrayLength, IntPointer padA, IntPointer filterStrideA, IntPointer dilationA,
int mode, int computeType);
/**
* Forward convolution
* @param handle cuDNN handle
* @param alpha Scaling factor for input
* @param xDesc Input tensor descriptor
* @param x Input tensor data
* @param wDesc Filter tensor descriptor
* @param w Filter data
* @param convDesc Convolution descriptor
* @param algo Convolution algorithm
* @param workSpace Workspace memory
* @param workSpaceSizeInBytes Workspace size
* @param beta Scaling factor for output
* @param yDesc Output tensor descriptor
* @param y Output tensor data
* @return cuDNN status
*/
public static native int cudnnConvolutionForward(cudnnHandle_t handle,
Pointer alpha, cudnnTensorDescriptor_t xDesc, Pointer x,
cudnnFilterDescriptor_t wDesc, Pointer w, cudnnConvolutionDescriptor_t convDesc,
int algo, Pointer workSpace, long workSpaceSizeInBytes, Pointer beta,
cudnnTensorDescriptor_t yDesc, Pointer y);
}
/**
* Activation functions
*/
public static class cuDNNActivation {
/**
* Create activation descriptor
* @param activationDesc Output activation descriptor
* @return cuDNN status
*/
public static native int cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
/**
* Set activation descriptor
* @param activationDesc Activation descriptor
* @param mode Activation mode (sigmoid, relu, tanh, etc.)
* @param reluNanOpt NaN propagation mode
* @param coef Coefficient for some activation modes
* @return cuDNN status
*/
public static native int cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
int mode, int reluNanOpt, double coef);
/**
* Forward activation
* @param handle cuDNN handle
* @param activationDesc Activation descriptor
* @param alpha Scaling factor for input
* @param xDesc Input tensor descriptor
* @param x Input tensor data
* @param beta Scaling factor for output
* @param yDesc Output tensor descriptor
* @param y Output tensor data
* @return cuDNN status
*/
public static native int cudnnActivationForward(cudnnHandle_t handle,
cudnnActivationDescriptor_t activationDesc, Pointer alpha,
cudnnTensorDescriptor_t xDesc, Pointer x, Pointer beta,
cudnnTensorDescriptor_t yDesc, Pointer y);
}OpenCL API for cross-platform parallel computing across CPUs, GPUs, and other devices.
/**
* OpenCL platform and device management
*/
public static class OpenCLPlatform {
/**
* Get platform IDs
* @param num_entries Number of platform entries
* @param platforms Output platform array
* @param num_platforms Actual number of platforms found
* @return OpenCL error code
*/
public static native int clGetPlatformIDs(int num_entries, cl_platform_id platforms,
IntPointer num_platforms);
/**
* Get platform information
* @param platform Platform ID
* @param param_name Information parameter
* @param param_value_size Size of output buffer
* @param param_value Output buffer
* @param param_value_size_ret Actual size of information
* @return OpenCL error code
*/
public static native int clGetPlatformInfo(cl_platform_id platform, int param_name,
long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
/**
* Get device IDs for platform
* @param platform Platform ID
* @param device_type Device type filter (GPU, CPU, ALL, etc.)
* @param num_entries Number of device entries
* @param devices Output device array
* @param num_devices Actual number of devices found
* @return OpenCL error code
*/
public static native int clGetDeviceIDs(cl_platform_id platform, long device_type,
int num_entries, cl_device_id devices, IntPointer num_devices);
/**
* Get device information
* @param device Device ID
* @param param_name Information parameter
* @param param_value_size Size of output buffer
* @param param_value Output buffer
* @param param_value_size_ret Actual size of information
* @return OpenCL error code
*/
public static native int clGetDeviceInfo(cl_device_id device, int param_name,
long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
}
/**
* OpenCL context and command queue management
*/
public static class OpenCLContext {
/**
* Create OpenCL context
* @param properties Context properties
* @param num_devices Number of devices
* @param devices Device array
* @param pfn_notify Notification callback
* @param user_data User data for callback
* @param errcode_ret Error code output
* @return Context handle
*/
public static native cl_context clCreateContext(cl_context_properties properties,
int num_devices, cl_device_id devices, CreateContextCallbackFunction pfn_notify,
Pointer user_data, IntPointer errcode_ret);
/**
* Release context
* @param context Context to release
* @return OpenCL error code
*/
public static native int clReleaseContext(cl_context context);
/**
* Create command queue
* @param context OpenCL context
* @param device Target device
* @param properties Queue properties
* @param errcode_ret Error code output
* @return Command queue handle
*/
public static native cl_command_queue clCreateCommandQueue(cl_context context,
cl_device_id device, long properties, IntPointer errcode_ret);
/**
* Release command queue
* @param command_queue Queue to release
* @return OpenCL error code
*/
public static native int clReleaseCommandQueue(cl_command_queue command_queue);
}
/**
* OpenCL memory management
*/
public static class OpenCLMemory {
/**
* Create buffer object
* @param context OpenCL context
* @param flags Memory flags (read/write permissions, etc.)
* @param size Buffer size in bytes
* @param host_ptr Host memory pointer (optional)
* @param errcode_ret Error code output
* @return Memory object handle
*/
public static native cl_mem clCreateBuffer(cl_context context, long flags, long size,
Pointer host_ptr, IntPointer errcode_ret);
/**
* Release memory object
* @param memobj Memory object to release
* @return OpenCL error code
*/
public static native int clReleaseMemObject(cl_mem memobj);
/**
* Enqueue buffer write operation
* @param command_queue Command queue
* @param buffer Target buffer
* @param blocking_write Blocking operation flag
* @param offset Offset in buffer
* @param size Size to write
* @param ptr Source data pointer
* @param num_events_in_wait_list Number of events to wait for
* @param event_wait_list Events to wait for
* @param event Output event
* @return OpenCL error code
*/
public static native int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
int blocking_write, long offset, long size, Pointer ptr, int num_events_in_wait_list,
cl_event event_wait_list, cl_event event);
/**
* Enqueue buffer read operation
* @param command_queue Command queue
* @param buffer Source buffer
* @param blocking_read Blocking operation flag
* @param offset Offset in buffer
* @param size Size to read
* @param ptr Destination data pointer
* @param num_events_in_wait_list Number of events to wait for
* @param event_wait_list Events to wait for
* @param event Output event
* @return OpenCL error code
*/
public static native int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
int blocking_read, long offset, long size, Pointer ptr, int num_events_in_wait_list,
cl_event event_wait_list, cl_event event);
}
/**
* OpenCL kernel execution
*/
public static class OpenCLKernel {
/**
* Create program from source
* @param context OpenCL context
* @param count Number of source strings
* @param strings Source code strings
* @param lengths String lengths (null for null-terminated)
* @param errcode_ret Error code output
* @return Program handle
*/
public static native cl_program clCreateProgramWithSource(cl_context context, int count,
PointerPointer strings, SizeTPointer lengths, IntPointer errcode_ret);
/**
* Build program
* @param program Program to build
* @param num_devices Number of devices
* @param device_list Target devices
* @param options Build options
* @param pfn_notify Build callback
* @param user_data User data for callback
* @return OpenCL error code
*/
public static native int clBuildProgram(cl_program program, int num_devices,
cl_device_id device_list, String options, BuildProgramCallbackFunction pfn_notify,
Pointer user_data);
/**
* Create kernel from program
* @param program Compiled program
* @param kernel_name Kernel function name
* @param errcode_ret Error code output
* @return Kernel handle
*/
public static native cl_kernel clCreateKernel(cl_program program, String kernel_name,
IntPointer errcode_ret);
/**
* Set kernel argument
* @param kernel Kernel handle
* @param arg_index Argument index
* @param arg_size Argument size
* @param arg_value Argument value pointer
* @return OpenCL error code
*/
public static native int clSetKernelArg(cl_kernel kernel, int arg_index, long arg_size,
Pointer arg_value);
/**
* Enqueue kernel execution
* @param command_queue Command queue
* @param kernel Kernel to execute
* @param work_dim Number of work dimensions
* @param global_work_offset Global work offset
* @param global_work_size Global work size
* @param local_work_size Local work size
* @param num_events_in_wait_list Number of events to wait for
* @param event_wait_list Events to wait for
* @param event Output event
* @return OpenCL error code
*/
public static native int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
int work_dim, SizeTPointer global_work_offset, SizeTPointer global_work_size,
SizeTPointer local_work_size, int num_events_in_wait_list, cl_event event_wait_list,
cl_event event);
}import org.bytedeco.cuda.cudart.*;
import static org.bytedeco.cuda.global.cudart.*;
public class CUDAVectorAdd {
static {
Loader.load(cudart.class);
}
public static void vectorAdd() {
try (PointerScope scope = new PointerScope()) {
int N = 1024;
int size = N * 4; // sizeof(float) * N
// Host arrays
float[] h_A = new float[N];
float[] h_B = new float[N];
float[] h_C = new float[N];
// Initialize host arrays
for (int i = 0; i < N; i++) {
h_A[i] = i;
h_B[i] = i * 2;
}
// Device pointers
Pointer d_A = new Pointer();
Pointer d_B = new Pointer();
Pointer d_C = new Pointer();
// Allocate device memory
cudaMalloc(d_A, size);
cudaMalloc(d_B, size);
cudaMalloc(d_C, size);
// Copy data to device
FloatPointer fp_A = new FloatPointer(h_A);
FloatPointer fp_B = new FloatPointer(h_B);
FloatPointer fp_C = new FloatPointer(h_C);
cudaMemcpy(d_A, fp_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, fp_B, size, cudaMemcpyHostToDevice);
// Launch kernel (this would require a compiled CUDA kernel)
// For illustration - actual kernel launch would use CUDA driver API
// or require JCuda/JCUDA for higher-level kernel launching
// Copy result back to host
cudaMemcpy(fp_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify results
boolean success = true;
for (int i = 0; i < N && success; i++) {
if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
success = false;
System.err.println("Verification failed at index " + i);
}
}
if (success) {
System.out.println("Vector addition completed successfully!");
}
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
}
public static void deviceInfo() {
try (PointerScope scope = new PointerScope()) {
IntPointer deviceCount = new IntPointer(1);
cudaGetDeviceCount(deviceCount);
System.out.println("Number of CUDA devices: " + deviceCount.get());
for (int i = 0; i < deviceCount.get(); i++) {
cudaDeviceProp prop = new cudaDeviceProp();
cudaGetDeviceProperties(prop, i);
System.out.println("\nDevice " + i + ":");
System.out.println(" Name: " + prop.name().getString());
System.out.println(" Compute Capability: " + prop.major() + "." + prop.minor());
System.out.println(" Total Global Memory: " + prop.totalGlobalMem() / (1024*1024) + " MB");
System.out.println(" Multiprocessors: " + prop.multiProcessorCount());
System.out.println(" Max Threads per Block: " + prop.maxThreadsPerBlock());
System.out.println(" Warp Size: " + prop.warpSize());
}
}
}
}import org.bytedeco.cuda.cudart.*;
import org.bytedeco.cuda.cublas.*;
import static org.bytedeco.cuda.global.cudart.*;
import static org.bytedeco.cuda.global.cublas.*;
public class cuBLASExample {
static {
Loader.load(cudart.class);
Loader.load(cublas.class);
}
public static void matrixMultiplication() {
try (PointerScope scope = new PointerScope()) {
int M = 3, N = 3, K = 3;
// Host matrices
float[] h_A = {1, 2, 3, 4, 5, 6, 7, 8, 9};
float[] h_B = {9, 8, 7, 6, 5, 4, 3, 2, 1};
float[] h_C = new float[M * N];
// Device matrices
Pointer d_A = new Pointer();
Pointer d_B = new Pointer();
Pointer d_C = new Pointer();
int sizeA = M * K * 4; // sizeof(float)
int sizeB = K * N * 4;
int sizeC = M * N * 4;
cudaMalloc(d_A, sizeA);
cudaMalloc(d_B, sizeB);
cudaMalloc(d_C, sizeC);
// Copy matrices to device
FloatPointer fp_A = new FloatPointer(h_A);
FloatPointer fp_B = new FloatPointer(h_B);
FloatPointer fp_C = new FloatPointer(h_C);
cudaMemcpy(d_A, fp_A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, fp_B, sizeB, cudaMemcpyHostToDevice);
// Create cuBLAS handle
cublasHandle_t handle = new cublasHandle_t();
cublasCreate_v2(handle);
// Scalars for GEMM
FloatPointer alpha = new FloatPointer(1.0f);
FloatPointer beta = new FloatPointer(0.0f);
// Perform matrix multiplication: C = α*A*B + β*C
cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K, alpha,
new FloatPointer(d_A), M,
new FloatPointer(d_B), K,
beta, new FloatPointer(d_C), M);
// Copy result back to host
cudaMemcpy(fp_C, d_C, sizeC, cudaMemcpyDeviceToHost);
// Print result
System.out.println("cuBLAS Matrix multiplication result:");
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
System.out.printf("%.1f ", h_C[i * N + j]);
}
System.out.println();
}
// Cleanup
cublasDestroy_v2(handle);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
}
}import org.bytedeco.opencl.*;
import static org.bytedeco.opencl.global.OpenCL.*;
public class OpenCLExample {
static {
Loader.load(OpenCL.class);
}
// OpenCL kernel source code
static final String kernelSource =
"__kernel void vector_add(__global const float* A, __global const float* B, " +
"__global float* C) { " +
" int i = get_global_id(0); " +
" C[i] = A[i] + B[i]; " +
"}";
public static void vectorAdd() {
try (PointerScope scope = new PointerScope()) {
int N = 1024;
// Get platform and device
cl_platform_id platform = new cl_platform_id();
cl_device_id device = new cl_device_id();
IntPointer ret = new IntPointer(1);
clGetPlatformIDs(1, platform, null);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, device, null);
// Create context and command queue
cl_context context = clCreateContext(null, 1, device, null, null, ret);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, ret);
// Host data
float[] h_A = new float[N];
float[] h_B = new float[N];
float[] h_C = new float[N];
for (int i = 0; i < N; i++) {
h_A[i] = i;
h_B[i] = i * 2;
}
// Create device buffers
cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 4, null, ret);
// Copy data to device
FloatPointer fp_A = new FloatPointer(h_A);
FloatPointer fp_B = new FloatPointer(h_B);
clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, N * 4, fp_A, 0, null, null);
clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, N * 4, fp_B, 0, null, null);
// Create and build program
PointerPointer kernelSourcePtr = new PointerPointer(kernelSource);
cl_program program = clCreateProgramWithSource(context, 1, kernelSourcePtr, null, ret);
clBuildProgram(program, 1, device, null, null, null);
// Create kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", ret);
// Set kernel arguments
clSetKernelArg(kernel, 0, Pointer.sizeof(cl_mem.class), d_A);
clSetKernelArg(kernel, 1, Pointer.sizeof(cl_mem.class), d_B);
clSetKernelArg(kernel, 2, Pointer.sizeof(cl_mem.class), d_C);
// Execute kernel
SizeTPointer globalWorkSize = new SizeTPointer(N);
clEnqueueNDRangeKernel(queue, kernel, 1, null, globalWorkSize, null, 0, null, null);
// Read result
FloatPointer fp_C = new FloatPointer(h_C);
clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, N * 4, fp_C, 0, null, null);
// Verify results
boolean success = true;
for (int i = 0; i < Math.min(N, 10) && success; i++) {
if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
success = false;
}
System.out.printf("C[%d] = %.1f (expected %.1f)\n", i, h_C[i], h_A[i] + h_B[i]);
}
System.out.println(success ? "OpenCL vector addition successful!" : "Verification failed");
// Cleanup (in reverse order of creation)
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(d_A);
clReleaseMemObject(d_B);
clReleaseMemObject(d_C);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}
}
}Install with Tessl CLI
npx tessl i tessl/maven-org-bytedeco--javacpp-presets-platform