CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-bytedeco--javacpp-presets-platform

Cross-platform Java bindings for 60+ native C/C++ libraries including OpenCV, FFmpeg, PyTorch, TensorFlow, and scientific computing libraries

Pending
Overview
Eval results
Files

gpu-computing.mddocs/

GPU Computing

High-performance GPU computing capabilities through CUDA, OpenCL, and associated libraries for parallel processing and acceleration.

Capabilities

CUDA Runtime Operations

NVIDIA CUDA runtime API for GPU device management, memory operations, and kernel execution.

/**
 * Device management functions
 */
public static class CUDADevice {
    /**
     * Get number of CUDA devices
     * @param count Output device count
     * @return CUDA error code
     */
    public static native int cudaGetDeviceCount(IntPointer count);
    
    /**
     * Set current CUDA device
     * @param device Device index to use
     * @return CUDA error code
     */
    public static native int cudaSetDevice(int device);
    
    /**
     * Get current CUDA device
     * @param device Output current device index
     * @return CUDA error code
     */
    public static native int cudaGetDevice(IntPointer device);
    
    /**
     * Get device properties
     * @param prop Output device properties
     * @param device Device index
     * @return CUDA error code
     */
    public static native int cudaGetDeviceProperties(cudaDeviceProp prop, int device);
    
    /**
     * Synchronize with current device
     * @return CUDA error code
     */
    public static native int cudaDeviceSynchronize();
    
    /**
     * Reset current device
     * @return CUDA error code
     */
    public static native int cudaDeviceReset();
}

/**
 * Memory management functions
 */
public static class CUDAMemory {
    /**
     * Allocate device memory
     * @param devPtr Output pointer to allocated memory
     * @param size Size in bytes to allocate
     * @return CUDA error code
     */
    public static native int cudaMalloc(PointerPointer devPtr, long size);
    
    /**
     * Free device memory
     * @param devPtr Device pointer to free
     * @return CUDA error code
     */
    public static native int cudaFree(Pointer devPtr);
    
    /**
     * Copy memory between host and device
     * @param dst Destination pointer
     * @param src Source pointer
     * @param count Size in bytes
     * @param kind Copy direction (cudaMemcpyHostToDevice, etc.)
     * @return CUDA error code
     */
    public static native int cudaMemcpy(Pointer dst, Pointer src, long count, int kind);
    
    /**
     * Asynchronous memory copy
     * @param dst Destination pointer
     * @param src Source pointer
     * @param count Size in bytes
     * @param kind Copy direction
     * @param stream CUDA stream
     * @return CUDA error code
     */
    public static native int cudaMemcpyAsync(Pointer dst, Pointer src, long count,
        int kind, cudaStream_t stream);
    
    /**
     * Set device memory to value
     * @param devPtr Device pointer
     * @param value Value to set (byte)
     * @param count Number of bytes
     * @return CUDA error code
     */
    public static native int cudaMemset(Pointer devPtr, int value, long count);
    
    /**
     * Allocate page-locked host memory
     * @param ptr Output pointer to allocated host memory
     * @param size Size in bytes
     * @return CUDA error code
     */
    public static native int cudaMallocHost(PointerPointer ptr, long size);
    
    /**
     * Free page-locked host memory
     * @param ptr Host pointer to free
     * @return CUDA error code
     */
    public static native int cudaFreeHost(Pointer ptr);
}

/**
 * Stream management for asynchronous operations
 */
public static class CUDAStream {
    /**
     * Create CUDA stream
     * @param pStream Output stream handle
     * @return CUDA error code
     */
    public static native int cudaStreamCreate(cudaStream_t pStream);
    
    /**
     * Destroy CUDA stream
     * @param stream Stream to destroy
     * @return CUDA error code
     */
    public static native int cudaStreamDestroy(cudaStream_t stream);
    
    /**
     * Synchronize with stream
     * @param stream Stream to synchronize
     * @return CUDA error code
     */
    public static native int cudaStreamSynchronize(cudaStream_t stream);
    
    /**
     * Query stream status
     * @param stream Stream to query
     * @return CUDA error code (cudaSuccess if complete)
     */
    public static native int cudaStreamQuery(cudaStream_t stream);
}

/**
 * CUDA device properties structure
 */
public class cudaDeviceProp extends Pointer {
    /** Device name */
    public native String name();
    
    /** Total global memory in bytes */
    public native long totalGlobalMem();
    
    /** Shared memory per block */
    public native long sharedMemPerBlock();
    
    /** Number of registers per block */
    public native int regsPerBlock();
    
    /** Warp size */
    public native int warpSize();
    
    /** Maximum threads per block */
    public native int maxThreadsPerBlock();
    
    /** Maximum block dimensions */
    public native IntPointer maxThreadsDim();
    
    /** Maximum grid dimensions */
    public native IntPointer maxGridSize();
    
    /** Compute capability major version */
    public native int major();
    
    /** Compute capability minor version */
    public native int minor();
    
    /** Number of multiprocessors */
    public native int multiProcessorCount();
}

cuBLAS Operations

CUDA Basic Linear Algebra Subprograms for GPU-accelerated linear algebra.

/**
 * cuBLAS context and initialization
 */
public static class cuBLASContext {
    /**
     * Create cuBLAS handle
     * @param handle Output handle
     * @return cuBLAS status
     */
    public static native int cublasCreate_v2(cublasHandle_t handle);
    
    /**
     * Destroy cuBLAS handle
     * @param handle Handle to destroy
     * @return cuBLAS status
     */
    public static native int cublasDestroy_v2(cublasHandle_t handle);
    
    /**
     * Set cuBLAS stream
     * @param handle cuBLAS handle
     * @param streamId CUDA stream
     * @return cuBLAS status
     */
    public static native int cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
}

/**
 * cuBLAS Level 3 operations (matrix-matrix)
 */
public static class cuBLASLevel3 {
    /**
     * Single precision matrix multiplication: C = α*A*B + β*C
     * @param handle cuBLAS handle
     * @param transa Transpose operation for A
     * @param transb Transpose operation for B
     * @param m Number of rows in A and C
     * @param n Number of columns in B and C
     * @param k Number of columns in A and rows in B
     * @param alpha Scalar α
     * @param A Matrix A on device
     * @param lda Leading dimension of A
     * @param B Matrix B on device
     * @param ldb Leading dimension of B
     * @param beta Scalar β
     * @param C Matrix C on device
     * @param ldc Leading dimension of C
     * @return cuBLAS status
     */
    public static native int cublasSgemm_v2(cublasHandle_t handle, int transa, int transb,
        int m, int n, int k, FloatPointer alpha, FloatPointer A, int lda,
        FloatPointer B, int ldb, FloatPointer beta, FloatPointer C, int ldc);
    
    /**
     * Double precision matrix multiplication
     */
    public static native int cublasDgemm_v2(cublasHandle_t handle, int transa, int transb,
        int m, int n, int k, DoublePointer alpha, DoublePointer A, int lda,
        DoublePointer B, int ldb, DoublePointer beta, DoublePointer C, int ldc);
    
    /**
     * Batched matrix multiplication
     * @param handle cuBLAS handle
     * @param transa Transpose operation for A
     * @param transb Transpose operation for B
     * @param m Number of rows in A and C
     * @param n Number of columns in B and C
     * @param k Number of columns in A and rows in B
     * @param alpha Scalar α
     * @param Aarray Array of pointers to matrices A
     * @param lda Leading dimension of A
     * @param Barray Array of pointers to matrices B
     * @param ldb Leading dimension of B
     * @param beta Scalar β
     * @param Carray Array of pointers to matrices C
     * @param ldc Leading dimension of C
     * @param batchCount Number of matrices
     * @return cuBLAS status
     */
    public static native int cublasSgemmBatched(cublasHandle_t handle, int transa, int transb,
        int m, int n, int k, FloatPointer alpha, PointerPointer Aarray, int lda,
        PointerPointer Barray, int ldb, FloatPointer beta, PointerPointer Carray,
        int ldc, int batchCount);
}

cuDNN Deep Learning

CUDA Deep Neural Network library for accelerated deep learning operations.

/**
 * cuDNN context management
 */
public static class cuDNNContext {
    /**
     * Create cuDNN handle
     * @param handle Output handle
     * @return cuDNN status
     */
    public static native int cudnnCreate(cudnnHandle_t handle);
    
    /**
     * Destroy cuDNN handle
     * @param handle Handle to destroy
     * @return cuDNN status
     */
    public static native int cudnnDestroy(cudnnHandle_t handle);
    
    /**
     * Set cuDNN stream
     * @param handle cuDNN handle
     * @param stream CUDA stream
     * @return cuDNN status
     */
    public static native int cudnnSetStream(cudnnHandle_t handle, cudaStream_t stream);
}

/**
 * Tensor descriptor management
 */
public static class cuDNNTensor {
    /**
     * Create tensor descriptor
     * @param tensorDesc Output tensor descriptor
     * @return cuDNN status
     */
    public static native int cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
    
    /**
     * Set tensor descriptor
     * @param tensorDesc Tensor descriptor
     * @param format Data format (NCHW, NHWC, etc.)
     * @param dataType Data type (float, half, etc.)
     * @param nbDims Number of dimensions
     * @param dimA Dimension sizes
     * @param strideA Stride sizes
     * @return cuDNN status
     */
    public static native int cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
        int dataType, int nbDims, IntPointer dimA, IntPointer strideA);
    
    /**
     * Destroy tensor descriptor
     * @param tensorDesc Tensor descriptor to destroy
     * @return cuDNN status
     */
    public static native int cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
}

/**
 * Convolution operations
 */
public static class cuDNNConvolution {
    /**
     * Create convolution descriptor
     * @param convDesc Output convolution descriptor
     * @return cuDNN status
     */
    public static native int cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
    
    /**
     * Set convolution descriptor
     * @param convDesc Convolution descriptor  
     * @param arrayLength Number of dimensions
     * @param padA Padding for each dimension
     * @param filterStrideA Stride for each dimension
     * @param dilationA Dilation for each dimension
     * @param mode Convolution mode
     * @param computeType Computation data type
     * @return cuDNN status
     */
    public static native int cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
        int arrayLength, IntPointer padA, IntPointer filterStrideA, IntPointer dilationA,
        int mode, int computeType);
    
    /**
     * Forward convolution
     * @param handle cuDNN handle
     * @param alpha Scaling factor for input
     * @param xDesc Input tensor descriptor
     * @param x Input tensor data
     * @param wDesc Filter tensor descriptor
     * @param w Filter data
     * @param convDesc Convolution descriptor
     * @param algo Convolution algorithm
     * @param workSpace Workspace memory
     * @param workSpaceSizeInBytes Workspace size
     * @param beta Scaling factor for output
     * @param yDesc Output tensor descriptor
     * @param y Output tensor data
     * @return cuDNN status
     */
    public static native int cudnnConvolutionForward(cudnnHandle_t handle,
        Pointer alpha, cudnnTensorDescriptor_t xDesc, Pointer x,
        cudnnFilterDescriptor_t wDesc, Pointer w, cudnnConvolutionDescriptor_t convDesc,
        int algo, Pointer workSpace, long workSpaceSizeInBytes, Pointer beta,
        cudnnTensorDescriptor_t yDesc, Pointer y);
}

/**
 * Activation functions
 */
public static class cuDNNActivation {
    /**
     * Create activation descriptor
     * @param activationDesc Output activation descriptor
     * @return cuDNN status
     */
    public static native int cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
    
    /**
     * Set activation descriptor
     * @param activationDesc Activation descriptor
     * @param mode Activation mode (sigmoid, relu, tanh, etc.)
     * @param reluNanOpt NaN propagation mode
     * @param coef Coefficient for some activation modes
     * @return cuDNN status
     */
    public static native int cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
        int mode, int reluNanOpt, double coef);
    
    /**
     * Forward activation
     * @param handle cuDNN handle
     * @param activationDesc Activation descriptor
     * @param alpha Scaling factor for input
     * @param xDesc Input tensor descriptor
     * @param x Input tensor data
     * @param beta Scaling factor for output
     * @param yDesc Output tensor descriptor
     * @param y Output tensor data
     * @return cuDNN status
     */
    public static native int cudnnActivationForward(cudnnHandle_t handle,
        cudnnActivationDescriptor_t activationDesc, Pointer alpha,
        cudnnTensorDescriptor_t xDesc, Pointer x, Pointer beta,
        cudnnTensorDescriptor_t yDesc, Pointer y);
}

OpenCL Cross-Platform Computing

OpenCL API for cross-platform parallel computing across CPUs, GPUs, and other devices.

/**
 * OpenCL platform and device management
 */
public static class OpenCLPlatform {
    /**
     * Get platform IDs
     * @param num_entries Number of platform entries
     * @param platforms Output platform array
     * @param num_platforms Actual number of platforms found
     * @return OpenCL error code
     */
    public static native int clGetPlatformIDs(int num_entries, cl_platform_id platforms,
        IntPointer num_platforms);
    
    /**
     * Get platform information
     * @param platform Platform ID
     * @param param_name Information parameter
     * @param param_value_size Size of output buffer
     * @param param_value Output buffer
     * @param param_value_size_ret Actual size of information
     * @return OpenCL error code
     */
    public static native int clGetPlatformInfo(cl_platform_id platform, int param_name,
        long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
    
    /**
     * Get device IDs for platform
     * @param platform Platform ID
     * @param device_type Device type filter (GPU, CPU, ALL, etc.)
     * @param num_entries Number of device entries
     * @param devices Output device array
     * @param num_devices Actual number of devices found
     * @return OpenCL error code
     */
    public static native int clGetDeviceIDs(cl_platform_id platform, long device_type,
        int num_entries, cl_device_id devices, IntPointer num_devices);
    
    /**
     * Get device information
     * @param device Device ID
     * @param param_name Information parameter
     * @param param_value_size Size of output buffer
     * @param param_value Output buffer
     * @param param_value_size_ret Actual size of information
     * @return OpenCL error code
     */
    public static native int clGetDeviceInfo(cl_device_id device, int param_name,
        long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);
}

/**
 * OpenCL context and command queue management
 */
public static class OpenCLContext {
    /**
     * Create OpenCL context
     * @param properties Context properties
     * @param num_devices Number of devices
     * @param devices Device array
     * @param pfn_notify Notification callback
     * @param user_data User data for callback
     * @param errcode_ret Error code output
     * @return Context handle
     */
    public static native cl_context clCreateContext(cl_context_properties properties,
        int num_devices, cl_device_id devices, CreateContextCallbackFunction pfn_notify,
        Pointer user_data, IntPointer errcode_ret);
    
    /**
     * Release context
     * @param context Context to release
     * @return OpenCL error code
     */
    public static native int clReleaseContext(cl_context context);
    
    /**
     * Create command queue
     * @param context OpenCL context
     * @param device Target device
     * @param properties Queue properties
     * @param errcode_ret Error code output
     * @return Command queue handle
     */
    public static native cl_command_queue clCreateCommandQueue(cl_context context,
        cl_device_id device, long properties, IntPointer errcode_ret);
    
    /**
     * Release command queue
     * @param command_queue Queue to release
     * @return OpenCL error code
     */
    public static native int clReleaseCommandQueue(cl_command_queue command_queue);
}

/**
 * OpenCL memory management
 */
public static class OpenCLMemory {
    /**
     * Create buffer object
     * @param context OpenCL context
     * @param flags Memory flags (read/write permissions, etc.)
     * @param size Buffer size in bytes
     * @param host_ptr Host memory pointer (optional)
     * @param errcode_ret Error code output
     * @return Memory object handle
     */
    public static native cl_mem clCreateBuffer(cl_context context, long flags, long size,
        Pointer host_ptr, IntPointer errcode_ret);
    
    /**
     * Release memory object
     * @param memobj Memory object to release
     * @return OpenCL error code
     */
    public static native int clReleaseMemObject(cl_mem memobj);
    
    /**
     * Enqueue buffer write operation
     * @param command_queue Command queue
     * @param buffer Target buffer
     * @param blocking_write Blocking operation flag
     * @param offset Offset in buffer
     * @param size Size to write
     * @param ptr Source data pointer
     * @param num_events_in_wait_list Number of events to wait for
     * @param event_wait_list Events to wait for
     * @param event Output event
     * @return OpenCL error code
     */
    public static native int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
        int blocking_write, long offset, long size, Pointer ptr, int num_events_in_wait_list,
        cl_event event_wait_list, cl_event event);
    
    /**
     * Enqueue buffer read operation
     * @param command_queue Command queue
     * @param buffer Source buffer
     * @param blocking_read Blocking operation flag
     * @param offset Offset in buffer
     * @param size Size to read
     * @param ptr Destination data pointer
     * @param num_events_in_wait_list Number of events to wait for
     * @param event_wait_list Events to wait for
     * @param event Output event
     * @return OpenCL error code
     */
    public static native int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
        int blocking_read, long offset, long size, Pointer ptr, int num_events_in_wait_list,
        cl_event event_wait_list, cl_event event);
}

/**
 * OpenCL kernel execution
 */
public static class OpenCLKernel {
    /**
     * Create program from source
     * @param context OpenCL context
     * @param count Number of source strings
     * @param strings Source code strings
     * @param lengths String lengths (null for null-terminated)
     * @param errcode_ret Error code output
     * @return Program handle
     */
    public static native cl_program clCreateProgramWithSource(cl_context context, int count,
        PointerPointer strings, SizeTPointer lengths, IntPointer errcode_ret);
    
    /**
     * Build program
     * @param program Program to build
     * @param num_devices Number of devices
     * @param device_list Target devices
     * @param options Build options
     * @param pfn_notify Build callback
     * @param user_data User data for callback
     * @return OpenCL error code
     */
    public static native int clBuildProgram(cl_program program, int num_devices,
        cl_device_id device_list, String options, BuildProgramCallbackFunction pfn_notify,
        Pointer user_data);
    
    /**
     * Create kernel from program
     * @param program Compiled program
     * @param kernel_name Kernel function name
     * @param errcode_ret Error code output
     * @return Kernel handle
     */
    public static native cl_kernel clCreateKernel(cl_program program, String kernel_name,
        IntPointer errcode_ret);
    
    /**
     * Set kernel argument
     * @param kernel Kernel handle
     * @param arg_index Argument index
     * @param arg_size Argument size
     * @param arg_value Argument value pointer
     * @return OpenCL error code
     */
    public static native int clSetKernelArg(cl_kernel kernel, int arg_index, long arg_size,
        Pointer arg_value);
    
    /**
     * Enqueue kernel execution
     * @param command_queue Command queue
     * @param kernel Kernel to execute
     * @param work_dim Number of work dimensions
     * @param global_work_offset Global work offset
     * @param global_work_size Global work size
     * @param local_work_size Local work size
     * @param num_events_in_wait_list Number of events to wait for
     * @param event_wait_list Events to wait for
     * @param event Output event
     * @return OpenCL error code
     */
    public static native int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
        int work_dim, SizeTPointer global_work_offset, SizeTPointer global_work_size,
        SizeTPointer local_work_size, int num_events_in_wait_list, cl_event event_wait_list,
        cl_event event);
}

Usage Examples

CUDA Vector Addition

import org.bytedeco.cuda.cudart.*;
import static org.bytedeco.cuda.global.cudart.*;

public class CUDAVectorAdd {
    static {
        Loader.load(cudart.class);
    }
    
    public static void vectorAdd() {
        try (PointerScope scope = new PointerScope()) {
            int N = 1024;
            int size = N * 4; // sizeof(float) * N
            
            // Host arrays
            float[] h_A = new float[N];
            float[] h_B = new float[N];
            float[] h_C = new float[N];
            
            // Initialize host arrays
            for (int i = 0; i < N; i++) {
                h_A[i] = i;
                h_B[i] = i * 2;
            }
            
            // Device pointers
            Pointer d_A = new Pointer();
            Pointer d_B = new Pointer();
            Pointer d_C = new Pointer();
            
            // Allocate device memory
            cudaMalloc(d_A, size);
            cudaMalloc(d_B, size);
            cudaMalloc(d_C, size);
            
            // Copy data to device
            FloatPointer fp_A = new FloatPointer(h_A);
            FloatPointer fp_B = new FloatPointer(h_B);
            FloatPointer fp_C = new FloatPointer(h_C);
            
            cudaMemcpy(d_A, fp_A, size, cudaMemcpyHostToDevice);
            cudaMemcpy(d_B, fp_B, size, cudaMemcpyHostToDevice);
            
            // Launch kernel (this would require a compiled CUDA kernel)
            // For illustration - actual kernel launch would use CUDA driver API
            // or require JCuda/JCUDA for higher-level kernel launching
            
            // Copy result back to host
            cudaMemcpy(fp_C, d_C, size, cudaMemcpyDeviceToHost);
            
            // Verify results
            boolean success = true;
            for (int i = 0; i < N && success; i++) {
                if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
                    success = false;
                    System.err.println("Verification failed at index " + i);
                }
            }
            
            if (success) {
                System.out.println("Vector addition completed successfully!");
            }
            
            // Free device memory
            cudaFree(d_A);
            cudaFree(d_B);
            cudaFree(d_C);
        }
    }
    
    public static void deviceInfo() {
        try (PointerScope scope = new PointerScope()) {
            IntPointer deviceCount = new IntPointer(1);
            cudaGetDeviceCount(deviceCount);
            
            System.out.println("Number of CUDA devices: " + deviceCount.get());
            
            for (int i = 0; i < deviceCount.get(); i++) {
                cudaDeviceProp prop = new cudaDeviceProp();
                cudaGetDeviceProperties(prop, i);
                
                System.out.println("\nDevice " + i + ":");
                System.out.println("  Name: " + prop.name().getString());
                System.out.println("  Compute Capability: " + prop.major() + "." + prop.minor());
                System.out.println("  Total Global Memory: " + prop.totalGlobalMem() / (1024*1024) + " MB");
                System.out.println("  Multiprocessors: " + prop.multiProcessorCount());
                System.out.println("  Max Threads per Block: " + prop.maxThreadsPerBlock());
                System.out.println("  Warp Size: " + prop.warpSize());
            }
        }
    }
}

cuBLAS Matrix Multiplication

import org.bytedeco.cuda.cudart.*;
import org.bytedeco.cuda.cublas.*;
import static org.bytedeco.cuda.global.cudart.*;
import static org.bytedeco.cuda.global.cublas.*;

public class cuBLASExample {
    static {
        Loader.load(cudart.class);
        Loader.load(cublas.class);
    }
    
    public static void matrixMultiplication() {
        try (PointerScope scope = new PointerScope()) {
            int M = 3, N = 3, K = 3;
            
            // Host matrices
            float[] h_A = {1, 2, 3, 4, 5, 6, 7, 8, 9};
            float[] h_B = {9, 8, 7, 6, 5, 4, 3, 2, 1};
            float[] h_C = new float[M * N];
            
            // Device matrices
            Pointer d_A = new Pointer();
            Pointer d_B = new Pointer();
            Pointer d_C = new Pointer();
            
            int sizeA = M * K * 4; // sizeof(float)
            int sizeB = K * N * 4;
            int sizeC = M * N * 4;
            
            cudaMalloc(d_A, sizeA);
            cudaMalloc(d_B, sizeB);
            cudaMalloc(d_C, sizeC);
            
            // Copy matrices to device
            FloatPointer fp_A = new FloatPointer(h_A);
            FloatPointer fp_B = new FloatPointer(h_B);
            FloatPointer fp_C = new FloatPointer(h_C);
            
            cudaMemcpy(d_A, fp_A, sizeA, cudaMemcpyHostToDevice);
            cudaMemcpy(d_B, fp_B, sizeB, cudaMemcpyHostToDevice);
            
            // Create cuBLAS handle
            cublasHandle_t handle = new cublasHandle_t();
            cublasCreate_v2(handle);
            
            // Scalars for GEMM
            FloatPointer alpha = new FloatPointer(1.0f);
            FloatPointer beta = new FloatPointer(0.0f);
            
            // Perform matrix multiplication: C = α*A*B + β*C
            cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                M, N, K, alpha,
                new FloatPointer(d_A), M,
                new FloatPointer(d_B), K,
                beta, new FloatPointer(d_C), M);
            
            // Copy result back to host
            cudaMemcpy(fp_C, d_C, sizeC, cudaMemcpyDeviceToHost);
            
            // Print result
            System.out.println("cuBLAS Matrix multiplication result:");
            for (int i = 0; i < M; i++) {
                for (int j = 0; j < N; j++) {
                    System.out.printf("%.1f ", h_C[i * N + j]);
                }
                System.out.println();
            }
            
            // Cleanup
            cublasDestroy_v2(handle);
            cudaFree(d_A);
            cudaFree(d_B);
            cudaFree(d_C);
        }
    }
}

OpenCL Vector Addition

import org.bytedeco.opencl.*;
import static org.bytedeco.opencl.global.OpenCL.*;

public class OpenCLExample {
    static {
        Loader.load(OpenCL.class);
    }
    
    // OpenCL kernel source code
    static final String kernelSource = 
        "__kernel void vector_add(__global const float* A, __global const float* B, " +
        "__global float* C) { " +
        "    int i = get_global_id(0); " +
        "    C[i] = A[i] + B[i]; " +
        "}";
    
    public static void vectorAdd() {
        try (PointerScope scope = new PointerScope()) {
            int N = 1024;
            
            // Get platform and device
            cl_platform_id platform = new cl_platform_id();
            cl_device_id device = new cl_device_id();
            IntPointer ret = new IntPointer(1);
            
            clGetPlatformIDs(1, platform, null);
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, device, null);
            
            // Create context and command queue
            cl_context context = clCreateContext(null, 1, device, null, null, ret);
            cl_command_queue queue = clCreateCommandQueue(context, device, 0, ret);
            
            // Host data
            float[] h_A = new float[N];
            float[] h_B = new float[N];
            float[] h_C = new float[N];
            
            for (int i = 0; i < N; i++) {
                h_A[i] = i;
                h_B[i] = i * 2;
            }
            
            // Create device buffers
            cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
            cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);
            cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 4, null, ret);
            
            // Copy data to device
            FloatPointer fp_A = new FloatPointer(h_A);
            FloatPointer fp_B = new FloatPointer(h_B);
            
            clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, N * 4, fp_A, 0, null, null);
            clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, N * 4, fp_B, 0, null, null);
            
            // Create and build program
            PointerPointer kernelSourcePtr = new PointerPointer(kernelSource);
            cl_program program = clCreateProgramWithSource(context, 1, kernelSourcePtr, null, ret);
            clBuildProgram(program, 1, device, null, null, null);
            
            // Create kernel
            cl_kernel kernel = clCreateKernel(program, "vector_add", ret);
            
            // Set kernel arguments
            clSetKernelArg(kernel, 0, Pointer.sizeof(cl_mem.class), d_A);
            clSetKernelArg(kernel, 1, Pointer.sizeof(cl_mem.class), d_B);
            clSetKernelArg(kernel, 2, Pointer.sizeof(cl_mem.class), d_C);
            
            // Execute kernel
            SizeTPointer globalWorkSize = new SizeTPointer(N);
            clEnqueueNDRangeKernel(queue, kernel, 1, null, globalWorkSize, null, 0, null, null);
            
            // Read result
            FloatPointer fp_C = new FloatPointer(h_C);
            clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, N * 4, fp_C, 0, null, null);
            
            // Verify results
            boolean success = true;
            for (int i = 0; i < Math.min(N, 10) && success; i++) {
                if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
                    success = false;
                }
                System.out.printf("C[%d] = %.1f (expected %.1f)\n", i, h_C[i], h_A[i] + h_B[i]);
            }
            
            System.out.println(success ? "OpenCL vector addition successful!" : "Verification failed");
            
            // Cleanup (in reverse order of creation)
            clReleaseKernel(kernel);
            clReleaseProgram(program);
            clReleaseMemObject(d_A);
            clReleaseMemObject(d_B);
            clReleaseMemObject(d_C);
            clReleaseCommandQueue(queue);
            clReleaseContext(context);
        }
    }
}

Install with Tessl CLI

npx tessl i tessl/maven-org-bytedeco--javacpp-presets-platform

docs

computer-vision.md

gpu-computing.md

index.md

machine-learning.md

multimedia.md

scientific-computing.md

text-processing.md

tile.json