or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

computer-vision.mdgpu-computing.mdindex.mdmachine-learning.mdmultimedia.mdscientific-computing.mdtext-processing.md

gpu-computing.mddocs/

0

# GPU Computing

1

2

High-performance GPU computing capabilities through CUDA, OpenCL, and associated libraries for parallel processing and acceleration.

3

4

## Capabilities

5

6

### CUDA Runtime Operations

7

8

NVIDIA CUDA runtime API for GPU device management, memory operations, and kernel execution.

9

10

```java { .api }

11

/**

12

* Device management functions

13

*/

14

public static class CUDADevice {

15

/**

16

* Get number of CUDA devices

17

* @param count Output device count

18

* @return CUDA error code

19

*/

20

public static native int cudaGetDeviceCount(IntPointer count);

21

22

/**

23

* Set current CUDA device

24

* @param device Device index to use

25

* @return CUDA error code

26

*/

27

public static native int cudaSetDevice(int device);

28

29

/**

30

* Get current CUDA device

31

* @param device Output current device index

32

* @return CUDA error code

33

*/

34

public static native int cudaGetDevice(IntPointer device);

35

36

/**

37

* Get device properties

38

* @param prop Output device properties

39

* @param device Device index

40

* @return CUDA error code

41

*/

42

public static native int cudaGetDeviceProperties(cudaDeviceProp prop, int device);

43

44

/**

45

* Synchronize with current device

46

* @return CUDA error code

47

*/

48

public static native int cudaDeviceSynchronize();

49

50

/**

51

* Reset current device

52

* @return CUDA error code

53

*/

54

public static native int cudaDeviceReset();

55

}

56

57

/**

58

* Memory management functions

59

*/

60

public static class CUDAMemory {

61

/**

62

* Allocate device memory

63

* @param devPtr Output pointer to allocated memory

64

* @param size Size in bytes to allocate

65

* @return CUDA error code

66

*/

67

public static native int cudaMalloc(PointerPointer devPtr, long size);

68

69

/**

70

* Free device memory

71

* @param devPtr Device pointer to free

72

* @return CUDA error code

73

*/

74

public static native int cudaFree(Pointer devPtr);

75

76

/**

77

* Copy memory between host and device

78

* @param dst Destination pointer

79

* @param src Source pointer

80

* @param count Size in bytes

81

* @param kind Copy direction (cudaMemcpyHostToDevice, etc.)

82

* @return CUDA error code

83

*/

84

public static native int cudaMemcpy(Pointer dst, Pointer src, long count, int kind);

85

86

/**

87

* Asynchronous memory copy

88

* @param dst Destination pointer

89

* @param src Source pointer

90

* @param count Size in bytes

91

* @param kind Copy direction

92

* @param stream CUDA stream

93

* @return CUDA error code

94

*/

95

public static native int cudaMemcpyAsync(Pointer dst, Pointer src, long count,

96

int kind, cudaStream_t stream);

97

98

/**

99

* Set device memory to value

100

* @param devPtr Device pointer

101

* @param value Value to set (byte)

102

* @param count Number of bytes

103

* @return CUDA error code

104

*/

105

public static native int cudaMemset(Pointer devPtr, int value, long count);

106

107

/**

108

* Allocate page-locked host memory

109

* @param ptr Output pointer to allocated host memory

110

* @param size Size in bytes

111

* @return CUDA error code

112

*/

113

public static native int cudaMallocHost(PointerPointer ptr, long size);

114

115

/**

116

* Free page-locked host memory

117

* @param ptr Host pointer to free

118

* @return CUDA error code

119

*/

120

public static native int cudaFreeHost(Pointer ptr);

121

}

122

123

/**

124

* Stream management for asynchronous operations

125

*/

126

public static class CUDAStream {

127

/**

128

* Create CUDA stream

129

* @param pStream Output stream handle

130

* @return CUDA error code

131

*/

132

public static native int cudaStreamCreate(cudaStream_t pStream);

133

134

/**

135

* Destroy CUDA stream

136

* @param stream Stream to destroy

137

* @return CUDA error code

138

*/

139

public static native int cudaStreamDestroy(cudaStream_t stream);

140

141

/**

142

* Synchronize with stream

143

* @param stream Stream to synchronize

144

* @return CUDA error code

145

*/

146

public static native int cudaStreamSynchronize(cudaStream_t stream);

147

148

/**

149

* Query stream status

150

* @param stream Stream to query

151

* @return CUDA error code (cudaSuccess if complete)

152

*/

153

public static native int cudaStreamQuery(cudaStream_t stream);

154

}

155

156

/**

157

* CUDA device properties structure

158

*/

159

public class cudaDeviceProp extends Pointer {

160

/** Device name */

161

public native String name();

162

163

/** Total global memory in bytes */

164

public native long totalGlobalMem();

165

166

/** Shared memory per block */

167

public native long sharedMemPerBlock();

168

169

/** Number of registers per block */

170

public native int regsPerBlock();

171

172

/** Warp size */

173

public native int warpSize();

174

175

/** Maximum threads per block */

176

public native int maxThreadsPerBlock();

177

178

/** Maximum block dimensions */

179

public native IntPointer maxThreadsDim();

180

181

/** Maximum grid dimensions */

182

public native IntPointer maxGridSize();

183

184

/** Compute capability major version */

185

public native int major();

186

187

/** Compute capability minor version */

188

public native int minor();

189

190

/** Number of multiprocessors */

191

public native int multiProcessorCount();

192

}

193

```

194

195

### cuBLAS Operations

196

197

CUDA Basic Linear Algebra Subprograms for GPU-accelerated linear algebra.

198

199

```java { .api }

200

/**

201

* cuBLAS context and initialization

202

*/

203

public static class cuBLASContext {

204

/**

205

* Create cuBLAS handle

206

* @param handle Output handle

207

* @return cuBLAS status

208

*/

209

public static native int cublasCreate_v2(cublasHandle_t handle);

210

211

/**

212

* Destroy cuBLAS handle

213

* @param handle Handle to destroy

214

* @return cuBLAS status

215

*/

216

public static native int cublasDestroy_v2(cublasHandle_t handle);

217

218

/**

219

* Set cuBLAS stream

220

* @param handle cuBLAS handle

221

* @param streamId CUDA stream

222

* @return cuBLAS status

223

*/

224

public static native int cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);

225

}

226

227

/**

228

* cuBLAS Level 3 operations (matrix-matrix)

229

*/

230

public static class cuBLASLevel3 {

231

/**

232

* Single precision matrix multiplication: C = α*A*B + β*C

233

* @param handle cuBLAS handle

234

* @param transa Transpose operation for A

235

* @param transb Transpose operation for B

236

* @param m Number of rows in A and C

237

* @param n Number of columns in B and C

238

* @param k Number of columns in A and rows in B

239

* @param alpha Scalar α

240

* @param A Matrix A on device

241

* @param lda Leading dimension of A

242

* @param B Matrix B on device

243

* @param ldb Leading dimension of B

244

* @param beta Scalar β

245

* @param C Matrix C on device

246

* @param ldc Leading dimension of C

247

* @return cuBLAS status

248

*/

249

public static native int cublasSgemm_v2(cublasHandle_t handle, int transa, int transb,

250

int m, int n, int k, FloatPointer alpha, FloatPointer A, int lda,

251

FloatPointer B, int ldb, FloatPointer beta, FloatPointer C, int ldc);

252

253

/**

254

* Double precision matrix multiplication

255

*/

256

public static native int cublasDgemm_v2(cublasHandle_t handle, int transa, int transb,

257

int m, int n, int k, DoublePointer alpha, DoublePointer A, int lda,

258

DoublePointer B, int ldb, DoublePointer beta, DoublePointer C, int ldc);

259

260

/**

261

* Batched matrix multiplication

262

* @param handle cuBLAS handle

263

* @param transa Transpose operation for A

264

* @param transb Transpose operation for B

265

* @param m Number of rows in A and C

266

* @param n Number of columns in B and C

267

* @param k Number of columns in A and rows in B

268

* @param alpha Scalar α

269

* @param Aarray Array of pointers to matrices A

270

* @param lda Leading dimension of A

271

* @param Barray Array of pointers to matrices B

272

* @param ldb Leading dimension of B

273

* @param beta Scalar β

274

* @param Carray Array of pointers to matrices C

275

* @param ldc Leading dimension of C

276

* @param batchCount Number of matrices

277

* @return cuBLAS status

278

*/

279

public static native int cublasSgemmBatched(cublasHandle_t handle, int transa, int transb,

280

int m, int n, int k, FloatPointer alpha, PointerPointer Aarray, int lda,

281

PointerPointer Barray, int ldb, FloatPointer beta, PointerPointer Carray,

282

int ldc, int batchCount);

283

}

284

```

285

286

### cuDNN Deep Learning

287

288

CUDA Deep Neural Network library for accelerated deep learning operations.

289

290

```java { .api }

291

/**

292

* cuDNN context management

293

*/

294

public static class cuDNNContext {

295

/**

296

* Create cuDNN handle

297

* @param handle Output handle

298

* @return cuDNN status

299

*/

300

public static native int cudnnCreate(cudnnHandle_t handle);

301

302

/**

303

* Destroy cuDNN handle

304

* @param handle Handle to destroy

305

* @return cuDNN status

306

*/

307

public static native int cudnnDestroy(cudnnHandle_t handle);

308

309

/**

310

* Set cuDNN stream

311

* @param handle cuDNN handle

312

* @param stream CUDA stream

313

* @return cuDNN status

314

*/

315

public static native int cudnnSetStream(cudnnHandle_t handle, cudaStream_t stream);

316

}

317

318

/**

319

* Tensor descriptor management

320

*/

321

public static class cuDNNTensor {

322

/**

323

* Create tensor descriptor

324

* @param tensorDesc Output tensor descriptor

325

* @return cuDNN status

326

*/

327

public static native int cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);

328

329

/**

330

* Set tensor descriptor

331

* @param tensorDesc Tensor descriptor

332

* @param format Data format (NCHW, NHWC, etc.)

333

* @param dataType Data type (float, half, etc.)

334

* @param nbDims Number of dimensions

335

* @param dimA Dimension sizes

336

* @param strideA Stride sizes

337

* @return cuDNN status

338

*/

339

public static native int cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,

340

int dataType, int nbDims, IntPointer dimA, IntPointer strideA);

341

342

/**

343

* Destroy tensor descriptor

344

* @param tensorDesc Tensor descriptor to destroy

345

* @return cuDNN status

346

*/

347

public static native int cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);

348

}

349

350

/**

351

* Convolution operations

352

*/

353

public static class cuDNNConvolution {

354

/**

355

* Create convolution descriptor

356

* @param convDesc Output convolution descriptor

357

* @return cuDNN status

358

*/

359

public static native int cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);

360

361

/**

362

* Set convolution descriptor

363

* @param convDesc Convolution descriptor

364

* @param arrayLength Number of dimensions

365

* @param padA Padding for each dimension

366

* @param filterStrideA Stride for each dimension

367

* @param dilationA Dilation for each dimension

368

* @param mode Convolution mode

369

* @param computeType Computation data type

370

* @return cuDNN status

371

*/

372

public static native int cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,

373

int arrayLength, IntPointer padA, IntPointer filterStrideA, IntPointer dilationA,

374

int mode, int computeType);

375

376

/**

377

* Forward convolution

378

* @param handle cuDNN handle

379

* @param alpha Scaling factor for input

380

* @param xDesc Input tensor descriptor

381

* @param x Input tensor data

382

* @param wDesc Filter tensor descriptor

383

* @param w Filter data

384

* @param convDesc Convolution descriptor

385

* @param algo Convolution algorithm

386

* @param workSpace Workspace memory

387

* @param workSpaceSizeInBytes Workspace size

388

* @param beta Scaling factor for output

389

* @param yDesc Output tensor descriptor

390

* @param y Output tensor data

391

* @return cuDNN status

392

*/

393

public static native int cudnnConvolutionForward(cudnnHandle_t handle,

394

Pointer alpha, cudnnTensorDescriptor_t xDesc, Pointer x,

395

cudnnFilterDescriptor_t wDesc, Pointer w, cudnnConvolutionDescriptor_t convDesc,

396

int algo, Pointer workSpace, long workSpaceSizeInBytes, Pointer beta,

397

cudnnTensorDescriptor_t yDesc, Pointer y);

398

}

399

400

/**

401

* Activation functions

402

*/

403

public static class cuDNNActivation {

404

/**

405

* Create activation descriptor

406

* @param activationDesc Output activation descriptor

407

* @return cuDNN status

408

*/

409

public static native int cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t activationDesc);

410

411

/**

412

* Set activation descriptor

413

* @param activationDesc Activation descriptor

414

* @param mode Activation mode (sigmoid, relu, tanh, etc.)

415

* @param reluNanOpt NaN propagation mode

416

* @param coef Coefficient for some activation modes

417

* @return cuDNN status

418

*/

419

public static native int cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,

420

int mode, int reluNanOpt, double coef);

421

422

/**

423

* Forward activation

424

* @param handle cuDNN handle

425

* @param activationDesc Activation descriptor

426

* @param alpha Scaling factor for input

427

* @param xDesc Input tensor descriptor

428

* @param x Input tensor data

429

* @param beta Scaling factor for output

430

* @param yDesc Output tensor descriptor

431

* @param y Output tensor data

432

* @return cuDNN status

433

*/

434

public static native int cudnnActivationForward(cudnnHandle_t handle,

435

cudnnActivationDescriptor_t activationDesc, Pointer alpha,

436

cudnnTensorDescriptor_t xDesc, Pointer x, Pointer beta,

437

cudnnTensorDescriptor_t yDesc, Pointer y);

438

}

439

```

440

441

### OpenCL Cross-Platform Computing

442

443

OpenCL API for cross-platform parallel computing across CPUs, GPUs, and other devices.

444

445

```java { .api }

446

/**

447

* OpenCL platform and device management

448

*/

449

public static class OpenCLPlatform {

450

/**

451

* Get platform IDs

452

* @param num_entries Number of platform entries

453

* @param platforms Output platform array

454

* @param num_platforms Actual number of platforms found

455

* @return OpenCL error code

456

*/

457

public static native int clGetPlatformIDs(int num_entries, cl_platform_id platforms,

458

IntPointer num_platforms);

459

460

/**

461

* Get platform information

462

* @param platform Platform ID

463

* @param param_name Information parameter

464

* @param param_value_size Size of output buffer

465

* @param param_value Output buffer

466

* @param param_value_size_ret Actual size of information

467

* @return OpenCL error code

468

*/

469

public static native int clGetPlatformInfo(cl_platform_id platform, int param_name,

470

long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);

471

472

/**

473

* Get device IDs for platform

474

* @param platform Platform ID

475

* @param device_type Device type filter (GPU, CPU, ALL, etc.)

476

* @param num_entries Number of device entries

477

* @param devices Output device array

478

* @param num_devices Actual number of devices found

479

* @return OpenCL error code

480

*/

481

public static native int clGetDeviceIDs(cl_platform_id platform, long device_type,

482

int num_entries, cl_device_id devices, IntPointer num_devices);

483

484

/**

485

* Get device information

486

* @param device Device ID

487

* @param param_name Information parameter

488

* @param param_value_size Size of output buffer

489

* @param param_value Output buffer

490

* @param param_value_size_ret Actual size of information

491

* @return OpenCL error code

492

*/

493

public static native int clGetDeviceInfo(cl_device_id device, int param_name,

494

long param_value_size, Pointer param_value, SizeTPointer param_value_size_ret);

495

}

496

497

/**

498

* OpenCL context and command queue management

499

*/

500

public static class OpenCLContext {

501

/**

502

* Create OpenCL context

503

* @param properties Context properties

504

* @param num_devices Number of devices

505

* @param devices Device array

506

* @param pfn_notify Notification callback

507

* @param user_data User data for callback

508

* @param errcode_ret Error code output

509

* @return Context handle

510

*/

511

public static native cl_context clCreateContext(cl_context_properties properties,

512

int num_devices, cl_device_id devices, CreateContextCallbackFunction pfn_notify,

513

Pointer user_data, IntPointer errcode_ret);

514

515

/**

516

* Release context

517

* @param context Context to release

518

* @return OpenCL error code

519

*/

520

public static native int clReleaseContext(cl_context context);

521

522

/**

523

* Create command queue

524

* @param context OpenCL context

525

* @param device Target device

526

* @param properties Queue properties

527

* @param errcode_ret Error code output

528

* @return Command queue handle

529

*/

530

public static native cl_command_queue clCreateCommandQueue(cl_context context,

531

cl_device_id device, long properties, IntPointer errcode_ret);

532

533

/**

534

* Release command queue

535

* @param command_queue Queue to release

536

* @return OpenCL error code

537

*/

538

public static native int clReleaseCommandQueue(cl_command_queue command_queue);

539

}

540

541

/**

542

* OpenCL memory management

543

*/

544

public static class OpenCLMemory {

545

/**

546

* Create buffer object

547

* @param context OpenCL context

548

* @param flags Memory flags (read/write permissions, etc.)

549

* @param size Buffer size in bytes

550

* @param host_ptr Host memory pointer (optional)

551

* @param errcode_ret Error code output

552

* @return Memory object handle

553

*/

554

public static native cl_mem clCreateBuffer(cl_context context, long flags, long size,

555

Pointer host_ptr, IntPointer errcode_ret);

556

557

/**

558

* Release memory object

559

* @param memobj Memory object to release

560

* @return OpenCL error code

561

*/

562

public static native int clReleaseMemObject(cl_mem memobj);

563

564

/**

565

* Enqueue buffer write operation

566

* @param command_queue Command queue

567

* @param buffer Target buffer

568

* @param blocking_write Blocking operation flag

569

* @param offset Offset in buffer

570

* @param size Size to write

571

* @param ptr Source data pointer

572

* @param num_events_in_wait_list Number of events to wait for

573

* @param event_wait_list Events to wait for

574

* @param event Output event

575

* @return OpenCL error code

576

*/

577

public static native int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,

578

int blocking_write, long offset, long size, Pointer ptr, int num_events_in_wait_list,

579

cl_event event_wait_list, cl_event event);

580

581

/**

582

* Enqueue buffer read operation

583

* @param command_queue Command queue

584

* @param buffer Source buffer

585

* @param blocking_read Blocking operation flag

586

* @param offset Offset in buffer

587

* @param size Size to read

588

* @param ptr Destination data pointer

589

* @param num_events_in_wait_list Number of events to wait for

590

* @param event_wait_list Events to wait for

591

* @param event Output event

592

* @return OpenCL error code

593

*/

594

public static native int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,

595

int blocking_read, long offset, long size, Pointer ptr, int num_events_in_wait_list,

596

cl_event event_wait_list, cl_event event);

597

}

598

599

/**

600

* OpenCL kernel execution

601

*/

602

public static class OpenCLKernel {

603

/**

604

* Create program from source

605

* @param context OpenCL context

606

* @param count Number of source strings

607

* @param strings Source code strings

608

* @param lengths String lengths (null for null-terminated)

609

* @param errcode_ret Error code output

610

* @return Program handle

611

*/

612

public static native cl_program clCreateProgramWithSource(cl_context context, int count,

613

PointerPointer strings, SizeTPointer lengths, IntPointer errcode_ret);

614

615

/**

616

* Build program

617

* @param program Program to build

618

* @param num_devices Number of devices

619

* @param device_list Target devices

620

* @param options Build options

621

* @param pfn_notify Build callback

622

* @param user_data User data for callback

623

* @return OpenCL error code

624

*/

625

public static native int clBuildProgram(cl_program program, int num_devices,

626

cl_device_id device_list, String options, BuildProgramCallbackFunction pfn_notify,

627

Pointer user_data);

628

629

/**

630

* Create kernel from program

631

* @param program Compiled program

632

* @param kernel_name Kernel function name

633

* @param errcode_ret Error code output

634

* @return Kernel handle

635

*/

636

public static native cl_kernel clCreateKernel(cl_program program, String kernel_name,

637

IntPointer errcode_ret);

638

639

/**

640

* Set kernel argument

641

* @param kernel Kernel handle

642

* @param arg_index Argument index

643

* @param arg_size Argument size

644

* @param arg_value Argument value pointer

645

* @return OpenCL error code

646

*/

647

public static native int clSetKernelArg(cl_kernel kernel, int arg_index, long arg_size,

648

Pointer arg_value);

649

650

/**

651

* Enqueue kernel execution

652

* @param command_queue Command queue

653

* @param kernel Kernel to execute

654

* @param work_dim Number of work dimensions

655

* @param global_work_offset Global work offset

656

* @param global_work_size Global work size

657

* @param local_work_size Local work size

658

* @param num_events_in_wait_list Number of events to wait for

659

* @param event_wait_list Events to wait for

660

* @param event Output event

661

* @return OpenCL error code

662

*/

663

public static native int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,

664

int work_dim, SizeTPointer global_work_offset, SizeTPointer global_work_size,

665

SizeTPointer local_work_size, int num_events_in_wait_list, cl_event event_wait_list,

666

cl_event event);

667

}

668

```

669

670

## Usage Examples

671

672

### CUDA Vector Addition

673

674

```java

675

import org.bytedeco.cuda.cudart.*;

676

import static org.bytedeco.cuda.global.cudart.*;

677

678

public class CUDAVectorAdd {

679

static {

680

Loader.load(cudart.class);

681

}

682

683

public static void vectorAdd() {

684

try (PointerScope scope = new PointerScope()) {

685

int N = 1024;

686

int size = N * 4; // sizeof(float) * N

687

688

// Host arrays

689

float[] h_A = new float[N];

690

float[] h_B = new float[N];

691

float[] h_C = new float[N];

692

693

// Initialize host arrays

694

for (int i = 0; i < N; i++) {

695

h_A[i] = i;

696

h_B[i] = i * 2;

697

}

698

699

// Device pointers

700

Pointer d_A = new Pointer();

701

Pointer d_B = new Pointer();

702

Pointer d_C = new Pointer();

703

704

// Allocate device memory

705

cudaMalloc(d_A, size);

706

cudaMalloc(d_B, size);

707

cudaMalloc(d_C, size);

708

709

// Copy data to device

710

FloatPointer fp_A = new FloatPointer(h_A);

711

FloatPointer fp_B = new FloatPointer(h_B);

712

FloatPointer fp_C = new FloatPointer(h_C);

713

714

cudaMemcpy(d_A, fp_A, size, cudaMemcpyHostToDevice);

715

cudaMemcpy(d_B, fp_B, size, cudaMemcpyHostToDevice);

716

717

// Launch kernel (this would require a compiled CUDA kernel)

718

// For illustration - actual kernel launch would use CUDA driver API

719

// or require JCuda/JCUDA for higher-level kernel launching

720

721

// Copy result back to host

722

cudaMemcpy(fp_C, d_C, size, cudaMemcpyDeviceToHost);

723

724

// Verify results

725

boolean success = true;

726

for (int i = 0; i < N && success; i++) {

727

if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {

728

success = false;

729

System.err.println("Verification failed at index " + i);

730

}

731

}

732

733

if (success) {

734

System.out.println("Vector addition completed successfully!");

735

}

736

737

// Free device memory

738

cudaFree(d_A);

739

cudaFree(d_B);

740

cudaFree(d_C);

741

}

742

}

743

744

public static void deviceInfo() {

745

try (PointerScope scope = new PointerScope()) {

746

IntPointer deviceCount = new IntPointer(1);

747

cudaGetDeviceCount(deviceCount);

748

749

System.out.println("Number of CUDA devices: " + deviceCount.get());

750

751

for (int i = 0; i < deviceCount.get(); i++) {

752

cudaDeviceProp prop = new cudaDeviceProp();

753

cudaGetDeviceProperties(prop, i);

754

755

System.out.println("\nDevice " + i + ":");

756

System.out.println(" Name: " + prop.name().getString());

757

System.out.println(" Compute Capability: " + prop.major() + "." + prop.minor());

758

System.out.println(" Total Global Memory: " + prop.totalGlobalMem() / (1024*1024) + " MB");

759

System.out.println(" Multiprocessors: " + prop.multiProcessorCount());

760

System.out.println(" Max Threads per Block: " + prop.maxThreadsPerBlock());

761

System.out.println(" Warp Size: " + prop.warpSize());

762

}

763

}

764

}

765

}

766

```

767

768

### cuBLAS Matrix Multiplication

769

770

```java

771

import org.bytedeco.cuda.cudart.*;

772

import org.bytedeco.cuda.cublas.*;

773

import static org.bytedeco.cuda.global.cudart.*;

774

import static org.bytedeco.cuda.global.cublas.*;

775

776

public class cuBLASExample {

777

static {

778

Loader.load(cudart.class);

779

Loader.load(cublas.class);

780

}

781

782

public static void matrixMultiplication() {

783

try (PointerScope scope = new PointerScope()) {

784

int M = 3, N = 3, K = 3;

785

786

// Host matrices

787

float[] h_A = {1, 2, 3, 4, 5, 6, 7, 8, 9};

788

float[] h_B = {9, 8, 7, 6, 5, 4, 3, 2, 1};

789

float[] h_C = new float[M * N];

790

791

// Device matrices

792

Pointer d_A = new Pointer();

793

Pointer d_B = new Pointer();

794

Pointer d_C = new Pointer();

795

796

int sizeA = M * K * 4; // sizeof(float)

797

int sizeB = K * N * 4;

798

int sizeC = M * N * 4;

799

800

cudaMalloc(d_A, sizeA);

801

cudaMalloc(d_B, sizeB);

802

cudaMalloc(d_C, sizeC);

803

804

// Copy matrices to device

805

FloatPointer fp_A = new FloatPointer(h_A);

806

FloatPointer fp_B = new FloatPointer(h_B);

807

FloatPointer fp_C = new FloatPointer(h_C);

808

809

cudaMemcpy(d_A, fp_A, sizeA, cudaMemcpyHostToDevice);

810

cudaMemcpy(d_B, fp_B, sizeB, cudaMemcpyHostToDevice);

811

812

// Create cuBLAS handle

813

cublasHandle_t handle = new cublasHandle_t();

814

cublasCreate_v2(handle);

815

816

// Scalars for GEMM

817

FloatPointer alpha = new FloatPointer(1.0f);

818

FloatPointer beta = new FloatPointer(0.0f);

819

820

// Perform matrix multiplication: C = α*A*B + β*C

821

cublasSgemm_v2(handle, CUBLAS_OP_N, CUBLAS_OP_N,

822

M, N, K, alpha,

823

new FloatPointer(d_A), M,

824

new FloatPointer(d_B), K,

825

beta, new FloatPointer(d_C), M);

826

827

// Copy result back to host

828

cudaMemcpy(fp_C, d_C, sizeC, cudaMemcpyDeviceToHost);

829

830

// Print result

831

System.out.println("cuBLAS Matrix multiplication result:");

832

for (int i = 0; i < M; i++) {

833

for (int j = 0; j < N; j++) {

834

System.out.printf("%.1f ", h_C[i * N + j]);

835

}

836

System.out.println();

837

}

838

839

// Cleanup

840

cublasDestroy_v2(handle);

841

cudaFree(d_A);

842

cudaFree(d_B);

843

cudaFree(d_C);

844

}

845

}

846

}

847

```

848

849

### OpenCL Vector Addition

850

851

```java

852

import org.bytedeco.opencl.*;

853

import static org.bytedeco.opencl.global.OpenCL.*;

854

855

public class OpenCLExample {

856

static {

857

Loader.load(OpenCL.class);

858

}

859

860

// OpenCL kernel source code

861

static final String kernelSource =

862

"__kernel void vector_add(__global const float* A, __global const float* B, " +

863

"__global float* C) { " +

864

" int i = get_global_id(0); " +

865

" C[i] = A[i] + B[i]; " +

866

"}";

867

868

public static void vectorAdd() {

869

try (PointerScope scope = new PointerScope()) {

870

int N = 1024;

871

872

// Get platform and device

873

cl_platform_id platform = new cl_platform_id();

874

cl_device_id device = new cl_device_id();

875

IntPointer ret = new IntPointer(1);

876

877

clGetPlatformIDs(1, platform, null);

878

clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, device, null);

879

880

// Create context and command queue

881

cl_context context = clCreateContext(null, 1, device, null, null, ret);

882

cl_command_queue queue = clCreateCommandQueue(context, device, 0, ret);

883

884

// Host data

885

float[] h_A = new float[N];

886

float[] h_B = new float[N];

887

float[] h_C = new float[N];

888

889

for (int i = 0; i < N; i++) {

890

h_A[i] = i;

891

h_B[i] = i * 2;

892

}

893

894

// Create device buffers

895

cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);

896

cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, N * 4, null, ret);

897

cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 4, null, ret);

898

899

// Copy data to device

900

FloatPointer fp_A = new FloatPointer(h_A);

901

FloatPointer fp_B = new FloatPointer(h_B);

902

903

clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, N * 4, fp_A, 0, null, null);

904

clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, N * 4, fp_B, 0, null, null);

905

906

// Create and build program

907

PointerPointer kernelSourcePtr = new PointerPointer(kernelSource);

908

cl_program program = clCreateProgramWithSource(context, 1, kernelSourcePtr, null, ret);

909

clBuildProgram(program, 1, device, null, null, null);

910

911

// Create kernel

912

cl_kernel kernel = clCreateKernel(program, "vector_add", ret);

913

914

// Set kernel arguments

915

clSetKernelArg(kernel, 0, Pointer.sizeof(cl_mem.class), d_A);

916

clSetKernelArg(kernel, 1, Pointer.sizeof(cl_mem.class), d_B);

917

clSetKernelArg(kernel, 2, Pointer.sizeof(cl_mem.class), d_C);

918

919

// Execute kernel

920

SizeTPointer globalWorkSize = new SizeTPointer(N);

921

clEnqueueNDRangeKernel(queue, kernel, 1, null, globalWorkSize, null, 0, null, null);

922

923

// Read result

924

FloatPointer fp_C = new FloatPointer(h_C);

925

clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, N * 4, fp_C, 0, null, null);

926

927

// Verify results

928

boolean success = true;

929

for (int i = 0; i < Math.min(N, 10) && success; i++) {

930

if (Math.abs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {

931

success = false;

932

}

933

System.out.printf("C[%d] = %.1f (expected %.1f)\n", i, h_C[i], h_A[i] + h_B[i]);

934

}

935

936

System.out.println(success ? "OpenCL vector addition successful!" : "Verification failed");

937

938

// Cleanup (in reverse order of creation)

939

clReleaseKernel(kernel);

940

clReleaseProgram(program);

941

clReleaseMemObject(d_A);

942

clReleaseMemObject(d_B);

943

clReleaseMemObject(d_C);

944

clReleaseCommandQueue(queue);

945

clReleaseContext(context);

946

}

947

}

948

}

949

```