or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cuda-core.mddevice-memory.mddriver-api.mdgpu-direct-storage.mdindex.mdjit-compilation.mdkernels-streams.mdlibrary-management.mdruntime-compilation.md

gpu-direct-storage.mddocs/

0

# GPU Direct Storage

1

2

cuFile GPU Direct Storage API for high-performance direct GPU I/O operations bypassing CPU and system memory. This module enables direct data transfers between storage devices and GPU memory, significantly reducing I/O latency and CPU overhead for large-scale data processing workloads.

3

4

## Capabilities

5

6

### Driver and System Management

7

8

Initialize and manage the cuFile driver for GPU Direct Storage operations.

9

10

```python { .api }

11

def driver_open() -> None:

12

"""

13

Open the cuFile driver for GPU Direct Storage.

14

15

Note:

16

Must be called before any other cuFile operations

17

18

Raises:

19

cuFileError: If driver initialization fails

20

"""

21

22

def driver_close() -> None:

23

"""

24

Close the cuFile driver and release system resources.

25

26

Note:

27

Should be called when GPU Direct Storage is no longer needed

28

"""

29

30

def get_version() -> int:

31

"""

32

Get the cuFile library version.

33

34

Returns:

35

int: Version number in packed format

36

"""

37

```

38

39

### File Handle Management

40

41

Register and manage file handles for GPU Direct Storage operations.

42

43

```python { .api }

44

def handle_register(descr: int) -> int:

45

"""

46

Register a file descriptor for GPU Direct Storage.

47

48

Args:

49

descr (int): File descriptor (from open() syscall)

50

51

Returns:

52

int: cuFile handle for GPU operations

53

54

Note:

55

File must be opened with appropriate flags for direct I/O

56

57

Raises:

58

cuFileError: If registration fails

59

"""

60

61

def handle_deregister(fh: int) -> None:

62

"""

63

Deregister a cuFile handle and release associated resources.

64

65

Args:

66

fh (int): cuFile handle to deregister

67

68

Note:

69

Handle becomes invalid after deregistration

70

"""

71

```

72

73

### Buffer Management

74

75

Register GPU memory buffers for direct I/O operations.

76

77

```python { .api }

78

def buf_register(devPtr_base: int, size: int, flags: int) -> None:

79

"""

80

Register a GPU memory buffer for cuFile operations.

81

82

Args:

83

devPtr_base (int): Base address of GPU memory buffer

84

size (int): Buffer size in bytes

85

flags (int): Registration flags

86

87

Note:

88

Buffer must remain valid for duration of registration

89

90

Raises:

91

cuFileError: If buffer registration fails

92

"""

93

94

def buf_deregister(devPtr_base: int) -> None:

95

"""

96

Deregister a GPU memory buffer.

97

98

Args:

99

devPtr_base (int): Base address of previously registered buffer

100

"""

101

```

102

103

### Synchronous I/O Operations

104

105

Perform synchronous read and write operations between storage and GPU memory.

106

107

```python { .api }

108

def read(

109

fh: int,

110

buf_ptr_base: int,

111

size: int,

112

file_offset: int,

113

buf_ptr_offset: int

114

) -> None:

115

"""

116

Synchronously read data from file to GPU memory.

117

118

Args:

119

fh (int): cuFile handle

120

buf_ptr_base (int): GPU buffer base address

121

size (int): Number of bytes to read

122

file_offset (int): Offset in file to read from

123

buf_ptr_offset (int): Offset in GPU buffer to write to

124

125

Note:

126

Blocks until read operation completes

127

128

Raises:

129

cuFileError: If read operation fails

130

"""

131

132

def write(

133

fh: int,

134

buf_ptr_base: int,

135

size: int,

136

file_offset: int,

137

buf_ptr_offset: int

138

) -> None:

139

"""

140

Synchronously write data from GPU memory to file.

141

142

Args:

143

fh (int): cuFile handle

144

buf_ptr_base (int): GPU buffer base address

145

size (int): Number of bytes to write

146

file_offset (int): Offset in file to write to

147

buf_ptr_offset (int): Offset in GPU buffer to read from

148

149

Note:

150

Blocks until write operation completes

151

152

Raises:

153

cuFileError: If write operation fails

154

"""

155

156

def pread(

157

fh: int,

158

buf_ptr_base: int,

159

size: int,

160

file_offset: int,

161

buf_ptr_offset: int

162

) -> int:

163

"""

164

Synchronously read with explicit file positioning.

165

166

Args:

167

fh (int): cuFile handle

168

buf_ptr_base (int): GPU buffer base address

169

size (int): Number of bytes to read

170

file_offset (int): File position to read from

171

buf_ptr_offset (int): Buffer offset to write to

172

173

Returns:

174

int: Number of bytes actually read

175

"""

176

177

def pwrite(

178

fh: int,

179

buf_ptr_base: int,

180

size: int,

181

file_offset: int,

182

buf_ptr_offset: int

183

) -> int:

184

"""

185

Synchronously write with explicit file positioning.

186

187

Args:

188

fh (int): cuFile handle

189

buf_ptr_base (int): GPU buffer base address

190

size (int): Number of bytes to write

191

file_offset (int): File position to write to

192

buf_ptr_offset (int): Buffer offset to read from

193

194

Returns:

195

int: Number of bytes actually written

196

"""

197

```

198

199

### Asynchronous I/O Operations

200

201

Perform asynchronous I/O operations for maximum throughput and concurrency.

202

203

```python { .api }

204

def read_async(

205

fh: int,

206

buf_ptr_base: int,

207

size: int,

208

file_offset: int,

209

buf_ptr_offset: int,

210

bytes_read_ptr: int,

211

stream: int

212

) -> None:

213

"""

214

Asynchronously read data from file to GPU memory.

215

216

Args:

217

fh (int): cuFile handle

218

buf_ptr_base (int): GPU buffer base address

219

size (int): Number of bytes to read

220

file_offset (int): Offset in file to read from

221

buf_ptr_offset (int): Offset in GPU buffer to write to

222

bytes_read_ptr (int): Pointer to receive actual bytes read

223

stream (int): CUDA stream for asynchronous execution

224

225

Note:

226

Returns immediately; use stream synchronization to wait

227

"""

228

229

def write_async(

230

fh: int,

231

buf_ptr_base: int,

232

size: int,

233

file_offset: int,

234

buf_ptr_offset: int,

235

bytes_written_ptr: int,

236

stream: int

237

) -> None:

238

"""

239

Asynchronously write data from GPU memory to file.

240

241

Args:

242

fh (int): cuFile handle

243

buf_ptr_base (int): GPU buffer base address

244

size (int): Number of bytes to write

245

file_offset (int): Offset in file to write to

246

buf_ptr_offset (int): Offset in GPU buffer to read from

247

bytes_written_ptr (int): Pointer to receive actual bytes written

248

stream (int): CUDA stream for asynchronous execution

249

250

Note:

251

Returns immediately; use stream synchronization to wait

252

"""

253

```

254

255

### Batch I/O Operations

256

257

Perform multiple I/O operations efficiently using batch APIs.

258

259

```python { .api }

260

def readv(

261

fh: int,

262

iov: list,

263

iovcnt: int,

264

file_offset: int,

265

bytes_read_ptr: int

266

) -> None:

267

"""

268

Vector read operation - read into multiple buffers.

269

270

Args:

271

fh (int): cuFile handle

272

iov (list): List of I/O vector structures

273

iovcnt (int): Number of I/O vectors

274

file_offset (int): Starting file offset

275

bytes_read_ptr (int): Pointer to receive total bytes read

276

277

Note:

278

Enables efficient reading into scattered GPU memory regions

279

"""

280

281

def writev(

282

fh: int,

283

iov: list,

284

iovcnt: int,

285

file_offset: int,

286

bytes_written_ptr: int

287

) -> None:

288

"""

289

Vector write operation - write from multiple buffers.

290

291

Args:

292

fh (int): cuFile handle

293

iov (list): List of I/O vector structures

294

iovcnt (int): Number of I/O vectors

295

file_offset (int): Starting file offset

296

bytes_written_ptr (int): Pointer to receive total bytes written

297

298

Note:

299

Enables efficient writing from scattered GPU memory regions

300

"""

301

```

302

303

### Properties and Configuration

304

305

Query and configure cuFile properties and behavior.

306

307

```python { .api }

308

def get_file_properties(fh: int) -> dict:

309

"""

310

Get properties of a registered file handle.

311

312

Args:

313

fh (int): cuFile handle

314

315

Returns:

316

dict: File properties including direct I/O capabilities

317

"""

318

319

def set_file_properties(fh: int, props: dict) -> None:

320

"""

321

Set properties for a file handle.

322

323

Args:

324

fh (int): cuFile handle

325

props (dict): Properties to set

326

"""

327

```

328

329

## Types

330

331

### Status and Error Codes

332

333

```python { .api }

334

class Status:

335

"""cuFile operation status codes"""

336

CU_FILE_SUCCESS: int # Operation successful

337

CU_FILE_INVALID_VALUE: int # Invalid parameter value

338

CU_FILE_INVALID_HANDLE: int # Invalid file handle

339

CU_FILE_CUDA_MEMORY_TYPE_NOT_SUPPORTED: int # Memory type not supported

340

CU_FILE_IO_NOT_SUPPORTED: int # I/O operation not supported

341

CU_FILE_PERMISSION_DENIED: int # Permission denied

342

CU_FILE_INVALID_FILE_OPEN_FLAG: int # Invalid file open flags

343

CU_FILE_MEMORY_ALREADY_REGISTERED: int # Memory already registered

344

CU_FILE_MEMORY_NOT_REGISTERED: int # Memory not registered

345

CU_FILE_PLATFORM_NOT_SUPPORTED: int # Platform not supported

346

CU_FILE_FILE_SYSTEM_NOT_SUPPORTED: int # File system not supported

347

```

348

349

### Operation Error Codes

350

351

```python { .api }

352

class OpError:

353

"""cuFile detailed operation error codes"""

354

CU_FILE_OP_SUCCESS: int # Operation successful

355

CU_FILE_OP_FAILED: int # Operation failed

356

CU_FILE_OP_INVALID_ARG: int # Invalid argument

357

CU_FILE_OP_IO_FAILED: int # I/O operation failed

358

CU_FILE_OP_MEMORY_INVALID: int # Memory access error

359

CU_FILE_OP_PARTIAL_COMPLETION: int # Partial operation completion

360

```

361

362

### Feature Flags

363

364

```python { .api }

365

class FeatureFlags:

366

"""cuFile feature availability flags"""

367

CU_FILE_FEATURE_GDS_SUPPORTED: int # GPU Direct Storage supported

368

CU_FILE_FEATURE_BATCH_IO_SUPPORTED: int # Batch I/O supported

369

CU_FILE_FEATURE_ASYNC_IO_SUPPORTED: int # Async I/O supported

370

CU_FILE_FEATURE_VECTOR_IO_SUPPORTED: int # Vector I/O supported

371

```

372

373

### File Handle Types

374

375

```python { .api }

376

class FileHandleType:

377

"""cuFile handle type enumeration"""

378

CU_FILE_HANDLE_TYPE_OPAQUE_FD: int # Opaque file descriptor

379

CU_FILE_HANDLE_TYPE_OPAQUE_WIN32: int # Windows handle

380

```

381

382

### Buffer Registration Flags

383

384

```python { .api }

385

# Buffer registration flag constants

386

CU_FILE_BUF_REGISTER_FLAGS_NONE: int # No special flags

387

CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY: int # Buffer for read operations only

388

CU_FILE_BUF_REGISTER_FLAGS_WRITE_ONLY: int # Buffer for write operations only

389

```

390

391

### Exception Classes

392

393

```python { .api }

394

class cuFileError(Exception):

395

"""cuFile operation exception"""

396

def __init__(self, status: Status, message: str): ...

397

```

398

399

### I/O Vector Structure

400

401

```python { .api }

402

class IOVec:

403

"""I/O vector structure for batch operations"""

404

ptr: int # GPU memory pointer

405

size: int # Transfer size in bytes

406

file_offset: int # File offset for this vector

407

buf_offset: int # Buffer offset for this vector

408

```

409

410

## Usage Examples

411

412

### Basic File I/O

413

414

```python

415

from cuda.bindings import cufile, runtime

416

import os

417

418

# Initialize cuFile driver

419

cufile.driver_open()

420

421

try:

422

# Open file for direct I/O

423

fd = os.open("large_dataset.dat", os.O_RDONLY | os.O_DIRECT)

424

cufile_handle = cufile.handle_register(fd)

425

426

# Allocate GPU memory

427

buffer_size = 1024 * 1024 * 64 # 64MB

428

gpu_buffer = runtime.cudaMalloc(buffer_size)

429

430

# Register GPU buffer

431

cufile.buf_register(gpu_buffer, buffer_size,

432

cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)

433

434

# Read data directly to GPU

435

cufile.read(cufile_handle, gpu_buffer, buffer_size, 0, 0)

436

437

print(f"Read {buffer_size} bytes directly to GPU memory")

438

439

# Process data on GPU...

440

441

# Cleanup

442

cufile.buf_deregister(gpu_buffer)

443

runtime.cudaFree(gpu_buffer)

444

cufile.handle_deregister(cufile_handle)

445

os.close(fd)

446

447

finally:

448

cufile.driver_close()

449

```

450

451

### Asynchronous I/O with Streams

452

453

```python

454

from cuda.bindings import cufile, runtime

455

import os

456

457

def async_gpu_io_pipeline():

458

"""Demonstrate asynchronous GPU I/O with CUDA streams."""

459

460

cufile.driver_open()

461

462

# Create CUDA streams for overlapping operations

463

compute_stream = runtime.cudaStreamCreate()

464

io_stream = runtime.cudaStreamCreate()

465

466

try:

467

# Open input and output files

468

input_fd = os.open("input.dat", os.O_RDONLY | os.O_DIRECT)

469

output_fd = os.open("output.dat", os.O_WRONLY | os.O_CREAT | os.O_DIRECT, 0o644)

470

471

input_handle = cufile.handle_register(input_fd)

472

output_handle = cufile.handle_register(output_fd)

473

474

# Allocate double-buffered GPU memory

475

chunk_size = 1024 * 1024 * 32 # 32MB chunks

476

buffer1 = runtime.cudaMalloc(chunk_size)

477

buffer2 = runtime.cudaMalloc(chunk_size)

478

479

# Register buffers

480

cufile.buf_register(buffer1, chunk_size, cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)

481

cufile.buf_register(buffer2, chunk_size, cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)

482

483

file_offset = 0

484

current_buffer = buffer1

485

next_buffer = buffer2

486

487

# Allocate space for async result tracking

488

bytes_read_ptr = runtime.cudaMalloc(8) # sizeof(size_t)

489

bytes_written_ptr = runtime.cudaMalloc(8)

490

491

while True:

492

# Start async read into next buffer

493

cufile.read_async(

494

input_handle, next_buffer, chunk_size,

495

file_offset + chunk_size, 0, bytes_read_ptr, io_stream

496

)

497

498

# Process current buffer on compute stream

499

# ... kernel launch on current_buffer using compute_stream ...

500

501

# Write processed data asynchronously

502

cufile.write_async(

503

output_handle, current_buffer, chunk_size,

504

file_offset, 0, bytes_written_ptr, io_stream

505

)

506

507

# Synchronize I/O stream

508

runtime.cudaStreamSynchronize(io_stream)

509

510

# Check bytes read

511

bytes_read = runtime.cudaMemcpy(

512

bytes_read_ptr, None, 8,

513

runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost

514

)

515

516

if bytes_read < chunk_size:

517

break # End of file

518

519

# Swap buffers

520

current_buffer, next_buffer = next_buffer, current_buffer

521

file_offset += chunk_size

522

523

print(f"Processed {file_offset} bytes with async I/O")

524

525

finally:

526

# Cleanup

527

runtime.cudaFree(bytes_read_ptr)

528

runtime.cudaFree(bytes_written_ptr)

529

cufile.buf_deregister(buffer1)

530

cufile.buf_deregister(buffer2)

531

runtime.cudaFree(buffer1)

532

runtime.cudaFree(buffer2)

533

cufile.handle_deregister(input_handle)

534

cufile.handle_deregister(output_handle)

535

os.close(input_fd)

536

os.close(output_fd)

537

runtime.cudaStreamDestroy(compute_stream)

538

runtime.cudaStreamDestroy(io_stream)

539

cufile.driver_close()

540

541

# Run the pipeline

542

async_gpu_io_pipeline()

543

```

544

545

### Vector I/O for Scattered Data

546

547

```python

548

from cuda.bindings import cufile, runtime

549

import os

550

551

def scattered_io_example():

552

"""Demonstrate vector I/O for scattered data access."""

553

554

cufile.driver_open()

555

556

try:

557

# Open sparse data file

558

fd = os.open("sparse_matrix.dat", os.O_RDONLY | os.O_DIRECT)

559

cufile_handle = cufile.handle_register(fd)

560

561

# Allocate multiple GPU buffers for different matrix blocks

562

block_size = 1024 * 1024 # 1MB per block

563

num_blocks = 4

564

gpu_buffers = []

565

566

for i in range(num_blocks):

567

buffer = runtime.cudaMalloc(block_size)

568

cufile.buf_register(buffer, block_size,

569

cufile.CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY)

570

gpu_buffers.append(buffer)

571

572

# Define I/O vectors for scattered reads

573

iov_list = []

574

file_offsets = [0, 1024*1024*10, 1024*1024*50, 1024*1024*100] # Sparse offsets

575

576

for i, (buffer, offset) in enumerate(zip(gpu_buffers, file_offsets)):

577

iov = cufile.IOVec()

578

iov.ptr = buffer

579

iov.size = block_size

580

iov.file_offset = offset

581

iov.buf_offset = 0

582

iov_list.append(iov)

583

584

# Perform vector read

585

bytes_read_ptr = runtime.cudaMalloc(8)

586

cufile.readv(cufile_handle, iov_list, len(iov_list), 0, bytes_read_ptr)

587

588

# Get total bytes read

589

total_bytes = runtime.cudaMemcpy(

590

bytes_read_ptr, None, 8,

591

runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost

592

)

593

594

print(f"Vector read {total_bytes} bytes from {len(iov_list)} scattered locations")

595

596

# Process each block on GPU...

597

598

# Cleanup

599

runtime.cudaFree(bytes_read_ptr)

600

for buffer in gpu_buffers:

601

cufile.buf_deregister(buffer)

602

runtime.cudaFree(buffer)

603

cufile.handle_deregister(cufile_handle)

604

os.close(fd)

605

606

finally:

607

cufile.driver_close()

608

609

# Run scattered I/O example

610

scattered_io_example()

611

```

612

613

### Performance Monitoring and Tuning

614

615

```python

616

from cuda.bindings import cufile, runtime

617

import os

618

import time

619

620

class GPUIOProfiler:

621

"""Profile GPU Direct Storage performance."""

622

623

def __init__(self):

624

self.stats = {

625

'total_bytes': 0,

626

'total_time': 0,

627

'operations': 0

628

}

629

630

def profile_read(self, file_path, buffer_size, num_iterations=10):

631

"""Profile read performance."""

632

633

cufile.driver_open()

634

635

try:

636

# Setup

637

fd = os.open(file_path, os.O_RDONLY | os.O_DIRECT)

638

cufile_handle = cufile.handle_register(fd)

639

640

gpu_buffer = runtime.cudaMalloc(buffer_size)

641

cufile.buf_register(gpu_buffer, buffer_size,

642

cufile.CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY)

643

644

# Create events for timing

645

start_event = runtime.cudaEventCreate()

646

end_event = runtime.cudaEventCreate()

647

648

total_time = 0

649

650

for i in range(num_iterations):

651

# Record start time

652

runtime.cudaEventRecord(start_event)

653

654

# Perform read

655

cufile.read(cufile_handle, gpu_buffer, buffer_size,

656

i * buffer_size, 0)

657

658

# Record end time

659

runtime.cudaEventRecord(end_event)

660

runtime.cudaEventSynchronize(end_event)

661

662

# Calculate elapsed time

663

elapsed_ms = runtime.cudaEventElapsedTime(start_event, end_event)

664

total_time += elapsed_ms

665

666

self.stats['total_bytes'] += buffer_size

667

self.stats['operations'] += 1

668

669

self.stats['total_time'] += total_time / 1000 # Convert to seconds

670

671

# Calculate metrics

672

avg_time_ms = total_time / num_iterations

673

throughput_gbps = (buffer_size * num_iterations / (1024**3)) / (total_time / 1000)

674

675

print(f"GPU Direct Storage Read Performance:")

676

print(f" Buffer Size: {buffer_size // (1024*1024)} MB")

677

print(f" Iterations: {num_iterations}")

678

print(f" Average Time: {avg_time_ms:.3f} ms")

679

print(f" Throughput: {throughput_gbps:.2f} GB/s")

680

681

# Cleanup

682

runtime.cudaEventDestroy(start_event)

683

runtime.cudaEventDestroy(end_event)

684

cufile.buf_deregister(gpu_buffer)

685

runtime.cudaFree(gpu_buffer)

686

cufile.handle_deregister(cufile_handle)

687

os.close(fd)

688

689

finally:

690

cufile.driver_close()

691

692

def get_summary(self):

693

"""Get overall performance summary."""

694

if self.stats['operations'] > 0:

695

avg_throughput = (self.stats['total_bytes'] / (1024**3)) / self.stats['total_time']

696

return {

697

'total_data_gb': self.stats['total_bytes'] / (1024**3),

698

'total_time_s': self.stats['total_time'],

699

'operations': self.stats['operations'],

700

'avg_throughput_gbps': avg_throughput

701

}

702

return self.stats

703

704

# Example usage

705

profiler = GPUIOProfiler()

706

707

# Profile different buffer sizes

708

buffer_sizes = [1024*1024, 16*1024*1024, 64*1024*1024] # 1MB, 16MB, 64MB

709

710

for size in buffer_sizes:

711

try:

712

profiler.profile_read("test_data.dat", size, num_iterations=5)

713

except Exception as e:

714

print(f"Profiling failed for {size} bytes: {e}")

715

716

# Print summary

717

summary = profiler.get_summary()

718

if summary:

719

print(f"\nOverall Summary:")

720

print(f" Total Data: {summary['total_data_gb']:.2f} GB")

721

print(f" Total Time: {summary['total_time_s']:.2f} s")

722

print(f" Operations: {summary['operations']}")

723

print(f" Average Throughput: {summary['avg_throughput_gbps']:.2f} GB/s")

724

```