0
# GPU Direct Storage
1
2
cuFile GPU Direct Storage API for high-performance direct GPU I/O operations bypassing CPU and system memory. This module enables direct data transfers between storage devices and GPU memory, significantly reducing I/O latency and CPU overhead for large-scale data processing workloads.
3
4
## Capabilities
5
6
### Driver and System Management
7
8
Initialize and manage the cuFile driver for GPU Direct Storage operations.
9
10
```python { .api }
11
def driver_open() -> None:
12
"""
13
Open the cuFile driver for GPU Direct Storage.
14
15
Note:
16
Must be called before any other cuFile operations
17
18
Raises:
19
cuFileError: If driver initialization fails
20
"""
21
22
def driver_close() -> None:
23
"""
24
Close the cuFile driver and release system resources.
25
26
Note:
27
Should be called when GPU Direct Storage is no longer needed
28
"""
29
30
def get_version() -> int:
31
"""
32
Get the cuFile library version.
33
34
Returns:
35
int: Version number in packed format
36
"""
37
```
38
39
### File Handle Management
40
41
Register and manage file handles for GPU Direct Storage operations.
42
43
```python { .api }
44
def handle_register(descr: int) -> int:
45
"""
46
Register a file descriptor for GPU Direct Storage.
47
48
Args:
49
descr (int): File descriptor (from open() syscall)
50
51
Returns:
52
int: cuFile handle for GPU operations
53
54
Note:
55
File must be opened with appropriate flags for direct I/O
56
57
Raises:
58
cuFileError: If registration fails
59
"""
60
61
def handle_deregister(fh: int) -> None:
62
"""
63
Deregister a cuFile handle and release associated resources.
64
65
Args:
66
fh (int): cuFile handle to deregister
67
68
Note:
69
Handle becomes invalid after deregistration
70
"""
71
```
72
73
### Buffer Management
74
75
Register GPU memory buffers for direct I/O operations.
76
77
```python { .api }
78
def buf_register(devPtr_base: int, size: int, flags: int) -> None:
79
"""
80
Register a GPU memory buffer for cuFile operations.
81
82
Args:
83
devPtr_base (int): Base address of GPU memory buffer
84
size (int): Buffer size in bytes
85
flags (int): Registration flags
86
87
Note:
88
Buffer must remain valid for duration of registration
89
90
Raises:
91
cuFileError: If buffer registration fails
92
"""
93
94
def buf_deregister(devPtr_base: int) -> None:
95
"""
96
Deregister a GPU memory buffer.
97
98
Args:
99
devPtr_base (int): Base address of previously registered buffer
100
"""
101
```
102
103
### Synchronous I/O Operations
104
105
Perform synchronous read and write operations between storage and GPU memory.
106
107
```python { .api }
108
def read(
109
fh: int,
110
buf_ptr_base: int,
111
size: int,
112
file_offset: int,
113
buf_ptr_offset: int
114
) -> None:
115
"""
116
Synchronously read data from file to GPU memory.
117
118
Args:
119
fh (int): cuFile handle
120
buf_ptr_base (int): GPU buffer base address
121
size (int): Number of bytes to read
122
file_offset (int): Offset in file to read from
123
buf_ptr_offset (int): Offset in GPU buffer to write to
124
125
Note:
126
Blocks until read operation completes
127
128
Raises:
129
cuFileError: If read operation fails
130
"""
131
132
def write(
133
fh: int,
134
buf_ptr_base: int,
135
size: int,
136
file_offset: int,
137
buf_ptr_offset: int
138
) -> None:
139
"""
140
Synchronously write data from GPU memory to file.
141
142
Args:
143
fh (int): cuFile handle
144
buf_ptr_base (int): GPU buffer base address
145
size (int): Number of bytes to write
146
file_offset (int): Offset in file to write to
147
buf_ptr_offset (int): Offset in GPU buffer to read from
148
149
Note:
150
Blocks until write operation completes
151
152
Raises:
153
cuFileError: If write operation fails
154
"""
155
156
def pread(
157
fh: int,
158
buf_ptr_base: int,
159
size: int,
160
file_offset: int,
161
buf_ptr_offset: int
162
) -> int:
163
"""
164
Synchronously read with explicit file positioning.
165
166
Args:
167
fh (int): cuFile handle
168
buf_ptr_base (int): GPU buffer base address
169
size (int): Number of bytes to read
170
file_offset (int): File position to read from
171
buf_ptr_offset (int): Buffer offset to write to
172
173
Returns:
174
int: Number of bytes actually read
175
"""
176
177
def pwrite(
178
fh: int,
179
buf_ptr_base: int,
180
size: int,
181
file_offset: int,
182
buf_ptr_offset: int
183
) -> int:
184
"""
185
Synchronously write with explicit file positioning.
186
187
Args:
188
fh (int): cuFile handle
189
buf_ptr_base (int): GPU buffer base address
190
size (int): Number of bytes to write
191
file_offset (int): File position to write to
192
buf_ptr_offset (int): Buffer offset to read from
193
194
Returns:
195
int: Number of bytes actually written
196
"""
197
```
198
199
### Asynchronous I/O Operations
200
201
Perform asynchronous I/O operations for maximum throughput and concurrency.
202
203
```python { .api }
204
def read_async(
205
fh: int,
206
buf_ptr_base: int,
207
size: int,
208
file_offset: int,
209
buf_ptr_offset: int,
210
bytes_read_ptr: int,
211
stream: int
212
) -> None:
213
"""
214
Asynchronously read data from file to GPU memory.
215
216
Args:
217
fh (int): cuFile handle
218
buf_ptr_base (int): GPU buffer base address
219
size (int): Number of bytes to read
220
file_offset (int): Offset in file to read from
221
buf_ptr_offset (int): Offset in GPU buffer to write to
222
bytes_read_ptr (int): Pointer to receive actual bytes read
223
stream (int): CUDA stream for asynchronous execution
224
225
Note:
226
Returns immediately; use stream synchronization to wait
227
"""
228
229
def write_async(
230
fh: int,
231
buf_ptr_base: int,
232
size: int,
233
file_offset: int,
234
buf_ptr_offset: int,
235
bytes_written_ptr: int,
236
stream: int
237
) -> None:
238
"""
239
Asynchronously write data from GPU memory to file.
240
241
Args:
242
fh (int): cuFile handle
243
buf_ptr_base (int): GPU buffer base address
244
size (int): Number of bytes to write
245
file_offset (int): Offset in file to write to
246
buf_ptr_offset (int): Offset in GPU buffer to read from
247
bytes_written_ptr (int): Pointer to receive actual bytes written
248
stream (int): CUDA stream for asynchronous execution
249
250
Note:
251
Returns immediately; use stream synchronization to wait
252
"""
253
```
254
255
### Batch I/O Operations
256
257
Perform multiple I/O operations efficiently using batch APIs.
258
259
```python { .api }
260
def readv(
261
fh: int,
262
iov: list,
263
iovcnt: int,
264
file_offset: int,
265
bytes_read_ptr: int
266
) -> None:
267
"""
268
Vector read operation - read into multiple buffers.
269
270
Args:
271
fh (int): cuFile handle
272
iov (list): List of I/O vector structures
273
iovcnt (int): Number of I/O vectors
274
file_offset (int): Starting file offset
275
bytes_read_ptr (int): Pointer to receive total bytes read
276
277
Note:
278
Enables efficient reading into scattered GPU memory regions
279
"""
280
281
def writev(
282
fh: int,
283
iov: list,
284
iovcnt: int,
285
file_offset: int,
286
bytes_written_ptr: int
287
) -> None:
288
"""
289
Vector write operation - write from multiple buffers.
290
291
Args:
292
fh (int): cuFile handle
293
iov (list): List of I/O vector structures
294
iovcnt (int): Number of I/O vectors
295
file_offset (int): Starting file offset
296
bytes_written_ptr (int): Pointer to receive total bytes written
297
298
Note:
299
Enables efficient writing from scattered GPU memory regions
300
"""
301
```
302
303
### Properties and Configuration
304
305
Query and configure cuFile properties and behavior.
306
307
```python { .api }
308
def get_file_properties(fh: int) -> dict:
309
"""
310
Get properties of a registered file handle.
311
312
Args:
313
fh (int): cuFile handle
314
315
Returns:
316
dict: File properties including direct I/O capabilities
317
"""
318
319
def set_file_properties(fh: int, props: dict) -> None:
320
"""
321
Set properties for a file handle.
322
323
Args:
324
fh (int): cuFile handle
325
props (dict): Properties to set
326
"""
327
```
328
329
## Types
330
331
### Status and Error Codes
332
333
```python { .api }
334
class Status:
335
"""cuFile operation status codes"""
336
CU_FILE_SUCCESS: int # Operation successful
337
CU_FILE_INVALID_VALUE: int # Invalid parameter value
338
CU_FILE_INVALID_HANDLE: int # Invalid file handle
339
CU_FILE_CUDA_MEMORY_TYPE_NOT_SUPPORTED: int # Memory type not supported
340
CU_FILE_IO_NOT_SUPPORTED: int # I/O operation not supported
341
CU_FILE_PERMISSION_DENIED: int # Permission denied
342
CU_FILE_INVALID_FILE_OPEN_FLAG: int # Invalid file open flags
343
CU_FILE_MEMORY_ALREADY_REGISTERED: int # Memory already registered
344
CU_FILE_MEMORY_NOT_REGISTERED: int # Memory not registered
345
CU_FILE_PLATFORM_NOT_SUPPORTED: int # Platform not supported
346
CU_FILE_FILE_SYSTEM_NOT_SUPPORTED: int # File system not supported
347
```
348
349
### Operation Error Codes
350
351
```python { .api }
352
class OpError:
353
"""cuFile detailed operation error codes"""
354
CU_FILE_OP_SUCCESS: int # Operation successful
355
CU_FILE_OP_FAILED: int # Operation failed
356
CU_FILE_OP_INVALID_ARG: int # Invalid argument
357
CU_FILE_OP_IO_FAILED: int # I/O operation failed
358
CU_FILE_OP_MEMORY_INVALID: int # Memory access error
359
CU_FILE_OP_PARTIAL_COMPLETION: int # Partial operation completion
360
```
361
362
### Feature Flags
363
364
```python { .api }
365
class FeatureFlags:
366
"""cuFile feature availability flags"""
367
CU_FILE_FEATURE_GDS_SUPPORTED: int # GPU Direct Storage supported
368
CU_FILE_FEATURE_BATCH_IO_SUPPORTED: int # Batch I/O supported
369
CU_FILE_FEATURE_ASYNC_IO_SUPPORTED: int # Async I/O supported
370
CU_FILE_FEATURE_VECTOR_IO_SUPPORTED: int # Vector I/O supported
371
```
372
373
### File Handle Types
374
375
```python { .api }
376
class FileHandleType:
377
"""cuFile handle type enumeration"""
378
CU_FILE_HANDLE_TYPE_OPAQUE_FD: int # Opaque file descriptor
379
CU_FILE_HANDLE_TYPE_OPAQUE_WIN32: int # Windows handle
380
```
381
382
### Buffer Registration Flags
383
384
```python { .api }
385
# Buffer registration flag constants
386
CU_FILE_BUF_REGISTER_FLAGS_NONE: int # No special flags
387
CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY: int # Buffer for read operations only
388
CU_FILE_BUF_REGISTER_FLAGS_WRITE_ONLY: int # Buffer for write operations only
389
```
390
391
### Exception Classes
392
393
```python { .api }
394
class cuFileError(Exception):
395
"""cuFile operation exception"""
396
def __init__(self, status: Status, message: str): ...
397
```
398
399
### I/O Vector Structure
400
401
```python { .api }
402
class IOVec:
403
"""I/O vector structure for batch operations"""
404
ptr: int # GPU memory pointer
405
size: int # Transfer size in bytes
406
file_offset: int # File offset for this vector
407
buf_offset: int # Buffer offset for this vector
408
```
409
410
## Usage Examples
411
412
### Basic File I/O
413
414
```python
415
from cuda.bindings import cufile, runtime
416
import os
417
418
# Initialize cuFile driver
419
cufile.driver_open()
420
421
try:
422
# Open file for direct I/O
423
fd = os.open("large_dataset.dat", os.O_RDONLY | os.O_DIRECT)
424
cufile_handle = cufile.handle_register(fd)
425
426
# Allocate GPU memory
427
buffer_size = 1024 * 1024 * 64 # 64MB
428
gpu_buffer = runtime.cudaMalloc(buffer_size)
429
430
# Register GPU buffer
431
cufile.buf_register(gpu_buffer, buffer_size,
432
cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)
433
434
# Read data directly to GPU
435
cufile.read(cufile_handle, gpu_buffer, buffer_size, 0, 0)
436
437
print(f"Read {buffer_size} bytes directly to GPU memory")
438
439
# Process data on GPU...
440
441
# Cleanup
442
cufile.buf_deregister(gpu_buffer)
443
runtime.cudaFree(gpu_buffer)
444
cufile.handle_deregister(cufile_handle)
445
os.close(fd)
446
447
finally:
448
cufile.driver_close()
449
```
450
451
### Asynchronous I/O with Streams
452
453
```python
454
from cuda.bindings import cufile, runtime
455
import os
456
457
def async_gpu_io_pipeline():
458
"""Demonstrate asynchronous GPU I/O with CUDA streams."""
459
460
cufile.driver_open()
461
462
# Create CUDA streams for overlapping operations
463
compute_stream = runtime.cudaStreamCreate()
464
io_stream = runtime.cudaStreamCreate()
465
466
try:
467
# Open input and output files
468
input_fd = os.open("input.dat", os.O_RDONLY | os.O_DIRECT)
469
output_fd = os.open("output.dat", os.O_WRONLY | os.O_CREAT | os.O_DIRECT, 0o644)
470
471
input_handle = cufile.handle_register(input_fd)
472
output_handle = cufile.handle_register(output_fd)
473
474
# Allocate double-buffered GPU memory
475
chunk_size = 1024 * 1024 * 32 # 32MB chunks
476
buffer1 = runtime.cudaMalloc(chunk_size)
477
buffer2 = runtime.cudaMalloc(chunk_size)
478
479
# Register buffers
480
cufile.buf_register(buffer1, chunk_size, cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)
481
cufile.buf_register(buffer2, chunk_size, cufile.CU_FILE_BUF_REGISTER_FLAGS_NONE)
482
483
file_offset = 0
484
current_buffer = buffer1
485
next_buffer = buffer2
486
487
# Allocate space for async result tracking
488
bytes_read_ptr = runtime.cudaMalloc(8) # sizeof(size_t)
489
bytes_written_ptr = runtime.cudaMalloc(8)
490
491
while True:
492
# Start async read into next buffer
493
cufile.read_async(
494
input_handle, next_buffer, chunk_size,
495
file_offset + chunk_size, 0, bytes_read_ptr, io_stream
496
)
497
498
# Process current buffer on compute stream
499
# ... kernel launch on current_buffer using compute_stream ...
500
501
# Write processed data asynchronously
502
cufile.write_async(
503
output_handle, current_buffer, chunk_size,
504
file_offset, 0, bytes_written_ptr, io_stream
505
)
506
507
# Synchronize I/O stream
508
runtime.cudaStreamSynchronize(io_stream)
509
510
# Check bytes read
511
bytes_read = runtime.cudaMemcpy(
512
bytes_read_ptr, None, 8,
513
runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost
514
)
515
516
if bytes_read < chunk_size:
517
break # End of file
518
519
# Swap buffers
520
current_buffer, next_buffer = next_buffer, current_buffer
521
file_offset += chunk_size
522
523
print(f"Processed {file_offset} bytes with async I/O")
524
525
finally:
526
# Cleanup
527
runtime.cudaFree(bytes_read_ptr)
528
runtime.cudaFree(bytes_written_ptr)
529
cufile.buf_deregister(buffer1)
530
cufile.buf_deregister(buffer2)
531
runtime.cudaFree(buffer1)
532
runtime.cudaFree(buffer2)
533
cufile.handle_deregister(input_handle)
534
cufile.handle_deregister(output_handle)
535
os.close(input_fd)
536
os.close(output_fd)
537
runtime.cudaStreamDestroy(compute_stream)
538
runtime.cudaStreamDestroy(io_stream)
539
cufile.driver_close()
540
541
# Run the pipeline
542
async_gpu_io_pipeline()
543
```
544
545
### Vector I/O for Scattered Data
546
547
```python
548
from cuda.bindings import cufile, runtime
549
import os
550
551
def scattered_io_example():
552
"""Demonstrate vector I/O for scattered data access."""
553
554
cufile.driver_open()
555
556
try:
557
# Open sparse data file
558
fd = os.open("sparse_matrix.dat", os.O_RDONLY | os.O_DIRECT)
559
cufile_handle = cufile.handle_register(fd)
560
561
# Allocate multiple GPU buffers for different matrix blocks
562
block_size = 1024 * 1024 # 1MB per block
563
num_blocks = 4
564
gpu_buffers = []
565
566
for i in range(num_blocks):
567
buffer = runtime.cudaMalloc(block_size)
568
cufile.buf_register(buffer, block_size,
569
cufile.CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY)
570
gpu_buffers.append(buffer)
571
572
# Define I/O vectors for scattered reads
573
iov_list = []
574
file_offsets = [0, 1024*1024*10, 1024*1024*50, 1024*1024*100] # Sparse offsets
575
576
for i, (buffer, offset) in enumerate(zip(gpu_buffers, file_offsets)):
577
iov = cufile.IOVec()
578
iov.ptr = buffer
579
iov.size = block_size
580
iov.file_offset = offset
581
iov.buf_offset = 0
582
iov_list.append(iov)
583
584
# Perform vector read
585
bytes_read_ptr = runtime.cudaMalloc(8)
586
cufile.readv(cufile_handle, iov_list, len(iov_list), 0, bytes_read_ptr)
587
588
# Get total bytes read
589
total_bytes = runtime.cudaMemcpy(
590
bytes_read_ptr, None, 8,
591
runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost
592
)
593
594
print(f"Vector read {total_bytes} bytes from {len(iov_list)} scattered locations")
595
596
# Process each block on GPU...
597
598
# Cleanup
599
runtime.cudaFree(bytes_read_ptr)
600
for buffer in gpu_buffers:
601
cufile.buf_deregister(buffer)
602
runtime.cudaFree(buffer)
603
cufile.handle_deregister(cufile_handle)
604
os.close(fd)
605
606
finally:
607
cufile.driver_close()
608
609
# Run scattered I/O example
610
scattered_io_example()
611
```
612
613
### Performance Monitoring and Tuning
614
615
```python
616
from cuda.bindings import cufile, runtime
617
import os
618
import time
619
620
class GPUIOProfiler:
621
"""Profile GPU Direct Storage performance."""
622
623
def __init__(self):
624
self.stats = {
625
'total_bytes': 0,
626
'total_time': 0,
627
'operations': 0
628
}
629
630
def profile_read(self, file_path, buffer_size, num_iterations=10):
631
"""Profile read performance."""
632
633
cufile.driver_open()
634
635
try:
636
# Setup
637
fd = os.open(file_path, os.O_RDONLY | os.O_DIRECT)
638
cufile_handle = cufile.handle_register(fd)
639
640
gpu_buffer = runtime.cudaMalloc(buffer_size)
641
cufile.buf_register(gpu_buffer, buffer_size,
642
cufile.CU_FILE_BUF_REGISTER_FLAGS_READ_ONLY)
643
644
# Create events for timing
645
start_event = runtime.cudaEventCreate()
646
end_event = runtime.cudaEventCreate()
647
648
total_time = 0
649
650
for i in range(num_iterations):
651
# Record start time
652
runtime.cudaEventRecord(start_event)
653
654
# Perform read
655
cufile.read(cufile_handle, gpu_buffer, buffer_size,
656
i * buffer_size, 0)
657
658
# Record end time
659
runtime.cudaEventRecord(end_event)
660
runtime.cudaEventSynchronize(end_event)
661
662
# Calculate elapsed time
663
elapsed_ms = runtime.cudaEventElapsedTime(start_event, end_event)
664
total_time += elapsed_ms
665
666
self.stats['total_bytes'] += buffer_size
667
self.stats['operations'] += 1
668
669
self.stats['total_time'] += total_time / 1000 # Convert to seconds
670
671
# Calculate metrics
672
avg_time_ms = total_time / num_iterations
673
throughput_gbps = (buffer_size * num_iterations / (1024**3)) / (total_time / 1000)
674
675
print(f"GPU Direct Storage Read Performance:")
676
print(f" Buffer Size: {buffer_size // (1024*1024)} MB")
677
print(f" Iterations: {num_iterations}")
678
print(f" Average Time: {avg_time_ms:.3f} ms")
679
print(f" Throughput: {throughput_gbps:.2f} GB/s")
680
681
# Cleanup
682
runtime.cudaEventDestroy(start_event)
683
runtime.cudaEventDestroy(end_event)
684
cufile.buf_deregister(gpu_buffer)
685
runtime.cudaFree(gpu_buffer)
686
cufile.handle_deregister(cufile_handle)
687
os.close(fd)
688
689
finally:
690
cufile.driver_close()
691
692
def get_summary(self):
693
"""Get overall performance summary."""
694
if self.stats['operations'] > 0:
695
avg_throughput = (self.stats['total_bytes'] / (1024**3)) / self.stats['total_time']
696
return {
697
'total_data_gb': self.stats['total_bytes'] / (1024**3),
698
'total_time_s': self.stats['total_time'],
699
'operations': self.stats['operations'],
700
'avg_throughput_gbps': avg_throughput
701
}
702
return self.stats
703
704
# Example usage
705
profiler = GPUIOProfiler()
706
707
# Profile different buffer sizes
708
buffer_sizes = [1024*1024, 16*1024*1024, 64*1024*1024] # 1MB, 16MB, 64MB
709
710
for size in buffer_sizes:
711
try:
712
profiler.profile_read("test_data.dat", size, num_iterations=5)
713
except Exception as e:
714
print(f"Profiling failed for {size} bytes: {e}")
715
716
# Print summary
717
summary = profiler.get_summary()
718
if summary:
719
print(f"\nOverall Summary:")
720
print(f" Total Data: {summary['total_data_gb']:.2f} GB")
721
print(f" Total Time: {summary['total_time_s']:.2f} s")
722
print(f" Operations: {summary['operations']}")
723
print(f" Average Throughput: {summary['avg_throughput_gbps']:.2f} GB/s")
724
```