CuPy: NumPy & SciPy for GPU - CUDA 11.x optimized distribution providing GPU-accelerated computing with Python
—
CuPy provides comprehensive input/output operations for reading, writing, and formatting array data across various file formats and data sources. These operations enable efficient data exchange between GPU arrays and external storage systems, supporting both binary and text formats with optimized performance for large datasets.
Core file input and output operations for saving and loading CuPy arrays in various formats.
def save(file, arr):
"""
Save an array to a binary file in NumPy .npy format.
Parameters:
file: str or file - File name or file object to save to
arr: array_like - Array to save
"""
def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding='ASCII'):
"""
Load arrays or pickled objects from .npy, .npz or pickled files.
Parameters:
file: str or file - File name or file object to load from
mmap_mode: None or str, optional - Memory mapping mode
allow_pickle: bool, optional - Allow loading pickled object arrays
fix_imports: bool, optional - Fix Python 2/3 import compatibility
encoding: str, optional - Encoding used for reading Python 2 strings
Returns:
ndarray or dict: Loaded array data
"""
def savez(file, *args, **kwds):
"""
Save several arrays into a single file in uncompressed .npz format.
Parameters:
file: str or file - Output file name or file object
*args: array_like - Arrays to save (saved with automatic names arr_0, arr_1, etc.)
**kwds: array_like - Arrays to save with specified names
"""
def savez_compressed(file, *args, **kwds):
"""
Save several arrays into a single file in compressed .npz format.
Parameters:
file: str or file - Output file name or file object
*args: array_like - Arrays to save with automatic names
**kwds: array_like - Arrays to save with specified names
"""
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None):
"""
Load data from a text file.
Parameters:
fname: str or file - File name or file object to read from
dtype: data-type, optional - Data type of the resulting array
comments: str or sequence, optional - Characters used to indicate comments
delimiter: str, optional - String used to separate values
converters: dict, optional - Dictionary mapping column number to conversion function
skiprows: int, optional - Skip the first skiprows lines including comments
usecols: int or sequence, optional - Which columns to read
unpack: bool, optional - If True, return data in separate arrays
ndmin: int, optional - Minimum number of dimensions for returned array
encoding: str, optional - Encoding used to decode the input file
max_rows: int, optional - Read max_rows lines of content after skiprows
Returns:
ndarray: Data read from the text file
"""
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None):
"""
Save an array to a text file.
Parameters:
fname: str or file - File name or file object to write to
X: 1-D or 2-D array_like - Data to be saved
fmt: str or sequence of str, optional - Format specification
delimiter: str, optional - String separating columns
newline: str, optional - String separating lines
header: str, optional - Header text at the beginning of the file
footer: str, optional - Footer text at the end of the file
comments: str, optional - String prefix for header and footer comments
encoding: str, optional - Encoding used for writing text files
"""
def fromfile(file, dtype=float, count=-1, sep='', offset=0):
"""
Construct an array from data in a text or binary file.
Parameters:
file: str or file - Open file object or filename
dtype: data-type, optional - Data type of the returned array
count: int, optional - Number of items to read (-1 means all data)
sep: str, optional - Separator between items for text files
offset: int, optional - Offset in bytes from the file's current position
Returns:
ndarray: Array constructed from file data
"""
def tofile(arr, fid, sep="", format="%s"):
"""
Write array to a file as text or binary (default).
Parameters:
arr: ndarray - Array to write to file
fid: str or file - Output file name or open file object
sep: str, optional - Separator between array items for text output
format: str, optional - Format string for text output
"""Operations for converting arrays to and from string and buffer representations.
def fromstring(string, dtype=float, count=-1, sep=''):
"""
Create an array from string data.
Parameters:
string: str - String containing array data
dtype: data-type, optional - Data type of the returned array
count: int, optional - Number of items to read from string
sep: str, optional - String separator between items
Returns:
ndarray: Array created from string data
"""
def tostring(arr, order='C'):
"""
Return array data as a string containing the raw bytes.
Parameters:
arr: ndarray - Input array
order: {'C', 'F', 'A'}, optional - Order of data bytes
Returns:
bytes: Raw bytes of array data
"""
def frombuffer(buffer, dtype=float, count=-1, offset=0):
"""
Interpret a buffer as a 1-dimensional array.
Parameters:
buffer: buffer_like - Object exposing buffer interface
dtype: data-type, optional - Data type of returned array
count: int, optional - Number of items to read from buffer
offset: int, optional - Start reading buffer from this offset
Returns:
ndarray: 1-D array from buffer data
"""
def tobytes(arr, order='C'):
"""
Return array data as bytes.
Parameters:
arr: ndarray - Input array
order: {'C', 'F', 'A'}, optional - Order of data bytes
Returns:
bytes: Array data as bytes
"""Functions for creating formatted string representations of arrays for display and debugging.
def array_str(a, max_line_width=None, precision=None, suppress_small=None):
"""
Return a string representation of an array.
Parameters:
a: ndarray - Input array
max_line_width: int, optional - Maximum characters per line
precision: int, optional - Floating point precision
suppress_small: bool, optional - Suppress small floating point values
Returns:
str: String representation of the array
"""
def array_repr(arr, max_line_width=None, precision=None, suppress_small=None):
"""
Return string representation of an array that can recreate the array.
Parameters:
arr: ndarray - Input array
max_line_width: int, optional - Maximum characters per line
precision: int, optional - Floating point precision
suppress_small: bool, optional - Suppress small floating point values
Returns:
str: String representation with constructor format
"""
def array2string(a, max_line_width=None, precision=None, suppress_small=None, separator=' ', prefix="", style=repr, formatter=None, threshold=None, edgeitems=None, sign=None, floatmode=None, suffix="", legacy=None):
"""
Return a string representation of an array with full control over formatting.
Parameters:
a: ndarray - Input array
max_line_width: int, optional - Maximum characters per line
precision: int, optional - Floating point precision
suppress_small: bool, optional - Suppress small floating point values
separator: str, optional - Separator between array elements
prefix: str, optional - Prefix string for each line except first
style: function, optional - Function to format scalar values
formatter: dict, optional - Dictionary of formatting functions by type
threshold: int, optional - Total array elements triggering summarization
edgeitems: int, optional - Number of items in summary at beginning/end
sign: str, optional - Controls sign printing ('-', '+', ' ')
floatmode: str, optional - Controls floating-point precision display
suffix: str, optional - Suffix string for each line except last
legacy: bool, optional - Use legacy printing mode
Returns:
str: Formatted string representation
"""
def format_float_positional(x, precision=None, unique=True, fractional=True, trim='k', sign=False, pad_left=None, pad_right=None):
"""
Format a floating-point scalar as a decimal string in positional notation.
Parameters:
x: float - Value to format
precision: int, optional - Maximum number of digits to print
unique: bool, optional - Use unique formatting that preserves value
fractional: bool, optional - Use fractional formatting
trim: str, optional - Trimming method ('k', '0', '.')
sign: bool, optional - Force sign display
pad_left: int, optional - Pad to this many characters on left
pad_right: int, optional - Pad to this many characters on right
Returns:
str: Formatted float string
"""
def format_float_scientific(x, precision=None, unique=True, trim='k', sign=False, pad_left=None, exp_digits=None):
"""
Format a floating-point scalar as a decimal string in scientific notation.
Parameters:
x: float - Value to format
precision: int, optional - Maximum number of digits to print
unique: bool, optional - Use unique formatting that preserves value
trim: str, optional - Trimming method ('k', '0', '.')
sign: bool, optional - Force sign display
pad_left: int, optional - Pad to this many characters on left
exp_digits: int, optional - Number of digits in exponent
Returns:
str: Formatted float string in scientific notation
"""Functions for configuring array printing and display options.
def set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=None, suppress=None, nanstr=None, infstr=None, formatter=None, sign=None, floatmode=None, legacy=None):
"""
Set printing options for arrays.
Parameters:
precision: int, optional - Number of digits for floating point output
threshold: int, optional - Total array elements triggering summarization
edgeitems: int, optional - Number of items in summary at beginning/end
linewidth: int, optional - Number of characters per line for array output
suppress: bool, optional - Suppress small floating point values
nanstr: str, optional - String representation of NaN values
infstr: str, optional - String representation of infinity values
formatter: dict, optional - Custom formatting functions by data type
sign: str, optional - Controls printing of sign for positive values
floatmode: str, optional - Controls floating-point precision display
legacy: bool, optional - Use NumPy 1.13 legacy printing mode
"""
def get_printoptions():
"""
Get current printing options for arrays.
Returns:
dict: Current print option settings
"""
def printoptions(**kwargs):
"""
Context manager for temporarily setting print options.
Parameters:
**kwargs: Print options to temporarily set
Returns:
context manager: Context for temporary print options
"""Specialized I/O operations for complex data structures and formats.
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars=None, defaultfmt="f%i", autostrip=False, replace_space='_', case_sensitive=True, unpack=None, invalid_raise=True, max_rows=None, encoding='bytes'):
"""
Load data from a text file with enhanced handling of missing values.
Parameters:
fname: str or file - File to read data from
dtype: dtype, optional - Data type of the resulting array
comments: str, optional - Characters indicating start of comment
delimiter: str, optional - String used to separate values
skip_header: int, optional - Number of lines to skip at beginning
skip_footer: int, optional - Number of lines to skip at end
converters: dict, optional - Dictionary mapping column to converter function
missing_values: variable, optional - Set of strings corresponding to missing data
filling_values: variable, optional - Values to use for missing data
usecols: sequence, optional - Which columns to read
names: sequence, optional - Names for the columns
excludelist: sequence, optional - Names to exclude from field names
deletechars: str, optional - Characters to remove from field names
defaultfmt: str, optional - Format string for field names
autostrip: bool, optional - Strip whitespaces from values
replace_space: char, optional - Character to replace spaces in field names
case_sensitive: bool, optional - Whether field names are case sensitive
unpack: bool, optional - Return data in separate variables
invalid_raise: bool, optional - Raise exception for inconsistent columns
max_rows: int, optional - Maximum number of rows to read
encoding: str, optional - Encoding for input file
Returns:
ndarray: Array constructed from text file
"""
class DataSource:
"""
Generic data source for reading from files, URLs, and compressed archives.
Provides a unified interface for accessing data from various sources
including local files, remote URLs, and compressed formats.
"""
def __init__(self, destpath='.'):
"""
Parameters:
destpath: str, optional - Destination path for downloaded files
"""
def open(self, path, mode='r', encoding=None, newline=None):
"""
Open and return file-like object for path.
Parameters:
path: str - Path to file or URL
mode: str, optional - File open mode
encoding: str, optional - Text encoding
newline: str, optional - Newline handling
Returns:
file-like object: Opened file or stream
"""
def abspath(self, path):
"""
Return absolute path of file in the DataSource directory.
Parameters:
path: str - File path
Returns:
str: Absolute path
"""
def exists(self, path):
"""
Test if path exists.
Parameters:
path: str - Path to test
Returns:
bool: True if path exists
"""import cupy as cp
import numpy as np
# Create sample data
data = cp.random.rand(1000, 1000, dtype=cp.float32)
labels = cp.arange(1000)
# Save single array in binary format
cp.save('data_array.npy', data)
# Load single array
loaded_data = cp.load('data_array.npy')
print("Data loaded successfully:", cp.allclose(data, loaded_data))
# Save multiple arrays in compressed archive
cp.savez_compressed('dataset.npz',
features=data,
labels=labels,
metadata=cp.array([1000, 1000, 32]))
# Load multiple arrays from archive
archive = cp.load('dataset.npz')
print("Archive contents:", list(archive.keys()))
print("Features shape:", archive['features'].shape)
print("Labels shape:", archive['labels'].shape)
print("Metadata:", archive['metadata'])
# Cleanup
archive.close()# Create and save data as text
matrix = cp.random.rand(10, 5)
# Save with custom formatting
cp.savetxt('matrix_data.txt', matrix,
fmt='%.6f', # 6 decimal places
delimiter=',', # Comma-separated
header='Generated random matrix data',
comments='# ')
# Load text data
loaded_matrix = cp.loadtxt('matrix_data.txt',
delimiter=',',
comments='#',
skiprows=1) # Skip header
print("Text data loaded, shape:", loaded_matrix.shape)
# Advanced text loading with column selection
data_with_labels = cp.column_stack([matrix, cp.arange(10)])
cp.savetxt('labeled_data.txt', data_with_labels,
fmt=['%.6f'] * 5 + ['%d'], # Different formats per column
delimiter='\t',
header='col1\tcol2\tcol3\tcol4\tcol5\tlabel')
# Load specific columns
features_only = cp.loadtxt('labeled_data.txt',
delimiter='\t',
usecols=range(5), # First 5 columns only
skiprows=1)
labels_only = cp.loadtxt('labeled_data.txt',
delimiter='\t',
usecols=[5], # Last column only
skiprows=1,
dtype=int)
print("Features shape:", features_only.shape)
print("Labels shape:", labels_only.shape)# Handle missing values with genfromtxt
sample_data = """# Sample dataset with missing values
1.0,2.0,3.0,A
4.0,,6.0,B
7.0,8.0,,C
,11.0,12.0,D
13.0,14.0,15.0,
"""
# Write sample file
with open('missing_data.csv', 'w') as f:
f.write(sample_data)
# Load with missing value handling
data = cp.genfromtxt('missing_data.csv',
delimiter=',',
dtype=None,
names=['col1', 'col2', 'col3', 'category'],
filling_values={'col1': 0.0, 'col2': -1.0, 'col3': 999.0},
encoding='utf-8')
print("Loaded structured data:")
print("Numeric columns:", data['col1'], data['col2'], data['col3'])
# Complex CSV processing
complex_csv = """# Weather data
Date,Temperature,Humidity,Pressure,Conditions
2023-01-01,15.5,65,1013.2,Sunny
2023-01-02,12.0,70,1015.1,Cloudy
2023-01-03,8.5,85,1008.7,Rainy
2023-01-04,18.0,55,1020.3,Clear
"""
with open('weather.csv', 'w') as f:
f.write(complex_csv)
# Load with converters for date processing
import datetime
def date_converter(date_string):
return datetime.datetime.strptime(date_string.decode(), '%Y-%m-%d').toordinal()
weather_data = cp.genfromtxt('weather.csv',
delimiter=',',
skip_header=2,
usecols=[1, 2, 3], # Skip date and conditions
names=['temp', 'humidity', 'pressure'])
print("Weather data shape:", weather_data.shape)
print("Temperature range:", cp.min(weather_data['temp']), "to", cp.max(weather_data['temp']))# Working with raw binary data
large_array = cp.random.rand(1000000).astype(cp.float32)
# Save as raw binary
with open('binary_data.bin', 'wb') as f:
f.write(large_array.tobytes())
# Load from binary file
loaded_binary = cp.fromfile('binary_data.bin', dtype=cp.float32)
print("Binary data loaded successfully:", cp.allclose(large_array, loaded_binary))
# Working with structured data
dt = cp.dtype([('x', cp.float32), ('y', cp.float32), ('id', cp.int32)])
structured_data = cp.zeros(1000, dtype=dt)
structured_data['x'] = cp.random.rand(1000)
structured_data['y'] = cp.random.rand(1000)
structured_data['id'] = cp.arange(1000)
# Save structured data
cp.save('structured_data.npy', structured_data)
loaded_structured = cp.load('structured_data.npy')
print("Structured data types:", loaded_structured.dtype)
print("Sample structured data:", loaded_structured[:3])
# Buffer operations
buffer_data = cp.arange(100, dtype=cp.int32)
byte_buffer = buffer_data.tobytes()
print("Buffer size:", len(byte_buffer), "bytes")
# Reconstruct from buffer
reconstructed = cp.frombuffer(byte_buffer, dtype=cp.int32)
print("Buffer reconstruction successful:", cp.array_equal(buffer_data, reconstructed))# String representations
array_2d = cp.random.rand(5, 5)
# Different string formats
print("Array string representation:")
print(cp.array_str(array_2d, precision=3, suppress_small=True))
print("\nArray repr (recreatable):")
print(cp.array_repr(array_2d, precision=3))
# Custom formatting
print("\nCustom formatted output:")
formatted = cp.array2string(array_2d,
precision=2,
separator=', ',
prefix=' ',
max_line_width=60)
print(formatted)
# String data conversion
string_data = "1.0 2.0 3.0 4.0 5.0"
array_from_string = cp.fromstring(string_data, sep=' ')
print("Array from string:", array_from_string)
# Comma-separated values
csv_string = "1.5,2.5,3.5,4.5"
csv_array = cp.fromstring(csv_string, sep=',')
print("Array from CSV string:", csv_array)# Configure print options
original_options = cp.get_printoptions()
print("Original print options:", original_options)
# Set custom print options
cp.set_printoptions(precision=3,
suppress=True,
threshold=50,
edgeitems=2,
linewidth=80)
large_array = cp.random.rand(100, 100) * 1000
print("Large array with custom formatting:")
print(large_array)
# Temporary print options using context manager
with cp.printoptions(precision=8, suppress=False):
small_array = cp.array([1e-10, 1e-5, 1.0, 1e5, 1e10])
print("High precision output:")
print(small_array)
print("Back to custom formatting:")
print(small_array)
# Float formatting examples
values = [cp.pi, cp.e, 1.23456789e-8, 1.23456789e8]
for val in values:
positional = cp.format_float_positional(val, precision=4)
scientific = cp.format_float_scientific(val, precision=4)
print(f"Value: {val}")
print(f" Positional: {positional}")
print(f" Scientific: {scientific}")
# Restore original options
cp.set_printoptions(**original_options)# Using DataSource for flexible file access
datasource = cp.DataSource('data_cache')
# Example with local file
if datasource.exists('sample.txt'):
with datasource.open('sample.txt', 'r') as f:
content = f.read()
print("File content:", content[:100])
# Working with compressed files (conceptual example)
compressed_data = """
# This would typically be loaded from a .gz, .bz2, or .xz file
# DataSource automatically handles decompression
"""
# Advanced file format detection and handling
def load_flexible_format(filename):
"""Load data from various formats automatically."""
if filename.endswith('.npy'):
return cp.load(filename)
elif filename.endswith('.npz'):
archive = cp.load(filename)
# Return first array if single array, otherwise return dict
keys = list(archive.keys())
if len(keys) == 1:
return archive[keys[0]]
return dict(archive)
elif filename.endswith('.txt') or filename.endswith('.csv'):
# Try to detect delimiter
with open(filename, 'r') as f:
first_line = f.readline()
if ',' in first_line:
delimiter = ','
elif '\t' in first_line:
delimiter = '\t'
else:
delimiter = None
return cp.loadtxt(filename, delimiter=delimiter)
else:
raise ValueError(f"Unknown file format: {filename}")
# Performance-optimized I/O
def efficient_large_file_processing(filename, chunk_size=10000):
"""Process large files in chunks to manage memory."""
results = []
# For very large files, process in chunks
total_lines = sum(1 for line in open(filename))
chunks = (total_lines + chunk_size - 1) // chunk_size
for i in range(chunks):
skip_rows = i * chunk_size
max_rows = min(chunk_size, total_lines - skip_rows)
chunk_data = cp.loadtxt(filename,
skiprows=skip_rows,
max_rows=max_rows)
# Process chunk
processed_chunk = cp.mean(chunk_data, axis=1) # Example processing
results.append(processed_chunk)
return cp.concatenate(results)# Memory mapping for large files
def process_large_dataset_efficiently(filename):
"""Process large datasets without loading entirely into GPU memory."""
# Load metadata first
with open(filename, 'r') as f:
header = f.readline()
sample_line = f.readline()
# Determine dimensions and data type
n_cols = len(sample_line.split(','))
# Process in batches
batch_size = 50000
batch_results = []
skip_rows = 1 # Skip header
while True:
try:
batch = cp.loadtxt(filename,
delimiter=',',
skiprows=skip_rows,
max_rows=batch_size)
if batch.size == 0:
break
# Process batch on GPU
batch_result = cp.sum(batch ** 2, axis=1)
batch_results.append(batch_result)
skip_rows += batch_size
except Exception as e:
print(f"Finished processing: {e}")
break
return cp.concatenate(batch_results) if batch_results else cp.array([])
# Streaming data processing
class StreamingDataProcessor:
def __init__(self, output_file):
self.output_file = output_file
self.processed_count = 0
def process_stream(self, data_generator):
"""Process streaming data and save incrementally."""
with open(self.output_file, 'w') as f:
f.write("# Processed streaming data\n")
for batch in data_generator:
# Convert to GPU, process, convert back
gpu_batch = cp.asarray(batch)
processed = cp.sqrt(cp.sum(gpu_batch ** 2, axis=1))
cpu_result = cp.asnumpy(processed)
# Save incrementally
cp.savetxt(f, cpu_result.reshape(-1, 1), fmt='%.6f')
self.processed_count += len(cpu_result)
print(f"Processed {self.processed_count} items")
# Usage with generator
def data_generator(total_items=100000, batch_size=1000):
"""Generate batches of synthetic data."""
import numpy as np
for i in range(0, total_items, batch_size):
current_batch_size = min(batch_size, total_items - i)
yield np.random.rand(current_batch_size, 10)
# Process streaming data
processor = StreamingDataProcessor('streaming_output.txt')
processor.process_stream(data_generator())Input/output operations in CuPy provide comprehensive data exchange capabilities between GPU arrays and external storage systems, supporting various file formats, text processing, binary data handling, and memory-efficient processing patterns for large datasets while maintaining high performance and compatibility with NumPy I/O interfaces.
Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda11x