Dask cuDF - A GPU Backend for Dask DataFrame providing GPU-accelerated parallel and larger-than-memory DataFrame computing
—
Read and write data in various formats with GPU acceleration and automatic cuDF backend integration. All I/O operations leverage the cuDF backend when configured, providing significant performance improvements for compatible data formats.
Read CSV files with GPU acceleration using cuDF's high-performance CSV parser with automatic type inference and memory-efficient streaming.
def read_csv(*args, **kwargs):
"""
Read CSV file(s) using cuDF backend.
Uses dask.dataframe.read_csv with cudf backend configured.
Supports all standard CSV reading options plus cuDF-specific optimizations.
Parameters:
- path: str or list - File path(s) to read
- **kwargs: Additional arguments passed to cudf.read_csv via Dask
Common Parameters:
- sep: str, default ',' - Field delimiter
- header: int or None, default 'infer' - Row to use as column names
- names: list, optional - Column names to use
- dtype: dict, optional - Data types for columns
- usecols: list, optional - Columns to read
- skiprows: int, optional - Rows to skip at start
- nrows: int, optional - Number of rows to read
- na_values: list, optional - Values to treat as NaN
Returns:
DataFrame - Dask-cuDF DataFrame with CSV data
Notes:
- Automatically uses cuDF backend when dataframe.backend="cudf"
- Supports remote filesystems via fsspec
- Optimized for large files with automatic partitioning
"""Read JSON files and JSON Lines format with GPU acceleration and efficient nested data handling.
def read_json(*args, **kwargs):
"""
Read JSON file(s) using cuDF backend.
Uses dask.dataframe.read_json with cudf backend configured.
Supports both standard JSON and JSON Lines formats.
Parameters:
- path: str or list - File path(s) to read
- **kwargs: Additional arguments passed to cudf.read_json via Dask
Common Parameters:
- orient: str, default 'records' - JSON orientation
- lines: bool, default False - Read as JSON Lines format
- dtype: dict, optional - Data types for columns
- compression: str, optional - Compression type ('gzip', 'bz2', etc.)
Returns:
DataFrame - Dask-cuDF DataFrame with JSON data
Notes:
- JSON Lines format recommended for large datasets
- Supports nested JSON structures with automatic flattening
"""Read Parquet files with GPU acceleration using cuDF's optimized Parquet reader with column pruning and predicate pushdown.
def read_parquet(*args, **kwargs):
"""
Read Parquet file(s) using cuDF backend.
Uses dask.dataframe.read_parquet with cudf backend configured.
Provides optimized reading with column selection and filtering.
Parameters:
- path: str or list - File path(s) or directory to read
- **kwargs: Additional arguments passed via Dask
Common Parameters:
- columns: list, optional - Columns to read (column pruning)
- filters: list, optional - Row filters for predicate pushdown
- engine: str, default 'cudf' - Parquet engine to use
- index: str or False, optional - Column to use as index
- storage_options: dict, optional - Filesystem options
Returns:
DataFrame - Dask-cuDF DataFrame with Parquet data
Notes:
- Automatically partitions based on Parquet file structure
- Supports nested column types and complex schemas
- Optimized for large datasets with efficient memory usage
"""Read ORC (Optimized Row Columnar) files with GPU acceleration for high-performance columnar data access.
def read_orc(*args, **kwargs):
"""
Read ORC file(s) using cuDF backend.
Uses dask.dataframe.read_orc with cudf backend configured.
Optimized for ORC's columnar format with GPU acceleration.
Parameters:
- path: str or list - File path(s) to read
- **kwargs: Additional arguments passed to cudf.read_orc via Dask
Common Parameters:
- columns: list, optional - Columns to read
- stripes: list, optional - Stripe indices to read
- skiprows: int, optional - Rows to skip
- num_rows: int, optional - Number of rows to read
Returns:
DataFrame - Dask-cuDF DataFrame with ORC data
Notes:
- Leverages ORC's built-in compression and encoding
- Supports complex nested data types
- Optimized stripe-level reading for large files
"""
def read_text(path, chunksize="256 MiB", **kwargs):
"""
Read text files using cuDF backend.
Available in both expression and legacy modes. In expression mode,
uses DataFrame.read_text method. In legacy mode, uses direct implementation.
Parameters:
- path: str or list - File path(s) to read
- chunksize: str or int, default "256 MiB" - Size of each partition
- **kwargs: Additional arguments passed to cudf.read_text
Common Parameters:
- delimiter: str - Text delimiter for parsing
- byte_range: tuple, optional - (offset, size) for reading specific byte range
Returns:
DataFrame - Dask-cuDF DataFrame with parsed text data
Notes:
- Conditional availability based on DASK_DATAFRAME__QUERY_PLANNING setting
- Supports large text files with automatic chunking
- Uses cuDF's optimized text parsing capabilities
"""Legacy I/O functions available through the dask_cudf.io module (deprecated in favor of top-level functions).
# Deprecated - use dask_cudf.read_csv instead
dask_cudf.io.read_csv(*args, **kwargs)
# Deprecated - use dask_cudf.read_json instead
dask_cudf.io.read_json(*args, **kwargs)
# Deprecated - use dask_cudf.read_parquet instead
dask_cudf.io.read_parquet(*args, **kwargs)
# Deprecated - use dask_cudf.read_orc instead
dask_cudf.io.read_orc(*args, **kwargs)
# Deprecated - use DataFrame.to_parquet method instead
dask_cudf.io.to_parquet(df, path, **kwargs)
def to_orc(df, path, **kwargs):
"""
Write DataFrame to ORC format.
DEPRECATED: This function is deprecated and will be removed.
Use DataFrame.to_orc method instead.
Parameters:
- df: DataFrame - DataFrame to write
- path: str - Output path
- **kwargs: Additional arguments
Raises:
NotImplementedError - Function is no longer supported
Notes:
- Legacy implementation available via dask_cudf._legacy.io.to_orc
- Recommended migration: df.to_orc(path, **kwargs)
"""import dask_cudf
# Read single CSV file
df = dask_cudf.read_csv('data.csv')
# Read multiple CSV files with pattern
df = dask_cudf.read_csv('data/*.csv')
# Read with specific options
df = dask_cudf.read_csv(
'data.csv',
dtype={'id': 'int64', 'value': 'float64'},
usecols=['id', 'value', 'category'],
skiprows=1
)
result = df.compute()# Read Parquet with column selection and filtering
df = dask_cudf.read_parquet(
'data.parquet',
columns=['id', 'value', 'timestamp'],
filters=[('timestamp', '>=', '2023-01-01')]
)
# Process filtered data
summary = df.groupby('id')['value'].mean()
result = summary.compute()# Read from S3 with storage options
df = dask_cudf.read_parquet(
's3://bucket/data/',
storage_options={
'key': 'access_key',
'secret': 'secret_key'
}
)
# Read JSON Lines from remote location
df = dask_cudf.read_json(
's3://bucket/jsonl_data/*.jsonl',
lines=True,
storage_options={'anon': True}
)import dask
import dask.dataframe as dd
# Configure cuDF backend globally
dask.config.set({"dataframe.backend": "cudf"})
# Now standard Dask functions use cuDF backend
df = dd.read_csv('data.csv') # Automatically uses cuDF
result = df.groupby('category').sum().compute() # GPU-accelerated# Read large text files with automatic chunking
df = dask_cudf.read_text(
'large_text_file.txt',
delimiter='\n',
chunksize='128 MiB'
)
# Read with specific byte range
df_range = dask_cudf.read_text(
'data.txt',
delimiter='|',
byte_range=(1000, 5000) # Read bytes 1000-5000
)
result = df.compute()Install with Tessl CLI
npx tessl i tessl/pypi-dask-cudf