GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data
—
cuDF provides high-performance GPU I/O for popular data formats with automatic memory management and optimized readers/writers. All I/O operations leverage GPU memory directly, minimizing CPU-GPU data transfers.
# Core I/O functions
from cudf import read_csv, read_parquet, read_json
from cudf.io import read_orc, read_avro, read_feather, read_hdf, read_text
from cudf.io.csv import to_csv
from cudf.io.orc import to_orc
# Parquet utilities
from cudf.io.parquet import (
read_parquet_metadata, merge_parquet_filemetadata,
ParquetDatasetWriter, write_to_dataset
)
# ORC utilities
from cudf.io.orc import read_orc_metadata
# Interoperability
from cudf.io.dlpack import from_dlpackHigh-performance CSV reading with extensive parsing options.
def read_csv(
filepath_or_buffer,
sep=',',
delimiter=None,
header='infer',
names=None,
index_col=None,
usecols=None,
dtype=None,
skiprows=None,
skipfooter=0,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
skip_blank_lines=True,
parse_dates=False,
date_parser=None,
dayfirst=False,
compression='infer',
thousands=None,
decimal='.',
lineterminator=None,
quotechar='"',
quoting=0,
doublequote=True,
escapechar=None,
comment=None,
encoding='utf-8',
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read CSV file directly into GPU memory with optimized parsing
Provides GPU-accelerated CSV parsing with extensive configuration options.
Automatically detects and handles various CSV formats and encodings.
Parameters:
filepath_or_buffer: str, PathLike, or file-like object
File path, URL, or buffer containing CSV data
sep: str, default ','
Field delimiter character
delimiter: str, optional
Alternative name for sep parameter
header: int, list of int, or 'infer', default 'infer'
Row number(s) to use as column names
names: list, optional
List of column names to use instead of header
index_col: int, str, or list, optional
Column(s) to use as row labels
usecols: list or callable, optional
Subset of columns to read
dtype: dict or str, optional
Data type specification for columns
skiprows: int, list, or callable, optional
Rows to skip at beginning of file
skipfooter: int, default 0
Number of rows to skip at end of file
nrows: int, optional
Maximum number of rows to read
na_values: scalar, str, list, or dict, optional
Additional strings to recognize as NA/NaN
keep_default_na: bool, default True
Whether to include default NaN values
na_filter: bool, default True
Whether to check for missing values
skip_blank_lines: bool, default True
Whether to skip blank lines
parse_dates: bool, list, or dict, default False
Columns to parse as dates
compression: str or dict, default 'infer'
Type of compression ('gzip', 'bz2', 'xz', 'zip', None)
encoding: str, default 'utf-8'
Character encoding to use
storage_options: dict, optional
Options for cloud storage access
**kwargs: additional keyword arguments
Other CSV parsing options
Returns:
DataFrame: GPU DataFrame containing parsed CSV data
Examples:
# Basic CSV reading
df = cudf.read_csv('data.csv')
# With custom options
df = cudf.read_csv(
'data.csv',
sep=';',
header=0,
dtype={'col1': 'int64', 'col2': 'float32'},
parse_dates=['date_column']
)
# From URL with compression
df = cudf.read_csv(
'https://example.com/data.csv.gz',
compression='gzip'
)
"""def to_csv(
path_or_buf=None,
sep=',',
na_rep='',
float_format=None,
columns=None,
header=True,
index=True,
index_label=None,
mode='w',
encoding=None,
compression='infer',
quoting=None,
quotechar='"',
line_terminator=None,
chunksize=None,
date_format=None,
doublequote=True,
escapechar=None,
decimal='.',
**kwargs
):
"""
Write GPU DataFrame to CSV format
High-performance CSV writing with customizable formatting options.
Writes directly from GPU memory with minimal data transfers.
Parameters:
path_or_buf: str, path object, or file-like object
File path or object to write to
sep: str, default ','
Field delimiter character
na_rep: str, default ''
String representation of NaN values
float_format: str, optional
Format string for floating point numbers
columns: sequence, optional
Columns to write
header: bool or list of str, default True
Write column names as header
index: bool, default True
Write row names (index)
mode: str, default 'w'
File mode ('w' for write, 'a' for append)
compression: str or dict, default 'infer'
Compression type ('gzip', 'bz2', 'xz', 'zstd', etc.)
**kwargs: additional keyword arguments
Other CSV writing options
Examples:
# Basic CSV writing
df.to_csv('output.csv')
# Custom formatting
df.to_csv('output.csv', sep=';', index=False, float_format='%.2f')
# Compressed output
df.to_csv('output.csv.gz', compression='gzip')
"""Optimized Apache Parquet support with metadata handling and dataset operations.
def read_parquet(
path,
engine='cudf',
columns=None,
filters=None,
row_groups=None,
use_pandas_metadata=True,
storage_options=None,
bytes_per_thread=None,
**kwargs
) -> DataFrame:
"""
Read Apache Parquet file(s) directly into GPU memory
High-performance Parquet reader with predicate pushdown, column pruning,
and automatic schema detection. Supports single files, directories, and
cloud storage locations.
Parameters:
path: str, PathLike, or list
File path, directory, or list of files to read
engine: str, default 'cudf'
Parquet engine to use ('cudf' for GPU acceleration)
columns: list, optional
Specific columns to read (column pruning)
filters: list of tuples, optional
Row filter predicates for predicate pushdown
row_groups: list, optional
Specific row groups to read
use_pandas_metadata: bool, default True
Whether to use pandas metadata for schema information
storage_options: dict, optional
Options for cloud storage (S3, GCS, Azure)
bytes_per_thread: int, optional
Bytes to read per thread for parallel I/O
**kwargs: additional arguments
Engine-specific options
Returns:
DataFrame: GPU DataFrame with Parquet data
Examples:
# Basic Parquet reading
df = cudf.read_parquet('data.parquet')
# Column pruning and filtering
df = cudf.read_parquet(
'data.parquet',
columns=['col1', 'col2', 'col3'],
filters=[('col1', '>', 100), ('col2', '==', 'value')]
)
# Multiple files
df = cudf.read_parquet(['file1.parquet', 'file2.parquet'])
# From cloud storage
df = cudf.read_parquet(
's3://bucket/path/data.parquet',
storage_options={'key': 'access_key', 'secret': 'secret_key'}
)
"""
def read_parquet_metadata(path, **kwargs) -> object:
"""
Read metadata from Parquet file without loading data
Extracts schema information, row group statistics, and file metadata
for query planning and data exploration without full data loading.
Parameters:
path: str or PathLike
Path to Parquet file
**kwargs: additional arguments
Storage and engine options
Returns:
object: Parquet metadata object with schema and statistics
Examples:
# Read metadata only
metadata = cudf.io.parquet.read_parquet_metadata('data.parquet')
print(f"Rows: {metadata.num_rows}")
print(f"Columns: {len(metadata.schema)}")
"""
def merge_parquet_filemetadata(metadata_list) -> object:
"""
Merge multiple Parquet file metadata objects
Combines metadata from multiple Parquet files for unified schema
and statistics. Useful for dataset-level operations.
Parameters:
metadata_list: list
List of Parquet metadata objects to merge
Returns:
object: Merged Parquet metadata object
Examples:
# Merge metadata from multiple files
meta1 = cudf.io.parquet.read_parquet_metadata('file1.parquet')
meta2 = cudf.io.parquet.read_parquet_metadata('file2.parquet')
merged = cudf.io.parquet.merge_parquet_filemetadata([meta1, meta2])
"""class ParquetDatasetWriter:
"""
Writer for partitioned Parquet datasets
Manages writing DataFrames to partitioned Parquet datasets with
automatic directory structure creation and metadata management.
Parameters:
path: str or PathLike
Root directory for the dataset
partition_cols: list, optional
Columns to use for dataset partitioning
**kwargs: additional arguments
Writer configuration options
Methods:
write_table(table, **kwargs): Write table to dataset
close(): Finalize dataset and write metadata
Examples:
# Create partitioned dataset writer
writer = cudf.io.parquet.ParquetDatasetWriter(
'/path/to/dataset',
partition_cols=['year', 'month']
)
# Write data in chunks
for chunk in data_chunks:
writer.write_table(chunk)
writer.close()
"""
def write_to_dataset(
df,
root_path,
partition_cols=None,
preserve_index=False,
storage_options=None,
**kwargs
) -> None:
"""
Write DataFrame to partitioned Parquet dataset
Creates partitioned Parquet dataset with automatic directory structure
based on partition columns. Supports cloud storage destinations.
Parameters:
df: DataFrame
cuDF DataFrame to write
root_path: str or PathLike
Root directory for dataset
partition_cols: list, optional
Columns to use for partitioning
preserve_index: bool, default False
Whether to write index as column
storage_options: dict, optional
Cloud storage configuration
**kwargs: additional arguments
Writer options (compression, etc.)
Examples:
# Write partitioned dataset
cudf.io.parquet.write_to_dataset(
df,
'/path/to/dataset',
partition_cols=['year', 'category'],
compression='snappy'
)
"""Flexible JSON reading with support for various JSON formats.
def read_json(
path_or_buf,
orient='records',
typ='frame',
dtype=None,
lines=False,
compression='infer',
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read JSON data directly into GPU memory
Supports various JSON formats including line-delimited JSON (JSONL),
nested JSON structures, and automatic schema inference.
Parameters:
path_or_buf: str, PathLike, or file-like object
JSON data source (file, URL, or buffer)
orient: str, default 'records'
JSON structure format ('records', 'index', 'values', 'split')
typ: str, default 'frame'
Type of object to return ('frame' for DataFrame)
dtype: dict or str, optional
Data type specification for columns
lines: bool, default False
Whether to read line-delimited JSON
compression: str, default 'infer'
Compression type ('gzip', 'bz2', 'xz', None)
storage_options: dict, optional
Cloud storage configuration
**kwargs: additional arguments
JSON parsing options
Returns:
DataFrame: GPU DataFrame containing JSON data
Examples:
# Read JSON file
df = cudf.read_json('data.json')
# Line-delimited JSON
df = cudf.read_json('data.jsonl', lines=True)
# With compression
df = cudf.read_json('data.json.gz', compression='gzip')
# From URL
df = cudf.read_json('https://api.example.com/data.json')
"""Apache ORC format support with metadata utilities.
def read_orc(
path,
columns=None,
filters=None,
stripes=None,
skiprows=None,
num_rows=None,
use_index=True,
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read Apache ORC file directly into GPU memory
High-performance ORC reader with predicate pushdown and column pruning.
Supports compressed ORC files and cloud storage.
Parameters:
path: str or PathLike
Path to ORC file
columns: list, optional
Specific columns to read
filters: list of tuples, optional
Row filter predicates
stripes: list, optional
Specific ORC stripes to read
skiprows: int, optional
Number of rows to skip
num_rows: int, optional
Maximum rows to read
use_index: bool, default True
Whether to use ORC file index
storage_options: dict, optional
Cloud storage options
**kwargs: additional arguments
Reader configuration
Returns:
DataFrame: GPU DataFrame with ORC data
Examples:
# Basic ORC reading
df = cudf.read_orc('data.orc')
# With column pruning and filtering
df = cudf.read_orc(
'data.orc',
columns=['col1', 'col2'],
filters=[('col1', '>', 0)]
)
"""
def read_orc_metadata(path, **kwargs) -> object:
"""
Read metadata from ORC file without loading data
Extracts schema, stripe information, and statistics for
query planning and data exploration.
Parameters:
path: str or PathLike
Path to ORC file
**kwargs: additional arguments
Reader options
Returns:
object: ORC metadata with schema and statistics
Examples:
# Read ORC metadata
metadata = cudf.io.orc.read_orc_metadata('data.orc')
print(f"Stripes: {len(metadata.stripes)}")
"""def to_orc(
path,
compression='snappy',
enable_statistics=True,
stripe_size_bytes=None,
stripe_size_rows=None,
row_index_stride=None,
**kwargs
):
"""
Write GPU DataFrame to Apache ORC format
High-performance ORC writing with compression and statistical metadata.
Writes directly from GPU memory with configurable stripe organization.
Parameters:
path: str or PathLike
Output path for ORC file
compression: str, default 'snappy'
Compression algorithm ('snappy', 'zlib', 'lz4', 'zstd', None)
enable_statistics: bool, default True
Whether to compute column statistics
stripe_size_bytes: int, optional
Target stripe size in bytes
stripe_size_rows: int, optional
Target stripe size in rows
row_index_stride: int, optional
Row group index stride
**kwargs: additional keyword arguments
Other ORC writing options
Examples:
# Basic ORC writing
df.to_orc('output.orc')
# With compression
df.to_orc('output.orc', compression='zlib')
# Custom stripe configuration
df.to_orc('output.orc', stripe_size_rows=50000)
"""Apache Avro format support for schema evolution and serialization.
def read_avro(
filepath_or_buffer,
columns=None,
skiprows=None,
num_rows=None,
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read Apache Avro file directly into GPU memory
Reads Avro files with automatic schema detection and type conversion.
Supports compressed Avro files and nested data structures.
Parameters:
filepath_or_buffer: str, PathLike, or file-like object
Avro data source
columns: list, optional
Specific columns to read
skiprows: int, optional
Number of rows to skip at beginning
num_rows: int, optional
Maximum number of rows to read
storage_options: dict, optional
Cloud storage configuration
**kwargs: additional arguments
Avro reader options
Returns:
DataFrame: GPU DataFrame with Avro data
Examples:
# Read Avro file
df = cudf.read_avro('data.avro')
# With column selection
df = cudf.read_avro('data.avro', columns=['col1', 'col2'])
"""Apache Arrow Feather format for fast serialization.
def read_feather(
path,
columns=None,
use_threads=True,
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read Apache Feather format file into GPU memory
Fast binary format based on Apache Arrow for efficient DataFrame
serialization with preserved data types and metadata.
Parameters:
path: str or PathLike
Path to Feather file
columns: list, optional
Subset of columns to read
use_threads: bool, default True
Whether to use threading for parallel I/O
storage_options: dict, optional
Cloud storage options
**kwargs: additional arguments
Reader configuration
Returns:
DataFrame: GPU DataFrame with Feather data
Examples:
# Read Feather file
df = cudf.read_feather('data.feather')
# Column selection
df = cudf.read_feather('data.feather', columns=['A', 'B'])
"""HDF5 format support for scientific and numerical data.
def read_hdf(
path_or_buf,
key=None,
mode='r',
columns=None,
start=None,
stop=None,
**kwargs
) -> DataFrame:
"""
Read HDF5 file into GPU memory
Reads HDF5 datasets with support for hierarchical data organization
and partial reading of large datasets.
Parameters:
path_or_buf: str, PathLike, or file-like object
HDF5 file source
key: str, optional
HDF5 group/dataset key to read
mode: str, default 'r'
File access mode
columns: list, optional
Subset of columns to read
start: int, optional
Starting row position
stop: int, optional
Ending row position
**kwargs: additional arguments
HDF5 reader options
Returns:
DataFrame: GPU DataFrame with HDF5 data
Examples:
# Read HDF5 dataset
df = cudf.read_hdf('data.h5', key='dataset1')
# Partial reading
df = cudf.read_hdf('data.h5', key='dataset1', start=1000, stop=2000)
"""Raw text file reading for unstructured data processing.
def read_text(
filepath_or_buffer,
delimiter=None,
dtype='str',
lineterminator='\n',
skiprows=0,
skipfooter=0,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
storage_options=None,
**kwargs
) -> DataFrame:
"""
Read raw text file line by line into GPU memory
Reads unstructured text data with each line as a DataFrame row.
Useful for log files, natural language processing, and custom parsing.
Parameters:
filepath_or_buffer: str, PathLike, or file-like object
Text file source
delimiter: str, optional
Line delimiter (default: newline)
dtype: str, default 'str'
Data type for text data
lineterminator: str, default '\n'
Line termination character
skiprows: int, default 0
Number of rows to skip at beginning
skipfooter: int, default 0
Number of rows to skip at end
nrows: int, optional
Maximum number of lines to read
na_values: list, optional
Values to treat as missing
keep_default_na: bool, default True
Whether to include default NA values
na_filter: bool, default True
Whether to check for missing values
storage_options: dict, optional
Cloud storage configuration
**kwargs: additional arguments
Text reader options
Returns:
DataFrame: GPU DataFrame with one column containing text lines
Examples:
# Read text file
df = cudf.read_text('logfile.txt')
# With line limits
df = cudf.read_text('data.txt', nrows=1000)
"""def from_dlpack(dlpack_tensor) -> Union[DataFrame, Series]:
"""
Create cuDF object from DLPack tensor
Enables zero-copy data sharing between cuDF and other GPU libraries
that support the DLPack standard (PyTorch, CuPy, JAX, etc.).
Parameters:
dlpack_tensor: DLPack tensor object
GPU tensor in DLPack format
Returns:
Union[DataFrame, Series]: cuDF object sharing memory with tensor
Examples:
# From PyTorch tensor
import torch
tensor = torch.cuda.FloatTensor([1, 2, 3, 4])
series = cudf.io.dlpack.from_dlpack(tensor.__dlpack__())
# From CuPy array
import cupy
array = cupy.array([1.0, 2.0, 3.0])
series = cudf.io.dlpack.from_dlpack(array.toDlpack())
"""All cuDF DataFrames include write methods for various formats:
# CSV writing
df.to_csv('output.csv', index=False)
# Parquet writing
df.to_parquet('output.parquet', compression='snappy')
# JSON writing
df.to_json('output.json', orient='records', lines=True)
# ORC writing
df.to_orc('output.orc', compression='zlib')
# Feather writing
df.to_feather('output.feather')
# HDF5 writing
df.to_hdf('output.h5', key='dataset', mode='w')All I/O functions support cloud storage through storage_options:
# Amazon S3
s3_options = {
'key': 'access_key_id',
'secret': 'secret_access_key',
'token': 'session_token' # optional
}
df = cudf.read_parquet('s3://bucket/path/data.parquet',
storage_options=s3_options)
# Google Cloud Storage
gcs_options = {
'token': 'path/to/service_account.json'
}
df = cudf.read_csv('gs://bucket/data.csv', storage_options=gcs_options)
# Azure Blob Storage
azure_options = {
'account_name': 'storage_account',
'account_key': 'account_key'
}
df = cudf.read_json('abfs://container/data.json',
storage_options=azure_options)Install with Tessl CLI
npx tessl i tessl/pypi-cudf-cu12