GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data
npx @tessl/cli install tessl/pypi-cudf-cu12@25.8.0cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating data. cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
cudf-cu12cudfpip install cudf-cu12 or conda install cudf# Main data structures
import cudf
from cudf import DataFrame, Series, Index
# I/O operations
from cudf import read_csv, read_parquet, read_json
from cudf.io import read_orc, read_avro, read_feather
# Data manipulation
from cudf import concat, merge, pivot_table
from cudf import cut, factorize, unique
# Type checking
from cudf.api.types import is_numeric_dtype, is_categorical_dtype
from cudf.api.types import dtype
# Configuration
from cudf.options import get_option, set_option
# Dataset generation
from cudf.datasets import timeseries, randomdata
# Version information
import cudf
print(cudf.__version__) # Package version# Create DataFrame from dictionary
df = cudf.DataFrame({
'x': [1, 2, 3, 4, 5],
'y': [1.0, 2.5, 3.2, 4.1, 5.8],
'z': ['red', 'green', 'blue', 'red', 'green']
})
# GPU-accelerated operations
result = df.groupby('z').agg({'x': 'sum', 'y': 'mean'})
# I/O operations leverage GPU memory
df_from_file = cudf.read_parquet('data.parquet')
df_from_file.to_csv('output.csv')
# Seamless pandas compatibility
df_pandas = df.to_pandas() # Move to CPU
df_cudf = cudf.from_pandas(df_pandas) # Move to GPUcuDF leverages the RAPIDS ecosystem to provide GPU-accelerated data processing:
cuDF provides GPU-accelerated versions of pandas' core data structures with enhanced capabilities.
class DataFrame:
"""GPU-accelerated DataFrame with pandas-like API"""
class Series:
"""One-dimensional GPU array with axis labels"""
class Index:
"""Immutable sequence used for axis labels and selection"""
class RangeIndex(Index):
"""Memory-efficient index for integer ranges"""
class CategoricalIndex(Index):
"""Index for categorical data with GPU acceleration"""Key Features: GPU memory efficiency, nested data types (lists, structs), decimal precision support.
→ Learn more about Core Data Structures
High-performance GPU I/O for popular data formats with automatic memory management.
def read_parquet(filepath_or_buffer, columns=None, **kwargs) -> DataFrame:
"""
Read Apache Parquet file directly into GPU memory
Parameters:
filepath_or_buffer: File path, URL, or buffer-like object
columns: List[str], optional column subset to read
**kwargs: Additional parquet reading options
Returns:
DataFrame: GPU-accelerated DataFrame
"""
def read_csv(filepath_or_buffer, **kwargs) -> DataFrame:
"""
Read CSV file with GPU acceleration
Parameters:
filepath_or_buffer: File path or buffer
**kwargs: CSV parsing options (delimiter, header, etc.)
Returns:
DataFrame: GPU DataFrame with parsed CSV data
"""Supported Formats: Parquet, ORC, CSV, JSON, Avro, Feather, HDF5, raw text files.
→ Learn more about I/O Operations
GPU-accelerated operations for reshaping, joining, and transforming data.
def concat(objs, axis=0, ignore_index=False, **kwargs) -> Union[DataFrame, Series]:
"""
Concatenate cuDF objects along a particular axis
Parameters:
objs: Sequence of DataFrame/Series objects
axis: int, axis to concatenate along (0='index', 1='columns')
ignore_index: bool, reset index if True
Returns:
Union[DataFrame, Series]: Concatenated result
"""
def merge(left, right, how='inner', on=None, **kwargs) -> DataFrame:
"""
Merge DataFrame objects with database-style join operations
Parameters:
left: DataFrame, left object to merge
right: DataFrame, right object to merge
how: str, type of merge ('inner', 'outer', 'left', 'right')
on: label or list, column names to join on
Returns:
DataFrame: Merged DataFrame
"""Operations: Concatenation, merging, pivoting, melting, groupby, aggregation, sorting.
→ Learn more about Data Manipulation
Comprehensive type checking system for GPU data types including nested types.
def is_numeric_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is numeric
Parameters:
arr_or_dtype: Array-like or data type to check
Returns:
bool: True if numeric dtype
"""
def is_categorical_dtype(arr_or_dtype) -> bool:
"""
Check whether the array or dtype is categorical
Parameters:
arr_or_dtype: Array-like or data type to check
Returns:
bool: True if categorical dtype
"""Type Support: Standard dtypes, categorical, decimal, list, struct, interval, datetime types.
→ Learn more about Type Checking
Drop-in acceleration for existing pandas code with cudf.pandas.
def install() -> None:
"""
Enable cuDF pandas accelerator mode
Automatically accelerates pandas operations with GPU when beneficial,
falls back to CPU pandas for unsupported operations.
"""
class Profiler:
"""
Performance profiler for pandas acceleration opportunities
Analyzes pandas code execution to identify GPU acceleration potential
"""Features: Automatic fallback, transparent acceleration, performance profiling, IPython magic commands.
→ Learn more about Pandas Compatibility
GPU-aware testing framework with specialized assertions for cuDF objects.
def assert_frame_equal(left, right, check_dtype=True, **kwargs) -> None:
"""
Assert DataFrame equality with GPU-aware comparison
Parameters:
left: DataFrame, expected result
right: DataFrame, actual result
check_dtype: bool, whether to check dtype compatibility
**kwargs: Additional comparison options
"""Capabilities: DataFrame/Series/Index comparison, GPU memory validation, performance assertions.
→ Learn more about Testing Utilities
Global configuration system for controlling GPU memory usage and behavior.
def get_option(key: str) -> Any:
"""
Get the value of a configuration option
Parameters:
key: str, configuration option key
Returns:
Any: Current option value
"""
def set_option(key: str, value: Any) -> None:
"""
Set a configuration option value
Parameters:
key: str, configuration option key
value: Any, new option value
"""Options: Memory management, display formatting, computation behavior, I/O settings.
Specialized error types for GPU-specific issues and mixed-type operations.
class UnsupportedCUDAError(Exception):
"""Raised when CUDA functionality is not supported"""
class MixedTypeError(Exception):
"""Raised when mixing incompatible GPU and CPU types"""Utilities for generating test data and benchmarking datasets directly in GPU memory.
def timeseries(
start='2000-01-01',
end='2000-01-31',
freq='1s',
dtypes=None,
nulls_frequency=0,
seed=None
) -> DataFrame:
"""
Generate random timeseries data for testing and benchmarking
Parameters:
start: str or datetime-like, start date
end: str or datetime-like, end date
freq: str, date frequency string (e.g., '1s', '1H', '1D')
dtypes: dict, mapping of column names to types
nulls_frequency: float, proportion of nulls to include (0-1)
seed: int, random state seed for reproducibility
Returns:
DataFrame: GPU DataFrame with random timeseries data
"""
def randomdata(nrows=10, dtypes=None, seed=None) -> DataFrame:
"""
Generate random data for testing and benchmarking
Parameters:
nrows: int, number of rows to generate
dtypes: dict, mapping of column names to types
seed: int, random state seed for reproducibility
Returns:
DataFrame: GPU DataFrame with random data
"""Access package version and build information programmatically.
import cudf
# Package version string
__version__ = cudf.__version__ # e.g., "25.8.0"
# Git commit hash (if available)
__git_commit__ = cudf.__git_commit__ # e.g., "6cea3743b6"