GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data
—
cuDF provides GPU-accelerated operations for reshaping, joining, aggregating, and transforming data. All operations leverage GPU parallelism for optimal performance on large datasets.
# Core manipulation functions
from cudf import concat, merge, pivot, pivot_table, melt, crosstab
from cudf import unstack, get_dummies
# Algorithm functions
from cudf import factorize, unique, cut
# Time/date operations
from cudf import date_range, to_datetime, interval_range, DateOffset
from cudf import to_numeric
# Groupby operations
from cudf import Grouper, NamedAggCombine cuDF objects along axes with flexible alignment and indexing options.
def concat(
objs,
axis=0,
join='outer',
ignore_index=False,
keys=None,
levels=None,
names=None,
verify_integrity=False,
sort=False,
copy=True
) -> Union[DataFrame, Series]:
"""
Concatenate cuDF objects along a particular axis with GPU acceleration
Efficiently combines multiple DataFrames or Series along rows or columns
with flexible joining and indexing options. GPU-optimized for large datasets.
Parameters:
objs: sequence of DataFrame, Series, or dict
Objects to concatenate (list, tuple, or dict of objects)
axis: int or str, default 0
Axis to concatenate along (0/'index' for rows, 1/'columns' for columns)
join: str, default 'outer'
How to handle indexes on other axis ('inner' or 'outer')
ignore_index: bool, default False
If True, reset index to default integer index
keys: sequence, optional
Construct hierarchical index using keys as outermost level
levels: list of sequences, optional
Specific levels to use for MultiIndex construction
names: list, optional
Names for levels in resulting hierarchical index
verify_integrity: bool, default False
Check whether new concatenated axis contains duplicates
sort: bool, default False
Sort non-concatenation axis if not already aligned
copy: bool, default True
Copy data if False and possible to avoid copy
Returns:
Union[DataFrame, Series]: Concatenated result of same type as input objects
Examples:
# Concatenate DataFrames vertically (rows)
df1 = cudf.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = cudf.DataFrame({'A': [5, 6], 'B': [7, 8]})
result = cudf.concat([df1, df2]) # 4 rows, 2 columns
# Concatenate horizontally (columns)
df3 = cudf.DataFrame({'C': [9, 10], 'D': [11, 12]})
result = cudf.concat([df1, df3], axis=1) # 2 rows, 4 columns
# With hierarchical indexing
result = cudf.concat([df1, df2], keys=['first', 'second'])
# Ignore original indexes
result = cudf.concat([df1, df2], ignore_index=True)
"""Database-style join operations with various merge strategies and optimizations.
def merge(
left,
right,
how='inner',
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
sort=False,
suffixes=('_x', '_y'),
copy=True,
indicator=False,
validate=None,
method='hash'
) -> DataFrame:
"""
Merge DataFrame objects with database-style join operations
High-performance GPU joins with automatic optimization and support
for various join algorithms. Handles large datasets efficiently.
Parameters:
left: DataFrame
Left DataFrame to merge
right: DataFrame
Right DataFrame to merge
how: str, default 'inner'
Type of merge ('left', 'right', 'outer', 'inner', 'cross')
on: label or list, optional
Column or index level names to join on (must exist in both objects)
left_on: label or list, optional
Column or index level names to join on in left DataFrame
right_on: label or list, optional
Column or index level names to join on in right DataFrame
left_index: bool, default False
Use left DataFrame's index as join key
right_index: bool, default False
Use right DataFrame's index as join key
sort: bool, default False
Sort join keys lexicographically in result
suffixes: tuple of str, default ('_x', '_y')
Suffixes to apply to overlapping column names
copy: bool, default True
Always copy data, set False to avoid copies when possible
indicator: bool or str, default False
Add column indicating source of each row
validate: str, optional
Check uniqueness of merge keys ('one_to_one', 'one_to_many', etc.)
method: str, default 'hash'
Join algorithm ('hash', 'sort')
Returns:
DataFrame: Merged DataFrame combining left and right
Examples:
# Inner join on common column
left = cudf.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
right = cudf.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
result = cudf.merge(left, right, on='key') # Returns A, B rows
# Left join with different column names
result = cudf.merge(
left, right,
left_on='key', right_on='key',
how='left'
)
# Multiple key join
result = cudf.merge(df1, df2, on=['key1', 'key2'], how='outer')
# Index-based join
result = cudf.merge(
left, right,
left_index=True, right_index=True,
how='inner'
)
"""Transform data layout between wide and long formats with pivoting and melting.
def pivot(
data,
index=None,
columns=None,
values=None
) -> DataFrame:
"""
Pivot data to reshape from long to wide format
Reorganizes data by pivoting column values into new columns.
GPU-accelerated for large pivot operations.
Parameters:
data: DataFrame
Input DataFrame to pivot
index: str, list, or array, optional
Column(s) to use to make new DataFrame's index
columns: str, list, or array
Column(s) to use to make new DataFrame's columns
values: str, list, or array, optional
Column(s) to use for populating new DataFrame's values
Returns:
DataFrame: Pivoted DataFrame with reshaped data
Examples:
# Basic pivot
df = cudf.DataFrame({
'date': ['2023-01', '2023-01', '2023-02', '2023-02'],
'variable': ['A', 'B', 'A', 'B'],
'value': [1, 2, 3, 4]
})
result = cudf.pivot(df, index='date', columns='variable', values='value')
# Multiple values columns
result = cudf.pivot(df, columns='variable', values=['value1', 'value2'])
"""
def pivot_table(
data,
values=None,
index=None,
columns=None,
aggfunc='mean',
fill_value=None,
margins=False,
dropna=True,
margins_name='All',
sort=True
) -> DataFrame:
"""
Create pivot table with aggregation functions
Generalized pivot operation that applies aggregation functions to
grouped data. Supports multiple aggregation functions and fill values.
Parameters:
data: DataFrame
Input DataFrame to create pivot table from
values: str, list, or array, optional
Column(s) to aggregate
index: str, list, or array, optional
Keys to group by on pivot table index
columns: str, list, or array, optional
Keys to group by on pivot table columns
aggfunc: function, list, dict, default 'mean'
Aggregation function(s) to apply ('mean', 'sum', 'count', etc.)
fill_value: scalar, optional
Value to replace missing values with
margins: bool, default False
Add row/column margins (subtotals)
dropna: bool, default True
Drop columns with all NaN values
margins_name: str, default 'All'
Name of margins row/column
sort: bool, default True
Sort resulting pivot table by index/columns
Returns:
DataFrame: Pivot table with aggregated values
Examples:
# Basic pivot table with aggregation
df = cudf.DataFrame({
'A': ['foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'one', 'two'],
'C': [1, 2, 3, 4],
'D': [10, 20, 30, 40]
})
table = cudf.pivot_table(df, values='C', index='A', columns='B', aggfunc='sum')
# Multiple aggregation functions
table = cudf.pivot_table(
df, values='C', index='A', columns='B',
aggfunc=['sum', 'mean', 'count']
)
# With margins
table = cudf.pivot_table(df, values='C', index='A', columns='B', margins=True)
"""
def melt(
frame,
id_vars=None,
value_vars=None,
var_name=None,
value_name='value',
col_level=None,
ignore_index=True
) -> DataFrame:
"""
Unpivot DataFrame from wide to long format (reverse of pivot)
Transforms columns into rows by "melting" the DataFrame. Useful for
converting wide-format data to long format for analysis.
Parameters:
frame: DataFrame
DataFrame to melt
id_vars: list of str, optional
Column(s) to use as identifier variables
value_vars: list of str, optional
Column(s) to unpivot (default: all columns not in id_vars)
var_name: str, optional
Name for variable column (default: 'variable')
value_name: str, default 'value'
Name for value column
col_level: int or str, optional
Level to melt for MultiIndex columns
ignore_index: bool, default True
Reset index in result
Returns:
DataFrame: Melted DataFrame in long format
Examples:
# Basic melt
df = cudf.DataFrame({
'id': ['A', 'B'],
'var1': [1, 3],
'var2': [2, 4]
})
result = cudf.melt(df, id_vars=['id']) # Long format
# Specify columns to melt
result = cudf.melt(
df,
id_vars=['id'],
value_vars=['var1', 'var2'],
var_name='variable',
value_name='measurement'
)
"""Statistical cross-tabulation and categorical variable encoding.
def crosstab(
index,
columns,
values=None,
rownames=None,
colnames=None,
aggfunc=None,
margins=False,
margins_name='All',
dropna=True,
normalize=False
) -> DataFrame:
"""
Compute cross-tabulation of two or more factors
Creates frequency table showing relationship between categorical variables.
GPU-accelerated for large categorical datasets.
Parameters:
index: array-like, Series, or list of arrays/Series
Values to group by in rows
columns: array-like, Series, or list of arrays/Series
Values to group by in columns
values: array-like, optional
Values to aggregate (default: frequency count)
rownames: sequence, optional
Names for row index levels
colnames: sequence, optional
Names for column index levels
aggfunc: function, optional
Aggregation function if values is specified
margins: bool, default False
Add row/column margins
margins_name: str, default 'All'
Name for margin row/column
dropna: bool, default True
Drop missing value combinations
normalize: bool or str, default False
Normalize by dividing by sum ('all', 'index', 'columns')
Returns:
DataFrame: Cross-tabulation table
Examples:
# Basic cross-tabulation
a = cudf.Series(['foo', 'foo', 'bar', 'bar'])
b = cudf.Series(['one', 'two', 'one', 'two'])
result = cudf.crosstab(a, b)
# With values and aggregation
values = cudf.Series([1, 2, 3, 4])
result = cudf.crosstab(a, b, values=values, aggfunc='sum')
# Normalized
result = cudf.crosstab(a, b, normalize=True)
"""
def get_dummies(
data,
prefix=None,
prefix_sep='_',
dummy_na=False,
columns=None,
sparse=False,
drop_first=False,
dtype=None
) -> DataFrame:
"""
Convert categorical variables to dummy/indicator variables
Creates binary columns for each category in categorical variables.
Commonly used for machine learning feature encoding.
Parameters:
data: array-like, Series, or DataFrame
Data to create dummy variables from
prefix: str, list of str, or dict, optional
Prefix for dummy column names
prefix_sep: str, default '_'
Separator between prefix and category name
dummy_na: bool, default False
Add column for missing values
columns: list-like, optional
Column names to encode (default: all categorical columns)
sparse: bool, default False
Return sparse matrix (not supported, included for compatibility)
drop_first: bool, default False
Drop first category to avoid multicollinearity
dtype: numpy.dtype, optional
Data type for dummy variables
Returns:
DataFrame: DataFrame with dummy variables
Examples:
# From Series
s = cudf.Series(['a', 'b', 'c', 'a'])
result = cudf.get_dummies(s) # Creates 3 binary columns
# From DataFrame with prefix
df = cudf.DataFrame({'col': ['red', 'blue', 'red', 'green']})
result = cudf.get_dummies(df, prefix='color')
# Drop first category
result = cudf.get_dummies(df, drop_first=True)
"""
def unstack(
level=-1,
fill_value=None
) -> DataFrame:
"""
Pivot index level to columns (MultiIndex method)
Transforms index level into columns, effectively pivoting the data.
Used with MultiIndex DataFrames to reshape hierarchical data.
Parameters:
level: int, str, or list, default -1
Level(s) of index to unstack
fill_value: scalar, optional
Value to use for missing combinations
Returns:
DataFrame: DataFrame with unstacked index level as columns
Examples:
# Create MultiIndex DataFrame
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
index = cudf.MultiIndex.from_arrays(arrays, names=['letter', 'number'])
df = cudf.DataFrame({'value': [10, 20, 30, 40]}, index=index)
# Unstack inner level
result = df.unstack() # number level becomes columns
# Unstack specific level
result = df.unstack(level='letter')
"""Fundamental algorithms for data analysis and preprocessing.
def factorize(
values,
sort=False,
na_sentinel=-1,
use_na_sentinel=True
) -> tuple[cupy.ndarray, Index]:
"""
Encode input values as enumerated type or categorical variable
Converts object array to integer codes and unique values. Useful for
creating categorical encodings and memory-efficient representations.
Parameters:
values: array-like
Sequence to factorize (Series, Index, or array-like)
sort: bool, default False
Sort unique values and codes
na_sentinel: int, default -1
Value to mark missing values with
use_na_sentinel: bool, default True
Whether to use sentinel value for missing data
Returns:
tuple: (codes, uniques)
codes: cupy.ndarray of integer codes
uniques: Index of unique values
Examples:
# Basic factorization
values = cudf.Series(['red', 'blue', 'red', 'green'])
codes, uniques = cudf.factorize(values)
# codes: [0, 1, 0, 2], uniques: ['red', 'blue', 'green']
# With sorting
codes, uniques = cudf.factorize(values, sort=True)
# Handle missing values
values_na = cudf.Series(['a', None, 'b', 'a'])
codes, uniques = cudf.factorize(values_na)
"""
def unique(values) -> Union[cupy.ndarray, Index]:
"""
Return unique values from array-like object
GPU-accelerated unique value extraction with automatic deduplication.
Preserves data type and handles missing values appropriately.
Parameters:
values: array-like
Input array, Series, or Index
Returns:
Union[cupy.ndarray, Index]: Unique values in same type as input
Examples:
# From Series
s = cudf.Series([1, 2, 2, 3, 1, 4])
unique_vals = cudf.unique(s) # [1, 2, 3, 4]
# From array with strings
arr = ['a', 'b', 'a', 'c', 'b']
unique_vals = cudf.unique(arr) # ['a', 'b', 'c']
# Preserves data type
dates = cudf.Series(['2023-01-01', '2023-01-02', '2023-01-01'])
dates = cudf.to_datetime(dates)
unique_dates = cudf.unique(dates)
"""
def cut(
x,
bins,
right=True,
labels=None,
retbins=False,
precision=3,
include_lowest=False,
duplicates='raise'
) -> Union[Series, tuple]:
"""
Bin continuous values into discrete intervals
Segments and sorts data values into bins. Useful for creating categorical
variables from continuous data and histogram-like operations.
Parameters:
x: array-like
Input array to be binned (1-dimensional)
bins: int, sequence, or IntervalIndex
Criteria for binning (number of bins or bin edges)
right: bool, default True
Whether intervals include right edge
labels: array-like or False, optional
Labels for returned bins (length must match number of bins)
retbins: bool, default False
Whether to return bins array
precision: int, default 3
Precision for bin edge display
include_lowest: bool, default False
Whether first interval should be left-inclusive
duplicates: str, default 'raise'
Treatment of duplicate bin edges ('raise' or 'drop')
Returns:
Union[Series, tuple]: Categorical Series with bin assignments
If retbins=True, returns (binned_series, bin_edges)
Examples:
# Equal-width bins
values = cudf.Series([1, 7, 5, 4, 6, 3])
result = cudf.cut(values, bins=3) # 3 equal-width bins
# Custom bin edges
result = cudf.cut(values, bins=[0, 3, 6, 9])
# With custom labels
result = cudf.cut(
values,
bins=3,
labels=['low', 'medium', 'high']
)
# Return bin edges
result, bin_edges = cudf.cut(values, bins=4, retbins=True)
"""Comprehensive date/time functionality for temporal data analysis.
def date_range(
start=None,
end=None,
periods=None,
freq=None,
tz=None,
normalize=False,
name=None,
closed=None
) -> DatetimeIndex:
"""
Generate sequence of dates with GPU acceleration
Creates DatetimeIndex with regular frequency between start and end dates.
Supports various frequency specifications and timezone handling.
Parameters:
start: str or datetime-like, optional
Left bound for generating dates
end: str or datetime-like, optional
Right bound for generating dates
periods: int, optional
Number of periods to generate
freq: str or DateOffset, default 'D'
Frequency string ('D', 'H', 'min', 'S', 'MS', etc.)
tz: str or tzinfo, optional
Timezone name for localized DatetimeIndex
normalize: bool, default False
Normalize start/end dates to midnight
name: str, optional
Name of resulting DatetimeIndex
closed: str, optional
Make interval closed ('left', 'right', or None)
Returns:
DatetimeIndex: Fixed frequency DatetimeIndex
Examples:
# Basic date range
dates = cudf.date_range('2023-01-01', '2023-01-10', freq='D')
# By number of periods
dates = cudf.date_range('2023-01-01', periods=10, freq='D')
# Hourly frequency
dates = cudf.date_range('2023-01-01', periods=24, freq='H')
# With timezone
dates = cudf.date_range('2023-01-01', periods=5, freq='D', tz='UTC')
# Business days only
dates = cudf.date_range('2023-01-01', periods=10, freq='B')
"""
def to_datetime(
arg,
errors='raise',
dayfirst=False,
yearfirst=False,
utc=None,
format=None,
exact=True,
unit=None,
infer_datetime_format=False,
origin='unix',
cache=True
) -> Union[datetime, Series, DatetimeIndex]:
"""
Convert argument to datetime with GPU acceleration
Flexible datetime parsing with automatic format detection and
error handling. Optimized for large-scale datetime conversions.
Parameters:
arg: int, float, str, datetime, list, tuple, array, Series, DataFrame
Object to convert to datetime
errors: str, default 'raise'
Error handling ('raise', 'coerce', 'ignore')
dayfirst: bool, default False
Interpret first value as day in ambiguous cases
yearfirst: bool, default False
Interpret first value as year in ambiguous cases
utc: bool, optional
Return UTC DatetimeIndex if True
format: str, optional
Strftime format to use for parsing
exact: bool, default True
Whether format must match exactly
unit: str, optional
Unit for numeric conversions ('D', 's', 'ms', 'us', 'ns')
infer_datetime_format: bool, default False
Attempt to infer format automatically
origin: scalar, default 'unix'
Define origin for numeric conversions
cache: bool, default True
Use cache for repeated conversion patterns
Returns:
Union[datetime, Series, DatetimeIndex]: Converted datetime object
Examples:
# String conversion
dates = cudf.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])
# With format specification
dates = cudf.to_datetime(
['01/01/2023', '01/02/2023'],
format='%m/%d/%Y'
)
# Numeric timestamps
timestamps = [1609459200, 1609545600, 1609632000] # Unix timestamps
dates = cudf.to_datetime(timestamps, unit='s')
# Error handling
mixed = ['2023-01-01', 'invalid', '2023-01-03']
dates = cudf.to_datetime(mixed, errors='coerce') # Invalid -> NaT
"""
def interval_range(
start=None,
end=None,
periods=None,
freq=None,
name=None,
closed='right'
) -> IntervalIndex:
"""
Generate sequence of intervals with fixed frequency
Creates IntervalIndex with regular intervals between start and end.
Useful for time-based and numeric interval operations.
Parameters:
start: numeric or datetime-like, optional
Left bound for generating intervals
end: numeric or datetime-like, optional
Right bound for generating intervals
periods: int, optional
Number of intervals to generate
freq: numeric, str, or DateOffset, optional
Length of each interval
name: str, optional
Name of resulting IntervalIndex
closed: str, default 'right'
Which side of intervals is closed ('left', 'right', 'both', 'neither')
Returns:
IntervalIndex: Fixed frequency IntervalIndex
Examples:
# Numeric intervals
intervals = cudf.interval_range(start=0, end=10, periods=5)
# Date intervals
intervals = cudf.interval_range(
start='2023-01-01',
end='2023-01-10',
freq='2D'
)
# Custom frequency
intervals = cudf.interval_range(start=0, periods=4, freq=2.5)
"""
class DateOffset:
"""
Standard offset class for date arithmetic and frequency operations
Base class for date offsets that can be added to datetime objects.
Provides consistent interface for date manipulation operations.
Parameters:
n: int, default 1
Number of offset periods
Examples:
# Create date offset
offset = cudf.DateOffset(days=1)
# Add to datetime
date = cudf.to_datetime('2023-01-01')
new_date = date + offset
# Use in date_range
dates = cudf.date_range('2023-01-01', periods=5, freq=offset)
"""
def to_numeric(
arg,
errors='raise',
downcast=None
) -> Union[Series, scalar]:
"""
Convert argument to numeric type with GPU acceleration
Attempts to convert object to numeric type with flexible error handling
and optional downcasting for memory efficiency.
Parameters:
arg: scalar, list, tuple, array, Series
Object to convert to numeric type
errors: str, default 'raise'
Error handling ('raise', 'coerce', 'ignore')
downcast: str, optional
Downcast to smallest possible numeric type ('integer', 'signed', 'unsigned', 'float')
Returns:
Union[Series, scalar]: Converted numeric object
Examples:
# String to numeric conversion
strings = cudf.Series(['1', '2', '3.5', '4'])
numeric = cudf.to_numeric(strings)
# Error handling
mixed = cudf.Series(['1', '2', 'invalid', '4'])
numeric = cudf.to_numeric(mixed, errors='coerce') # Invalid -> NaN
# Downcast for memory efficiency
large_ints = cudf.Series([1, 2, 3, 4]) # Default int64
small_ints = cudf.to_numeric(large_ints, downcast='integer') # Smallest int type
"""Flexible grouping utilities for split-apply-combine operations.
class Grouper:
"""
Groupby specification object for complex grouping operations
Provides detailed control over groupby operations including time-based
grouping, level selection, and custom key functions.
Parameters:
key: str, optional
Grouping key (column name for DataFrame, None for Series)
level: int, str, or list, optional
Level name or number for MultiIndex grouping
freq: str or DateOffset, optional
Frequency for time-based grouping
axis: int, default 0
Axis to group along
sort: bool, default True
Sort group keys
Examples:
# Time-based grouping
df = cudf.DataFrame({
'date': cudf.date_range('2023-01-01', periods=10, freq='D'),
'value': range(10)
})
monthly = df.groupby(cudf.Grouper(key='date', freq='M')).sum()
# MultiIndex grouping
grouper = cudf.Grouper(level='category')
result = df.groupby(grouper).mean()
"""
class NamedAgg:
"""
Named aggregation specification for groupby operations
Provides clear naming for aggregation results when using multiple
aggregation functions on the same column.
Parameters:
column: str
Column name to aggregate
aggfunc: str or callable
Aggregation function name or function
Examples:
# Named aggregations
df = cudf.DataFrame({
'group': ['A', 'B', 'A', 'B'],
'value': [1, 2, 3, 4]
})
result = df.groupby('group').agg(
mean_value=cudf.NamedAgg('value', 'mean'),
sum_value=cudf.NamedAgg('value', 'sum'),
count_value=cudf.NamedAgg('value', 'count')
)
"""Install with Tessl CLI
npx tessl i tessl/pypi-cudf-cu12