GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data
—
cuDF provides comprehensive testing utilities for GPU-aware testing of DataFrames, Series, and Index objects. These functions provide detailed comparison capabilities that handle GPU memory, floating-point precision, and cuDF-specific data types.
# Core testing functions
from cudf.testing import (
assert_eq, assert_neq,
assert_frame_equal, assert_series_equal, assert_index_equal
)
# For use in test suites
import cudf.testing as cudf_testingUniversal equality testing function that handles all cuDF object types.
def assert_eq(
left,
right,
check_dtype=True,
check_exact=False,
check_datetimelike_compat=False,
check_categorical=True,
check_category_order=True,
rtol=1e-05,
atol=1e-08,
**kwargs
) -> None:
"""
Generic equality assertion for cuDF objects with GPU-aware comparison
Comprehensive equality testing that automatically detects object type
and applies appropriate comparison logic. Handles DataFrames, Series,
Index objects, and scalar values with GPU memory considerations.
Parameters:
left: cuDF object, pandas object, or scalar
Expected result object
right: cuDF object, pandas object, or scalar
Actual result object
check_dtype: bool, default True
Whether to check dtype compatibility exactly
check_exact: bool, default False
Whether to check exact equality (no floating-point tolerance)
check_datetimelike_compat: bool, default False
Whether to compare datetime-like objects across types
check_categorical: bool, default True
Whether to check categorical data consistency
check_category_order: bool, default True
Whether categorical category order must match
rtol: float, default 1e-05
Relative tolerance for floating-point comparisons
atol: float, default 1e-08
Absolute tolerance for floating-point comparisons
**kwargs: additional arguments
Type-specific comparison options
Raises:
AssertionError: If objects are not equal according to specified criteria
Examples:
# DataFrame comparison
expected = cudf.DataFrame({'A': [1, 2, 3], 'B': [4.0, 5.0, 6.0]})
actual = cudf.DataFrame({'A': [1, 2, 3], 'B': [4.0, 5.0, 6.0]})
cudf.testing.assert_eq(expected, actual)
# Series comparison with tolerance
expected = cudf.Series([1.1, 2.2, 3.3])
actual = cudf.Series([1.100001, 2.200001, 3.300001])
cudf.testing.assert_eq(expected, actual, rtol=1e-4)
# Mixed cuDF/pandas comparison
cudf_series = cudf.Series([1, 2, 3])
pandas_series = cudf_series.to_pandas()
cudf.testing.assert_eq(cudf_series, pandas_series)
# Scalar comparison
cudf.testing.assert_eq(5, 5)
cudf.testing.assert_eq(3.14159, 3.14160, rtol=1e-4)
# Categorical comparison
cat1 = cudf.Series(['a', 'b', 'c'], dtype='category')
cat2 = cudf.Series(['a', 'b', 'c'], dtype='category')
cudf.testing.assert_eq(cat1, cat2, check_categorical=True)
"""
def assert_neq(
left,
right,
**kwargs
) -> None:
"""
Assert that two objects are not equal
Inverse of assert_eq - ensures objects are different according to
the same comparison criteria used by assert_eq.
Parameters:
left: cuDF object, pandas object, or scalar
First object to compare
right: cuDF object, pandas object, or scalar
Second object to compare
**kwargs: additional arguments
Passed to underlying comparison functions
Raises:
AssertionError: If objects are equal according to comparison criteria
Examples:
# Different DataFrames
df1 = cudf.DataFrame({'A': [1, 2, 3]})
df2 = cudf.DataFrame({'A': [4, 5, 6]})
cudf.testing.assert_neq(df1, df2)
# Different dtypes
series1 = cudf.Series([1, 2, 3], dtype='int32')
series2 = cudf.Series([1, 2, 3], dtype='int64')
cudf.testing.assert_neq(series1, series2, check_dtype=True)
# Different values
cudf.testing.assert_neq(5, 6)
cudf.testing.assert_neq([1, 2, 3], [1, 2, 4])
"""Detailed DataFrame comparison with comprehensive options for handling edge cases.
def assert_frame_equal(
left,
right,
check_dtype=True,
check_index_type=True,
check_column_type=True,
check_frame_type=True,
check_names=True,
check_exact=False,
check_datetimelike_compat=False,
check_categorical=True,
check_category_order=True,
check_like=False,
rtol=1e-05,
atol=1e-08,
**kwargs
) -> None:
"""
Assert DataFrame equality with comprehensive GPU-aware comparison
Detailed DataFrame comparison that checks data values, dtypes, indexes,
column names, and metadata. Optimized for GPU DataFrames with support
for floating-point tolerance and categorical data.
Parameters:
left: DataFrame
Expected DataFrame result
right: DataFrame
Actual DataFrame result
check_dtype: bool, default True
Whether to check that dtypes match exactly
check_index_type: bool, default True
Whether to check index type compatibility
check_column_type: bool, default True
Whether to check column type compatibility
check_frame_type: bool, default True
Whether to check that both objects are DataFrames
check_names: bool, default True
Whether to check index and column names match
check_exact: bool, default False
Whether to use exact equality (no floating-point tolerance)
check_datetimelike_compat: bool, default False
Whether to allow comparison of different datetime-like types
check_categorical: bool, default True
Whether to check categorical data consistency
check_category_order: bool, default True
Whether categorical category order must match exactly
check_like: bool, default False
Whether to ignore order of index and columns
rtol: float, default 1e-05
Relative tolerance for floating-point comparison
atol: float, default 1e-08
Absolute tolerance for floating-point comparison
**kwargs: additional arguments
Additional comparison options
Raises:
AssertionError: If DataFrames are not equal with detailed diff message
Examples:
# Basic DataFrame comparison
expected = cudf.DataFrame({
'A': [1, 2, 3],
'B': [4.0, 5.0, 6.0],
'C': ['x', 'y', 'z']
})
actual = cudf.DataFrame({
'A': [1, 2, 3],
'B': [4.0, 5.0, 6.0],
'C': ['x', 'y', 'z']
})
cudf.testing.assert_frame_equal(expected, actual)
# With custom index
expected.index = ['row1', 'row2', 'row3']
actual.index = ['row1', 'row2', 'row3']
cudf.testing.assert_frame_equal(expected, actual, check_names=True)
# Floating-point tolerance
expected = cudf.DataFrame({'vals': [1.1, 2.2, 3.3]})
actual = cudf.DataFrame({'vals': [1.100001, 2.200001, 3.300001]})
cudf.testing.assert_frame_equal(expected, actual, rtol=1e-4)
# Ignore column/index order
expected = cudf.DataFrame({'A': [1, 2], 'B': [3, 4]})
actual = cudf.DataFrame({'B': [3, 4], 'A': [1, 2]})
cudf.testing.assert_frame_equal(expected, actual, check_like=True)
# Mixed cuDF/pandas comparison
cudf_df = cudf.DataFrame({'x': [1, 2, 3]})
pandas_df = cudf_df.to_pandas()
cudf.testing.assert_frame_equal(cudf_df, pandas_df)
# Categorical data
cat_df1 = cudf.DataFrame({
'cat_col': cudf.Series(['a', 'b', 'c'], dtype='category')
})
cat_df2 = cudf.DataFrame({
'cat_col': cudf.Series(['a', 'b', 'c'], dtype='category')
})
cudf.testing.assert_frame_equal(cat_df1, cat_df2, check_categorical=True)
"""Detailed Series comparison with support for all cuDF data types.
def assert_series_equal(
left,
right,
check_dtype=True,
check_index_type=True,
check_series_type=True,
check_names=True,
check_exact=False,
check_datetimelike_compat=False,
check_categorical=True,
check_category_order=True,
rtol=1e-05,
atol=1e-08,
**kwargs
) -> None:
"""
Assert Series equality with GPU-aware detailed comparison
Comprehensive Series comparison that validates data values, dtype,
index, name, and metadata. Handles cuDF-specific data types including
nested types (lists, structs) and extension types (decimals).
Parameters:
left: Series
Expected Series result
right: Series
Actual Series result
check_dtype: bool, default True
Whether to check dtype compatibility exactly
check_index_type: bool, default True
Whether to check index type compatibility
check_series_type: bool, default True
Whether to check that both objects are Series
check_names: bool, default True
Whether to check Series and index names match
check_exact: bool, default False
Whether to use exact equality (no floating-point tolerance)
check_datetimelike_compat: bool, default False
Whether to allow comparison of different datetime-like types
check_categorical: bool, default True
Whether to check categorical data consistency
check_category_order: bool, default True
Whether categorical category order must match
rtol: float, default 1e-05
Relative tolerance for floating-point comparison
atol: float, default 1e-08
Absolute tolerance for floating-point comparison
**kwargs: additional arguments
Additional comparison options
Raises:
AssertionError: If Series are not equal with detailed diff message
Examples:
# Basic Series comparison
expected = cudf.Series([1, 2, 3, 4, 5])
actual = cudf.Series([1, 2, 3, 4, 5])
cudf.testing.assert_series_equal(expected, actual)
# With custom index and name
expected = cudf.Series([10, 20, 30],
index=['a', 'b', 'c'],
name='values')
actual = cudf.Series([10, 20, 30],
index=['a', 'b', 'c'],
name='values')
cudf.testing.assert_series_equal(expected, actual, check_names=True)
# Floating-point data with tolerance
expected = cudf.Series([1.1, 2.2, 3.3])
actual = cudf.Series([1.100001, 2.200001, 3.300001])
cudf.testing.assert_series_equal(expected, actual, rtol=1e-4)
# String data
expected = cudf.Series(['hello', 'world', 'cudf'])
actual = cudf.Series(['hello', 'world', 'cudf'])
cudf.testing.assert_series_equal(expected, actual)
# Categorical data
expected = cudf.Series(['red', 'blue', 'red'], dtype='category')
actual = cudf.Series(['red', 'blue', 'red'], dtype='category')
cudf.testing.assert_series_equal(expected, actual, check_categorical=True)
# Datetime data
dates = ['2023-01-01', '2023-01-02', '2023-01-03']
expected = cudf.to_datetime(cudf.Series(dates))
actual = cudf.to_datetime(cudf.Series(dates))
cudf.testing.assert_series_equal(expected, actual)
# List data (nested type)
expected = cudf.Series([[1, 2], [3, 4, 5], [6]])
actual = cudf.Series([[1, 2], [3, 4, 5], [6]])
cudf.testing.assert_series_equal(expected, actual)
# Decimal data
decimal_dtype = cudf.Decimal64Dtype(10, 2)
expected = cudf.Series([1.23, 4.56], dtype=decimal_dtype)
actual = cudf.Series([1.23, 4.56], dtype=decimal_dtype)
cudf.testing.assert_series_equal(expected, actual, check_exact=True)
"""Comprehensive Index comparison for all cuDF Index types.
def assert_index_equal(
left,
right,
exact='equiv',
check_names=True,
check_exact=False,
check_categorical=True,
check_order=True,
rtol=1e-05,
atol=1e-08,
**kwargs
) -> None:
"""
Assert Index equality with support for all cuDF Index types
Detailed comparison of Index objects including RangeIndex, DatetimeIndex,
CategoricalIndex, MultiIndex, and other specialized Index types.
Parameters:
left: Index
Expected Index result
right: Index
Actual Index result
exact: str or bool, default 'equiv'
Level of exactness ('equiv' for equivalent, True for exact, False for basic)
check_names: bool, default True
Whether to check Index name compatibility
check_exact: bool, default False
Whether to use exact equality (no floating-point tolerance)
check_categorical: bool, default True
Whether to check categorical index data consistency
check_order: bool, default True
Whether to check that order of elements matches
rtol: float, default 1e-05
Relative tolerance for floating-point comparison
atol: float, default 1e-08
Absolute tolerance for floating-point comparison
**kwargs: additional arguments
Index-type specific comparison options
Raises:
AssertionError: If indexes are not equal with detailed diff message
Examples:
# Basic Index comparison
expected = cudf.Index([1, 2, 3, 4, 5])
actual = cudf.Index([1, 2, 3, 4, 5])
cudf.testing.assert_index_equal(expected, actual)
# Named Index
expected = cudf.Index([10, 20, 30], name='values')
actual = cudf.Index([10, 20, 30], name='values')
cudf.testing.assert_index_equal(expected, actual, check_names=True)
# RangeIndex comparison
expected = cudf.RangeIndex(10) # 0-9
actual = cudf.RangeIndex(start=0, stop=10, step=1)
cudf.testing.assert_index_equal(expected, actual)
# DatetimeIndex comparison
dates = ['2023-01-01', '2023-01-02', '2023-01-03']
expected = cudf.DatetimeIndex(dates)
actual = cudf.DatetimeIndex(dates)
cudf.testing.assert_index_equal(expected, actual)
# CategoricalIndex comparison
categories = ['red', 'blue', 'green']
expected = cudf.CategoricalIndex(['red', 'blue', 'red'])
actual = cudf.CategoricalIndex(['red', 'blue', 'red'])
cudf.testing.assert_index_equal(expected, actual, check_categorical=True)
# MultiIndex comparison
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
expected = cudf.MultiIndex.from_arrays(arrays, names=['letter', 'number'])
actual = cudf.MultiIndex.from_arrays(arrays, names=['letter', 'number'])
cudf.testing.assert_index_equal(expected, actual, check_names=True)
# IntervalIndex comparison
expected = cudf.interval_range(0, 10, periods=5)
actual = cudf.interval_range(0, 10, periods=5)
cudf.testing.assert_index_equal(expected, actual)
# Float Index with tolerance
expected = cudf.Index([1.1, 2.2, 3.3])
actual = cudf.Index([1.100001, 2.200001, 3.300001])
cudf.testing.assert_index_equal(expected, actual, rtol=1e-4)
"""import pytest
import cudf
import cudf.testing
class TestDataFrameOperations:
"""Example test class using cuDF testing utilities"""
@pytest.mark.parametrize("data", [
{'A': [1, 2, 3], 'B': [4, 5, 6]},
{'x': [1.1, 2.2], 'y': [3.3, 4.4]},
{'str_col': ['a', 'b', 'c']}
])
def test_dataframe_creation(self, data):
"""Test DataFrame creation with various data types"""
df = cudf.DataFrame(data)
expected = cudf.DataFrame(data)
cudf.testing.assert_frame_equal(df, expected)
@pytest.mark.parametrize("dtype", ['int32', 'int64', 'float32', 'float64'])
def test_series_dtypes(self, dtype):
"""Test Series with different numeric dtypes"""
data = [1, 2, 3, 4, 5]
series = cudf.Series(data, dtype=dtype)
expected = cudf.Series(data, dtype=dtype)
cudf.testing.assert_series_equal(series, expected, check_dtype=True)import cudf
import cudf.testing
def test_large_dataframe_operations():
"""Test operations on large DataFrames that require GPU memory management"""
# Create large DataFrame
n_rows = 1_000_000
df = cudf.DataFrame({
'A': range(n_rows),
'B': range(n_rows, 2 * n_rows),
'C': [f'str_{i}' for i in range(n_rows)]
})
# Perform operations and verify results
grouped = df.groupby('A').sum()
expected_b_sum = df['B'].sum() # All B values summed
# Use testing utilities to verify
assert len(grouped) <= n_rows # Sanity check
cudf.testing.assert_eq(grouped['B'].sum(), expected_b_sum)
def test_memory_efficient_operations():
"""Test that operations don't unnecessarily copy GPU memory"""
original_df = cudf.DataFrame({'x': range(100000)})
# Operation that should not copy data
view_df = original_df[['x']] # Column selection
# Verify data is shared (same underlying GPU memory)
# Note: Actual memory sharing verification would require
# more sophisticated GPU memory inspection
cudf.testing.assert_series_equal(original_df['x'], view_df['x'])import pytest
import cudf
import cudf.testing
def test_assertion_errors():
"""Test that assertion functions properly raise errors for different data"""
df1 = cudf.DataFrame({'A': [1, 2, 3]})
df2 = cudf.DataFrame({'A': [4, 5, 6]})
# This should raise AssertionError
with pytest.raises(AssertionError):
cudf.testing.assert_frame_equal(df1, df2)
# Test dtype mismatch
series1 = cudf.Series([1, 2, 3], dtype='int32')
series2 = cudf.Series([1, 2, 3], dtype='int64')
with pytest.raises(AssertionError):
cudf.testing.assert_series_equal(series1, series2, check_dtype=True)
# But should pass without dtype checking
cudf.testing.assert_series_equal(series1, series2, check_dtype=False)
def test_tolerance_behavior():
"""Test floating-point tolerance behavior"""
# Within tolerance - should pass
series1 = cudf.Series([1.0, 2.0, 3.0])
series2 = cudf.Series([1.0000001, 2.0000001, 3.0000001])
cudf.testing.assert_series_equal(series1, series2, rtol=1e-6)
# Outside tolerance - should fail
series3 = cudf.Series([1.1, 2.1, 3.1])
with pytest.raises(AssertionError):
cudf.testing.assert_series_equal(series1, series3, rtol=1e-6)import cudf
import pandas as pd
import cudf.testing
def test_cudf_pandas_compatibility():
"""Test that cuDF and pandas produce equivalent results"""
# Create equivalent data in both libraries
data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]}
cudf_df = cudf.DataFrame(data)
pandas_df = pd.DataFrame(data)
# Perform same operation on both
cudf_result = cudf_df.groupby('A').sum()
pandas_result = pandas_df.groupby('A').sum()
# Compare results (cuDF testing handles cross-library comparison)
cudf.testing.assert_frame_equal(cudf_result, pandas_result)
def test_round_trip_conversion():
"""Test cuDF -> pandas -> cuDF conversion preserves data"""
original = cudf.DataFrame({
'ints': [1, 2, 3],
'floats': [1.1, 2.2, 3.3],
'strings': ['a', 'b', 'c']
})
# Convert to pandas and back
pandas_version = original.to_pandas()
round_trip = cudf.from_pandas(pandas_version)
# Should be identical
cudf.testing.assert_frame_equal(original, round_trip)rtol/atol based on expected precisioncheck_dtype=True for type-sensitive testsInstall with Tessl CLI
npx tessl i tessl/pypi-cudf-cu12