tessl/pypi-cudf-cu12

GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data

—

Pending

Overview

Eval results

Files

Testing Utilities

Name: tessl/pypi-cudf-cu12
Author: tessl

cuDF provides comprehensive testing utilities for GPU-aware testing of DataFrames, Series, and Index objects. These functions provide detailed comparison capabilities that handle GPU memory, floating-point precision, and cuDF-specific data types.

Import Statements

# Core testing functions
from cudf.testing import (
    assert_eq, assert_neq,
    assert_frame_equal, assert_series_equal, assert_index_equal
)

# For use in test suites
import cudf.testing as cudf_testing

Generic Equality Assertions

Universal equality testing function that handles all cuDF object types.

def assert_eq(
    left,
    right,
    check_dtype=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    rtol=1e-05,
    atol=1e-08,
    **kwargs
) -> None:
    """
    Generic equality assertion for cuDF objects with GPU-aware comparison
    
    Comprehensive equality testing that automatically detects object type
    and applies appropriate comparison logic. Handles DataFrames, Series,
    Index objects, and scalar values with GPU memory considerations.
    
    Parameters:
        left: cuDF object, pandas object, or scalar
            Expected result object
        right: cuDF object, pandas object, or scalar  
            Actual result object
        check_dtype: bool, default True
            Whether to check dtype compatibility exactly
        check_exact: bool, default False
            Whether to check exact equality (no floating-point tolerance)
        check_datetimelike_compat: bool, default False
            Whether to compare datetime-like objects across types
        check_categorical: bool, default True
            Whether to check categorical data consistency
        check_category_order: bool, default True
            Whether categorical category order must match
        rtol: float, default 1e-05
            Relative tolerance for floating-point comparisons
        atol: float, default 1e-08
            Absolute tolerance for floating-point comparisons
        **kwargs: additional arguments
            Type-specific comparison options
            
    Raises:
        AssertionError: If objects are not equal according to specified criteria
        
    Examples:
        # DataFrame comparison
        expected = cudf.DataFrame({'A': [1, 2, 3], 'B': [4.0, 5.0, 6.0]})
        actual = cudf.DataFrame({'A': [1, 2, 3], 'B': [4.0, 5.0, 6.0]})
        cudf.testing.assert_eq(expected, actual)
        
        # Series comparison with tolerance
        expected = cudf.Series([1.1, 2.2, 3.3])
        actual = cudf.Series([1.100001, 2.200001, 3.300001])
        cudf.testing.assert_eq(expected, actual, rtol=1e-4)
        
        # Mixed cuDF/pandas comparison
        cudf_series = cudf.Series([1, 2, 3])
        pandas_series = cudf_series.to_pandas()
        cudf.testing.assert_eq(cudf_series, pandas_series)
        
        # Scalar comparison
        cudf.testing.assert_eq(5, 5)
        cudf.testing.assert_eq(3.14159, 3.14160, rtol=1e-4)
        
        # Categorical comparison
        cat1 = cudf.Series(['a', 'b', 'c'], dtype='category')
        cat2 = cudf.Series(['a', 'b', 'c'], dtype='category') 
        cudf.testing.assert_eq(cat1, cat2, check_categorical=True)
    """

def assert_neq(
    left,
    right,
    **kwargs
) -> None:
    """
    Assert that two objects are not equal
    
    Inverse of assert_eq - ensures objects are different according to
    the same comparison criteria used by assert_eq.
    
    Parameters:
        left: cuDF object, pandas object, or scalar
            First object to compare
        right: cuDF object, pandas object, or scalar
            Second object to compare  
        **kwargs: additional arguments
            Passed to underlying comparison functions
            
    Raises:
        AssertionError: If objects are equal according to comparison criteria
        
    Examples:
        # Different DataFrames
        df1 = cudf.DataFrame({'A': [1, 2, 3]})
        df2 = cudf.DataFrame({'A': [4, 5, 6]})
        cudf.testing.assert_neq(df1, df2)
        
        # Different dtypes
        series1 = cudf.Series([1, 2, 3], dtype='int32')
        series2 = cudf.Series([1, 2, 3], dtype='int64')
        cudf.testing.assert_neq(series1, series2, check_dtype=True)
        
        # Different values
        cudf.testing.assert_neq(5, 6)
        cudf.testing.assert_neq([1, 2, 3], [1, 2, 4])
    """

DataFrame Equality Assertions

Detailed DataFrame comparison with comprehensive options for handling edge cases.

def assert_frame_equal(
    left,
    right,
    check_dtype=True,
    check_index_type=True,
    check_column_type=True,
    check_frame_type=True,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    check_like=False,
    rtol=1e-05,
    atol=1e-08,
    **kwargs
) -> None:
    """
    Assert DataFrame equality with comprehensive GPU-aware comparison
    
    Detailed DataFrame comparison that checks data values, dtypes, indexes,
    column names, and metadata. Optimized for GPU DataFrames with support
    for floating-point tolerance and categorical data.
    
    Parameters:
        left: DataFrame
            Expected DataFrame result
        right: DataFrame
            Actual DataFrame result
        check_dtype: bool, default True
            Whether to check that dtypes match exactly
        check_index_type: bool, default True
            Whether to check index type compatibility
        check_column_type: bool, default True
            Whether to check column type compatibility
        check_frame_type: bool, default True
            Whether to check that both objects are DataFrames
        check_names: bool, default True
            Whether to check index and column names match
        check_exact: bool, default False
            Whether to use exact equality (no floating-point tolerance)
        check_datetimelike_compat: bool, default False
            Whether to allow comparison of different datetime-like types
        check_categorical: bool, default True
            Whether to check categorical data consistency
        check_category_order: bool, default True
            Whether categorical category order must match exactly
        check_like: bool, default False
            Whether to ignore order of index and columns
        rtol: float, default 1e-05
            Relative tolerance for floating-point comparison
        atol: float, default 1e-08
            Absolute tolerance for floating-point comparison
        **kwargs: additional arguments
            Additional comparison options
            
    Raises:
        AssertionError: If DataFrames are not equal with detailed diff message
        
    Examples:
        # Basic DataFrame comparison
        expected = cudf.DataFrame({
            'A': [1, 2, 3],
            'B': [4.0, 5.0, 6.0],
            'C': ['x', 'y', 'z']
        })
        actual = cudf.DataFrame({
            'A': [1, 2, 3], 
            'B': [4.0, 5.0, 6.0],
            'C': ['x', 'y', 'z']
        })
        cudf.testing.assert_frame_equal(expected, actual)
        
        # With custom index
        expected.index = ['row1', 'row2', 'row3']
        actual.index = ['row1', 'row2', 'row3']
        cudf.testing.assert_frame_equal(expected, actual, check_names=True)
        
        # Floating-point tolerance
        expected = cudf.DataFrame({'vals': [1.1, 2.2, 3.3]})
        actual = cudf.DataFrame({'vals': [1.100001, 2.200001, 3.300001]})
        cudf.testing.assert_frame_equal(expected, actual, rtol=1e-4)
        
        # Ignore column/index order
        expected = cudf.DataFrame({'A': [1, 2], 'B': [3, 4]})
        actual = cudf.DataFrame({'B': [3, 4], 'A': [1, 2]})
        cudf.testing.assert_frame_equal(expected, actual, check_like=True)
        
        # Mixed cuDF/pandas comparison
        cudf_df = cudf.DataFrame({'x': [1, 2, 3]})
        pandas_df = cudf_df.to_pandas()
        cudf.testing.assert_frame_equal(cudf_df, pandas_df)
        
        # Categorical data
        cat_df1 = cudf.DataFrame({
            'cat_col': cudf.Series(['a', 'b', 'c'], dtype='category')
        })
        cat_df2 = cudf.DataFrame({
            'cat_col': cudf.Series(['a', 'b', 'c'], dtype='category')
        })
        cudf.testing.assert_frame_equal(cat_df1, cat_df2, check_categorical=True)
    """

Series Equality Assertions

Detailed Series comparison with support for all cuDF data types.

def assert_series_equal(
    left,
    right,
    check_dtype=True,
    check_index_type=True,
    check_series_type=True,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    rtol=1e-05,
    atol=1e-08,
    **kwargs
) -> None:
    """
    Assert Series equality with GPU-aware detailed comparison
    
    Comprehensive Series comparison that validates data values, dtype,
    index, name, and metadata. Handles cuDF-specific data types including
    nested types (lists, structs) and extension types (decimals).
    
    Parameters:
        left: Series
            Expected Series result
        right: Series
            Actual Series result
        check_dtype: bool, default True
            Whether to check dtype compatibility exactly
        check_index_type: bool, default True  
            Whether to check index type compatibility
        check_series_type: bool, default True
            Whether to check that both objects are Series
        check_names: bool, default True
            Whether to check Series and index names match
        check_exact: bool, default False
            Whether to use exact equality (no floating-point tolerance)
        check_datetimelike_compat: bool, default False
            Whether to allow comparison of different datetime-like types
        check_categorical: bool, default True
            Whether to check categorical data consistency
        check_category_order: bool, default True
            Whether categorical category order must match
        rtol: float, default 1e-05
            Relative tolerance for floating-point comparison
        atol: float, default 1e-08
            Absolute tolerance for floating-point comparison
        **kwargs: additional arguments
            Additional comparison options
            
    Raises:
        AssertionError: If Series are not equal with detailed diff message
        
    Examples:
        # Basic Series comparison
        expected = cudf.Series([1, 2, 3, 4, 5])
        actual = cudf.Series([1, 2, 3, 4, 5])
        cudf.testing.assert_series_equal(expected, actual)
        
        # With custom index and name
        expected = cudf.Series([10, 20, 30], 
                              index=['a', 'b', 'c'], 
                              name='values')
        actual = cudf.Series([10, 20, 30],
                            index=['a', 'b', 'c'],
                            name='values')
        cudf.testing.assert_series_equal(expected, actual, check_names=True)
        
        # Floating-point data with tolerance
        expected = cudf.Series([1.1, 2.2, 3.3])
        actual = cudf.Series([1.100001, 2.200001, 3.300001])
        cudf.testing.assert_series_equal(expected, actual, rtol=1e-4)
        
        # String data
        expected = cudf.Series(['hello', 'world', 'cudf'])
        actual = cudf.Series(['hello', 'world', 'cudf'])
        cudf.testing.assert_series_equal(expected, actual)
        
        # Categorical data
        expected = cudf.Series(['red', 'blue', 'red'], dtype='category')
        actual = cudf.Series(['red', 'blue', 'red'], dtype='category')
        cudf.testing.assert_series_equal(expected, actual, check_categorical=True)
        
        # Datetime data
        dates = ['2023-01-01', '2023-01-02', '2023-01-03']
        expected = cudf.to_datetime(cudf.Series(dates))
        actual = cudf.to_datetime(cudf.Series(dates))
        cudf.testing.assert_series_equal(expected, actual)
        
        # List data (nested type)
        expected = cudf.Series([[1, 2], [3, 4, 5], [6]])
        actual = cudf.Series([[1, 2], [3, 4, 5], [6]])
        cudf.testing.assert_series_equal(expected, actual)
        
        # Decimal data  
        decimal_dtype = cudf.Decimal64Dtype(10, 2)
        expected = cudf.Series([1.23, 4.56], dtype=decimal_dtype)
        actual = cudf.Series([1.23, 4.56], dtype=decimal_dtype)
        cudf.testing.assert_series_equal(expected, actual, check_exact=True)
    """

Index Equality Assertions

Comprehensive Index comparison for all cuDF Index types.

def assert_index_equal(
    left,
    right,
    exact='equiv',
    check_names=True,
    check_exact=False,
    check_categorical=True,
    check_order=True,
    rtol=1e-05,
    atol=1e-08,
    **kwargs
) -> None:
    """
    Assert Index equality with support for all cuDF Index types
    
    Detailed comparison of Index objects including RangeIndex, DatetimeIndex,
    CategoricalIndex, MultiIndex, and other specialized Index types.
    
    Parameters:
        left: Index
            Expected Index result
        right: Index
            Actual Index result
        exact: str or bool, default 'equiv'
            Level of exactness ('equiv' for equivalent, True for exact, False for basic)
        check_names: bool, default True
            Whether to check Index name compatibility
        check_exact: bool, default False
            Whether to use exact equality (no floating-point tolerance)
        check_categorical: bool, default True
            Whether to check categorical index data consistency
        check_order: bool, default True
            Whether to check that order of elements matches
        rtol: float, default 1e-05
            Relative tolerance for floating-point comparison  
        atol: float, default 1e-08
            Absolute tolerance for floating-point comparison
        **kwargs: additional arguments
            Index-type specific comparison options
            
    Raises:
        AssertionError: If indexes are not equal with detailed diff message
        
    Examples:
        # Basic Index comparison
        expected = cudf.Index([1, 2, 3, 4, 5])
        actual = cudf.Index([1, 2, 3, 4, 5])
        cudf.testing.assert_index_equal(expected, actual)
        
        # Named Index
        expected = cudf.Index([10, 20, 30], name='values')
        actual = cudf.Index([10, 20, 30], name='values')
        cudf.testing.assert_index_equal(expected, actual, check_names=True)
        
        # RangeIndex comparison
        expected = cudf.RangeIndex(10)  # 0-9
        actual = cudf.RangeIndex(start=0, stop=10, step=1)
        cudf.testing.assert_index_equal(expected, actual)
        
        # DatetimeIndex comparison
        dates = ['2023-01-01', '2023-01-02', '2023-01-03']
        expected = cudf.DatetimeIndex(dates)
        actual = cudf.DatetimeIndex(dates)
        cudf.testing.assert_index_equal(expected, actual)
        
        # CategoricalIndex comparison
        categories = ['red', 'blue', 'green']
        expected = cudf.CategoricalIndex(['red', 'blue', 'red'])
        actual = cudf.CategoricalIndex(['red', 'blue', 'red'])
        cudf.testing.assert_index_equal(expected, actual, check_categorical=True)
        
        # MultiIndex comparison
        arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
        expected = cudf.MultiIndex.from_arrays(arrays, names=['letter', 'number'])
        actual = cudf.MultiIndex.from_arrays(arrays, names=['letter', 'number'])
        cudf.testing.assert_index_equal(expected, actual, check_names=True)
        
        # IntervalIndex comparison  
        expected = cudf.interval_range(0, 10, periods=5)
        actual = cudf.interval_range(0, 10, periods=5)
        cudf.testing.assert_index_equal(expected, actual)
        
        # Float Index with tolerance
        expected = cudf.Index([1.1, 2.2, 3.3])
        actual = cudf.Index([1.100001, 2.200001, 3.300001])
        cudf.testing.assert_index_equal(expected, actual, rtol=1e-4)
    """

Advanced Testing Patterns

Parameterized Testing

import pytest
import cudf
import cudf.testing

class TestDataFrameOperations:
    """Example test class using cuDF testing utilities"""
    
    @pytest.mark.parametrize("data", [
        {'A': [1, 2, 3], 'B': [4, 5, 6]},
        {'x': [1.1, 2.2], 'y': [3.3, 4.4]},
        {'str_col': ['a', 'b', 'c']}
    ])
    def test_dataframe_creation(self, data):
        """Test DataFrame creation with various data types"""
        df = cudf.DataFrame(data)
        expected = cudf.DataFrame(data)
        cudf.testing.assert_frame_equal(df, expected)
        
    @pytest.mark.parametrize("dtype", ['int32', 'int64', 'float32', 'float64'])
    def test_series_dtypes(self, dtype):
        """Test Series with different numeric dtypes"""
        data = [1, 2, 3, 4, 5]
        series = cudf.Series(data, dtype=dtype)
        expected = cudf.Series(data, dtype=dtype)
        cudf.testing.assert_series_equal(series, expected, check_dtype=True)

GPU Memory Testing

import cudf
import cudf.testing

def test_large_dataframe_operations():
    """Test operations on large DataFrames that require GPU memory management"""
    
    # Create large DataFrame
    n_rows = 1_000_000
    df = cudf.DataFrame({
        'A': range(n_rows),
        'B': range(n_rows, 2 * n_rows),
        'C': [f'str_{i}' for i in range(n_rows)]
    })
    
    # Perform operations and verify results
    grouped = df.groupby('A').sum()
    expected_b_sum = df['B'].sum()  # All B values summed
    
    # Use testing utilities to verify
    assert len(grouped) <= n_rows  # Sanity check
    cudf.testing.assert_eq(grouped['B'].sum(), expected_b_sum)

def test_memory_efficient_operations():
    """Test that operations don't unnecessarily copy GPU memory"""
    original_df = cudf.DataFrame({'x': range(100000)})
    
    # Operation that should not copy data
    view_df = original_df[['x']]  # Column selection
    
    # Verify data is shared (same underlying GPU memory)
    # Note: Actual memory sharing verification would require 
    # more sophisticated GPU memory inspection
    cudf.testing.assert_series_equal(original_df['x'], view_df['x'])

Error Condition Testing

import pytest
import cudf
import cudf.testing

def test_assertion_errors():
    """Test that assertion functions properly raise errors for different data"""
    
    df1 = cudf.DataFrame({'A': [1, 2, 3]})
    df2 = cudf.DataFrame({'A': [4, 5, 6]})
    
    # This should raise AssertionError
    with pytest.raises(AssertionError):
        cudf.testing.assert_frame_equal(df1, df2)
    
    # Test dtype mismatch
    series1 = cudf.Series([1, 2, 3], dtype='int32')
    series2 = cudf.Series([1, 2, 3], dtype='int64')
    
    with pytest.raises(AssertionError):
        cudf.testing.assert_series_equal(series1, series2, check_dtype=True)
    
    # But should pass without dtype checking
    cudf.testing.assert_series_equal(series1, series2, check_dtype=False)

def test_tolerance_behavior():
    """Test floating-point tolerance behavior"""
    
    # Within tolerance - should pass
    series1 = cudf.Series([1.0, 2.0, 3.0])
    series2 = cudf.Series([1.0000001, 2.0000001, 3.0000001])
    cudf.testing.assert_series_equal(series1, series2, rtol=1e-6)
    
    # Outside tolerance - should fail
    series3 = cudf.Series([1.1, 2.1, 3.1])
    with pytest.raises(AssertionError):
        cudf.testing.assert_series_equal(series1, series3, rtol=1e-6)

Cross-Platform Testing

import cudf
import pandas as pd
import cudf.testing

def test_cudf_pandas_compatibility():
    """Test that cuDF and pandas produce equivalent results"""
    
    # Create equivalent data in both libraries
    data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]}
    cudf_df = cudf.DataFrame(data)
    pandas_df = pd.DataFrame(data)
    
    # Perform same operation on both
    cudf_result = cudf_df.groupby('A').sum()
    pandas_result = pandas_df.groupby('A').sum()
    
    # Compare results (cuDF testing handles cross-library comparison)
    cudf.testing.assert_frame_equal(cudf_result, pandas_result)
    
def test_round_trip_conversion():
    """Test cuDF -> pandas -> cuDF conversion preserves data"""
    
    original = cudf.DataFrame({
        'ints': [1, 2, 3],
        'floats': [1.1, 2.2, 3.3],
        'strings': ['a', 'b', 'c']
    })
    
    # Convert to pandas and back
    pandas_version = original.to_pandas()
    round_trip = cudf.from_pandas(pandas_version)
    
    # Should be identical
    cudf.testing.assert_frame_equal(original, round_trip)

Performance Considerations

GPU Testing Efficiency

Minimize Data Transfer: Keep test data on GPU when possible
Batch Assertions: Combine multiple checks in single test function
Memory Management: Use appropriate data sizes for test reproducibility
Parallel Testing: Design tests to run independently for parallel execution

Best Practices

Use Appropriate Tolerances: Set rtol/atol based on expected precision
Check Dtypes When Relevant: Use check_dtype=True for type-sensitive tests
Test Edge Cases: Include empty DataFrames, NaN values, and boundary conditions
Cross-Library Compatibility: Test cuDF results against pandas equivalents
Memory Cleanup: Ensure large test objects are properly garbage collected