Dask cuDF - A GPU Backend for Dask DataFrame providing GPU-accelerated parallel and larger-than-memory DataFrame computing
—
Specialized accessor methods for complex cuDF data types including list and struct columns, providing GPU-accelerated operations on nested data structures that are unique to cuDF's columnar format.
Operations for list-type columns that contain arrays or sequences as individual cell values, enabling efficient manipulation of nested array data on the GPU.
class ListMethods:
"""
Accessor methods for Series containing list-type data.
Accessed via Series.list property on list-dtype Series.
Provides GPU-accelerated operations on list columns.
"""
def __init__(self, d_series):
"""
Initialize list accessor.
Parameters:
- d_series: Series - Dask-cuDF Series with list dtype
"""
def len(self):
"""
Compute the length of each list element in the Series.
Returns the number of elements in each list, with null values
for null lists.
Returns:
Series - Integer Series with list lengths
Example:
>>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.len().compute()
0 3
1 <NA>
2 2
dtype: int32
"""
def contains(self, search_key):
"""
Check if each list contains the specified scalar value.
Creates boolean Series indicating whether the search key
is present in each list element.
Parameters:
- search_key: scalar - Value to search for in each list
Returns:
Series - Boolean Series indicating containment
Example:
>>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.contains(4).compute()
0 False
1 True
2 True
dtype: bool
"""
def get(self, index):
"""
Extract element at specified index from each list.
Supports negative indexing for accessing elements from the end.
Returns null for out-of-bounds indices.
Parameters:
- index: int - Index position to extract (supports negative indexing)
Returns:
Series - Series with extracted elements
Example:
>>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.get(-1).compute() # Last element
0 3
1 5
2 6
dtype: int64
"""
@property
def leaves(self):
"""
Extract all leaf elements from nested lists as flat Series.
For nested list structures, returns the innermost elements
as a flat Series with one value per row.
Returns:
Series - Flattened Series of leaf values
Example:
>>> s = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.leaves.compute()
0 1
1 <NA>
2 3
3 4
4 5
5 6
dtype: int64
"""
def take(self, lists_indices):
"""
Collect list elements based on index arrays.
For each row, extracts elements at positions specified
by the corresponding index list.
Parameters:
- lists_indices: list of lists - Index positions for each row
Returns:
Series - Series with collected elements as lists
Example:
>>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.take([[0, 1], [], []]).compute()
0 [1, 2]
1 None
2 []
dtype: list
"""
def unique(self):
"""
Get unique elements within each list.
Returns unique elements for each list, removing duplicates.
Order of unique elements is not guaranteed.
Returns:
Series - Series with unique elements as lists
Example:
>>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.unique().compute() # Order not guaranteed
0 [1.0, 2.0, nan]
1 None
2 [4.0]
3 []
dtype: list
"""
def sort_values(self, ascending=True, inplace=False, kind="quicksort",
na_position="last", ignore_index=False):
"""
Sort elements within each list.
Sorts the contents of each list according to specified criteria.
Parameters:
- ascending: bool, default True - Sort order
- inplace: bool, default False - Modify in place (not supported)
- kind: str, default "quicksort" - Sort algorithm (not supported)
- na_position: str, default "last" - Null placement ('first' or 'last')
- ignore_index: bool, default False - Reset result index
Returns:
Series - Series with sorted lists
Notes:
- inplace and kind parameters not supported in cuDF
Example:
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.list.sort_values().compute()
0 [2.0, 4.0, 9.0, nan]
1 [2.0, 8.0, 8.0]
2 [1.0, 2.0]
dtype: list
"""Operations for struct-type columns containing record-like data with named fields, enabling efficient manipulation of structured data on the GPU.
class StructMethods:
"""
Accessor methods for Series containing struct-type data.
Accessed via Series.struct property on struct-dtype Series.
Provides GPU-accelerated operations on structured data.
"""
def __init__(self, d_series):
"""
Initialize struct accessor.
Parameters:
- d_series: Series - Dask-cuDF Series with struct dtype
"""
def field(self, key):
"""
Extract a specific field from struct column.
Extracts the specified field by name or index position,
returning a new Series with the field values.
Parameters:
- key: str or int - Field name or index position
Returns:
Series - Series containing the extracted field values
Examples:
>>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.struct.field('a').compute()
0 1
1 3
dtype: int64
>>> ds.struct.field(0).compute() # First field
0 1
1 3
dtype: int64
"""
def explode(self):
"""
Explode struct column into separate DataFrame columns.
Creates a DataFrame with one column per struct field,
expanding the struct into a tabular format.
Returns:
DataFrame - DataFrame with struct fields as columns
Example:
>>> s = cudf.Series([
... {'a': 42, 'b': 'str1', 'c': [-1]},
... {'a': 0, 'b': 'str2', 'c': [400, 500]},
... {'a': 7, 'b': '', 'c': []}
... ])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.struct.explode().compute()
a b c
0 42 str1 [-1]
1 0 str2 [400, 500]
2 7 []
"""import cudf
import dask_cudf
# Create DataFrame with list column
df = cudf.DataFrame({
'id': [1, 2, 3, 4],
'values': [[1, 2, 3], [4, 5], [], [6, 7, 8, 9]]
})
ddf = dask_cudf.from_cudf(df, npartitions=2)
# Get list lengths
lengths = ddf['values'].list.len()
print("List lengths:")
print(lengths.compute())
# Check if lists contain specific value
contains_5 = ddf['values'].list.contains(5)
print("\nContains 5:")
print(contains_5.compute())
# Get first element of each list
first_elements = ddf['values'].list.get(0)
print("\nFirst elements:")
print(first_elements.compute())
# Sort values within each list
sorted_lists = ddf['values'].list.sort_values(ascending=False)
print("\nSorted lists (descending):")
print(sorted_lists.compute())# Create nested list data
nested_df = cudf.DataFrame({
'nested_lists': [
[[1, 2], [3, 4, 5]],
[[6], [7, 8]],
[[], [9, 10, 11]]
]
})
ddf_nested = dask_cudf.from_cudf(nested_df, npartitions=1)
# Extract all leaf values
leaves = ddf_nested['nested_lists'].list.leaves
print("Leaf values:")
print(leaves.compute())
# Custom indexing with take
indices = [[0], [1, 0], [1]] # Take different elements from each row
taken = ddf_nested['nested_lists'].list.take(indices)
print("\nTaken elements:")
print(taken.compute())# Create DataFrame with struct column
struct_data = cudf.Series([
{'name': 'Alice', 'age': 25, 'city': 'NY'},
{'name': 'Bob', 'age': 30, 'city': 'LA'},
{'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
])
df_struct = cudf.DataFrame({'person': struct_data})
ddf_struct = dask_cudf.from_cudf(df_struct, npartitions=2)
# Extract specific fields
names = ddf_struct['person'].struct.field('name')
ages = ddf_struct['person'].struct.field('age')
print("Names:")
print(names.compute())
print("\nAges:")
print(ages.compute())
# Explode struct into DataFrame
exploded = ddf_struct['person'].struct.explode()
print("\nExploded struct:")
print(exploded.compute())# Complex pipeline with mixed data types
complex_df = cudf.DataFrame({
'group': ['A', 'B', 'A', 'B'],
'measurements': [
[1.1, 2.2, 3.3],
[4.4, 5.5],
[6.6, 7.7, 8.8, 9.9],
[10.0]
],
'metadata': [
{'sensor': 'temp', 'unit': 'C'},
{'sensor': 'humidity', 'unit': '%'},
{'sensor': 'pressure', 'unit': 'hPa'},
{'sensor': 'wind', 'unit': 'm/s'}
]
})
ddf_complex = dask_cudf.from_cudf(complex_df, npartitions=2)
# Extract sensor types
sensors = ddf_complex['metadata'].struct.field('sensor')
# Calculate measurement statistics
avg_measurements = ddf_complex['measurements'].list.len()
max_measurements = ddf_complex.groupby('group').apply(
lambda x: x['measurements'].list.len().max()
)
print("Sensor types:")
print(sensors.compute())
print("\nMeasurement counts by group:")
print(max_measurements.compute())
# Filter based on list length and struct content
filtered = ddf_complex[
(ddf_complex['measurements'].list.len() > 2) &
(ddf_complex['metadata'].struct.field('sensor') != 'wind')
]
print("\nFiltered data:")
print(filtered.compute())Install with Tessl CLI
npx tessl i tessl/pypi-dask-cudf