Tessl Tile for pypi/dask-cudf@24.12.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-operations.md data-io.md data-type-accessors.md groupby-operations.md index.md

data-type-accessors.mddocs/

0
# Data Type Accessors
1

2
Specialized accessor methods for complex cuDF data types including list and struct columns, providing GPU-accelerated operations on nested data structures that are unique to cuDF's columnar format.
3

4
## Capabilities
5

6
### List Column Accessors
7

8
Operations for list-type columns that contain arrays or sequences as individual cell values, enabling efficient manipulation of nested array data on the GPU.
9

10
```python { .api }
11
class ListMethods:
12
    """
13
    Accessor methods for Series containing list-type data.
14
    
15
    Accessed via Series.list property on list-dtype Series.
16
    Provides GPU-accelerated operations on list columns.
17
    """
18
    
19
    def __init__(self, d_series):
20
        """
21
        Initialize list accessor.
22
        
23
        Parameters:
24
        - d_series: Series - Dask-cuDF Series with list dtype
25
        """
26
    
27
    def len(self):
28
        """
29
        Compute the length of each list element in the Series.
30
        
31
        Returns the number of elements in each list, with null values
32
        for null lists.
33
        
34
        Returns:
35
        Series - Integer Series with list lengths
36
        
37
        Example:
38
        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
39
        >>> ds = dask_cudf.from_cudf(s, 2)
40
        >>> ds.list.len().compute()
41
        0       3
42
        1    <NA>
43
        2       2
44
        dtype: int32
45
        """
46
    
47
    def contains(self, search_key):
48
        """
49
        Check if each list contains the specified scalar value.
50
        
51
        Creates boolean Series indicating whether the search key
52
        is present in each list element.
53
        
54
        Parameters:
55
        - search_key: scalar - Value to search for in each list
56
        
57
        Returns:
58
        Series - Boolean Series indicating containment
59
        
60
        Example:
61
        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
62
        >>> ds = dask_cudf.from_cudf(s, 2)
63
        >>> ds.list.contains(4).compute()
64
        0    False
65
        1     True
66
        2     True
67
        dtype: bool
68
        """
69
    
70
    def get(self, index):
71
        """
72
        Extract element at specified index from each list.
73
        
74
        Supports negative indexing for accessing elements from the end.
75
        Returns null for out-of-bounds indices.
76
        
77
        Parameters:
78
        - index: int - Index position to extract (supports negative indexing)
79
        
80
        Returns:
81
        Series - Series with extracted elements
82
        
83
        Example:
84
        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
85
        >>> ds = dask_cudf.from_cudf(s, 2)
86
        >>> ds.list.get(-1).compute()  # Last element
87
        0    3
88
        1    5
89
        2    6
90
        dtype: int64
91
        """
92
    
93
    @property
94
    def leaves(self):
95
        """
96
        Extract all leaf elements from nested lists as flat Series.
97
        
98
        For nested list structures, returns the innermost elements
99
        as a flat Series with one value per row.
100
        
101
        Returns:
102
        Series - Flattened Series of leaf values
103
        
104
        Example:
105
        >>> s = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]])
106
        >>> ds = dask_cudf.from_cudf(s, 2)
107
        >>> ds.list.leaves.compute()
108
        0       1
109
        1    <NA>
110
        2       3
111
        3       4
112
        4       5
113
        5       6
114
        dtype: int64
115
        """
116
    
117
    def take(self, lists_indices):
118
        """
119
        Collect list elements based on index arrays.
120
        
121
        For each row, extracts elements at positions specified
122
        by the corresponding index list.
123
        
124
        Parameters:
125
        - lists_indices: list of lists - Index positions for each row
126
        
127
        Returns:
128
        Series - Series with collected elements as lists
129
        
130
        Example:
131
        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
132
        >>> ds = dask_cudf.from_cudf(s, 2)
133
        >>> ds.list.take([[0, 1], [], []]).compute()
134
        0    [1, 2]
135
        1      None
136
        2        []
137
        dtype: list
138
        """
139
    
140
    def unique(self):
141
        """
142
        Get unique elements within each list.
143
        
144
        Returns unique elements for each list, removing duplicates.
145
        Order of unique elements is not guaranteed.
146
        
147
        Returns:
148
        Series - Series with unique elements as lists
149
        
150
        Example:
151
        >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
152
        >>> ds = dask_cudf.from_cudf(s, 2)
153
        >>> ds.list.unique().compute()  # Order not guaranteed
154
        0              [1.0, 2.0, nan]
155
        1                         None
156
        2                        [4.0]
157
        3                           []
158
        dtype: list
159
        """
160
    
161
    def sort_values(self, ascending=True, inplace=False, kind="quicksort", 
162
                   na_position="last", ignore_index=False):
163
        """
164
        Sort elements within each list.
165
        
166
        Sorts the contents of each list according to specified criteria.
167
        
168
        Parameters:
169
        - ascending: bool, default True - Sort order
170
        - inplace: bool, default False - Modify in place (not supported)
171
        - kind: str, default "quicksort" - Sort algorithm (not supported)
172
        - na_position: str, default "last" - Null placement ('first' or 'last')
173
        - ignore_index: bool, default False - Reset result index
174
        
175
        Returns:
176
        Series - Series with sorted lists
177
        
178
        Notes:
179
        - inplace and kind parameters not supported in cuDF
180
        
181
        Example:
182
        >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
183
        >>> ds = dask_cudf.from_cudf(s, 2)
184
        >>> ds.list.sort_values().compute()
185
        0    [2.0, 4.0, 9.0, nan]
186
        1         [2.0, 8.0, 8.0]
187
        2              [1.0, 2.0]
188
        dtype: list
189
        """
190
```
191

192
### Struct Column Accessors
193

194
Operations for struct-type columns containing record-like data with named fields, enabling efficient manipulation of structured data on the GPU.
195

196
```python { .api }
197
class StructMethods:
198
    """
199
    Accessor methods for Series containing struct-type data.
200
    
201
    Accessed via Series.struct property on struct-dtype Series.
202
    Provides GPU-accelerated operations on structured data.
203
    """
204
    
205
    def __init__(self, d_series):
206
        """
207
        Initialize struct accessor.
208
        
209
        Parameters:
210
        - d_series: Series - Dask-cuDF Series with struct dtype
211
        """
212
    
213
    def field(self, key):
214
        """
215
        Extract a specific field from struct column.
216
        
217
        Extracts the specified field by name or index position,
218
        returning a new Series with the field values.
219
        
220
        Parameters:
221
        - key: str or int - Field name or index position
222
        
223
        Returns:
224
        Series - Series containing the extracted field values
225
        
226
        Examples:
227
        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
228
        >>> ds = dask_cudf.from_cudf(s, 2)
229
        >>> ds.struct.field('a').compute()
230
        0    1
231
        1    3
232
        dtype: int64
233
        
234
        >>> ds.struct.field(0).compute()  # First field
235
        0    1
236
        1    3
237
        dtype: int64
238
        """
239
    
240
    def explode(self):
241
        """
242
        Explode struct column into separate DataFrame columns.
243
        
244
        Creates a DataFrame with one column per struct field,
245
        expanding the struct into a tabular format.
246
        
247
        Returns:
248
        DataFrame - DataFrame with struct fields as columns
249
        
250
        Example:
251
        >>> s = cudf.Series([
252
        ...     {'a': 42, 'b': 'str1', 'c': [-1]},
253
        ...     {'a': 0,  'b': 'str2', 'c': [400, 500]},
254
        ...     {'a': 7,  'b': '',     'c': []}
255
        ... ])
256
        >>> ds = dask_cudf.from_cudf(s, 2)
257
        >>> ds.struct.explode().compute()
258
            a     b           c
259
        0  42  str1        [-1]
260
        1   0  str2  [400, 500]
261
        2   7                []
262
        """
263
```
264

265
## Usage Examples
266

267
### Working with List Columns
268

269
```python
270
import cudf
271
import dask_cudf
272

273
# Create DataFrame with list column
274
df = cudf.DataFrame({
275
    'id': [1, 2, 3, 4],
276
    'values': [[1, 2, 3], [4, 5], [], [6, 7, 8, 9]]
277
})
278

279
ddf = dask_cudf.from_cudf(df, npartitions=2)
280

281
# Get list lengths
282
lengths = ddf['values'].list.len()
283
print("List lengths:")
284
print(lengths.compute())
285

286
# Check if lists contain specific value
287
contains_5 = ddf['values'].list.contains(5)
288
print("\nContains 5:")
289
print(contains_5.compute())
290

291
# Get first element of each list
292
first_elements = ddf['values'].list.get(0)
293
print("\nFirst elements:")
294
print(first_elements.compute())
295

296
# Sort values within each list
297
sorted_lists = ddf['values'].list.sort_values(ascending=False)
298
print("\nSorted lists (descending):")
299
print(sorted_lists.compute())
300
```
301

302
### Working with Nested List Data
303

304
```python
305
# Create nested list data
306
nested_df = cudf.DataFrame({
307
    'nested_lists': [
308
        [[1, 2], [3, 4, 5]],
309
        [[6], [7, 8]],
310
        [[], [9, 10, 11]]
311
    ]
312
})
313

314
ddf_nested = dask_cudf.from_cudf(nested_df, npartitions=1)
315

316
# Extract all leaf values
317
leaves = ddf_nested['nested_lists'].list.leaves
318
print("Leaf values:")
319
print(leaves.compute())
320

321
# Custom indexing with take
322
indices = [[0], [1, 0], [1]]  # Take different elements from each row
323
taken = ddf_nested['nested_lists'].list.take(indices)
324
print("\nTaken elements:")
325
print(taken.compute())
326
```
327

328
### Working with Struct Columns
329

330
```python
331
# Create DataFrame with struct column
332
struct_data = cudf.Series([
333
    {'name': 'Alice', 'age': 25, 'city': 'NY'},
334
    {'name': 'Bob', 'age': 30, 'city': 'LA'},
335
    {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
336
])
337

338
df_struct = cudf.DataFrame({'person': struct_data})
339
ddf_struct = dask_cudf.from_cudf(df_struct, npartitions=2)
340

341
# Extract specific fields
342
names = ddf_struct['person'].struct.field('name')
343
ages = ddf_struct['person'].struct.field('age')
344

345
print("Names:")
346
print(names.compute())
347
print("\nAges:")
348
print(ages.compute())
349

350
# Explode struct into DataFrame
351
exploded = ddf_struct['person'].struct.explode()
352
print("\nExploded struct:")
353
print(exploded.compute())
354
```
355

356
### Complex Data Processing Pipeline
357

358
```python
359
# Complex pipeline with mixed data types
360
complex_df = cudf.DataFrame({
361
    'group': ['A', 'B', 'A', 'B'],
362
    'measurements': [
363
        [1.1, 2.2, 3.3],
364
        [4.4, 5.5],
365
        [6.6, 7.7, 8.8, 9.9],
366
        [10.0]
367
    ],
368
    'metadata': [
369
        {'sensor': 'temp', 'unit': 'C'},
370
        {'sensor': 'humidity', 'unit': '%'},
371
        {'sensor': 'pressure', 'unit': 'hPa'},
372
        {'sensor': 'wind', 'unit': 'm/s'}
373
    ]
374
})
375

376
ddf_complex = dask_cudf.from_cudf(complex_df, npartitions=2)
377

378
# Extract sensor types
379
sensors = ddf_complex['metadata'].struct.field('sensor')
380

381
# Calculate measurement statistics
382
avg_measurements = ddf_complex['measurements'].list.len()
383
max_measurements = ddf_complex.groupby('group').apply(
384
    lambda x: x['measurements'].list.len().max()
385
)
386

387
print("Sensor types:")
388
print(sensors.compute())
389
print("\nMeasurement counts by group:")
390
print(max_measurements.compute())
391

392
# Filter based on list length and struct content
393
filtered = ddf_complex[
394
    (ddf_complex['measurements'].list.len() > 2) &
395
    (ddf_complex['metadata'].struct.field('sensor') != 'wind')
396
]
397

398
print("\nFiltered data:")
399
print(filtered.compute())
400
```

Version

Tile

Files

data-type-accessors.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-type-accessors.mddocs/