or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-operations.mddata-io.mddata-type-accessors.mdgroupby-operations.mdindex.md

data-type-accessors.mddocs/

0

# Data Type Accessors

1

2

Specialized accessor methods for complex cuDF data types including list and struct columns, providing GPU-accelerated operations on nested data structures that are unique to cuDF's columnar format.

3

4

## Capabilities

5

6

### List Column Accessors

7

8

Operations for list-type columns that contain arrays or sequences as individual cell values, enabling efficient manipulation of nested array data on the GPU.

9

10

```python { .api }

11

class ListMethods:

12

"""

13

Accessor methods for Series containing list-type data.

14

15

Accessed via Series.list property on list-dtype Series.

16

Provides GPU-accelerated operations on list columns.

17

"""

18

19

def __init__(self, d_series):

20

"""

21

Initialize list accessor.

22

23

Parameters:

24

- d_series: Series - Dask-cuDF Series with list dtype

25

"""

26

27

def len(self):

28

"""

29

Compute the length of each list element in the Series.

30

31

Returns the number of elements in each list, with null values

32

for null lists.

33

34

Returns:

35

Series - Integer Series with list lengths

36

37

Example:

38

>>> s = cudf.Series([[1, 2, 3], None, [4, 5]])

39

>>> ds = dask_cudf.from_cudf(s, 2)

40

>>> ds.list.len().compute()

41

0 3

42

1 <NA>

43

2 2

44

dtype: int32

45

"""

46

47

def contains(self, search_key):

48

"""

49

Check if each list contains the specified scalar value.

50

51

Creates boolean Series indicating whether the search key

52

is present in each list element.

53

54

Parameters:

55

- search_key: scalar - Value to search for in each list

56

57

Returns:

58

Series - Boolean Series indicating containment

59

60

Example:

61

>>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])

62

>>> ds = dask_cudf.from_cudf(s, 2)

63

>>> ds.list.contains(4).compute()

64

0 False

65

1 True

66

2 True

67

dtype: bool

68

"""

69

70

def get(self, index):

71

"""

72

Extract element at specified index from each list.

73

74

Supports negative indexing for accessing elements from the end.

75

Returns null for out-of-bounds indices.

76

77

Parameters:

78

- index: int - Index position to extract (supports negative indexing)

79

80

Returns:

81

Series - Series with extracted elements

82

83

Example:

84

>>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])

85

>>> ds = dask_cudf.from_cudf(s, 2)

86

>>> ds.list.get(-1).compute() # Last element

87

0 3

88

1 5

89

2 6

90

dtype: int64

91

"""

92

93

@property

94

def leaves(self):

95

"""

96

Extract all leaf elements from nested lists as flat Series.

97

98

For nested list structures, returns the innermost elements

99

as a flat Series with one value per row.

100

101

Returns:

102

Series - Flattened Series of leaf values

103

104

Example:

105

>>> s = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]])

106

>>> ds = dask_cudf.from_cudf(s, 2)

107

>>> ds.list.leaves.compute()

108

0 1

109

1 <NA>

110

2 3

111

3 4

112

4 5

113

5 6

114

dtype: int64

115

"""

116

117

def take(self, lists_indices):

118

"""

119

Collect list elements based on index arrays.

120

121

For each row, extracts elements at positions specified

122

by the corresponding index list.

123

124

Parameters:

125

- lists_indices: list of lists - Index positions for each row

126

127

Returns:

128

Series - Series with collected elements as lists

129

130

Example:

131

>>> s = cudf.Series([[1, 2, 3], None, [4, 5]])

132

>>> ds = dask_cudf.from_cudf(s, 2)

133

>>> ds.list.take([[0, 1], [], []]).compute()

134

0 [1, 2]

135

1 None

136

2 []

137

dtype: list

138

"""

139

140

def unique(self):

141

"""

142

Get unique elements within each list.

143

144

Returns unique elements for each list, removing duplicates.

145

Order of unique elements is not guaranteed.

146

147

Returns:

148

Series - Series with unique elements as lists

149

150

Example:

151

>>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])

152

>>> ds = dask_cudf.from_cudf(s, 2)

153

>>> ds.list.unique().compute() # Order not guaranteed

154

0 [1.0, 2.0, nan]

155

1 None

156

2 [4.0]

157

3 []

158

dtype: list

159

"""

160

161

def sort_values(self, ascending=True, inplace=False, kind="quicksort",

162

na_position="last", ignore_index=False):

163

"""

164

Sort elements within each list.

165

166

Sorts the contents of each list according to specified criteria.

167

168

Parameters:

169

- ascending: bool, default True - Sort order

170

- inplace: bool, default False - Modify in place (not supported)

171

- kind: str, default "quicksort" - Sort algorithm (not supported)

172

- na_position: str, default "last" - Null placement ('first' or 'last')

173

- ignore_index: bool, default False - Reset result index

174

175

Returns:

176

Series - Series with sorted lists

177

178

Notes:

179

- inplace and kind parameters not supported in cuDF

180

181

Example:

182

>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])

183

>>> ds = dask_cudf.from_cudf(s, 2)

184

>>> ds.list.sort_values().compute()

185

0 [2.0, 4.0, 9.0, nan]

186

1 [2.0, 8.0, 8.0]

187

2 [1.0, 2.0]

188

dtype: list

189

"""

190

```

191

192

### Struct Column Accessors

193

194

Operations for struct-type columns containing record-like data with named fields, enabling efficient manipulation of structured data on the GPU.

195

196

```python { .api }

197

class StructMethods:

198

"""

199

Accessor methods for Series containing struct-type data.

200

201

Accessed via Series.struct property on struct-dtype Series.

202

Provides GPU-accelerated operations on structured data.

203

"""

204

205

def __init__(self, d_series):

206

"""

207

Initialize struct accessor.

208

209

Parameters:

210

- d_series: Series - Dask-cuDF Series with struct dtype

211

"""

212

213

def field(self, key):

214

"""

215

Extract a specific field from struct column.

216

217

Extracts the specified field by name or index position,

218

returning a new Series with the field values.

219

220

Parameters:

221

- key: str or int - Field name or index position

222

223

Returns:

224

Series - Series containing the extracted field values

225

226

Examples:

227

>>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])

228

>>> ds = dask_cudf.from_cudf(s, 2)

229

>>> ds.struct.field('a').compute()

230

0 1

231

1 3

232

dtype: int64

233

234

>>> ds.struct.field(0).compute() # First field

235

0 1

236

1 3

237

dtype: int64

238

"""

239

240

def explode(self):

241

"""

242

Explode struct column into separate DataFrame columns.

243

244

Creates a DataFrame with one column per struct field,

245

expanding the struct into a tabular format.

246

247

Returns:

248

DataFrame - DataFrame with struct fields as columns

249

250

Example:

251

>>> s = cudf.Series([

252

... {'a': 42, 'b': 'str1', 'c': [-1]},

253

... {'a': 0, 'b': 'str2', 'c': [400, 500]},

254

... {'a': 7, 'b': '', 'c': []}

255

... ])

256

>>> ds = dask_cudf.from_cudf(s, 2)

257

>>> ds.struct.explode().compute()

258

a b c

259

0 42 str1 [-1]

260

1 0 str2 [400, 500]

261

2 7 []

262

"""

263

```

264

265

## Usage Examples

266

267

### Working with List Columns

268

269

```python

270

import cudf

271

import dask_cudf

272

273

# Create DataFrame with list column

274

df = cudf.DataFrame({

275

'id': [1, 2, 3, 4],

276

'values': [[1, 2, 3], [4, 5], [], [6, 7, 8, 9]]

277

})

278

279

ddf = dask_cudf.from_cudf(df, npartitions=2)

280

281

# Get list lengths

282

lengths = ddf['values'].list.len()

283

print("List lengths:")

284

print(lengths.compute())

285

286

# Check if lists contain specific value

287

contains_5 = ddf['values'].list.contains(5)

288

print("\nContains 5:")

289

print(contains_5.compute())

290

291

# Get first element of each list

292

first_elements = ddf['values'].list.get(0)

293

print("\nFirst elements:")

294

print(first_elements.compute())

295

296

# Sort values within each list

297

sorted_lists = ddf['values'].list.sort_values(ascending=False)

298

print("\nSorted lists (descending):")

299

print(sorted_lists.compute())

300

```

301

302

### Working with Nested List Data

303

304

```python

305

# Create nested list data

306

nested_df = cudf.DataFrame({

307

'nested_lists': [

308

[[1, 2], [3, 4, 5]],

309

[[6], [7, 8]],

310

[[], [9, 10, 11]]

311

]

312

})

313

314

ddf_nested = dask_cudf.from_cudf(nested_df, npartitions=1)

315

316

# Extract all leaf values

317

leaves = ddf_nested['nested_lists'].list.leaves

318

print("Leaf values:")

319

print(leaves.compute())

320

321

# Custom indexing with take

322

indices = [[0], [1, 0], [1]] # Take different elements from each row

323

taken = ddf_nested['nested_lists'].list.take(indices)

324

print("\nTaken elements:")

325

print(taken.compute())

326

```

327

328

### Working with Struct Columns

329

330

```python

331

# Create DataFrame with struct column

332

struct_data = cudf.Series([

333

{'name': 'Alice', 'age': 25, 'city': 'NY'},

334

{'name': 'Bob', 'age': 30, 'city': 'LA'},

335

{'name': 'Charlie', 'age': 35, 'city': 'Chicago'}

336

])

337

338

df_struct = cudf.DataFrame({'person': struct_data})

339

ddf_struct = dask_cudf.from_cudf(df_struct, npartitions=2)

340

341

# Extract specific fields

342

names = ddf_struct['person'].struct.field('name')

343

ages = ddf_struct['person'].struct.field('age')

344

345

print("Names:")

346

print(names.compute())

347

print("\nAges:")

348

print(ages.compute())

349

350

# Explode struct into DataFrame

351

exploded = ddf_struct['person'].struct.explode()

352

print("\nExploded struct:")

353

print(exploded.compute())

354

```

355

356

### Complex Data Processing Pipeline

357

358

```python

359

# Complex pipeline with mixed data types

360

complex_df = cudf.DataFrame({

361

'group': ['A', 'B', 'A', 'B'],

362

'measurements': [

363

[1.1, 2.2, 3.3],

364

[4.4, 5.5],

365

[6.6, 7.7, 8.8, 9.9],

366

[10.0]

367

],

368

'metadata': [

369

{'sensor': 'temp', 'unit': 'C'},

370

{'sensor': 'humidity', 'unit': '%'},

371

{'sensor': 'pressure', 'unit': 'hPa'},

372

{'sensor': 'wind', 'unit': 'm/s'}

373

]

374

})

375

376

ddf_complex = dask_cudf.from_cudf(complex_df, npartitions=2)

377

378

# Extract sensor types

379

sensors = ddf_complex['metadata'].struct.field('sensor')

380

381

# Calculate measurement statistics

382

avg_measurements = ddf_complex['measurements'].list.len()

383

max_measurements = ddf_complex.groupby('group').apply(

384

lambda x: x['measurements'].list.len().max()

385

)

386

387

print("Sensor types:")

388

print(sensors.compute())

389

print("\nMeasurement counts by group:")

390

print(max_measurements.compute())

391

392

# Filter based on list length and struct content

393

filtered = ddf_complex[

394

(ddf_complex['measurements'].list.len() > 2) &

395

(ddf_complex['metadata'].struct.field('sensor') != 'wind')

396

]

397

398

print("\nFiltered data:")

399

print(filtered.compute())

400

```