or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-operations.mddata-io.mddata-type-accessors.mdgroupby-operations.mdindex.md

core-operations.mddocs/

0

# Core DataFrame Operations

1

2

Core functionality for creating, manipulating, and converting GPU-accelerated DataFrames with seamless Dask DataFrame API compatibility and distributed computing support.

3

4

## Capabilities

5

6

### DataFrame Creation and Conversion

7

8

Create Dask-cuDF collections from cuDF objects with automatic partitioning and memory-efficient data distribution across GPU workers.

9

10

```python { .api }

11

def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):

12

"""

13

Create a Dask-cuDF collection from a cuDF object.

14

15

Parameters:

16

- data: cudf.DataFrame, cudf.Series, or cudf.Index - Source cuDF object

17

- npartitions: int, optional - Number of partitions to create

18

- chunksize: int, optional - Approximate size of each partition in rows

19

- sort: bool, default True - Whether to sort by index

20

- name: str, optional - Name for the collection

21

22

Returns:

23

DataFrame, Series, or Index - Corresponding Dask-cuDF collection

24

25

Raises:

26

- NotImplementedError: If data has MultiIndex (not supported)

27

28

Notes:

29

- Either npartitions or chunksize should be specified

30

- Uses Dask's from_pandas internally with cuDF backend

31

"""

32

```

33

34

### Collection Classes

35

36

Primary collection types providing distributed GPU-accelerated data structures with Dask DataFrame compatibility.

37

38

```python { .api }

39

class DataFrame:

40

"""

41

A distributed Dask DataFrame backed by cuDF DataFrames.

42

43

Provides GPU-accelerated operations with pandas-like API.

44

Most Dask DataFrame operations are supported with GPU acceleration.

45

"""

46

47

def __init__(self, dsk, name, meta, divisions):

48

"""

49

Initialize DataFrame.

50

51

Parameters:

52

- dsk: dict - Task graph

53

- name: str - Collection name

54

- meta: cudf.DataFrame - Metadata object

55

- divisions: tuple - Index divisions

56

"""

57

58

class Series:

59

"""

60

A distributed Dask Series backed by cuDF Series.

61

62

Provides GPU-accelerated operations for single-column data.

63

"""

64

65

def __init__(self, dsk, name, meta, divisions):

66

"""

67

Initialize Series.

68

69

Parameters:

70

- dsk: dict - Task graph

71

- name: str - Collection name

72

- meta: cudf.Series - Metadata object

73

- divisions: tuple - Index divisions

74

"""

75

76

class Index:

77

"""

78

A distributed Dask Index backed by cuDF Index.

79

80

Provides GPU-accelerated index operations.

81

"""

82

83

def __init__(self, dsk, name, meta, divisions):

84

"""

85

Initialize Index.

86

87

Parameters:

88

- dsk: dict - Task graph

89

- name: str - Collection name

90

- meta: cudf.Index - Metadata object

91

- divisions: tuple - Index divisions

92

"""

93

```

94

95

### Concatenation Operations

96

97

Combine multiple DataFrames along specified axes with GPU acceleration and optimized memory management.

98

99

```python { .api }

100

def concat(dfs, axis=0, join='outer', ignore_index=False, **kwargs):

101

"""

102

Concatenate Dask-cuDF objects along axis.

103

104

This is an alias to dask.dataframe.concat that works with cuDF backend.

105

106

Parameters:

107

- dfs: sequence - Objects to concatenate

108

- axis: int, default 0 - Axis to concatenate along

109

- join: str, default 'outer' - How to handle indexes

110

- ignore_index: bool, default False - Reset index in result

111

- **kwargs: Additional arguments passed to cudf.concat

112

113

Returns:

114

DataFrame or Series - Concatenated result

115

"""

116

117

def from_delayed(dasks, meta=None, divisions=None, prefix='from-delayed', verify_meta=True):

118

"""

119

Create DataFrame from list of delayed objects.

120

121

This function is a thin wrapper around dask.dataframe.from_delayed,

122

creating Dask-cuDF collections from Dask delayed tasks that return cuDF objects.

123

124

Parameters:

125

- dasks: list of Delayed objects - Tasks that return cuDF DataFrames/Series

126

- meta: DataFrame or Series, optional - Metadata defining structure and dtypes

127

- divisions: sequence, optional - Index divisions for result

128

- prefix: str, default 'from-delayed' - Task name prefix in graph

129

- verify_meta: bool, default True - Verify metadata consistency

130

131

Returns:

132

DataFrame or Series - Dask-cuDF collection from delayed tasks

133

134

Notes:

135

- Automatically uses cuDF backend for delayed task results

136

- Tasks should return cuDF DataFrame or Series objects

137

- Meta parameter should match the structure of delayed task outputs

138

"""

139

```

140

141

## Usage Examples

142

143

### Basic DataFrame Creation

144

145

```python

146

import cudf

147

import dask_cudf

148

149

# Create cuDF DataFrame

150

cudf_df = cudf.DataFrame({

151

'a': [1, 2, 3, 4, 5, 6],

152

'b': [10, 20, 30, 40, 50, 60],

153

'c': ['x', 'y', 'z', 'x', 'y', 'z']

154

})

155

156

# Convert to Dask-cuDF with 2 partitions

157

ddf = dask_cudf.from_cudf(cudf_df, npartitions=2)

158

159

# Access partition information

160

print(f"Partitions: {ddf.npartitions}")

161

print(f"Columns: {list(ddf.columns)}")

162

```

163

164

### Working with Series

165

166

```python

167

# Create cuDF Series

168

cudf_series = cudf.Series([1, 4, 9, 16, 25, 36], name='squares')

169

170

# Convert to Dask-cuDF Series

171

dseries = dask_cudf.from_cudf(cudf_series, npartitions=3)

172

173

# Perform operations

174

result = dseries.sum().compute()

175

print(f"Sum of squares: {result}")

176

```

177

178

### Concatenating DataFrames

179

180

```python

181

# Create multiple DataFrames

182

df1 = cudf.DataFrame({'x': [1, 2], 'y': [3, 4]})

183

df2 = cudf.DataFrame({'x': [5, 6], 'y': [7, 8]})

184

185

# Convert to Dask-cuDF

186

ddf1 = dask_cudf.from_cudf(df1, npartitions=1)

187

ddf2 = dask_cudf.from_cudf(df2, npartitions=1)

188

189

# Concatenate

190

combined = dask_cudf.concat([ddf1, ddf2])

191

result = combined.compute()

192

193

### Creating DataFrame from Delayed Tasks

194

195

```python

196

import dask

197

import cudf

198

import dask_cudf

199

200

# Create delayed tasks that return cuDF DataFrames

201

@dask.delayed

202

def load_partition(i):

203

# Simulate loading data partition

204

return cudf.DataFrame({

205

'id': range(i*100, (i+1)*100),

206

'value': range(i*100, (i+1)*100)

207

})

208

209

# Create list of delayed tasks

210

delayed_tasks = [load_partition(i) for i in range(5)]

211

212

# Convert to Dask-cuDF DataFrame

213

ddf = dask_cudf.from_delayed(

214

delayed_tasks,

215

meta=cudf.DataFrame({'id': [], 'value': []}, dtype='int64')

216

)

217

218

print(f"Created DataFrame with {ddf.npartitions} partitions")

219

print(ddf.compute().head())

220

```