or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mddatasets.mdindex.mdmachine-learning.mdnumpy-integration.mdpandas-integration.mdremote-computing.mdruntime-management.md

datasets.mddocs/

0

# Datasets

1

2

Large-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns. Xorbits datasets module provides scalable data loading and processing capabilities for machine learning workflows.

3

4

## Capabilities

5

6

### Dataset Class

7

8

Core dataset class for handling large datasets with distributed computing support.

9

10

```python { .api }

11

class Dataset:

12

"""

13

Dataset class for handling large datasets with distributed computing.

14

15

Provides efficient loading, processing, and manipulation of datasets

16

that exceed single-machine memory through distributed processing.

17

"""

18

```

19

20

### Hugging Face Integration

21

22

Direct integration with Hugging Face datasets for loading popular machine learning datasets.

23

24

```python { .api }

25

def from_huggingface(dataset_name: str, **kwargs):

26

"""

27

Load datasets from Hugging Face Hub with distributed support.

28

29

Parameters:

30

- dataset_name: str, name of the dataset on Hugging Face Hub

31

- **kwargs: Additional parameters for dataset loading including:

32

- split: str, dataset split to load ('train', 'test', 'validation')

33

- streaming: bool, whether to stream the dataset

34

- cache_dir: str, directory to cache downloaded files

35

- revision: str, specific revision/version of the dataset

36

- use_auth_token: bool or str, authentication token for private datasets

37

- trust_remote_code: bool, whether to trust remote code execution

38

39

Returns:

40

- Dataset object with distributed computing capabilities

41

"""

42

```

43

44

**Usage Examples:**

45

46

### Basic Dataset Usage

47

48

```python

49

import xorbits

50

from xorbits.datasets import Dataset

51

52

xorbits.init()

53

54

# Create dataset from local data

55

dataset = Dataset.from_csv('large_dataset.csv')

56

57

# Basic dataset operations

58

filtered_dataset = dataset.filter(lambda x: x['value'] > 100)

59

mapped_dataset = dataset.map(lambda x: {'processed': x['value'] * 2})

60

61

# Dataset info

62

print(f"Dataset size: {len(dataset)}")

63

print(f"Dataset columns: {dataset.column_names}")

64

65

# Execute operations

66

result = xorbits.run(mapped_dataset.to_pandas())

67

68

xorbits.shutdown()

69

```

70

71

### Hugging Face Integration Examples

72

73

```python

74

import xorbits

75

from xorbits.datasets import from_huggingface

76

77

xorbits.init()

78

79

# Load popular datasets from Hugging Face

80

# Text classification dataset

81

imdb_dataset = from_huggingface("imdb", split="train")

82

83

# Natural language inference dataset

84

glue_dataset = from_huggingface("glue", "mnli", split="train")

85

86

# Image classification dataset

87

cifar10_dataset = from_huggingface("cifar10", split="train")

88

89

# Question answering dataset

90

squad_dataset = from_huggingface("squad", split="train")

91

92

# Load with specific parameters

93

custom_dataset = from_huggingface(

94

"my_dataset",

95

split="train",

96

cache_dir="/tmp/datasets",

97

streaming=False,

98

trust_remote_code=True

99

)

100

101

# Process datasets with distributed computing

102

processed_imdb = imdb_dataset.map(

103

lambda example: {

104

'text_length': len(example['text']),

105

'label': example['label']

106

}

107

)

108

109

# Filter large datasets efficiently

110

long_texts = processed_imdb.filter(lambda x: x['text_length'] > 1000)

111

112

# Execute computations

113

results = xorbits.run(long_texts.to_pandas())

114

115

xorbits.shutdown()

116

```

117

118

### Dataset Preprocessing Pipeline

119

120

```python

121

import xorbits

122

from xorbits.datasets import from_huggingface

123

import xorbits.pandas as pd

124

125

xorbits.init()

126

127

# Load dataset

128

dataset = from_huggingface("imdb", split="train")

129

130

# Define preprocessing functions

131

def tokenize_text(example):

132

# Tokenization logic here

133

tokens = example['text'].split()

134

return {

135

'tokens': tokens,

136

'token_count': len(tokens),

137

'label': example['label']

138

}

139

140

def filter_by_length(example):

141

return 10 <= example['token_count'] <= 500

142

143

# Build preprocessing pipeline

144

processed_dataset = (dataset

145

.map(tokenize_text)

146

.filter(filter_by_length)

147

)

148

149

# Convert to pandas for further processing

150

df = processed_dataset.to_pandas()

151

152

# Additional pandas operations

153

analysis = df.groupby('label').agg({

154

'token_count': ['mean', 'std', 'min', 'max']

155

})

156

157

# Execute pipeline

158

results = xorbits.run(analysis)

159

160

xorbits.shutdown()

161

```

162

163

### Working with Multiple Datasets

164

165

```python

166

import xorbits

167

from xorbits.datasets import from_huggingface

168

from xorbits.datasets import Dataset

169

170

xorbits.init()

171

172

# Load multiple datasets

173

train_dataset = from_huggingface("imdb", split="train")

174

test_dataset = from_huggingface("imdb", split="test")

175

176

# Load local dataset

177

local_dataset = Dataset.from_json('local_reviews.json')

178

179

# Combine datasets

180

combined_dataset = Dataset.concatenate([

181

train_dataset,

182

test_dataset,

183

local_dataset

184

])

185

186

# Process combined dataset

187

def standardize_format(example):

188

return {

189

'text': example.get('text', example.get('review', '')),

190

'sentiment': example.get('label', example.get('sentiment', 0)),

191

'source': example.get('source', 'unknown')

192

}

193

194

standardized_dataset = combined_dataset.map(standardize_format)

195

196

# Analyze dataset composition

197

source_counts = standardized_dataset.to_pandas().groupby('source').size()

198

199

# Execute computation

200

results = xorbits.run(source_counts)

201

202

xorbits.shutdown()

203

```

204

205

### Large-Scale Data Processing

206

207

```python

208

import xorbits

209

from xorbits.datasets import from_huggingface

210

211

xorbits.init()

212

213

# Load large dataset with streaming for memory efficiency

214

large_dataset = from_huggingface(

215

"c4",

216

"en",

217

split="train",

218

streaming=True # Stream for very large datasets

219

)

220

221

# Process in batches for efficiency

222

def batch_process(batch):

223

# Process batch of examples

224

processed_batch = []

225

for example in batch:

226

processed_example = {

227

'text_length': len(example['text']),

228

'url_domain': example['url'].split('//')[1].split('/')[0] if '//' in example['url'] else 'unknown',

229

'timestamp': example['timestamp']

230

}

231

processed_batch.append(processed_example)

232

return processed_batch

233

234

# Apply batch processing

235

processed_dataset = large_dataset.map(

236

batch_process,

237

batched=True,

238

batch_size=1000

239

)

240

241

# Sample for analysis

242

sample_data = processed_dataset.take(10000)

243

244

# Convert sample to pandas for analysis

245

sample_df = sample_data.to_pandas()

246

domain_analysis = sample_df.groupby('url_domain').size().sort_values(ascending=False)

247

248

# Execute computation

249

results = xorbits.run(domain_analysis)

250

251

xorbits.shutdown()

252

```