0
# Datasets
1
2
Large-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns. Xorbits datasets module provides scalable data loading and processing capabilities for machine learning workflows.
3
4
## Capabilities
5
6
### Dataset Class
7
8
Core dataset class for handling large datasets with distributed computing support.
9
10
```python { .api }
11
class Dataset:
12
"""
13
Dataset class for handling large datasets with distributed computing.
14
15
Provides efficient loading, processing, and manipulation of datasets
16
that exceed single-machine memory through distributed processing.
17
"""
18
```
19
20
### Hugging Face Integration
21
22
Direct integration with Hugging Face datasets for loading popular machine learning datasets.
23
24
```python { .api }
25
def from_huggingface(dataset_name: str, **kwargs):
26
"""
27
Load datasets from Hugging Face Hub with distributed support.
28
29
Parameters:
30
- dataset_name: str, name of the dataset on Hugging Face Hub
31
- **kwargs: Additional parameters for dataset loading including:
32
- split: str, dataset split to load ('train', 'test', 'validation')
33
- streaming: bool, whether to stream the dataset
34
- cache_dir: str, directory to cache downloaded files
35
- revision: str, specific revision/version of the dataset
36
- use_auth_token: bool or str, authentication token for private datasets
37
- trust_remote_code: bool, whether to trust remote code execution
38
39
Returns:
40
- Dataset object with distributed computing capabilities
41
"""
42
```
43
44
**Usage Examples:**
45
46
### Basic Dataset Usage
47
48
```python
49
import xorbits
50
from xorbits.datasets import Dataset
51
52
xorbits.init()
53
54
# Create dataset from local data
55
dataset = Dataset.from_csv('large_dataset.csv')
56
57
# Basic dataset operations
58
filtered_dataset = dataset.filter(lambda x: x['value'] > 100)
59
mapped_dataset = dataset.map(lambda x: {'processed': x['value'] * 2})
60
61
# Dataset info
62
print(f"Dataset size: {len(dataset)}")
63
print(f"Dataset columns: {dataset.column_names}")
64
65
# Execute operations
66
result = xorbits.run(mapped_dataset.to_pandas())
67
68
xorbits.shutdown()
69
```
70
71
### Hugging Face Integration Examples
72
73
```python
74
import xorbits
75
from xorbits.datasets import from_huggingface
76
77
xorbits.init()
78
79
# Load popular datasets from Hugging Face
80
# Text classification dataset
81
imdb_dataset = from_huggingface("imdb", split="train")
82
83
# Natural language inference dataset
84
glue_dataset = from_huggingface("glue", "mnli", split="train")
85
86
# Image classification dataset
87
cifar10_dataset = from_huggingface("cifar10", split="train")
88
89
# Question answering dataset
90
squad_dataset = from_huggingface("squad", split="train")
91
92
# Load with specific parameters
93
custom_dataset = from_huggingface(
94
"my_dataset",
95
split="train",
96
cache_dir="/tmp/datasets",
97
streaming=False,
98
trust_remote_code=True
99
)
100
101
# Process datasets with distributed computing
102
processed_imdb = imdb_dataset.map(
103
lambda example: {
104
'text_length': len(example['text']),
105
'label': example['label']
106
}
107
)
108
109
# Filter large datasets efficiently
110
long_texts = processed_imdb.filter(lambda x: x['text_length'] > 1000)
111
112
# Execute computations
113
results = xorbits.run(long_texts.to_pandas())
114
115
xorbits.shutdown()
116
```
117
118
### Dataset Preprocessing Pipeline
119
120
```python
121
import xorbits
122
from xorbits.datasets import from_huggingface
123
import xorbits.pandas as pd
124
125
xorbits.init()
126
127
# Load dataset
128
dataset = from_huggingface("imdb", split="train")
129
130
# Define preprocessing functions
131
def tokenize_text(example):
132
# Tokenization logic here
133
tokens = example['text'].split()
134
return {
135
'tokens': tokens,
136
'token_count': len(tokens),
137
'label': example['label']
138
}
139
140
def filter_by_length(example):
141
return 10 <= example['token_count'] <= 500
142
143
# Build preprocessing pipeline
144
processed_dataset = (dataset
145
.map(tokenize_text)
146
.filter(filter_by_length)
147
)
148
149
# Convert to pandas for further processing
150
df = processed_dataset.to_pandas()
151
152
# Additional pandas operations
153
analysis = df.groupby('label').agg({
154
'token_count': ['mean', 'std', 'min', 'max']
155
})
156
157
# Execute pipeline
158
results = xorbits.run(analysis)
159
160
xorbits.shutdown()
161
```
162
163
### Working with Multiple Datasets
164
165
```python
166
import xorbits
167
from xorbits.datasets import from_huggingface
168
from xorbits.datasets import Dataset
169
170
xorbits.init()
171
172
# Load multiple datasets
173
train_dataset = from_huggingface("imdb", split="train")
174
test_dataset = from_huggingface("imdb", split="test")
175
176
# Load local dataset
177
local_dataset = Dataset.from_json('local_reviews.json')
178
179
# Combine datasets
180
combined_dataset = Dataset.concatenate([
181
train_dataset,
182
test_dataset,
183
local_dataset
184
])
185
186
# Process combined dataset
187
def standardize_format(example):
188
return {
189
'text': example.get('text', example.get('review', '')),
190
'sentiment': example.get('label', example.get('sentiment', 0)),
191
'source': example.get('source', 'unknown')
192
}
193
194
standardized_dataset = combined_dataset.map(standardize_format)
195
196
# Analyze dataset composition
197
source_counts = standardized_dataset.to_pandas().groupby('source').size()
198
199
# Execute computation
200
results = xorbits.run(source_counts)
201
202
xorbits.shutdown()
203
```
204
205
### Large-Scale Data Processing
206
207
```python
208
import xorbits
209
from xorbits.datasets import from_huggingface
210
211
xorbits.init()
212
213
# Load large dataset with streaming for memory efficiency
214
large_dataset = from_huggingface(
215
"c4",
216
"en",
217
split="train",
218
streaming=True # Stream for very large datasets
219
)
220
221
# Process in batches for efficiency
222
def batch_process(batch):
223
# Process batch of examples
224
processed_batch = []
225
for example in batch:
226
processed_example = {
227
'text_length': len(example['text']),
228
'url_domain': example['url'].split('//')[1].split('/')[0] if '//' in example['url'] else 'unknown',
229
'timestamp': example['timestamp']
230
}
231
processed_batch.append(processed_example)
232
return processed_batch
233
234
# Apply batch processing
235
processed_dataset = large_dataset.map(
236
batch_process,
237
batched=True,
238
batch_size=1000
239
)
240
241
# Sample for analysis
242
sample_data = processed_dataset.take(10000)
243
244
# Convert sample to pandas for analysis
245
sample_df = sample_data.to_pandas()
246
domain_analysis = sample_df.groupby('url_domain').size().sort_values(ascending=False)
247
248
# Execute computation
249
results = xorbits.run(domain_analysis)
250
251
xorbits.shutdown()
252
```