0
# Datasets
1
2
ModelScope's dataset handling provides unified interfaces for working with datasets from the ModelScope ecosystem and local data sources. The MsDataset class offers powerful data manipulation and transformation capabilities.
3
4
## Capabilities
5
6
### MsDataset Class
7
8
Main dataset interface for loading and manipulating datasets.
9
10
```python { .api }
11
class MsDataset:
12
"""
13
Main dataset interface for ModelScope datasets.
14
"""
15
16
@staticmethod
17
def load(
18
dataset_name: Union[str, list],
19
namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
20
target: Optional[str] = None,
21
version: Optional[str] = DEFAULT_DATASET_REVISION,
22
hub: Optional[Hubs] = Hubs.modelscope,
23
subset_name: Optional[str] = None,
24
split: Optional[str] = None,
25
data_dir: Optional[str] = None,
26
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
27
download_mode: Optional[DownloadMode] = DownloadMode.REUSE_DATASET_IF_EXISTS,
28
cache_dir: Optional[str] = MS_DATASETS_CACHE,
29
features: Optional[Features] = None,
30
use_streaming: Optional[bool] = False,
31
stream_batch_size: Optional[int] = 1,
32
custom_cfg: Optional[Config] = Config(),
33
token: Optional[str] = None,
34
dataset_info_only: Optional[bool] = False,
35
trust_remote_code: Optional[bool] = False,
36
**config_kwargs,
37
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
38
"""
39
Load dataset from ModelScope Hub or local source.
40
41
Parameters:
42
- dataset_name: Dataset identifier(s) on ModelScope Hub or local path(s)
43
- namespace: Dataset namespace (default: DEFAULT_DATASET_NAMESPACE)
44
- target: Target platform or format
45
- version: Dataset version/revision (default: DEFAULT_DATASET_REVISION)
46
- hub: Hub source (default: Hubs.modelscope)
47
- subset_name: Subset/configuration name within the dataset
48
- split: Dataset split ('train', 'test', 'validation')
49
- data_dir: Directory containing local dataset files
50
- data_files: Specific data files to load
51
- download_mode: Download behavior (default: REUSE_DATASET_IF_EXISTS)
52
- cache_dir: Directory for caching downloaded datasets (default: MS_DATASETS_CACHE)
53
- features: Dataset features schema
54
- use_streaming: Whether to use streaming mode
55
- stream_batch_size: Batch size for streaming (default: 1)
56
- custom_cfg: Custom configuration object
57
- token: Authentication token
58
- dataset_info_only: Whether to load only dataset info
59
- trust_remote_code: Whether to trust remote code execution
60
- **config_kwargs: Additional configuration parameters
61
62
Returns:
63
MsDataset instance, dict, or NativeIterableDataset
64
"""
65
66
def __init__(
67
self,
68
ds_instance: Union[Dataset, IterableDataset, ExternalDataset, NativeIterableDataset],
69
target: Optional[str] = None
70
):
71
"""
72
Initialize dataset with data.
73
74
Parameters:
75
- ds_instance: Dataset instance (Dataset, IterableDataset, ExternalDataset, or NativeIterableDataset)
76
- target: Target platform or format (optional)
77
"""
78
79
@classmethod
80
def to_ms_dataset(
81
cls,
82
ds_instance: Union[Dataset, DatasetDict, ExternalDataset, NativeIterableDataset, IterableDataset, IterableDatasetDict],
83
target: str = None
84
) -> Union[dict, 'MsDataset']:
85
"""
86
Convert dataset instance to MsDataset format.
87
88
Parameters:
89
- ds_instance: Dataset instance to convert
90
- target: Target platform or format (optional)
91
92
Returns:
93
MsDataset instance or dict of MsDataset instances
94
"""
95
96
def __len__(self) -> int:
97
"""
98
Get dataset length.
99
100
Returns:
101
Number of samples in the dataset
102
"""
103
104
def __getitem__(self, index):
105
"""
106
Get dataset item by index.
107
108
Parameters:
109
- index: Sample index or slice
110
111
Returns:
112
Dataset sample or samples
113
"""
114
115
def to_hf_dataset(self):
116
"""
117
Convert to HuggingFace Dataset format.
118
119
Returns:
120
HuggingFace Dataset instance
121
"""
122
123
def map(
124
self,
125
function,
126
batched: bool = False,
127
batch_size: int = 1000,
128
**kwargs
129
):
130
"""
131
Apply function to all dataset samples.
132
133
Parameters:
134
- function: Function to apply to each sample
135
- batched: Whether to process samples in batches
136
- batch_size: Size of batches for processing
137
- **kwargs: Additional mapping parameters
138
139
Returns:
140
New MsDataset with transformed data
141
"""
142
143
def filter(
144
self,
145
function,
146
batched: bool = False,
147
**kwargs
148
):
149
"""
150
Filter dataset samples based on condition.
151
152
Parameters:
153
- function: Function that returns True for samples to keep
154
- batched: Whether to process samples in batches
155
- **kwargs: Additional filtering parameters
156
157
Returns:
158
New MsDataset with filtered data
159
"""
160
161
def select(self, indices):
162
"""
163
Select subset of dataset by indices.
164
165
Parameters:
166
- indices: List of indices to select
167
168
Returns:
169
New MsDataset with selected samples
170
"""
171
172
def split(
173
self,
174
test_size: float = 0.2,
175
shuffle: bool = True,
176
seed: int = None
177
):
178
"""
179
Split dataset into train and test sets.
180
181
Parameters:
182
- test_size: Fraction of data for test set
183
- shuffle: Whether to shuffle before splitting
184
- seed: Random seed for reproducibility
185
186
Returns:
187
Dictionary with 'train' and 'test' MsDataset instances
188
"""
189
190
def shuffle(self, seed: int = None):
191
"""
192
Shuffle dataset samples.
193
194
Parameters:
195
- seed: Random seed for reproducibility
196
197
Returns:
198
New shuffled MsDataset
199
"""
200
201
def take(self, num_samples: int):
202
"""
203
Take first N samples from dataset.
204
205
Parameters:
206
- num_samples: Number of samples to take
207
208
Returns:
209
New MsDataset with first N samples
210
"""
211
212
def skip(self, num_samples: int):
213
"""
214
Skip first N samples from dataset.
215
216
Parameters:
217
- num_samples: Number of samples to skip
218
219
Returns:
220
New MsDataset with remaining samples
221
"""
222
223
def batch(self, batch_size: int):
224
"""
225
Create batched version of dataset.
226
227
Parameters:
228
- batch_size: Size of each batch
229
230
Returns:
231
New MsDataset that yields batches
232
"""
233
234
def save_to_disk(self, dataset_path: str):
235
"""
236
Save dataset to local disk.
237
238
Parameters:
239
- dataset_path: Path to save dataset
240
"""
241
242
@classmethod
243
def load_from_disk(cls, dataset_path: str):
244
"""
245
Load dataset from local disk.
246
247
Parameters:
248
- dataset_path: Path to saved dataset
249
250
Returns:
251
MsDataset instance
252
"""
253
```
254
255
## Usage Examples
256
257
### Loading Datasets from ModelScope Hub
258
259
```python
260
from modelscope import MsDataset
261
262
# Load complete dataset
263
dataset = MsDataset.load('clue', subset_name='afqmc')
264
print(f"Dataset size: {len(dataset)}")
265
266
# Load specific split
267
train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
268
test_dataset = MsDataset.load('clue', subset_name='afqmc', split='test')
269
270
print(f"Train size: {len(train_dataset)}")
271
print(f"Test size: {len(test_dataset)}")
272
273
# Inspect sample
274
sample = train_dataset[0]
275
print(f"Sample: {sample}")
276
```
277
278
### Loading Local Datasets
279
280
```python
281
from modelscope import MsDataset
282
283
# Load from local directory
284
local_dataset = MsDataset.load(
285
'path/to/local/dataset',
286
data_dir='./data',
287
cache_dir='./cache'
288
)
289
290
# Load from local files
291
import json
292
293
# Load JSON file
294
with open('data.json', 'r') as f:
295
data = json.load(f)
296
297
dataset = MsDataset(data)
298
```
299
300
### Dataset Transformation and Processing
301
302
```python
303
from modelscope import MsDataset
304
305
# Load dataset
306
dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
307
308
# Transform data with map
309
def preprocess_text(example):
310
example['text'] = example['sentence1'] + ' [SEP] ' + example['sentence2']
311
return example
312
313
processed_dataset = dataset.map(preprocess_text)
314
315
# Filter samples
316
def filter_long_texts(example):
317
return len(example['text']) < 512
318
319
filtered_dataset = processed_dataset.filter(filter_long_texts)
320
321
print(f"Original size: {len(dataset)}")
322
print(f"After filtering: {len(filtered_dataset)}")
323
```
324
325
### Batch Processing
326
327
```python
328
from modelscope import MsDataset
329
330
dataset = MsDataset.load('dataset_name')
331
332
# Process in batches
333
def batch_preprocess(batch):
334
# Process multiple samples at once
335
batch['processed_text'] = [text.lower() for text in batch['text']]
336
return batch
337
338
batch_processed = dataset.map(
339
batch_preprocess,
340
batched=True,
341
batch_size=1000
342
)
343
344
# Create batched dataset for training
345
batched_dataset = dataset.batch(batch_size=32)
346
347
# Iterate through batches
348
for batch in batched_dataset:
349
print(f"Batch size: {len(batch)}")
350
break # Just show first batch
351
```
352
353
### Dataset Splitting and Sampling
354
355
```python
356
from modelscope import MsDataset
357
358
# Load dataset
359
full_dataset = MsDataset.load('dataset_name')
360
361
# Split into train/test
362
splits = full_dataset.split(test_size=0.2, shuffle=True, seed=42)
363
train_data = splits['train']
364
test_data = splits['test']
365
366
print(f"Train size: {len(train_data)}")
367
print(f"Test size: {len(test_data)}")
368
369
# Take subset for quick testing
370
small_dataset = full_dataset.take(1000)
371
print(f"Small dataset size: {len(small_dataset)}")
372
373
# Skip samples
374
remaining_dataset = full_dataset.skip(1000)
375
print(f"Remaining size: {len(remaining_dataset)}")
376
377
# Shuffle dataset
378
shuffled_dataset = full_dataset.shuffle(seed=42)
379
```
380
381
### Dataset Selection and Indexing
382
383
```python
384
from modelscope import MsDataset
385
386
dataset = MsDataset.load('dataset_name')
387
388
# Select specific indices
389
indices = [0, 5, 10, 15, 20]
390
subset = dataset.select(indices)
391
print(f"Selected subset size: {len(subset)}")
392
393
# Slice dataset
394
first_100 = dataset[:100]
395
last_50 = dataset[-50:]
396
every_10th = dataset[::10]
397
398
print(f"First 100: {len(first_100)}")
399
print(f"Last 50: {len(last_50)}")
400
print(f"Every 10th: {len(every_10th)}")
401
```
402
403
### Converting to HuggingFace Format
404
405
```python
406
from modelscope import MsDataset
407
408
# Load ModelScope dataset
409
ms_dataset = MsDataset.load('clue', subset_name='afqmc')
410
411
# Convert to HuggingFace format
412
hf_dataset = ms_dataset.to_hf_dataset()
413
414
print(f"HF Dataset type: {type(hf_dataset)}")
415
print(f"HF Dataset features: {hf_dataset.features}")
416
417
# Use with HuggingFace ecosystem
418
from transformers import AutoTokenizer
419
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
420
421
def tokenize_function(examples):
422
return tokenizer(examples['sentence1'], examples['sentence2'],
423
truncation=True, padding='max_length', max_length=128)
424
425
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
426
```
427
428
### Saving and Loading Datasets
429
430
```python
431
from modelscope import MsDataset
432
433
# Load and process dataset
434
dataset = MsDataset.load('dataset_name')
435
processed_dataset = dataset.map(lambda x: {'processed': x['text'].lower()})
436
437
# Save processed dataset
438
processed_dataset.save_to_disk('./processed_dataset')
439
440
# Load saved dataset later
441
loaded_dataset = MsDataset.load_from_disk('./processed_dataset')
442
print(f"Loaded dataset size: {len(loaded_dataset)}")
443
```
444
445
### Complex Data Processing Pipeline
446
447
```python
448
from modelscope import MsDataset
449
450
# Load raw dataset
451
dataset = MsDataset.load('text_classification_data')
452
453
# Define processing pipeline
454
def clean_text(example):
455
import re
456
# Remove special characters
457
example['text'] = re.sub(r'[^\w\s]', '', example['text'])
458
# Convert to lowercase
459
example['text'] = example['text'].lower()
460
return example
461
462
def add_length_feature(example):
463
example['text_length'] = len(example['text'])
464
return example
465
466
def filter_by_length(example):
467
return 10 <= example['text_length'] <= 500
468
469
# Apply processing pipeline
470
processed_dataset = (dataset
471
.map(clean_text)
472
.map(add_length_feature)
473
.filter(filter_by_length)
474
.shuffle(seed=42))
475
476
print(f"Original size: {len(dataset)}")
477
print(f"After processing: {len(processed_dataset)}")
478
479
# Create train/validation splits
480
splits = processed_dataset.split(test_size=0.2, seed=42)
481
train_dataset = splits['train']
482
val_dataset = splits['test']
483
484
# Create batched iterators for training
485
train_batches = train_dataset.batch(32)
486
val_batches = val_dataset.batch(32)
487
```
488
489
### Custom Dataset Class
490
491
```python
492
from modelscope import MsDataset
493
494
class CustomTextDataset(MsDataset):
495
def __init__(self, texts, labels, tokenizer=None):
496
self.texts = texts
497
self.labels = labels
498
self.tokenizer = tokenizer
499
super().__init__(list(zip(texts, labels)))
500
501
def __getitem__(self, index):
502
text, label = self.texts[index], self.labels[index]
503
504
if self.tokenizer:
505
encoded = self.tokenizer(text, truncation=True, padding='max_length')
506
return {
507
'input_ids': encoded['input_ids'],
508
'attention_mask': encoded['attention_mask'],
509
'labels': label
510
}
511
512
return {'text': text, 'label': label}
513
514
def __len__(self):
515
return len(self.texts)
516
517
# Use custom dataset
518
texts = ["Text 1", "Text 2", "Text 3"]
519
labels = [0, 1, 0]
520
521
custom_dataset = CustomTextDataset(texts, labels)
522
print(f"Custom dataset size: {len(custom_dataset)}")
523
print(f"Sample: {custom_dataset[0]}")
524
```