or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdexport.mdhub.mdindex.mdmetrics.mdmodels.mdpipelines.mdpreprocessors.mdtraining.mdutilities.md

datasets.mddocs/

0

# Datasets

1

2

ModelScope's dataset handling provides unified interfaces for working with datasets from the ModelScope ecosystem and local data sources. The MsDataset class offers powerful data manipulation and transformation capabilities.

3

4

## Capabilities

5

6

### MsDataset Class

7

8

Main dataset interface for loading and manipulating datasets.

9

10

```python { .api }

11

class MsDataset:

12

"""

13

Main dataset interface for ModelScope datasets.

14

"""

15

16

@staticmethod

17

def load(

18

dataset_name: Union[str, list],

19

namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,

20

target: Optional[str] = None,

21

version: Optional[str] = DEFAULT_DATASET_REVISION,

22

hub: Optional[Hubs] = Hubs.modelscope,

23

subset_name: Optional[str] = None,

24

split: Optional[str] = None,

25

data_dir: Optional[str] = None,

26

data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,

27

download_mode: Optional[DownloadMode] = DownloadMode.REUSE_DATASET_IF_EXISTS,

28

cache_dir: Optional[str] = MS_DATASETS_CACHE,

29

features: Optional[Features] = None,

30

use_streaming: Optional[bool] = False,

31

stream_batch_size: Optional[int] = 1,

32

custom_cfg: Optional[Config] = Config(),

33

token: Optional[str] = None,

34

dataset_info_only: Optional[bool] = False,

35

trust_remote_code: Optional[bool] = False,

36

**config_kwargs,

37

) -> Union[dict, 'MsDataset', NativeIterableDataset]:

38

"""

39

Load dataset from ModelScope Hub or local source.

40

41

Parameters:

42

- dataset_name: Dataset identifier(s) on ModelScope Hub or local path(s)

43

- namespace: Dataset namespace (default: DEFAULT_DATASET_NAMESPACE)

44

- target: Target platform or format

45

- version: Dataset version/revision (default: DEFAULT_DATASET_REVISION)

46

- hub: Hub source (default: Hubs.modelscope)

47

- subset_name: Subset/configuration name within the dataset

48

- split: Dataset split ('train', 'test', 'validation')

49

- data_dir: Directory containing local dataset files

50

- data_files: Specific data files to load

51

- download_mode: Download behavior (default: REUSE_DATASET_IF_EXISTS)

52

- cache_dir: Directory for caching downloaded datasets (default: MS_DATASETS_CACHE)

53

- features: Dataset features schema

54

- use_streaming: Whether to use streaming mode

55

- stream_batch_size: Batch size for streaming (default: 1)

56

- custom_cfg: Custom configuration object

57

- token: Authentication token

58

- dataset_info_only: Whether to load only dataset info

59

- trust_remote_code: Whether to trust remote code execution

60

- **config_kwargs: Additional configuration parameters

61

62

Returns:

63

MsDataset instance, dict, or NativeIterableDataset

64

"""

65

66

def __init__(

67

self,

68

ds_instance: Union[Dataset, IterableDataset, ExternalDataset, NativeIterableDataset],

69

target: Optional[str] = None

70

):

71

"""

72

Initialize dataset with data.

73

74

Parameters:

75

- ds_instance: Dataset instance (Dataset, IterableDataset, ExternalDataset, or NativeIterableDataset)

76

- target: Target platform or format (optional)

77

"""

78

79

@classmethod

80

def to_ms_dataset(

81

cls,

82

ds_instance: Union[Dataset, DatasetDict, ExternalDataset, NativeIterableDataset, IterableDataset, IterableDatasetDict],

83

target: str = None

84

) -> Union[dict, 'MsDataset']:

85

"""

86

Convert dataset instance to MsDataset format.

87

88

Parameters:

89

- ds_instance: Dataset instance to convert

90

- target: Target platform or format (optional)

91

92

Returns:

93

MsDataset instance or dict of MsDataset instances

94

"""

95

96

def __len__(self) -> int:

97

"""

98

Get dataset length.

99

100

Returns:

101

Number of samples in the dataset

102

"""

103

104

def __getitem__(self, index):

105

"""

106

Get dataset item by index.

107

108

Parameters:

109

- index: Sample index or slice

110

111

Returns:

112

Dataset sample or samples

113

"""

114

115

def to_hf_dataset(self):

116

"""

117

Convert to HuggingFace Dataset format.

118

119

Returns:

120

HuggingFace Dataset instance

121

"""

122

123

def map(

124

self,

125

function,

126

batched: bool = False,

127

batch_size: int = 1000,

128

**kwargs

129

):

130

"""

131

Apply function to all dataset samples.

132

133

Parameters:

134

- function: Function to apply to each sample

135

- batched: Whether to process samples in batches

136

- batch_size: Size of batches for processing

137

- **kwargs: Additional mapping parameters

138

139

Returns:

140

New MsDataset with transformed data

141

"""

142

143

def filter(

144

self,

145

function,

146

batched: bool = False,

147

**kwargs

148

):

149

"""

150

Filter dataset samples based on condition.

151

152

Parameters:

153

- function: Function that returns True for samples to keep

154

- batched: Whether to process samples in batches

155

- **kwargs: Additional filtering parameters

156

157

Returns:

158

New MsDataset with filtered data

159

"""

160

161

def select(self, indices):

162

"""

163

Select subset of dataset by indices.

164

165

Parameters:

166

- indices: List of indices to select

167

168

Returns:

169

New MsDataset with selected samples

170

"""

171

172

def split(

173

self,

174

test_size: float = 0.2,

175

shuffle: bool = True,

176

seed: int = None

177

):

178

"""

179

Split dataset into train and test sets.

180

181

Parameters:

182

- test_size: Fraction of data for test set

183

- shuffle: Whether to shuffle before splitting

184

- seed: Random seed for reproducibility

185

186

Returns:

187

Dictionary with 'train' and 'test' MsDataset instances

188

"""

189

190

def shuffle(self, seed: int = None):

191

"""

192

Shuffle dataset samples.

193

194

Parameters:

195

- seed: Random seed for reproducibility

196

197

Returns:

198

New shuffled MsDataset

199

"""

200

201

def take(self, num_samples: int):

202

"""

203

Take first N samples from dataset.

204

205

Parameters:

206

- num_samples: Number of samples to take

207

208

Returns:

209

New MsDataset with first N samples

210

"""

211

212

def skip(self, num_samples: int):

213

"""

214

Skip first N samples from dataset.

215

216

Parameters:

217

- num_samples: Number of samples to skip

218

219

Returns:

220

New MsDataset with remaining samples

221

"""

222

223

def batch(self, batch_size: int):

224

"""

225

Create batched version of dataset.

226

227

Parameters:

228

- batch_size: Size of each batch

229

230

Returns:

231

New MsDataset that yields batches

232

"""

233

234

def save_to_disk(self, dataset_path: str):

235

"""

236

Save dataset to local disk.

237

238

Parameters:

239

- dataset_path: Path to save dataset

240

"""

241

242

@classmethod

243

def load_from_disk(cls, dataset_path: str):

244

"""

245

Load dataset from local disk.

246

247

Parameters:

248

- dataset_path: Path to saved dataset

249

250

Returns:

251

MsDataset instance

252

"""

253

```

254

255

## Usage Examples

256

257

### Loading Datasets from ModelScope Hub

258

259

```python

260

from modelscope import MsDataset

261

262

# Load complete dataset

263

dataset = MsDataset.load('clue', subset_name='afqmc')

264

print(f"Dataset size: {len(dataset)}")

265

266

# Load specific split

267

train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train')

268

test_dataset = MsDataset.load('clue', subset_name='afqmc', split='test')

269

270

print(f"Train size: {len(train_dataset)}")

271

print(f"Test size: {len(test_dataset)}")

272

273

# Inspect sample

274

sample = train_dataset[0]

275

print(f"Sample: {sample}")

276

```

277

278

### Loading Local Datasets

279

280

```python

281

from modelscope import MsDataset

282

283

# Load from local directory

284

local_dataset = MsDataset.load(

285

'path/to/local/dataset',

286

data_dir='./data',

287

cache_dir='./cache'

288

)

289

290

# Load from local files

291

import json

292

293

# Load JSON file

294

with open('data.json', 'r') as f:

295

data = json.load(f)

296

297

dataset = MsDataset(data)

298

```

299

300

### Dataset Transformation and Processing

301

302

```python

303

from modelscope import MsDataset

304

305

# Load dataset

306

dataset = MsDataset.load('clue', subset_name='afqmc', split='train')

307

308

# Transform data with map

309

def preprocess_text(example):

310

example['text'] = example['sentence1'] + ' [SEP] ' + example['sentence2']

311

return example

312

313

processed_dataset = dataset.map(preprocess_text)

314

315

# Filter samples

316

def filter_long_texts(example):

317

return len(example['text']) < 512

318

319

filtered_dataset = processed_dataset.filter(filter_long_texts)

320

321

print(f"Original size: {len(dataset)}")

322

print(f"After filtering: {len(filtered_dataset)}")

323

```

324

325

### Batch Processing

326

327

```python

328

from modelscope import MsDataset

329

330

dataset = MsDataset.load('dataset_name')

331

332

# Process in batches

333

def batch_preprocess(batch):

334

# Process multiple samples at once

335

batch['processed_text'] = [text.lower() for text in batch['text']]

336

return batch

337

338

batch_processed = dataset.map(

339

batch_preprocess,

340

batched=True,

341

batch_size=1000

342

)

343

344

# Create batched dataset for training

345

batched_dataset = dataset.batch(batch_size=32)

346

347

# Iterate through batches

348

for batch in batched_dataset:

349

print(f"Batch size: {len(batch)}")

350

break # Just show first batch

351

```

352

353

### Dataset Splitting and Sampling

354

355

```python

356

from modelscope import MsDataset

357

358

# Load dataset

359

full_dataset = MsDataset.load('dataset_name')

360

361

# Split into train/test

362

splits = full_dataset.split(test_size=0.2, shuffle=True, seed=42)

363

train_data = splits['train']

364

test_data = splits['test']

365

366

print(f"Train size: {len(train_data)}")

367

print(f"Test size: {len(test_data)}")

368

369

# Take subset for quick testing

370

small_dataset = full_dataset.take(1000)

371

print(f"Small dataset size: {len(small_dataset)}")

372

373

# Skip samples

374

remaining_dataset = full_dataset.skip(1000)

375

print(f"Remaining size: {len(remaining_dataset)}")

376

377

# Shuffle dataset

378

shuffled_dataset = full_dataset.shuffle(seed=42)

379

```

380

381

### Dataset Selection and Indexing

382

383

```python

384

from modelscope import MsDataset

385

386

dataset = MsDataset.load('dataset_name')

387

388

# Select specific indices

389

indices = [0, 5, 10, 15, 20]

390

subset = dataset.select(indices)

391

print(f"Selected subset size: {len(subset)}")

392

393

# Slice dataset

394

first_100 = dataset[:100]

395

last_50 = dataset[-50:]

396

every_10th = dataset[::10]

397

398

print(f"First 100: {len(first_100)}")

399

print(f"Last 50: {len(last_50)}")

400

print(f"Every 10th: {len(every_10th)}")

401

```

402

403

### Converting to HuggingFace Format

404

405

```python

406

from modelscope import MsDataset

407

408

# Load ModelScope dataset

409

ms_dataset = MsDataset.load('clue', subset_name='afqmc')

410

411

# Convert to HuggingFace format

412

hf_dataset = ms_dataset.to_hf_dataset()

413

414

print(f"HF Dataset type: {type(hf_dataset)}")

415

print(f"HF Dataset features: {hf_dataset.features}")

416

417

# Use with HuggingFace ecosystem

418

from transformers import AutoTokenizer

419

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

420

421

def tokenize_function(examples):

422

return tokenizer(examples['sentence1'], examples['sentence2'],

423

truncation=True, padding='max_length', max_length=128)

424

425

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

426

```

427

428

### Saving and Loading Datasets

429

430

```python

431

from modelscope import MsDataset

432

433

# Load and process dataset

434

dataset = MsDataset.load('dataset_name')

435

processed_dataset = dataset.map(lambda x: {'processed': x['text'].lower()})

436

437

# Save processed dataset

438

processed_dataset.save_to_disk('./processed_dataset')

439

440

# Load saved dataset later

441

loaded_dataset = MsDataset.load_from_disk('./processed_dataset')

442

print(f"Loaded dataset size: {len(loaded_dataset)}")

443

```

444

445

### Complex Data Processing Pipeline

446

447

```python

448

from modelscope import MsDataset

449

450

# Load raw dataset

451

dataset = MsDataset.load('text_classification_data')

452

453

# Define processing pipeline

454

def clean_text(example):

455

import re

456

# Remove special characters

457

example['text'] = re.sub(r'[^\w\s]', '', example['text'])

458

# Convert to lowercase

459

example['text'] = example['text'].lower()

460

return example

461

462

def add_length_feature(example):

463

example['text_length'] = len(example['text'])

464

return example

465

466

def filter_by_length(example):

467

return 10 <= example['text_length'] <= 500

468

469

# Apply processing pipeline

470

processed_dataset = (dataset

471

.map(clean_text)

472

.map(add_length_feature)

473

.filter(filter_by_length)

474

.shuffle(seed=42))

475

476

print(f"Original size: {len(dataset)}")

477

print(f"After processing: {len(processed_dataset)}")

478

479

# Create train/validation splits

480

splits = processed_dataset.split(test_size=0.2, seed=42)

481

train_dataset = splits['train']

482

val_dataset = splits['test']

483

484

# Create batched iterators for training

485

train_batches = train_dataset.batch(32)

486

val_batches = val_dataset.batch(32)

487

```

488

489

### Custom Dataset Class

490

491

```python

492

from modelscope import MsDataset

493

494

class CustomTextDataset(MsDataset):

495

def __init__(self, texts, labels, tokenizer=None):

496

self.texts = texts

497

self.labels = labels

498

self.tokenizer = tokenizer

499

super().__init__(list(zip(texts, labels)))

500

501

def __getitem__(self, index):

502

text, label = self.texts[index], self.labels[index]

503

504

if self.tokenizer:

505

encoded = self.tokenizer(text, truncation=True, padding='max_length')

506

return {

507

'input_ids': encoded['input_ids'],

508

'attention_mask': encoded['attention_mask'],

509

'labels': label

510

}

511

512

return {'text': text, 'label': label}

513

514

def __len__(self):

515

return len(self.texts)

516

517

# Use custom dataset

518

texts = ["Text 1", "Text 2", "Text 3"]

519

labels = [0, 1, 0]

520

521

custom_dataset = CustomTextDataset(texts, labels)

522

print(f"Custom dataset size: {len(custom_dataset)}")

523

print(f"Sample: {custom_dataset[0]}")

524

```