Tessl Tile for pypi/modelscope@1.29.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

datasets.md export.md hub.md index.md metrics.md models.md pipelines.md preprocessors.md training.md utilities.md

datasets.mddocs/

0
# Datasets
1

2
ModelScope's dataset handling provides unified interfaces for working with datasets from the ModelScope ecosystem and local data sources. The MsDataset class offers powerful data manipulation and transformation capabilities.
3

4
## Capabilities
5

6
### MsDataset Class
7

8
Main dataset interface for loading and manipulating datasets.
9

10
```python { .api }
11
class MsDataset:
12
    """
13
    Main dataset interface for ModelScope datasets.
14
    """
15
    
16
    @staticmethod
17
    def load(
18
        dataset_name: Union[str, list],
19
        namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
20
        target: Optional[str] = None,
21
        version: Optional[str] = DEFAULT_DATASET_REVISION,
22
        hub: Optional[Hubs] = Hubs.modelscope,
23
        subset_name: Optional[str] = None,
24
        split: Optional[str] = None,
25
        data_dir: Optional[str] = None,
26
        data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
27
        download_mode: Optional[DownloadMode] = DownloadMode.REUSE_DATASET_IF_EXISTS,
28
        cache_dir: Optional[str] = MS_DATASETS_CACHE,
29
        features: Optional[Features] = None,
30
        use_streaming: Optional[bool] = False,
31
        stream_batch_size: Optional[int] = 1,
32
        custom_cfg: Optional[Config] = Config(),
33
        token: Optional[str] = None,
34
        dataset_info_only: Optional[bool] = False,
35
        trust_remote_code: Optional[bool] = False,
36
        **config_kwargs,
37
    ) -> Union[dict, 'MsDataset', NativeIterableDataset]:
38
        """
39
        Load dataset from ModelScope Hub or local source.
40
        
41
        Parameters:
42
        - dataset_name: Dataset identifier(s) on ModelScope Hub or local path(s)
43
        - namespace: Dataset namespace (default: DEFAULT_DATASET_NAMESPACE)
44
        - target: Target platform or format
45
        - version: Dataset version/revision (default: DEFAULT_DATASET_REVISION)
46
        - hub: Hub source (default: Hubs.modelscope)
47
        - subset_name: Subset/configuration name within the dataset
48
        - split: Dataset split ('train', 'test', 'validation')
49
        - data_dir: Directory containing local dataset files
50
        - data_files: Specific data files to load
51
        - download_mode: Download behavior (default: REUSE_DATASET_IF_EXISTS)
52
        - cache_dir: Directory for caching downloaded datasets (default: MS_DATASETS_CACHE)
53
        - features: Dataset features schema
54
        - use_streaming: Whether to use streaming mode
55
        - stream_batch_size: Batch size for streaming (default: 1)
56
        - custom_cfg: Custom configuration object
57
        - token: Authentication token
58
        - dataset_info_only: Whether to load only dataset info
59
        - trust_remote_code: Whether to trust remote code execution
60
        - **config_kwargs: Additional configuration parameters
61
        
62
        Returns:
63
        MsDataset instance, dict, or NativeIterableDataset
64
        """
65
    
66
    def __init__(
67
        self,
68
        ds_instance: Union[Dataset, IterableDataset, ExternalDataset, NativeIterableDataset],
69
        target: Optional[str] = None
70
    ):
71
        """
72
        Initialize dataset with data.
73
        
74
        Parameters:
75
        - ds_instance: Dataset instance (Dataset, IterableDataset, ExternalDataset, or NativeIterableDataset)
76
        - target: Target platform or format (optional)
77
        """
78
    
79
    @classmethod
80
    def to_ms_dataset(
81
        cls,
82
        ds_instance: Union[Dataset, DatasetDict, ExternalDataset, NativeIterableDataset, IterableDataset, IterableDatasetDict],
83
        target: str = None
84
    ) -> Union[dict, 'MsDataset']:
85
        """
86
        Convert dataset instance to MsDataset format.
87
        
88
        Parameters:
89
        - ds_instance: Dataset instance to convert
90
        - target: Target platform or format (optional)
91
        
92
        Returns:
93
        MsDataset instance or dict of MsDataset instances
94
        """
95
    
96
    def __len__(self) -> int:
97
        """
98
        Get dataset length.
99
        
100
        Returns:
101
        Number of samples in the dataset
102
        """
103
    
104
    def __getitem__(self, index):
105
        """
106
        Get dataset item by index.
107
        
108
        Parameters:
109
        - index: Sample index or slice
110
        
111
        Returns:
112
        Dataset sample or samples
113
        """
114
    
115
    def to_hf_dataset(self):
116
        """
117
        Convert to HuggingFace Dataset format.
118
        
119
        Returns:
120
        HuggingFace Dataset instance
121
        """
122
    
123
    def map(
124
        self,
125
        function,
126
        batched: bool = False,
127
        batch_size: int = 1000,
128
        **kwargs
129
    ):
130
        """
131
        Apply function to all dataset samples.
132
        
133
        Parameters:
134
        - function: Function to apply to each sample
135
        - batched: Whether to process samples in batches
136
        - batch_size: Size of batches for processing
137
        - **kwargs: Additional mapping parameters
138
        
139
        Returns:
140
        New MsDataset with transformed data
141
        """
142
    
143
    def filter(
144
        self,
145
        function,
146
        batched: bool = False,
147
        **kwargs
148
    ):
149
        """
150
        Filter dataset samples based on condition.
151
        
152
        Parameters:
153
        - function: Function that returns True for samples to keep
154
        - batched: Whether to process samples in batches
155
        - **kwargs: Additional filtering parameters
156
        
157
        Returns:
158
        New MsDataset with filtered data
159
        """
160
    
161
    def select(self, indices):
162
        """
163
        Select subset of dataset by indices.
164
        
165
        Parameters:
166
        - indices: List of indices to select
167
        
168
        Returns:
169
        New MsDataset with selected samples
170
        """
171
    
172
    def split(
173
        self,
174
        test_size: float = 0.2,
175
        shuffle: bool = True,
176
        seed: int = None
177
    ):
178
        """
179
        Split dataset into train and test sets.
180
        
181
        Parameters:
182
        - test_size: Fraction of data for test set
183
        - shuffle: Whether to shuffle before splitting
184
        - seed: Random seed for reproducibility
185
        
186
        Returns:
187
        Dictionary with 'train' and 'test' MsDataset instances
188
        """
189
    
190
    def shuffle(self, seed: int = None):
191
        """
192
        Shuffle dataset samples.
193
        
194
        Parameters:
195
        - seed: Random seed for reproducibility
196
        
197
        Returns:
198
        New shuffled MsDataset
199
        """
200
    
201
    def take(self, num_samples: int):
202
        """
203
        Take first N samples from dataset.
204
        
205
        Parameters:
206
        - num_samples: Number of samples to take
207
        
208
        Returns:
209
        New MsDataset with first N samples
210
        """
211
    
212
    def skip(self, num_samples: int):
213
        """
214
        Skip first N samples from dataset.
215
        
216
        Parameters:
217
        - num_samples: Number of samples to skip
218
        
219
        Returns:
220
        New MsDataset with remaining samples
221
        """
222
    
223
    def batch(self, batch_size: int):
224
        """
225
        Create batched version of dataset.
226
        
227
        Parameters:
228
        - batch_size: Size of each batch
229
        
230
        Returns:
231
        New MsDataset that yields batches
232
        """
233
    
234
    def save_to_disk(self, dataset_path: str):
235
        """
236
        Save dataset to local disk.
237
        
238
        Parameters:
239
        - dataset_path: Path to save dataset
240
        """
241
    
242
    @classmethod
243
    def load_from_disk(cls, dataset_path: str):
244
        """
245
        Load dataset from local disk.
246
        
247
        Parameters:
248
        - dataset_path: Path to saved dataset
249
        
250
        Returns:
251
        MsDataset instance
252
        """
253
```
254

255
## Usage Examples
256

257
### Loading Datasets from ModelScope Hub
258

259
```python
260
from modelscope import MsDataset
261

262
# Load complete dataset
263
dataset = MsDataset.load('clue', subset_name='afqmc')
264
print(f"Dataset size: {len(dataset)}")
265

266
# Load specific split
267
train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
268
test_dataset = MsDataset.load('clue', subset_name='afqmc', split='test')
269

270
print(f"Train size: {len(train_dataset)}")
271
print(f"Test size: {len(test_dataset)}")
272

273
# Inspect sample
274
sample = train_dataset[0]
275
print(f"Sample: {sample}")
276
```
277

278
### Loading Local Datasets
279

280
```python
281
from modelscope import MsDataset
282

283
# Load from local directory
284
local_dataset = MsDataset.load(
285
    'path/to/local/dataset',
286
    data_dir='./data',
287
    cache_dir='./cache'
288
)
289

290
# Load from local files
291
import json
292

293
# Load JSON file
294
with open('data.json', 'r') as f:
295
    data = json.load(f)
296

297
dataset = MsDataset(data)
298
```
299

300
### Dataset Transformation and Processing
301

302
```python
303
from modelscope import MsDataset
304

305
# Load dataset
306
dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
307

308
# Transform data with map
309
def preprocess_text(example):
310
    example['text'] = example['sentence1'] + ' [SEP] ' + example['sentence2']
311
    return example
312

313
processed_dataset = dataset.map(preprocess_text)
314

315
# Filter samples
316
def filter_long_texts(example):
317
    return len(example['text']) < 512
318

319
filtered_dataset = processed_dataset.filter(filter_long_texts)
320

321
print(f"Original size: {len(dataset)}")
322
print(f"After filtering: {len(filtered_dataset)}")
323
```
324

325
### Batch Processing
326

327
```python
328
from modelscope import MsDataset
329

330
dataset = MsDataset.load('dataset_name')
331

332
# Process in batches
333
def batch_preprocess(batch):
334
    # Process multiple samples at once
335
    batch['processed_text'] = [text.lower() for text in batch['text']]
336
    return batch
337

338
batch_processed = dataset.map(
339
    batch_preprocess,
340
    batched=True,
341
    batch_size=1000
342
)
343

344
# Create batched dataset for training
345
batched_dataset = dataset.batch(batch_size=32)
346

347
# Iterate through batches
348
for batch in batched_dataset:
349
    print(f"Batch size: {len(batch)}")
350
    break  # Just show first batch
351
```
352

353
### Dataset Splitting and Sampling
354

355
```python
356
from modelscope import MsDataset
357

358
# Load dataset
359
full_dataset = MsDataset.load('dataset_name')
360

361
# Split into train/test
362
splits = full_dataset.split(test_size=0.2, shuffle=True, seed=42)
363
train_data = splits['train']
364
test_data = splits['test']
365

366
print(f"Train size: {len(train_data)}")
367
print(f"Test size: {len(test_data)}")
368

369
# Take subset for quick testing
370
small_dataset = full_dataset.take(1000)
371
print(f"Small dataset size: {len(small_dataset)}")
372

373
# Skip samples
374
remaining_dataset = full_dataset.skip(1000)
375
print(f"Remaining size: {len(remaining_dataset)}")
376

377
# Shuffle dataset
378
shuffled_dataset = full_dataset.shuffle(seed=42)
379
```
380

381
### Dataset Selection and Indexing
382

383
```python
384
from modelscope import MsDataset
385

386
dataset = MsDataset.load('dataset_name')
387

388
# Select specific indices
389
indices = [0, 5, 10, 15, 20]
390
subset = dataset.select(indices)
391
print(f"Selected subset size: {len(subset)}")
392

393
# Slice dataset
394
first_100 = dataset[:100]
395
last_50 = dataset[-50:]
396
every_10th = dataset[::10]
397

398
print(f"First 100: {len(first_100)}")
399
print(f"Last 50: {len(last_50)}")
400
print(f"Every 10th: {len(every_10th)}")
401
```
402

403
### Converting to HuggingFace Format
404

405
```python
406
from modelscope import MsDataset
407

408
# Load ModelScope dataset
409
ms_dataset = MsDataset.load('clue', subset_name='afqmc')
410

411
# Convert to HuggingFace format
412
hf_dataset = ms_dataset.to_hf_dataset()
413

414
print(f"HF Dataset type: {type(hf_dataset)}")
415
print(f"HF Dataset features: {hf_dataset.features}")
416

417
# Use with HuggingFace ecosystem
418
from transformers import AutoTokenizer
419
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
420

421
def tokenize_function(examples):
422
    return tokenizer(examples['sentence1'], examples['sentence2'], 
423
                    truncation=True, padding='max_length', max_length=128)
424

425
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
426
```
427

428
### Saving and Loading Datasets
429

430
```python
431
from modelscope import MsDataset
432

433
# Load and process dataset
434
dataset = MsDataset.load('dataset_name')
435
processed_dataset = dataset.map(lambda x: {'processed': x['text'].lower()})
436

437
# Save processed dataset
438
processed_dataset.save_to_disk('./processed_dataset')
439

440
# Load saved dataset later
441
loaded_dataset = MsDataset.load_from_disk('./processed_dataset')
442
print(f"Loaded dataset size: {len(loaded_dataset)}")
443
```
444

445
### Complex Data Processing Pipeline
446

447
```python
448
from modelscope import MsDataset
449

450
# Load raw dataset
451
dataset = MsDataset.load('text_classification_data')
452

453
# Define processing pipeline
454
def clean_text(example):
455
    import re
456
    # Remove special characters
457
    example['text'] = re.sub(r'[^\w\s]', '', example['text'])
458
    # Convert to lowercase
459
    example['text'] = example['text'].lower()
460
    return example
461

462
def add_length_feature(example):
463
    example['text_length'] = len(example['text'])
464
    return example
465

466
def filter_by_length(example):
467
    return 10 <= example['text_length'] <= 500
468

469
# Apply processing pipeline
470
processed_dataset = (dataset
471
                    .map(clean_text)
472
                    .map(add_length_feature)
473
                    .filter(filter_by_length)
474
                    .shuffle(seed=42))
475

476
print(f"Original size: {len(dataset)}")
477
print(f"After processing: {len(processed_dataset)}")
478

479
# Create train/validation splits
480
splits = processed_dataset.split(test_size=0.2, seed=42)
481
train_dataset = splits['train']
482
val_dataset = splits['test']
483

484
# Create batched iterators for training
485
train_batches = train_dataset.batch(32)
486
val_batches = val_dataset.batch(32)
487
```
488

489
### Custom Dataset Class
490

491
```python
492
from modelscope import MsDataset
493

494
class CustomTextDataset(MsDataset):
495
    def __init__(self, texts, labels, tokenizer=None):
496
        self.texts = texts
497
        self.labels = labels
498
        self.tokenizer = tokenizer
499
        super().__init__(list(zip(texts, labels)))
500
    
501
    def __getitem__(self, index):
502
        text, label = self.texts[index], self.labels[index]
503
        
504
        if self.tokenizer:
505
            encoded = self.tokenizer(text, truncation=True, padding='max_length')
506
            return {
507
                'input_ids': encoded['input_ids'],
508
                'attention_mask': encoded['attention_mask'],
509
                'labels': label
510
            }
511
        
512
        return {'text': text, 'label': label}
513
    
514
    def __len__(self):
515
        return len(self.texts)
516

517
# Use custom dataset
518
texts = ["Text 1", "Text 2", "Text 3"]
519
labels = [0, 1, 0]
520

521
custom_dataset = CustomTextDataset(texts, labels)
522
print(f"Custom dataset size: {len(custom_dataset)}")
523
print(f"Sample: {custom_dataset[0]}")
524
```

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/