ModelScope brings the notion of Model-as-a-Service to life with unified interfaces for state-of-the-art machine learning models.
—
ModelScope's dataset handling provides unified interfaces for working with datasets from the ModelScope ecosystem and local data sources. The MsDataset class offers powerful data manipulation and transformation capabilities.
Main dataset interface for loading and manipulating datasets.
class MsDataset:
"""
Main dataset interface for ModelScope datasets.
"""
@staticmethod
def load(
dataset_name: Union[str, list],
namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
hub: Optional[Hubs] = Hubs.modelscope,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = DownloadMode.REUSE_DATASET_IF_EXISTS,
cache_dir: Optional[str] = MS_DATASETS_CACHE,
features: Optional[Features] = None,
use_streaming: Optional[bool] = False,
stream_batch_size: Optional[int] = 1,
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
dataset_info_only: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""
Load dataset from ModelScope Hub or local source.
Parameters:
- dataset_name: Dataset identifier(s) on ModelScope Hub or local path(s)
- namespace: Dataset namespace (default: DEFAULT_DATASET_NAMESPACE)
- target: Target platform or format
- version: Dataset version/revision (default: DEFAULT_DATASET_REVISION)
- hub: Hub source (default: Hubs.modelscope)
- subset_name: Subset/configuration name within the dataset
- split: Dataset split ('train', 'test', 'validation')
- data_dir: Directory containing local dataset files
- data_files: Specific data files to load
- download_mode: Download behavior (default: REUSE_DATASET_IF_EXISTS)
- cache_dir: Directory for caching downloaded datasets (default: MS_DATASETS_CACHE)
- features: Dataset features schema
- use_streaming: Whether to use streaming mode
- stream_batch_size: Batch size for streaming (default: 1)
- custom_cfg: Custom configuration object
- token: Authentication token
- dataset_info_only: Whether to load only dataset info
- trust_remote_code: Whether to trust remote code execution
- **config_kwargs: Additional configuration parameters
Returns:
MsDataset instance, dict, or NativeIterableDataset
"""
def __init__(
self,
ds_instance: Union[Dataset, IterableDataset, ExternalDataset, NativeIterableDataset],
target: Optional[str] = None
):
"""
Initialize dataset with data.
Parameters:
- ds_instance: Dataset instance (Dataset, IterableDataset, ExternalDataset, or NativeIterableDataset)
- target: Target platform or format (optional)
"""
@classmethod
def to_ms_dataset(
cls,
ds_instance: Union[Dataset, DatasetDict, ExternalDataset, NativeIterableDataset, IterableDataset, IterableDatasetDict],
target: str = None
) -> Union[dict, 'MsDataset']:
"""
Convert dataset instance to MsDataset format.
Parameters:
- ds_instance: Dataset instance to convert
- target: Target platform or format (optional)
Returns:
MsDataset instance or dict of MsDataset instances
"""
def __len__(self) -> int:
"""
Get dataset length.
Returns:
Number of samples in the dataset
"""
def __getitem__(self, index):
"""
Get dataset item by index.
Parameters:
- index: Sample index or slice
Returns:
Dataset sample or samples
"""
def to_hf_dataset(self):
"""
Convert to HuggingFace Dataset format.
Returns:
HuggingFace Dataset instance
"""
def map(
self,
function,
batched: bool = False,
batch_size: int = 1000,
**kwargs
):
"""
Apply function to all dataset samples.
Parameters:
- function: Function to apply to each sample
- batched: Whether to process samples in batches
- batch_size: Size of batches for processing
- **kwargs: Additional mapping parameters
Returns:
New MsDataset with transformed data
"""
def filter(
self,
function,
batched: bool = False,
**kwargs
):
"""
Filter dataset samples based on condition.
Parameters:
- function: Function that returns True for samples to keep
- batched: Whether to process samples in batches
- **kwargs: Additional filtering parameters
Returns:
New MsDataset with filtered data
"""
def select(self, indices):
"""
Select subset of dataset by indices.
Parameters:
- indices: List of indices to select
Returns:
New MsDataset with selected samples
"""
def split(
self,
test_size: float = 0.2,
shuffle: bool = True,
seed: int = None
):
"""
Split dataset into train and test sets.
Parameters:
- test_size: Fraction of data for test set
- shuffle: Whether to shuffle before splitting
- seed: Random seed for reproducibility
Returns:
Dictionary with 'train' and 'test' MsDataset instances
"""
def shuffle(self, seed: int = None):
"""
Shuffle dataset samples.
Parameters:
- seed: Random seed for reproducibility
Returns:
New shuffled MsDataset
"""
def take(self, num_samples: int):
"""
Take first N samples from dataset.
Parameters:
- num_samples: Number of samples to take
Returns:
New MsDataset with first N samples
"""
def skip(self, num_samples: int):
"""
Skip first N samples from dataset.
Parameters:
- num_samples: Number of samples to skip
Returns:
New MsDataset with remaining samples
"""
def batch(self, batch_size: int):
"""
Create batched version of dataset.
Parameters:
- batch_size: Size of each batch
Returns:
New MsDataset that yields batches
"""
def save_to_disk(self, dataset_path: str):
"""
Save dataset to local disk.
Parameters:
- dataset_path: Path to save dataset
"""
@classmethod
def load_from_disk(cls, dataset_path: str):
"""
Load dataset from local disk.
Parameters:
- dataset_path: Path to saved dataset
Returns:
MsDataset instance
"""from modelscope import MsDataset
# Load complete dataset
dataset = MsDataset.load('clue', subset_name='afqmc')
print(f"Dataset size: {len(dataset)}")
# Load specific split
train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
test_dataset = MsDataset.load('clue', subset_name='afqmc', split='test')
print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")
# Inspect sample
sample = train_dataset[0]
print(f"Sample: {sample}")from modelscope import MsDataset
# Load from local directory
local_dataset = MsDataset.load(
'path/to/local/dataset',
data_dir='./data',
cache_dir='./cache'
)
# Load from local files
import json
# Load JSON file
with open('data.json', 'r') as f:
data = json.load(f)
dataset = MsDataset(data)from modelscope import MsDataset
# Load dataset
dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
# Transform data with map
def preprocess_text(example):
example['text'] = example['sentence1'] + ' [SEP] ' + example['sentence2']
return example
processed_dataset = dataset.map(preprocess_text)
# Filter samples
def filter_long_texts(example):
return len(example['text']) < 512
filtered_dataset = processed_dataset.filter(filter_long_texts)
print(f"Original size: {len(dataset)}")
print(f"After filtering: {len(filtered_dataset)}")from modelscope import MsDataset
dataset = MsDataset.load('dataset_name')
# Process in batches
def batch_preprocess(batch):
# Process multiple samples at once
batch['processed_text'] = [text.lower() for text in batch['text']]
return batch
batch_processed = dataset.map(
batch_preprocess,
batched=True,
batch_size=1000
)
# Create batched dataset for training
batched_dataset = dataset.batch(batch_size=32)
# Iterate through batches
for batch in batched_dataset:
print(f"Batch size: {len(batch)}")
break # Just show first batchfrom modelscope import MsDataset
# Load dataset
full_dataset = MsDataset.load('dataset_name')
# Split into train/test
splits = full_dataset.split(test_size=0.2, shuffle=True, seed=42)
train_data = splits['train']
test_data = splits['test']
print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")
# Take subset for quick testing
small_dataset = full_dataset.take(1000)
print(f"Small dataset size: {len(small_dataset)}")
# Skip samples
remaining_dataset = full_dataset.skip(1000)
print(f"Remaining size: {len(remaining_dataset)}")
# Shuffle dataset
shuffled_dataset = full_dataset.shuffle(seed=42)from modelscope import MsDataset
dataset = MsDataset.load('dataset_name')
# Select specific indices
indices = [0, 5, 10, 15, 20]
subset = dataset.select(indices)
print(f"Selected subset size: {len(subset)}")
# Slice dataset
first_100 = dataset[:100]
last_50 = dataset[-50:]
every_10th = dataset[::10]
print(f"First 100: {len(first_100)}")
print(f"Last 50: {len(last_50)}")
print(f"Every 10th: {len(every_10th)}")from modelscope import MsDataset
# Load ModelScope dataset
ms_dataset = MsDataset.load('clue', subset_name='afqmc')
# Convert to HuggingFace format
hf_dataset = ms_dataset.to_hf_dataset()
print(f"HF Dataset type: {type(hf_dataset)}")
print(f"HF Dataset features: {hf_dataset.features}")
# Use with HuggingFace ecosystem
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
def tokenize_function(examples):
return tokenizer(examples['sentence1'], examples['sentence2'],
truncation=True, padding='max_length', max_length=128)
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)from modelscope import MsDataset
# Load and process dataset
dataset = MsDataset.load('dataset_name')
processed_dataset = dataset.map(lambda x: {'processed': x['text'].lower()})
# Save processed dataset
processed_dataset.save_to_disk('./processed_dataset')
# Load saved dataset later
loaded_dataset = MsDataset.load_from_disk('./processed_dataset')
print(f"Loaded dataset size: {len(loaded_dataset)}")from modelscope import MsDataset
# Load raw dataset
dataset = MsDataset.load('text_classification_data')
# Define processing pipeline
def clean_text(example):
import re
# Remove special characters
example['text'] = re.sub(r'[^\w\s]', '', example['text'])
# Convert to lowercase
example['text'] = example['text'].lower()
return example
def add_length_feature(example):
example['text_length'] = len(example['text'])
return example
def filter_by_length(example):
return 10 <= example['text_length'] <= 500
# Apply processing pipeline
processed_dataset = (dataset
.map(clean_text)
.map(add_length_feature)
.filter(filter_by_length)
.shuffle(seed=42))
print(f"Original size: {len(dataset)}")
print(f"After processing: {len(processed_dataset)}")
# Create train/validation splits
splits = processed_dataset.split(test_size=0.2, seed=42)
train_dataset = splits['train']
val_dataset = splits['test']
# Create batched iterators for training
train_batches = train_dataset.batch(32)
val_batches = val_dataset.batch(32)from modelscope import MsDataset
class CustomTextDataset(MsDataset):
def __init__(self, texts, labels, tokenizer=None):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
super().__init__(list(zip(texts, labels)))
def __getitem__(self, index):
text, label = self.texts[index], self.labels[index]
if self.tokenizer:
encoded = self.tokenizer(text, truncation=True, padding='max_length')
return {
'input_ids': encoded['input_ids'],
'attention_mask': encoded['attention_mask'],
'labels': label
}
return {'text': text, 'label': label}
def __len__(self):
return len(self.texts)
# Use custom dataset
texts = ["Text 1", "Text 2", "Text 3"]
labels = [0, 1, 0]
custom_dataset = CustomTextDataset(texts, labels)
print(f"Custom dataset size: {len(custom_dataset)}")
print(f"Sample: {custom_dataset[0]}")Install with Tessl CLI
npx tessl i tessl/pypi-modelscope