CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-modelscope

ModelScope brings the notion of Model-as-a-Service to life with unified interfaces for state-of-the-art machine learning models.

Pending
Overview
Eval results
Files

datasets.mddocs/

Datasets

ModelScope's dataset handling provides unified interfaces for working with datasets from the ModelScope ecosystem and local data sources. The MsDataset class offers powerful data manipulation and transformation capabilities.

Capabilities

MsDataset Class

Main dataset interface for loading and manipulating datasets.

class MsDataset:
    """
    Main dataset interface for ModelScope datasets.
    """
    
    @staticmethod
    def load(
        dataset_name: Union[str, list],
        namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
        target: Optional[str] = None,
        version: Optional[str] = DEFAULT_DATASET_REVISION,
        hub: Optional[Hubs] = Hubs.modelscope,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
        download_mode: Optional[DownloadMode] = DownloadMode.REUSE_DATASET_IF_EXISTS,
        cache_dir: Optional[str] = MS_DATASETS_CACHE,
        features: Optional[Features] = None,
        use_streaming: Optional[bool] = False,
        stream_batch_size: Optional[int] = 1,
        custom_cfg: Optional[Config] = Config(),
        token: Optional[str] = None,
        dataset_info_only: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
        **config_kwargs,
    ) -> Union[dict, 'MsDataset', NativeIterableDataset]:
        """
        Load dataset from ModelScope Hub or local source.
        
        Parameters:
        - dataset_name: Dataset identifier(s) on ModelScope Hub or local path(s)
        - namespace: Dataset namespace (default: DEFAULT_DATASET_NAMESPACE)
        - target: Target platform or format
        - version: Dataset version/revision (default: DEFAULT_DATASET_REVISION)
        - hub: Hub source (default: Hubs.modelscope)
        - subset_name: Subset/configuration name within the dataset
        - split: Dataset split ('train', 'test', 'validation')
        - data_dir: Directory containing local dataset files
        - data_files: Specific data files to load
        - download_mode: Download behavior (default: REUSE_DATASET_IF_EXISTS)
        - cache_dir: Directory for caching downloaded datasets (default: MS_DATASETS_CACHE)
        - features: Dataset features schema
        - use_streaming: Whether to use streaming mode
        - stream_batch_size: Batch size for streaming (default: 1)
        - custom_cfg: Custom configuration object
        - token: Authentication token
        - dataset_info_only: Whether to load only dataset info
        - trust_remote_code: Whether to trust remote code execution
        - **config_kwargs: Additional configuration parameters
        
        Returns:
        MsDataset instance, dict, or NativeIterableDataset
        """
    
    def __init__(
        self,
        ds_instance: Union[Dataset, IterableDataset, ExternalDataset, NativeIterableDataset],
        target: Optional[str] = None
    ):
        """
        Initialize dataset with data.
        
        Parameters:
        - ds_instance: Dataset instance (Dataset, IterableDataset, ExternalDataset, or NativeIterableDataset)
        - target: Target platform or format (optional)
        """
    
    @classmethod
    def to_ms_dataset(
        cls,
        ds_instance: Union[Dataset, DatasetDict, ExternalDataset, NativeIterableDataset, IterableDataset, IterableDatasetDict],
        target: str = None
    ) -> Union[dict, 'MsDataset']:
        """
        Convert dataset instance to MsDataset format.
        
        Parameters:
        - ds_instance: Dataset instance to convert
        - target: Target platform or format (optional)
        
        Returns:
        MsDataset instance or dict of MsDataset instances
        """
    
    def __len__(self) -> int:
        """
        Get dataset length.
        
        Returns:
        Number of samples in the dataset
        """
    
    def __getitem__(self, index):
        """
        Get dataset item by index.
        
        Parameters:
        - index: Sample index or slice
        
        Returns:
        Dataset sample or samples
        """
    
    def to_hf_dataset(self):
        """
        Convert to HuggingFace Dataset format.
        
        Returns:
        HuggingFace Dataset instance
        """
    
    def map(
        self,
        function,
        batched: bool = False,
        batch_size: int = 1000,
        **kwargs
    ):
        """
        Apply function to all dataset samples.
        
        Parameters:
        - function: Function to apply to each sample
        - batched: Whether to process samples in batches
        - batch_size: Size of batches for processing
        - **kwargs: Additional mapping parameters
        
        Returns:
        New MsDataset with transformed data
        """
    
    def filter(
        self,
        function,
        batched: bool = False,
        **kwargs
    ):
        """
        Filter dataset samples based on condition.
        
        Parameters:
        - function: Function that returns True for samples to keep
        - batched: Whether to process samples in batches
        - **kwargs: Additional filtering parameters
        
        Returns:
        New MsDataset with filtered data
        """
    
    def select(self, indices):
        """
        Select subset of dataset by indices.
        
        Parameters:
        - indices: List of indices to select
        
        Returns:
        New MsDataset with selected samples
        """
    
    def split(
        self,
        test_size: float = 0.2,
        shuffle: bool = True,
        seed: int = None
    ):
        """
        Split dataset into train and test sets.
        
        Parameters:
        - test_size: Fraction of data for test set
        - shuffle: Whether to shuffle before splitting
        - seed: Random seed for reproducibility
        
        Returns:
        Dictionary with 'train' and 'test' MsDataset instances
        """
    
    def shuffle(self, seed: int = None):
        """
        Shuffle dataset samples.
        
        Parameters:
        - seed: Random seed for reproducibility
        
        Returns:
        New shuffled MsDataset
        """
    
    def take(self, num_samples: int):
        """
        Take first N samples from dataset.
        
        Parameters:
        - num_samples: Number of samples to take
        
        Returns:
        New MsDataset with first N samples
        """
    
    def skip(self, num_samples: int):
        """
        Skip first N samples from dataset.
        
        Parameters:
        - num_samples: Number of samples to skip
        
        Returns:
        New MsDataset with remaining samples
        """
    
    def batch(self, batch_size: int):
        """
        Create batched version of dataset.
        
        Parameters:
        - batch_size: Size of each batch
        
        Returns:
        New MsDataset that yields batches
        """
    
    def save_to_disk(self, dataset_path: str):
        """
        Save dataset to local disk.
        
        Parameters:
        - dataset_path: Path to save dataset
        """
    
    @classmethod
    def load_from_disk(cls, dataset_path: str):
        """
        Load dataset from local disk.
        
        Parameters:
        - dataset_path: Path to saved dataset
        
        Returns:
        MsDataset instance
        """

Usage Examples

Loading Datasets from ModelScope Hub

from modelscope import MsDataset

# Load complete dataset
dataset = MsDataset.load('clue', subset_name='afqmc')
print(f"Dataset size: {len(dataset)}")

# Load specific split
train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
test_dataset = MsDataset.load('clue', subset_name='afqmc', split='test')

print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")

# Inspect sample
sample = train_dataset[0]
print(f"Sample: {sample}")

Loading Local Datasets

from modelscope import MsDataset

# Load from local directory
local_dataset = MsDataset.load(
    'path/to/local/dataset',
    data_dir='./data',
    cache_dir='./cache'
)

# Load from local files
import json

# Load JSON file
with open('data.json', 'r') as f:
    data = json.load(f)

dataset = MsDataset(data)

Dataset Transformation and Processing

from modelscope import MsDataset

# Load dataset
dataset = MsDataset.load('clue', subset_name='afqmc', split='train')

# Transform data with map
def preprocess_text(example):
    example['text'] = example['sentence1'] + ' [SEP] ' + example['sentence2']
    return example

processed_dataset = dataset.map(preprocess_text)

# Filter samples
def filter_long_texts(example):
    return len(example['text']) < 512

filtered_dataset = processed_dataset.filter(filter_long_texts)

print(f"Original size: {len(dataset)}")
print(f"After filtering: {len(filtered_dataset)}")

Batch Processing

from modelscope import MsDataset

dataset = MsDataset.load('dataset_name')

# Process in batches
def batch_preprocess(batch):
    # Process multiple samples at once
    batch['processed_text'] = [text.lower() for text in batch['text']]
    return batch

batch_processed = dataset.map(
    batch_preprocess,
    batched=True,
    batch_size=1000
)

# Create batched dataset for training
batched_dataset = dataset.batch(batch_size=32)

# Iterate through batches
for batch in batched_dataset:
    print(f"Batch size: {len(batch)}")
    break  # Just show first batch

Dataset Splitting and Sampling

from modelscope import MsDataset

# Load dataset
full_dataset = MsDataset.load('dataset_name')

# Split into train/test
splits = full_dataset.split(test_size=0.2, shuffle=True, seed=42)
train_data = splits['train']
test_data = splits['test']

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")

# Take subset for quick testing
small_dataset = full_dataset.take(1000)
print(f"Small dataset size: {len(small_dataset)}")

# Skip samples
remaining_dataset = full_dataset.skip(1000)
print(f"Remaining size: {len(remaining_dataset)}")

# Shuffle dataset
shuffled_dataset = full_dataset.shuffle(seed=42)

Dataset Selection and Indexing

from modelscope import MsDataset

dataset = MsDataset.load('dataset_name')

# Select specific indices
indices = [0, 5, 10, 15, 20]
subset = dataset.select(indices)
print(f"Selected subset size: {len(subset)}")

# Slice dataset
first_100 = dataset[:100]
last_50 = dataset[-50:]
every_10th = dataset[::10]

print(f"First 100: {len(first_100)}")
print(f"Last 50: {len(last_50)}")
print(f"Every 10th: {len(every_10th)}")

Converting to HuggingFace Format

from modelscope import MsDataset

# Load ModelScope dataset
ms_dataset = MsDataset.load('clue', subset_name='afqmc')

# Convert to HuggingFace format
hf_dataset = ms_dataset.to_hf_dataset()

print(f"HF Dataset type: {type(hf_dataset)}")
print(f"HF Dataset features: {hf_dataset.features}")

# Use with HuggingFace ecosystem
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], 
                    truncation=True, padding='max_length', max_length=128)

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

Saving and Loading Datasets

from modelscope import MsDataset

# Load and process dataset
dataset = MsDataset.load('dataset_name')
processed_dataset = dataset.map(lambda x: {'processed': x['text'].lower()})

# Save processed dataset
processed_dataset.save_to_disk('./processed_dataset')

# Load saved dataset later
loaded_dataset = MsDataset.load_from_disk('./processed_dataset')
print(f"Loaded dataset size: {len(loaded_dataset)}")

Complex Data Processing Pipeline

from modelscope import MsDataset

# Load raw dataset
dataset = MsDataset.load('text_classification_data')

# Define processing pipeline
def clean_text(example):
    import re
    # Remove special characters
    example['text'] = re.sub(r'[^\w\s]', '', example['text'])
    # Convert to lowercase
    example['text'] = example['text'].lower()
    return example

def add_length_feature(example):
    example['text_length'] = len(example['text'])
    return example

def filter_by_length(example):
    return 10 <= example['text_length'] <= 500

# Apply processing pipeline
processed_dataset = (dataset
                    .map(clean_text)
                    .map(add_length_feature)
                    .filter(filter_by_length)
                    .shuffle(seed=42))

print(f"Original size: {len(dataset)}")
print(f"After processing: {len(processed_dataset)}")

# Create train/validation splits
splits = processed_dataset.split(test_size=0.2, seed=42)
train_dataset = splits['train']
val_dataset = splits['test']

# Create batched iterators for training
train_batches = train_dataset.batch(32)
val_batches = val_dataset.batch(32)

Custom Dataset Class

from modelscope import MsDataset

class CustomTextDataset(MsDataset):
    def __init__(self, texts, labels, tokenizer=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        super().__init__(list(zip(texts, labels)))
    
    def __getitem__(self, index):
        text, label = self.texts[index], self.labels[index]
        
        if self.tokenizer:
            encoded = self.tokenizer(text, truncation=True, padding='max_length')
            return {
                'input_ids': encoded['input_ids'],
                'attention_mask': encoded['attention_mask'],
                'labels': label
            }
        
        return {'text': text, 'label': label}
    
    def __len__(self):
        return len(self.texts)

# Use custom dataset
texts = ["Text 1", "Text 2", "Text 3"]
labels = [0, 1, 0]

custom_dataset = CustomTextDataset(texts, labels)
print(f"Custom dataset size: {len(custom_dataset)}")
print(f"Sample: {custom_dataset[0]}")

Install with Tessl CLI

npx tessl i tessl/pypi-modelscope

docs

datasets.md

export.md

hub.md

index.md

metrics.md

models.md

pipelines.md

preprocessors.md

training.md

utilities.md

tile.json