or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

adapters.mdconfiguration.mddatasets.mdevaluation.mdindex.mdlanguage-models.mdmodules.mdoptimization.mdprediction.mdretrieval.mdsignatures.mdstreaming.mdutilities.md
tile.json

datasets.mddocs/

Datasets

Dataset loading and management utilities for DSPy. Provides base classes for creating custom datasets and loaders for various data formats including HuggingFace, CSV, JSON, and Parquet.

Capabilities

Dataset Base Class

Base class for creating structured train/dev/test splits with seed control and sampling.

class Dataset:
    """
    Base class for structured datasets with train/dev/test splits.

    Provides automatic shuffling, sampling, and Example conversion with
    reproducible seeding for train/dev/test splits.
    """

    def __init__(
        self,
        train_seed: int = 0,
        train_size: int = None,
        eval_seed: int = 0,
        dev_size: int = None,
        test_size: int = None,
        input_keys: list = None
    ):
        """
        Initialize dataset with split configuration.

        Args:
            train_seed (int): Random seed for train split (default: 0)
            train_size (int | None): Number of training examples
            eval_seed (int): Random seed for dev/test splits (default: 0)
            dev_size (int | None): Number of dev examples
            test_size (int | None): Number of test examples
            input_keys (list | None): Fields to mark as inputs in Examples
        """
        pass

    @property
    def train(self) -> list:
        """
        Get training split.

        Returns:
            List of Example instances for training
        """
        pass

    @property
    def dev(self) -> list:
        """
        Get development/validation split.

        Returns:
            List of Example instances for validation
        """
        pass

    @property
    def test(self) -> list:
        """
        Get test split.

        Returns:
            List of Example instances for testing
        """
        pass

    def reset_seeds(
        self,
        train_seed: int = None,
        train_size: int = None,
        eval_seed: int = None,
        dev_size: int = None,
        test_size: int = None
    ):
        """
        Reset random seeds and resample splits.

        Args:
            train_seed (int | None): New train seed
            train_size (int | None): New train size
            eval_seed (int | None): New eval seed
            dev_size (int | None): New dev size
            test_size (int | None): New test size
        """
        pass

    @classmethod
    def prepare_by_seed(
        cls,
        train_seeds: list = None,
        train_size: int = 16,
        dev_size: int = 1000,
        divide_eval_per_seed: bool = True,
        eval_seed: int = 2023,
        **kwargs
    ):
        """
        Prepare multiple train/eval splits with different seeds.

        Useful for cross-validation and multi-seed experiments.

        Args:
            train_seeds (list | None): List of training seeds (default: [1,2,3,4,5])
            train_size (int): Training examples per seed (default: 16)
            dev_size (int): Total dev examples (default: 1000)
            divide_eval_per_seed (bool): Divide eval set among seeds (default: True)
            eval_seed (int): Seed for eval set (default: 2023)
            **kwargs: Additional dataset-specific arguments

        Returns:
            dict with 'train_sets' and 'eval_sets' lists
        """
        pass

Usage:

import dspy
from dspy.datasets import Dataset

# Create custom dataset
class MyDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Load your raw data
        self._train = [{"question": "Q1", "answer": "A1"}, ...]
        self._dev = [{"question": "Q2", "answer": "A2"}, ...]
        self._test = [{"question": "Q3", "answer": "A3"}, ...]

# Initialize with size limits and seeds
dataset = MyDataset(
    train_seed=42,
    train_size=100,
    eval_seed=2023,
    dev_size=50,
    test_size=50,
    input_keys=["question"]
)

# Access splits (automatically converted to Examples)
train_data = dataset.train  # List of 100 Examples
dev_data = dataset.dev      # List of 50 Examples
test_data = dataset.test    # List of 50 Examples

# Reset and resample
dataset.reset_seeds(train_seed=43, train_size=200)
new_train = dataset.train   # Different 200 examples

# Multi-seed preparation for experiments
splits = MyDataset.prepare_by_seed(
    train_seeds=[1, 2, 3, 4, 5],
    train_size=16,
    dev_size=1000
)
for i, (train, eval) in enumerate(zip(splits.train_sets, splits.eval_sets)):
    print(f"Seed {i+1}: {len(train)} train, {len(eval)} eval")

DataLoader

Flexible data loading from multiple sources including HuggingFace, CSV, JSON, Parquet, pandas DataFrames, and retrieval models.

class DataLoader:
    """
    Universal data loader for various file formats and sources.

    Supports HuggingFace datasets, CSV, JSON, Parquet files,
    pandas DataFrames, and retrieval model data.
    """

    def from_huggingface(
        self,
        dataset_name: str,
        *args,
        input_keys: tuple = (),
        fields: tuple = None,
        **kwargs
    ):
        """
        Load dataset from HuggingFace Hub.

        Args:
            dataset_name (str): HuggingFace dataset identifier
            *args: Additional arguments for load_dataset
            input_keys (tuple): Field names to mark as inputs
            fields (tuple | None): Specific fields to extract (default: all)
            **kwargs: Additional arguments (split, name, etc.)

        Returns:
            dict of split_name -> list[Example] or list[Example]
        """
        pass

    def from_csv(
        self,
        file_path: str,
        fields: list = None,
        input_keys: tuple = ()
    ) -> list:
        """
        Load dataset from CSV file.

        Args:
            file_path (str): Path to CSV file
            fields (list | None): Columns to include (default: all)
            input_keys (tuple): Field names to mark as inputs

        Returns:
            List of Example instances
        """
        pass

    def from_json(
        self,
        file_path: str,
        fields: list = None,
        input_keys: tuple = ()
    ) -> list:
        """
        Load dataset from JSON file.

        Args:
            file_path (str): Path to JSON file
            fields (list | None): Fields to include (default: all)
            input_keys (tuple): Field names to mark as inputs

        Returns:
            List of Example instances
        """
        pass

    def from_parquet(
        self,
        file_path: str,
        fields: list = None,
        input_keys: tuple = ()
    ) -> list:
        """
        Load dataset from Parquet file.

        Args:
            file_path (str): Path to Parquet file
            fields (list | None): Fields to include (default: all)
            input_keys (tuple): Field names to mark as inputs

        Returns:
            List of Example instances
        """
        pass

    def from_pandas(
        self,
        df,
        fields: list = None,
        input_keys: tuple = ()
    ) -> list:
        """
        Load dataset from pandas DataFrame.

        Args:
            df (pd.DataFrame): pandas DataFrame
            fields (list | None): Columns to include (default: all)
            input_keys (tuple): Field names to mark as inputs

        Returns:
            List of Example instances
        """
        pass

    def from_rm(
        self,
        num_samples: int,
        fields: list,
        input_keys: list
    ) -> list:
        """
        Load dataset from configured retrieval model.

        Requires dspy.configure(rm=...) to be set with a retrieval
        model that supports get_objects().

        Args:
            num_samples (int): Number of samples to retrieve
            fields (list): Fields to extract from objects
            input_keys (list): Field names to mark as inputs

        Returns:
            List of Example instances
        """
        pass

    def sample(
        self,
        dataset: list,
        n: int,
        *args,
        **kwargs
    ) -> list:
        """
        Randomly sample examples from dataset.

        Args:
            dataset (list[Example]): Dataset to sample from
            n (int): Number of samples
            *args: Additional arguments for random.sample
            **kwargs: Additional keyword arguments

        Returns:
            List of n sampled Examples
        """
        pass

    def train_test_split(
        self,
        dataset: list,
        train_size: int | float = 0.75,
        test_size: int | float = None,
        random_state: int = None
    ):
        """
        Split dataset into train and test sets.

        Args:
            dataset (list[Example]): Dataset to split
            train_size (int | float): Number or proportion for train (default: 0.75)
            test_size (int | float | None): Number or proportion for test
            random_state (int | None): Random seed for reproducibility

        Returns:
            dict with 'train' and 'test' keys containing Example lists
        """
        pass

Usage:

import dspy
from dspy.datasets import DataLoader

loader = DataLoader()

# Load from HuggingFace
data = loader.from_huggingface(
    "squad",
    split="train",
    input_keys=("question", "context")
)
print(len(data["train"]))  # List of Examples

# Load from CSV
csv_data = loader.from_csv(
    "data.csv",
    fields=["question", "answer"],
    input_keys=("question",)
)

# Load from JSON
json_data = loader.from_json(
    "data.json",
    input_keys=("query",)
)

# Load from pandas
import pandas as pd
df = pd.read_csv("data.csv")
df_data = loader.from_pandas(
    df,
    fields=["text", "label"],
    input_keys=("text",)
)

# Split data
splits = loader.train_test_split(
    json_data,
    train_size=0.8,
    random_state=42
)
train = splits["train"]
test = splits["test"]

# Sample subset
sample = loader.sample(train, n=100)

HotPotQA Dataset

Multi-hop question answering dataset with supporting facts.

class HotPotQA(Dataset):
    """
    HotPotQA multi-hop question answering dataset.

    Loads the HotPotQA fullwiki dataset with hard examples only.
    Includes question, answer, and supporting facts.
    """

    def __init__(
        self,
        *args,
        only_hard_examples: bool = True,
        keep_details: str | bool = "dev_titles",
        unofficial_dev: bool = True,
        **kwargs
    ):
        """
        Initialize HotPotQA dataset.

        Args:
            only_hard_examples (bool): Use only hard difficulty examples (required)
            keep_details (str | bool): Level of detail to keep:
                - True: Keep all fields (id, question, answer, type, supporting_facts, context)
                - "dev_titles": Keep question, answer, gold_titles
                - False: Keep only question, answer
            unofficial_dev (bool): Create dev split from train split (default: True)
            **kwargs: Arguments for Dataset base class
        """
        pass

Usage:

from dspy.datasets import HotPotQA

# Load HotPotQA with default settings
dataset = HotPotQA(
    train_seed=1,
    train_size=100,
    eval_seed=2023,
    dev_size=500
)

# Access examples
train_examples = dataset.train
dev_examples = dataset.dev

print(train_examples[0].question)
print(train_examples[0].answer)

# With gold titles for evaluation
dataset_with_titles = HotPotQA(
    train_size=100,
    dev_size=500,
    keep_details="dev_titles"
)
print(dev_examples[0].gold_titles)  # Set of supporting document titles

MATH Dataset

Mathematical reasoning dataset with step-by-step solutions.

class MATH(Dataset):
    """
    MATH dataset for mathematical problem solving.

    Contains math problems across various difficulty levels and
    categories with detailed solutions.
    """

    def __init__(self, *args, **kwargs):
        """
        Initialize MATH dataset.

        Args:
            **kwargs: Arguments for Dataset base class (train_size, etc.)
        """
        pass

Usage:

from dspy.datasets import MATH

# Load MATH dataset
dataset = MATH(
    train_size=1000,
    dev_size=500
)

train_examples = dataset.train
print(train_examples[0].problem)
print(train_examples[0].solution)

Colors Dataset

Simple dataset for testing and demonstration purposes.

class Colors(Dataset):
    """
    Colors dataset for testing and examples.

    Simple dataset with color-related examples, useful for
    demonstration and testing DSPy functionality.
    """

    def __init__(self, *args, **kwargs):
        """
        Initialize Colors dataset.

        Args:
            **kwargs: Arguments for Dataset base class
        """
        pass

Usage:

from dspy.datasets import Colors

# Load Colors dataset for testing
dataset = Colors(train_size=20, dev_size=10)

for example in dataset.train:
    print(example)

Dataset Patterns

Creating Custom Datasets

Extend the Dataset class for custom data:

import dspy
from dspy.datasets import Dataset

class CustomQADataset(Dataset):
    def __init__(self, data_path, **kwargs):
        super().__init__(**kwargs)

        # Load raw data
        with open(data_path) as f:
            data = json.load(f)

        # Split into train/dev/test
        total = len(data)
        self._train = data[:int(total * 0.7)]
        self._dev = data[int(total * 0.7):int(total * 0.85)]
        self._test = data[int(total * 0.85):]

        # Set input keys
        self.input_keys = ["question", "context"]

# Use custom dataset
dataset = CustomQADataset(
    "my_data.json",
    train_size=500,
    dev_size=100
)

Loading Multiple Formats

Combine data from different sources:

from dspy.datasets import DataLoader

loader = DataLoader()

# Load from multiple sources
train_csv = loader.from_csv("train.csv", input_keys=("question",))
train_json = loader.from_json("train.json", input_keys=("question",))

# Combine
train_data = train_csv + train_json

# Split combined data
splits = loader.train_test_split(
    train_data,
    train_size=0.8,
    random_state=42
)

Cross-Validation Setup

Use prepare_by_seed for k-fold-like experiments:

from dspy.datasets import HotPotQA

# Prepare 5 different train/eval splits
splits = HotPotQA.prepare_by_seed(
    train_seeds=[1, 2, 3, 4, 5],
    train_size=16,
    dev_size=1000,
    divide_eval_per_seed=True
)

# Train and evaluate on each split
for i, (train, eval) in enumerate(zip(splits.train_sets, splits.eval_sets)):
    optimizer = dspy.BootstrapFewShot(metric=my_metric)
    compiled = optimizer.compile(program, trainset=train)

    evaluator = dspy.Evaluate(devset=eval, metric=my_metric)
    score = evaluator(compiled)
    print(f"Split {i+1}: {score}")

Efficient Data Loading

Load only needed fields for efficiency:

from dspy.datasets import DataLoader

loader = DataLoader()

# Load only specific fields from HuggingFace
data = loader.from_huggingface(
    "squad",
    split="train",
    fields=("question", "context", "answers"),  # Only these fields
    input_keys=("question", "context")
)

# Reduces memory usage for large datasets