Dataset loading and management utilities for DSPy. Provides base classes for creating custom datasets and loaders for various data formats including HuggingFace, CSV, JSON, and Parquet.
Base class for creating structured train/dev/test splits with seed control and sampling.
class Dataset:
"""
Base class for structured datasets with train/dev/test splits.
Provides automatic shuffling, sampling, and Example conversion with
reproducible seeding for train/dev/test splits.
"""
def __init__(
self,
train_seed: int = 0,
train_size: int = None,
eval_seed: int = 0,
dev_size: int = None,
test_size: int = None,
input_keys: list = None
):
"""
Initialize dataset with split configuration.
Args:
train_seed (int): Random seed for train split (default: 0)
train_size (int | None): Number of training examples
eval_seed (int): Random seed for dev/test splits (default: 0)
dev_size (int | None): Number of dev examples
test_size (int | None): Number of test examples
input_keys (list | None): Fields to mark as inputs in Examples
"""
pass
@property
def train(self) -> list:
"""
Get training split.
Returns:
List of Example instances for training
"""
pass
@property
def dev(self) -> list:
"""
Get development/validation split.
Returns:
List of Example instances for validation
"""
pass
@property
def test(self) -> list:
"""
Get test split.
Returns:
List of Example instances for testing
"""
pass
def reset_seeds(
self,
train_seed: int = None,
train_size: int = None,
eval_seed: int = None,
dev_size: int = None,
test_size: int = None
):
"""
Reset random seeds and resample splits.
Args:
train_seed (int | None): New train seed
train_size (int | None): New train size
eval_seed (int | None): New eval seed
dev_size (int | None): New dev size
test_size (int | None): New test size
"""
pass
@classmethod
def prepare_by_seed(
cls,
train_seeds: list = None,
train_size: int = 16,
dev_size: int = 1000,
divide_eval_per_seed: bool = True,
eval_seed: int = 2023,
**kwargs
):
"""
Prepare multiple train/eval splits with different seeds.
Useful for cross-validation and multi-seed experiments.
Args:
train_seeds (list | None): List of training seeds (default: [1,2,3,4,5])
train_size (int): Training examples per seed (default: 16)
dev_size (int): Total dev examples (default: 1000)
divide_eval_per_seed (bool): Divide eval set among seeds (default: True)
eval_seed (int): Seed for eval set (default: 2023)
**kwargs: Additional dataset-specific arguments
Returns:
dict with 'train_sets' and 'eval_sets' lists
"""
passUsage:
import dspy
from dspy.datasets import Dataset
# Create custom dataset
class MyDataset(Dataset):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Load your raw data
self._train = [{"question": "Q1", "answer": "A1"}, ...]
self._dev = [{"question": "Q2", "answer": "A2"}, ...]
self._test = [{"question": "Q3", "answer": "A3"}, ...]
# Initialize with size limits and seeds
dataset = MyDataset(
train_seed=42,
train_size=100,
eval_seed=2023,
dev_size=50,
test_size=50,
input_keys=["question"]
)
# Access splits (automatically converted to Examples)
train_data = dataset.train # List of 100 Examples
dev_data = dataset.dev # List of 50 Examples
test_data = dataset.test # List of 50 Examples
# Reset and resample
dataset.reset_seeds(train_seed=43, train_size=200)
new_train = dataset.train # Different 200 examples
# Multi-seed preparation for experiments
splits = MyDataset.prepare_by_seed(
train_seeds=[1, 2, 3, 4, 5],
train_size=16,
dev_size=1000
)
for i, (train, eval) in enumerate(zip(splits.train_sets, splits.eval_sets)):
print(f"Seed {i+1}: {len(train)} train, {len(eval)} eval")Flexible data loading from multiple sources including HuggingFace, CSV, JSON, Parquet, pandas DataFrames, and retrieval models.
class DataLoader:
"""
Universal data loader for various file formats and sources.
Supports HuggingFace datasets, CSV, JSON, Parquet files,
pandas DataFrames, and retrieval model data.
"""
def from_huggingface(
self,
dataset_name: str,
*args,
input_keys: tuple = (),
fields: tuple = None,
**kwargs
):
"""
Load dataset from HuggingFace Hub.
Args:
dataset_name (str): HuggingFace dataset identifier
*args: Additional arguments for load_dataset
input_keys (tuple): Field names to mark as inputs
fields (tuple | None): Specific fields to extract (default: all)
**kwargs: Additional arguments (split, name, etc.)
Returns:
dict of split_name -> list[Example] or list[Example]
"""
pass
def from_csv(
self,
file_path: str,
fields: list = None,
input_keys: tuple = ()
) -> list:
"""
Load dataset from CSV file.
Args:
file_path (str): Path to CSV file
fields (list | None): Columns to include (default: all)
input_keys (tuple): Field names to mark as inputs
Returns:
List of Example instances
"""
pass
def from_json(
self,
file_path: str,
fields: list = None,
input_keys: tuple = ()
) -> list:
"""
Load dataset from JSON file.
Args:
file_path (str): Path to JSON file
fields (list | None): Fields to include (default: all)
input_keys (tuple): Field names to mark as inputs
Returns:
List of Example instances
"""
pass
def from_parquet(
self,
file_path: str,
fields: list = None,
input_keys: tuple = ()
) -> list:
"""
Load dataset from Parquet file.
Args:
file_path (str): Path to Parquet file
fields (list | None): Fields to include (default: all)
input_keys (tuple): Field names to mark as inputs
Returns:
List of Example instances
"""
pass
def from_pandas(
self,
df,
fields: list = None,
input_keys: tuple = ()
) -> list:
"""
Load dataset from pandas DataFrame.
Args:
df (pd.DataFrame): pandas DataFrame
fields (list | None): Columns to include (default: all)
input_keys (tuple): Field names to mark as inputs
Returns:
List of Example instances
"""
pass
def from_rm(
self,
num_samples: int,
fields: list,
input_keys: list
) -> list:
"""
Load dataset from configured retrieval model.
Requires dspy.configure(rm=...) to be set with a retrieval
model that supports get_objects().
Args:
num_samples (int): Number of samples to retrieve
fields (list): Fields to extract from objects
input_keys (list): Field names to mark as inputs
Returns:
List of Example instances
"""
pass
def sample(
self,
dataset: list,
n: int,
*args,
**kwargs
) -> list:
"""
Randomly sample examples from dataset.
Args:
dataset (list[Example]): Dataset to sample from
n (int): Number of samples
*args: Additional arguments for random.sample
**kwargs: Additional keyword arguments
Returns:
List of n sampled Examples
"""
pass
def train_test_split(
self,
dataset: list,
train_size: int | float = 0.75,
test_size: int | float = None,
random_state: int = None
):
"""
Split dataset into train and test sets.
Args:
dataset (list[Example]): Dataset to split
train_size (int | float): Number or proportion for train (default: 0.75)
test_size (int | float | None): Number or proportion for test
random_state (int | None): Random seed for reproducibility
Returns:
dict with 'train' and 'test' keys containing Example lists
"""
passUsage:
import dspy
from dspy.datasets import DataLoader
loader = DataLoader()
# Load from HuggingFace
data = loader.from_huggingface(
"squad",
split="train",
input_keys=("question", "context")
)
print(len(data["train"])) # List of Examples
# Load from CSV
csv_data = loader.from_csv(
"data.csv",
fields=["question", "answer"],
input_keys=("question",)
)
# Load from JSON
json_data = loader.from_json(
"data.json",
input_keys=("query",)
)
# Load from pandas
import pandas as pd
df = pd.read_csv("data.csv")
df_data = loader.from_pandas(
df,
fields=["text", "label"],
input_keys=("text",)
)
# Split data
splits = loader.train_test_split(
json_data,
train_size=0.8,
random_state=42
)
train = splits["train"]
test = splits["test"]
# Sample subset
sample = loader.sample(train, n=100)Multi-hop question answering dataset with supporting facts.
class HotPotQA(Dataset):
"""
HotPotQA multi-hop question answering dataset.
Loads the HotPotQA fullwiki dataset with hard examples only.
Includes question, answer, and supporting facts.
"""
def __init__(
self,
*args,
only_hard_examples: bool = True,
keep_details: str | bool = "dev_titles",
unofficial_dev: bool = True,
**kwargs
):
"""
Initialize HotPotQA dataset.
Args:
only_hard_examples (bool): Use only hard difficulty examples (required)
keep_details (str | bool): Level of detail to keep:
- True: Keep all fields (id, question, answer, type, supporting_facts, context)
- "dev_titles": Keep question, answer, gold_titles
- False: Keep only question, answer
unofficial_dev (bool): Create dev split from train split (default: True)
**kwargs: Arguments for Dataset base class
"""
passUsage:
from dspy.datasets import HotPotQA
# Load HotPotQA with default settings
dataset = HotPotQA(
train_seed=1,
train_size=100,
eval_seed=2023,
dev_size=500
)
# Access examples
train_examples = dataset.train
dev_examples = dataset.dev
print(train_examples[0].question)
print(train_examples[0].answer)
# With gold titles for evaluation
dataset_with_titles = HotPotQA(
train_size=100,
dev_size=500,
keep_details="dev_titles"
)
print(dev_examples[0].gold_titles) # Set of supporting document titlesMathematical reasoning dataset with step-by-step solutions.
class MATH(Dataset):
"""
MATH dataset for mathematical problem solving.
Contains math problems across various difficulty levels and
categories with detailed solutions.
"""
def __init__(self, *args, **kwargs):
"""
Initialize MATH dataset.
Args:
**kwargs: Arguments for Dataset base class (train_size, etc.)
"""
passUsage:
from dspy.datasets import MATH
# Load MATH dataset
dataset = MATH(
train_size=1000,
dev_size=500
)
train_examples = dataset.train
print(train_examples[0].problem)
print(train_examples[0].solution)Simple dataset for testing and demonstration purposes.
class Colors(Dataset):
"""
Colors dataset for testing and examples.
Simple dataset with color-related examples, useful for
demonstration and testing DSPy functionality.
"""
def __init__(self, *args, **kwargs):
"""
Initialize Colors dataset.
Args:
**kwargs: Arguments for Dataset base class
"""
passUsage:
from dspy.datasets import Colors
# Load Colors dataset for testing
dataset = Colors(train_size=20, dev_size=10)
for example in dataset.train:
print(example)Extend the Dataset class for custom data:
import dspy
from dspy.datasets import Dataset
class CustomQADataset(Dataset):
def __init__(self, data_path, **kwargs):
super().__init__(**kwargs)
# Load raw data
with open(data_path) as f:
data = json.load(f)
# Split into train/dev/test
total = len(data)
self._train = data[:int(total * 0.7)]
self._dev = data[int(total * 0.7):int(total * 0.85)]
self._test = data[int(total * 0.85):]
# Set input keys
self.input_keys = ["question", "context"]
# Use custom dataset
dataset = CustomQADataset(
"my_data.json",
train_size=500,
dev_size=100
)Combine data from different sources:
from dspy.datasets import DataLoader
loader = DataLoader()
# Load from multiple sources
train_csv = loader.from_csv("train.csv", input_keys=("question",))
train_json = loader.from_json("train.json", input_keys=("question",))
# Combine
train_data = train_csv + train_json
# Split combined data
splits = loader.train_test_split(
train_data,
train_size=0.8,
random_state=42
)Use prepare_by_seed for k-fold-like experiments:
from dspy.datasets import HotPotQA
# Prepare 5 different train/eval splits
splits = HotPotQA.prepare_by_seed(
train_seeds=[1, 2, 3, 4, 5],
train_size=16,
dev_size=1000,
divide_eval_per_seed=True
)
# Train and evaluate on each split
for i, (train, eval) in enumerate(zip(splits.train_sets, splits.eval_sets)):
optimizer = dspy.BootstrapFewShot(metric=my_metric)
compiled = optimizer.compile(program, trainset=train)
evaluator = dspy.Evaluate(devset=eval, metric=my_metric)
score = evaluator(compiled)
print(f"Split {i+1}: {score}")Load only needed fields for efficiency:
from dspy.datasets import DataLoader
loader = DataLoader()
# Load only specific fields from HuggingFace
data = loader.from_huggingface(
"squad",
split="train",
fields=("question", "context", "answers"), # Only these fields
input_keys=("question", "context")
)
# Reduces memory usage for large datasets