0
# Data Utilities
1
2
Dataset preparation, sampling, and templating utilities for few-shot learning scenarios. These functions help create balanced training sets, generate synthetic examples, and prepare data in the format expected by SetFit models.
3
4
## Capabilities
5
6
### Dataset Sampling
7
8
Create balanced few-shot datasets by sampling equal numbers of examples per class.
9
10
```python { .api }
11
def sample_dataset(
12
dataset: Dataset,
13
label_column: str = "label",
14
num_samples: int = 8,
15
seed: int = 42
16
) -> Dataset:
17
"""
18
Sample a Dataset to create equal number of samples per class.
19
20
Parameters:
21
- dataset: HuggingFace Dataset to sample from
22
- label_column: Name of the column containing labels
23
- num_samples: Number of samples per class to select
24
- seed: Random seed for reproducible sampling
25
26
Returns:
27
New Dataset with balanced samples per class
28
"""
29
30
def create_samples(
31
df: "pd.DataFrame",
32
sample_size: int,
33
seed: int = 42
34
) -> "pd.DataFrame":
35
"""
36
Sample DataFrame with equal samples per class.
37
38
Parameters:
39
- df: Input pandas DataFrame
40
- sample_size: Number of samples per class
41
- seed: Random seed for reproducibility
42
43
Returns:
44
Sampled DataFrame with balanced classes
45
"""
46
47
def create_samples_multilabel(
48
df: "pd.DataFrame",
49
sample_size: int,
50
seed: int = 42
51
) -> "pd.DataFrame":
52
"""
53
Sample DataFrame for multilabel classification scenarios.
54
55
Parameters:
56
- df: Input pandas DataFrame with multilabel targets
57
- sample_size: Number of samples to select
58
- seed: Random seed for reproducibility
59
60
Returns:
61
Sampled DataFrame for multilabel training
62
"""
63
```
64
65
### Few-Shot Split Creation
66
67
Generate training splits with different sample sizes for few-shot learning experiments.
68
69
```python { .api }
70
def create_fewshot_splits(
71
dataset: Dataset,
72
sample_sizes: List[int] = [2, 4, 8, 16, 32, 64],
73
add_data_augmentation: bool = False,
74
dataset_name: Optional[str] = None
75
) -> DatasetDict:
76
"""
77
Create training splits with equal samples per class for different shot sizes.
78
79
Parameters:
80
- dataset: Source dataset to create splits from
81
- sample_sizes: List of sample sizes to create splits for
82
- add_data_augmentation: Whether to add data augmentation
83
- dataset_name: Name of the dataset for tracking
84
85
Returns:
86
DatasetDict with splits for each sample size
87
"""
88
89
def create_fewshot_splits_multilabel(
90
dataset: Dataset,
91
sample_sizes: List[int] = [2, 4, 8, 16]
92
) -> DatasetDict:
93
"""
94
Create multilabel training splits with different sample sizes.
95
96
Parameters:
97
- dataset: Source multilabel dataset
98
- sample_sizes: List of sample sizes to create splits for
99
100
Returns:
101
DatasetDict with multilabel splits for each sample size
102
"""
103
```
104
105
### Templated Dataset Generation
106
107
Generate synthetic training examples using templates and candidate labels.
108
109
```python { .api }
110
def get_templated_dataset(
111
dataset: Optional[Dataset] = None,
112
candidate_labels: Optional[List[str]] = None,
113
reference_dataset: Optional[str] = None,
114
template: str = "This example is {}",
115
sample_size: int = 2,
116
text_column: str = "text",
117
label_column: str = "label",
118
multi_label: bool = False,
119
label_names_column: str = "label_text"
120
) -> Dataset:
121
"""
122
Create templated examples for a reference dataset or reference labels.
123
124
Parameters:
125
- dataset: Source dataset to template (optional)
126
- candidate_labels: List of label names to create templates for
127
- reference_dataset: Name of reference dataset to use
128
- template: Template string with {} placeholder for label
129
- sample_size: Number of examples per label to generate
130
- text_column: Name of text column in dataset
131
- label_column: Name of label column in dataset
132
- multi_label: Whether this is multi-label classification
133
- label_names_column: Column containing label names
134
135
Returns:
136
Dataset with templated examples
137
"""
138
139
def get_candidate_labels(
140
dataset_name: str,
141
label_names_column: str = "label_text"
142
) -> List[str]:
143
"""
144
Extract candidate labels from a dataset.
145
146
Parameters:
147
- dataset_name: Name of the dataset to extract labels from
148
- label_names_column: Column containing label names
149
150
Returns:
151
List of unique label names
152
"""
153
```
154
155
### SetFit Dataset Class
156
157
Custom dataset class for training differentiable heads with PyTorch.
158
159
```python { .api }
160
class SetFitDataset:
161
def __init__(
162
self,
163
x: List[str],
164
y: Union[List[int], List[List[int]]],
165
tokenizer: "PreTrainedTokenizerBase",
166
max_length: int = 512
167
):
168
"""
169
Dataset for training differentiable head on text classification.
170
171
Parameters:
172
- x: List of input texts
173
- y: List of labels (integers for single-label, lists for multi-label)
174
- tokenizer: HuggingFace tokenizer for text processing
175
- max_length: Maximum sequence length for tokenization
176
"""
177
178
def __len__(self) -> int:
179
"""Return the number of examples in the dataset."""
180
181
def __getitem__(self, index: int) -> Dict[str, Any]:
182
"""
183
Get a single example from the dataset.
184
185
Parameters:
186
- index: Index of the example to retrieve
187
188
Returns:
189
Dictionary with input_ids, attention_mask, and labels
190
"""
191
192
@staticmethod
193
def collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
194
"""
195
Collate function for batching examples.
196
197
Parameters:
198
- batch: List of examples from __getitem__
199
200
Returns:
201
Batched tensors ready for model input
202
"""
203
```
204
205
206
## Constants
207
208
```python { .api }
209
# Default seeds for reproducible sampling
210
SEEDS: List[int] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
211
212
# Default sample sizes for few-shot experiments
213
SAMPLE_SIZES: List[int] = [2, 4, 8, 16, 32, 64]
214
215
# Type alias for tokenizer output
216
TokenizerOutput = Dict[str, List[int]]
217
```
218
219
## Usage Examples
220
221
### Creating Balanced Few-Shot Datasets
222
223
```python
224
from setfit import sample_dataset
225
from datasets import load_dataset
226
227
# Load a dataset
228
dataset = load_dataset("emotion", split="train")
229
print(f"Original dataset size: {len(dataset)}")
230
231
# Create balanced 8-shot dataset
232
few_shot_dataset = sample_dataset(
233
dataset=dataset,
234
label_column="label",
235
num_samples=8,
236
seed=42
237
)
238
239
print(f"Few-shot dataset size: {len(few_shot_dataset)}")
240
print(f"Samples per class: 8")
241
242
# Check distribution
243
from collections import Counter
244
label_dist = Counter(few_shot_dataset["label"])
245
print(f"Label distribution: {dict(label_dist)}")
246
```
247
248
### Generating Multiple Training Splits
249
250
```python
251
from setfit import create_fewshot_splits
252
from datasets import load_dataset
253
254
# Load dataset
255
dataset = load_dataset("imdb", split="train")
256
257
# Create multiple few-shot splits
258
splits = create_fewshot_splits(
259
dataset=dataset,
260
sample_sizes=[2, 4, 8, 16, 32],
261
add_data_augmentation=False
262
)
263
264
print(f"Created splits: {list(splits.keys())}")
265
for split_name, split_data in splits.items():
266
print(f"{split_name}: {len(split_data)} examples")
267
268
# Use a specific split for training
269
train_2_shot = splits["train-2"]
270
print(f"2-shot training set: {len(train_2_shot)} examples")
271
```
272
273
### Creating Templated Datasets
274
275
```python
276
from setfit import get_templated_dataset
277
278
# Create templated dataset from labels
279
candidate_labels = ["positive", "negative", "neutral"]
280
281
templated_dataset = get_templated_dataset(
282
candidate_labels=candidate_labels,
283
template="This text expresses {} sentiment.",
284
sample_size=4
285
)
286
287
print("Generated templated examples:")
288
for i, example in enumerate(templated_dataset):
289
print(f"{i}: {example['text']} -> {example['label']}")
290
```
291
292
### Using SetFitDataset for PyTorch Training
293
294
```python
295
from setfit import SetFitDataset
296
from transformers import AutoTokenizer
297
import torch
298
299
# Prepare data
300
texts = ["I love this!", "This is terrible.", "Amazing work!", "Not good."]
301
labels = [1, 0, 1, 0]
302
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
303
304
# Create dataset
305
dataset = SetFitDataset(
306
x=texts,
307
y=labels,
308
tokenizer=tokenizer,
309
max_length=256
310
)
311
312
# Create dataloader
313
dataloader = torch.utils.data.DataLoader(
314
dataset,
315
batch_size=2,
316
collate_fn=dataset.collate_fn,
317
shuffle=True
318
)
319
320
# Iterate through batches
321
for batch in dataloader:
322
print(f"Batch shape: input_ids={batch['input_ids'].shape}")
323
print(f"Labels: {batch['labels']}")
324
break
325
```
326
327
### Multi-label Dataset Handling
328
329
```python
330
from setfit import create_fewshot_splits_multilabel, create_samples_multilabel
331
from datasets import Dataset
332
import pandas as pd
333
334
# Create multi-label dataset
335
data = {
336
"text": ["Great action movie", "Romantic comedy", "Scary thriller", "Funny drama"],
337
"labels": [[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 1, 0]] # [action, comedy, drama]
338
}
339
dataset = Dataset.from_dict(data)
340
341
# Create few-shot splits for multi-label
342
ml_splits = create_fewshot_splits_multilabel(
343
dataset=dataset,
344
sample_sizes=[2, 4]
345
)
346
347
print(f"Multi-label splits: {list(ml_splits.keys())}")
348
349
# Or use with pandas DataFrame
350
df = pd.DataFrame(data)
351
sampled_df = create_samples_multilabel(df, sample_size=2, seed=42)
352
print(f"Sampled DataFrame shape: {sampled_df.shape}")
353
```
354
355
### Benchmarking Different Sample Sizes
356
357
```python
358
from setfit import SetFitModel, SetFitTrainer, TrainingArguments, create_fewshot_splits
359
from datasets import load_dataset
360
import numpy as np
361
362
# Load dataset
363
dataset = load_dataset("SetFit/sst2", split="train")
364
test_dataset = load_dataset("SetFit/sst2", split="test")
365
366
# Create multiple training splits
367
splits = create_fewshot_splits(
368
dataset=dataset,
369
sample_sizes=[2, 4, 8, 16],
370
add_data_augmentation=False
371
)
372
373
# Benchmark performance across sample sizes
374
results = {}
375
model_name = "sentence-transformers/all-MiniLM-L6-v2"
376
377
for split_name, train_split in splits.items():
378
print(f"\nTraining on {split_name}...")
379
380
# Initialize fresh model for each experiment
381
model = SetFitModel.from_pretrained(model_name)
382
383
args = TrainingArguments(
384
batch_size=16,
385
num_epochs=4,
386
eval_strategy="epoch"
387
)
388
389
trainer = SetFitTrainer(
390
model=model,
391
args=args,
392
train_dataset=train_split,
393
eval_dataset=test_dataset
394
)
395
396
trainer.train()
397
eval_results = trainer.evaluate()
398
399
results[split_name] = eval_results["eval_accuracy"]
400
print(f"{split_name} accuracy: {eval_results['eval_accuracy']:.3f}")
401
402
print("\nFinal Results:")
403
for split_name, accuracy in results.items():
404
print(f"{split_name}: {accuracy:.3f}")
405
```