or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

absa.mdcore-model-training.mddata-utilities.mdindex.mdknowledge-distillation.mdmodel-cards.mdmodel-export.md

data-utilities.mddocs/

0

# Data Utilities

1

2

Dataset preparation, sampling, and templating utilities for few-shot learning scenarios. These functions help create balanced training sets, generate synthetic examples, and prepare data in the format expected by SetFit models.

3

4

## Capabilities

5

6

### Dataset Sampling

7

8

Create balanced few-shot datasets by sampling equal numbers of examples per class.

9

10

```python { .api }

11

def sample_dataset(

12

dataset: Dataset,

13

label_column: str = "label",

14

num_samples: int = 8,

15

seed: int = 42

16

) -> Dataset:

17

"""

18

Sample a Dataset to create equal number of samples per class.

19

20

Parameters:

21

- dataset: HuggingFace Dataset to sample from

22

- label_column: Name of the column containing labels

23

- num_samples: Number of samples per class to select

24

- seed: Random seed for reproducible sampling

25

26

Returns:

27

New Dataset with balanced samples per class

28

"""

29

30

def create_samples(

31

df: "pd.DataFrame",

32

sample_size: int,

33

seed: int = 42

34

) -> "pd.DataFrame":

35

"""

36

Sample DataFrame with equal samples per class.

37

38

Parameters:

39

- df: Input pandas DataFrame

40

- sample_size: Number of samples per class

41

- seed: Random seed for reproducibility

42

43

Returns:

44

Sampled DataFrame with balanced classes

45

"""

46

47

def create_samples_multilabel(

48

df: "pd.DataFrame",

49

sample_size: int,

50

seed: int = 42

51

) -> "pd.DataFrame":

52

"""

53

Sample DataFrame for multilabel classification scenarios.

54

55

Parameters:

56

- df: Input pandas DataFrame with multilabel targets

57

- sample_size: Number of samples to select

58

- seed: Random seed for reproducibility

59

60

Returns:

61

Sampled DataFrame for multilabel training

62

"""

63

```

64

65

### Few-Shot Split Creation

66

67

Generate training splits with different sample sizes for few-shot learning experiments.

68

69

```python { .api }

70

def create_fewshot_splits(

71

dataset: Dataset,

72

sample_sizes: List[int] = [2, 4, 8, 16, 32, 64],

73

add_data_augmentation: bool = False,

74

dataset_name: Optional[str] = None

75

) -> DatasetDict:

76

"""

77

Create training splits with equal samples per class for different shot sizes.

78

79

Parameters:

80

- dataset: Source dataset to create splits from

81

- sample_sizes: List of sample sizes to create splits for

82

- add_data_augmentation: Whether to add data augmentation

83

- dataset_name: Name of the dataset for tracking

84

85

Returns:

86

DatasetDict with splits for each sample size

87

"""

88

89

def create_fewshot_splits_multilabel(

90

dataset: Dataset,

91

sample_sizes: List[int] = [2, 4, 8, 16]

92

) -> DatasetDict:

93

"""

94

Create multilabel training splits with different sample sizes.

95

96

Parameters:

97

- dataset: Source multilabel dataset

98

- sample_sizes: List of sample sizes to create splits for

99

100

Returns:

101

DatasetDict with multilabel splits for each sample size

102

"""

103

```

104

105

### Templated Dataset Generation

106

107

Generate synthetic training examples using templates and candidate labels.

108

109

```python { .api }

110

def get_templated_dataset(

111

dataset: Optional[Dataset] = None,

112

candidate_labels: Optional[List[str]] = None,

113

reference_dataset: Optional[str] = None,

114

template: str = "This example is {}",

115

sample_size: int = 2,

116

text_column: str = "text",

117

label_column: str = "label",

118

multi_label: bool = False,

119

label_names_column: str = "label_text"

120

) -> Dataset:

121

"""

122

Create templated examples for a reference dataset or reference labels.

123

124

Parameters:

125

- dataset: Source dataset to template (optional)

126

- candidate_labels: List of label names to create templates for

127

- reference_dataset: Name of reference dataset to use

128

- template: Template string with {} placeholder for label

129

- sample_size: Number of examples per label to generate

130

- text_column: Name of text column in dataset

131

- label_column: Name of label column in dataset

132

- multi_label: Whether this is multi-label classification

133

- label_names_column: Column containing label names

134

135

Returns:

136

Dataset with templated examples

137

"""

138

139

def get_candidate_labels(

140

dataset_name: str,

141

label_names_column: str = "label_text"

142

) -> List[str]:

143

"""

144

Extract candidate labels from a dataset.

145

146

Parameters:

147

- dataset_name: Name of the dataset to extract labels from

148

- label_names_column: Column containing label names

149

150

Returns:

151

List of unique label names

152

"""

153

```

154

155

### SetFit Dataset Class

156

157

Custom dataset class for training differentiable heads with PyTorch.

158

159

```python { .api }

160

class SetFitDataset:

161

def __init__(

162

self,

163

x: List[str],

164

y: Union[List[int], List[List[int]]],

165

tokenizer: "PreTrainedTokenizerBase",

166

max_length: int = 512

167

):

168

"""

169

Dataset for training differentiable head on text classification.

170

171

Parameters:

172

- x: List of input texts

173

- y: List of labels (integers for single-label, lists for multi-label)

174

- tokenizer: HuggingFace tokenizer for text processing

175

- max_length: Maximum sequence length for tokenization

176

"""

177

178

def __len__(self) -> int:

179

"""Return the number of examples in the dataset."""

180

181

def __getitem__(self, index: int) -> Dict[str, Any]:

182

"""

183

Get a single example from the dataset.

184

185

Parameters:

186

- index: Index of the example to retrieve

187

188

Returns:

189

Dictionary with input_ids, attention_mask, and labels

190

"""

191

192

@staticmethod

193

def collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:

194

"""

195

Collate function for batching examples.

196

197

Parameters:

198

- batch: List of examples from __getitem__

199

200

Returns:

201

Batched tensors ready for model input

202

"""

203

```

204

205

206

## Constants

207

208

```python { .api }

209

# Default seeds for reproducible sampling

210

SEEDS: List[int] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

211

212

# Default sample sizes for few-shot experiments

213

SAMPLE_SIZES: List[int] = [2, 4, 8, 16, 32, 64]

214

215

# Type alias for tokenizer output

216

TokenizerOutput = Dict[str, List[int]]

217

```

218

219

## Usage Examples

220

221

### Creating Balanced Few-Shot Datasets

222

223

```python

224

from setfit import sample_dataset

225

from datasets import load_dataset

226

227

# Load a dataset

228

dataset = load_dataset("emotion", split="train")

229

print(f"Original dataset size: {len(dataset)}")

230

231

# Create balanced 8-shot dataset

232

few_shot_dataset = sample_dataset(

233

dataset=dataset,

234

label_column="label",

235

num_samples=8,

236

seed=42

237

)

238

239

print(f"Few-shot dataset size: {len(few_shot_dataset)}")

240

print(f"Samples per class: 8")

241

242

# Check distribution

243

from collections import Counter

244

label_dist = Counter(few_shot_dataset["label"])

245

print(f"Label distribution: {dict(label_dist)}")

246

```

247

248

### Generating Multiple Training Splits

249

250

```python

251

from setfit import create_fewshot_splits

252

from datasets import load_dataset

253

254

# Load dataset

255

dataset = load_dataset("imdb", split="train")

256

257

# Create multiple few-shot splits

258

splits = create_fewshot_splits(

259

dataset=dataset,

260

sample_sizes=[2, 4, 8, 16, 32],

261

add_data_augmentation=False

262

)

263

264

print(f"Created splits: {list(splits.keys())}")

265

for split_name, split_data in splits.items():

266

print(f"{split_name}: {len(split_data)} examples")

267

268

# Use a specific split for training

269

train_2_shot = splits["train-2"]

270

print(f"2-shot training set: {len(train_2_shot)} examples")

271

```

272

273

### Creating Templated Datasets

274

275

```python

276

from setfit import get_templated_dataset

277

278

# Create templated dataset from labels

279

candidate_labels = ["positive", "negative", "neutral"]

280

281

templated_dataset = get_templated_dataset(

282

candidate_labels=candidate_labels,

283

template="This text expresses {} sentiment.",

284

sample_size=4

285

)

286

287

print("Generated templated examples:")

288

for i, example in enumerate(templated_dataset):

289

print(f"{i}: {example['text']} -> {example['label']}")

290

```

291

292

### Using SetFitDataset for PyTorch Training

293

294

```python

295

from setfit import SetFitDataset

296

from transformers import AutoTokenizer

297

import torch

298

299

# Prepare data

300

texts = ["I love this!", "This is terrible.", "Amazing work!", "Not good."]

301

labels = [1, 0, 1, 0]

302

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

303

304

# Create dataset

305

dataset = SetFitDataset(

306

x=texts,

307

y=labels,

308

tokenizer=tokenizer,

309

max_length=256

310

)

311

312

# Create dataloader

313

dataloader = torch.utils.data.DataLoader(

314

dataset,

315

batch_size=2,

316

collate_fn=dataset.collate_fn,

317

shuffle=True

318

)

319

320

# Iterate through batches

321

for batch in dataloader:

322

print(f"Batch shape: input_ids={batch['input_ids'].shape}")

323

print(f"Labels: {batch['labels']}")

324

break

325

```

326

327

### Multi-label Dataset Handling

328

329

```python

330

from setfit import create_fewshot_splits_multilabel, create_samples_multilabel

331

from datasets import Dataset

332

import pandas as pd

333

334

# Create multi-label dataset

335

data = {

336

"text": ["Great action movie", "Romantic comedy", "Scary thriller", "Funny drama"],

337

"labels": [[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 1, 0]] # [action, comedy, drama]

338

}

339

dataset = Dataset.from_dict(data)

340

341

# Create few-shot splits for multi-label

342

ml_splits = create_fewshot_splits_multilabel(

343

dataset=dataset,

344

sample_sizes=[2, 4]

345

)

346

347

print(f"Multi-label splits: {list(ml_splits.keys())}")

348

349

# Or use with pandas DataFrame

350

df = pd.DataFrame(data)

351

sampled_df = create_samples_multilabel(df, sample_size=2, seed=42)

352

print(f"Sampled DataFrame shape: {sampled_df.shape}")

353

```

354

355

### Benchmarking Different Sample Sizes

356

357

```python

358

from setfit import SetFitModel, SetFitTrainer, TrainingArguments, create_fewshot_splits

359

from datasets import load_dataset

360

import numpy as np

361

362

# Load dataset

363

dataset = load_dataset("SetFit/sst2", split="train")

364

test_dataset = load_dataset("SetFit/sst2", split="test")

365

366

# Create multiple training splits

367

splits = create_fewshot_splits(

368

dataset=dataset,

369

sample_sizes=[2, 4, 8, 16],

370

add_data_augmentation=False

371

)

372

373

# Benchmark performance across sample sizes

374

results = {}

375

model_name = "sentence-transformers/all-MiniLM-L6-v2"

376

377

for split_name, train_split in splits.items():

378

print(f"\nTraining on {split_name}...")

379

380

# Initialize fresh model for each experiment

381

model = SetFitModel.from_pretrained(model_name)

382

383

args = TrainingArguments(

384

batch_size=16,

385

num_epochs=4,

386

eval_strategy="epoch"

387

)

388

389

trainer = SetFitTrainer(

390

model=model,

391

args=args,

392

train_dataset=train_split,

393

eval_dataset=test_dataset

394

)

395

396

trainer.train()

397

eval_results = trainer.evaluate()

398

399

results[split_name] = eval_results["eval_accuracy"]

400

print(f"{split_name} accuracy: {eval_results['eval_accuracy']:.3f}")

401

402

print("\nFinal Results:")

403

for split_name, accuracy in results.items():

404

print(f"{split_name}: {accuracy:.3f}")

405

```