Tessl Tile for pypi/setfit@1.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

absa.md core-model-training.md data-utilities.md index.md knowledge-distillation.md model-cards.md model-export.md

data-utilities.mddocs/

0
# Data Utilities
1

2
Dataset preparation, sampling, and templating utilities for few-shot learning scenarios. These functions help create balanced training sets, generate synthetic examples, and prepare data in the format expected by SetFit models.
3

4
## Capabilities
5

6
### Dataset Sampling
7

8
Create balanced few-shot datasets by sampling equal numbers of examples per class.
9

10
```python { .api }
11
def sample_dataset(
12
    dataset: Dataset,
13
    label_column: str = "label", 
14
    num_samples: int = 8,
15
    seed: int = 42
16
) -> Dataset:
17
    """
18
    Sample a Dataset to create equal number of samples per class.
19

20
    Parameters:
21
    - dataset: HuggingFace Dataset to sample from
22
    - label_column: Name of the column containing labels
23
    - num_samples: Number of samples per class to select
24
    - seed: Random seed for reproducible sampling
25

26
    Returns:
27
    New Dataset with balanced samples per class
28
    """
29

30
def create_samples(
31
    df: "pd.DataFrame",
32
    sample_size: int,
33
    seed: int = 42
34
) -> "pd.DataFrame":
35
    """
36
    Sample DataFrame with equal samples per class.
37

38
    Parameters:
39
    - df: Input pandas DataFrame
40
    - sample_size: Number of samples per class
41
    - seed: Random seed for reproducibility
42

43
    Returns:
44
    Sampled DataFrame with balanced classes
45
    """
46

47
def create_samples_multilabel(
48
    df: "pd.DataFrame", 
49
    sample_size: int,
50
    seed: int = 42
51
) -> "pd.DataFrame":
52
    """
53
    Sample DataFrame for multilabel classification scenarios.
54

55
    Parameters:
56
    - df: Input pandas DataFrame with multilabel targets
57
    - sample_size: Number of samples to select
58
    - seed: Random seed for reproducibility
59

60
    Returns:
61
    Sampled DataFrame for multilabel training
62
    """
63
```
64

65
### Few-Shot Split Creation
66

67
Generate training splits with different sample sizes for few-shot learning experiments.
68

69
```python { .api }
70
def create_fewshot_splits(
71
    dataset: Dataset,
72
    sample_sizes: List[int] = [2, 4, 8, 16, 32, 64],
73
    add_data_augmentation: bool = False,
74
    dataset_name: Optional[str] = None
75
) -> DatasetDict:
76
    """
77
    Create training splits with equal samples per class for different shot sizes.
78

79
    Parameters:
80
    - dataset: Source dataset to create splits from
81
    - sample_sizes: List of sample sizes to create splits for
82
    - add_data_augmentation: Whether to add data augmentation
83
    - dataset_name: Name of the dataset for tracking
84

85
    Returns:
86
    DatasetDict with splits for each sample size
87
    """
88

89
def create_fewshot_splits_multilabel(
90
    dataset: Dataset,
91
    sample_sizes: List[int] = [2, 4, 8, 16]
92
) -> DatasetDict:
93
    """
94
    Create multilabel training splits with different sample sizes.
95

96
    Parameters:
97
    - dataset: Source multilabel dataset
98
    - sample_sizes: List of sample sizes to create splits for
99

100
    Returns:
101
    DatasetDict with multilabel splits for each sample size
102
    """
103
```
104

105
### Templated Dataset Generation
106

107
Generate synthetic training examples using templates and candidate labels.
108

109
```python { .api }
110
def get_templated_dataset(
111
    dataset: Optional[Dataset] = None,
112
    candidate_labels: Optional[List[str]] = None,
113
    reference_dataset: Optional[str] = None,
114
    template: str = "This example is {}",
115
    sample_size: int = 2,
116
    text_column: str = "text",
117
    label_column: str = "label",
118
    multi_label: bool = False,
119
    label_names_column: str = "label_text"
120
) -> Dataset:
121
    """
122
    Create templated examples for a reference dataset or reference labels.
123

124
    Parameters:
125
    - dataset: Source dataset to template (optional)
126
    - candidate_labels: List of label names to create templates for
127
    - reference_dataset: Name of reference dataset to use
128
    - template: Template string with {} placeholder for label
129
    - sample_size: Number of examples per label to generate
130
    - text_column: Name of text column in dataset
131
    - label_column: Name of label column in dataset
132
    - multi_label: Whether this is multi-label classification
133
    - label_names_column: Column containing label names
134

135
    Returns:
136
    Dataset with templated examples
137
    """
138

139
def get_candidate_labels(
140
    dataset_name: str,
141
    label_names_column: str = "label_text"
142
) -> List[str]:
143
    """
144
    Extract candidate labels from a dataset.
145

146
    Parameters:
147
    - dataset_name: Name of the dataset to extract labels from
148
    - label_names_column: Column containing label names
149

150
    Returns:
151
    List of unique label names
152
    """
153
```
154

155
### SetFit Dataset Class
156

157
Custom dataset class for training differentiable heads with PyTorch.
158

159
```python { .api }
160
class SetFitDataset:
161
    def __init__(
162
        self,
163
        x: List[str],
164
        y: Union[List[int], List[List[int]]],
165
        tokenizer: "PreTrainedTokenizerBase",
166
        max_length: int = 512
167
    ):
168
        """
169
        Dataset for training differentiable head on text classification.
170

171
        Parameters:
172
        - x: List of input texts
173
        - y: List of labels (integers for single-label, lists for multi-label)
174
        - tokenizer: HuggingFace tokenizer for text processing
175
        - max_length: Maximum sequence length for tokenization
176
        """
177

178
    def __len__(self) -> int:
179
        """Return the number of examples in the dataset."""
180

181
    def __getitem__(self, index: int) -> Dict[str, Any]:
182
        """
183
        Get a single example from the dataset.
184

185
        Parameters:
186
        - index: Index of the example to retrieve
187

188
        Returns:
189
        Dictionary with input_ids, attention_mask, and labels
190
        """
191

192
    @staticmethod
193
    def collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
194
        """
195
        Collate function for batching examples.
196

197
        Parameters:
198
        - batch: List of examples from __getitem__
199

200
        Returns:
201
        Batched tensors ready for model input
202
        """
203
```
204

205

206
## Constants
207

208
```python { .api }
209
# Default seeds for reproducible sampling
210
SEEDS: List[int] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
211

212
# Default sample sizes for few-shot experiments
213
SAMPLE_SIZES: List[int] = [2, 4, 8, 16, 32, 64]
214

215
# Type alias for tokenizer output
216
TokenizerOutput = Dict[str, List[int]]
217
```
218

219
## Usage Examples
220

221
### Creating Balanced Few-Shot Datasets
222

223
```python
224
from setfit import sample_dataset
225
from datasets import load_dataset
226

227
# Load a dataset
228
dataset = load_dataset("emotion", split="train")
229
print(f"Original dataset size: {len(dataset)}")
230

231
# Create balanced 8-shot dataset
232
few_shot_dataset = sample_dataset(
233
    dataset=dataset,
234
    label_column="label",
235
    num_samples=8,
236
    seed=42
237
)
238

239
print(f"Few-shot dataset size: {len(few_shot_dataset)}")
240
print(f"Samples per class: 8")
241

242
# Check distribution
243
from collections import Counter
244
label_dist = Counter(few_shot_dataset["label"])
245
print(f"Label distribution: {dict(label_dist)}")
246
```
247

248
### Generating Multiple Training Splits
249

250
```python
251
from setfit import create_fewshot_splits
252
from datasets import load_dataset
253

254
# Load dataset
255
dataset = load_dataset("imdb", split="train")
256

257
# Create multiple few-shot splits
258
splits = create_fewshot_splits(
259
    dataset=dataset,
260
    sample_sizes=[2, 4, 8, 16, 32],
261
    add_data_augmentation=False
262
)
263

264
print(f"Created splits: {list(splits.keys())}")
265
for split_name, split_data in splits.items():
266
    print(f"{split_name}: {len(split_data)} examples")
267

268
# Use a specific split for training
269
train_2_shot = splits["train-2"]
270
print(f"2-shot training set: {len(train_2_shot)} examples")
271
```
272

273
### Creating Templated Datasets
274

275
```python
276
from setfit import get_templated_dataset
277

278
# Create templated dataset from labels
279
candidate_labels = ["positive", "negative", "neutral"]
280

281
templated_dataset = get_templated_dataset(
282
    candidate_labels=candidate_labels,
283
    template="This text expresses {} sentiment.",
284
    sample_size=4
285
)
286

287
print("Generated templated examples:")
288
for i, example in enumerate(templated_dataset):
289
    print(f"{i}: {example['text']} -> {example['label']}")
290
```
291

292
### Using SetFitDataset for PyTorch Training
293

294
```python
295
from setfit import SetFitDataset
296
from transformers import AutoTokenizer
297
import torch
298

299
# Prepare data
300
texts = ["I love this!", "This is terrible.", "Amazing work!", "Not good."]
301
labels = [1, 0, 1, 0]
302
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
303

304
# Create dataset
305
dataset = SetFitDataset(
306
    x=texts,
307
    y=labels,
308
    tokenizer=tokenizer,
309
    max_length=256
310
)
311

312
# Create dataloader
313
dataloader = torch.utils.data.DataLoader(
314
    dataset,
315
    batch_size=2,
316
    collate_fn=dataset.collate_fn,
317
    shuffle=True
318
)
319

320
# Iterate through batches
321
for batch in dataloader:
322
    print(f"Batch shape: input_ids={batch['input_ids'].shape}")
323
    print(f"Labels: {batch['labels']}")
324
    break
325
```
326

327
### Multi-label Dataset Handling
328

329
```python
330
from setfit import create_fewshot_splits_multilabel, create_samples_multilabel
331
from datasets import Dataset
332
import pandas as pd
333

334
# Create multi-label dataset
335
data = {
336
    "text": ["Great action movie", "Romantic comedy", "Scary thriller", "Funny drama"],
337
    "labels": [[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 1, 0]]  # [action, comedy, drama]
338
}
339
dataset = Dataset.from_dict(data)
340

341
# Create few-shot splits for multi-label
342
ml_splits = create_fewshot_splits_multilabel(
343
    dataset=dataset,
344
    sample_sizes=[2, 4]
345
)
346

347
print(f"Multi-label splits: {list(ml_splits.keys())}")
348

349
# Or use with pandas DataFrame
350
df = pd.DataFrame(data)
351
sampled_df = create_samples_multilabel(df, sample_size=2, seed=42)
352
print(f"Sampled DataFrame shape: {sampled_df.shape}")
353
```
354

355
### Benchmarking Different Sample Sizes
356

357
```python
358
from setfit import SetFitModel, SetFitTrainer, TrainingArguments, create_fewshot_splits
359
from datasets import load_dataset
360
import numpy as np
361

362
# Load dataset
363
dataset = load_dataset("SetFit/sst2", split="train")
364
test_dataset = load_dataset("SetFit/sst2", split="test")
365

366
# Create multiple training splits
367
splits = create_fewshot_splits(
368
    dataset=dataset,
369
    sample_sizes=[2, 4, 8, 16],
370
    add_data_augmentation=False
371
)
372

373
# Benchmark performance across sample sizes
374
results = {}
375
model_name = "sentence-transformers/all-MiniLM-L6-v2"
376

377
for split_name, train_split in splits.items():
378
    print(f"\nTraining on {split_name}...")
379
    
380
    # Initialize fresh model for each experiment
381
    model = SetFitModel.from_pretrained(model_name)
382
    
383
    args = TrainingArguments(
384
        batch_size=16,
385
        num_epochs=4,
386
        eval_strategy="epoch"
387
    )
388
    
389
    trainer = SetFitTrainer(
390
        model=model,
391
        args=args,
392
        train_dataset=train_split,
393
        eval_dataset=test_dataset
394
    )
395
    
396
    trainer.train()
397
    eval_results = trainer.evaluate()
398
    
399
    results[split_name] = eval_results["eval_accuracy"]
400
    print(f"{split_name} accuracy: {eval_results['eval_accuracy']:.3f}")
401

402
print("\nFinal Results:")
403
for split_name, accuracy in results.items():
404
    print(f"{split_name}: {accuracy:.3f}")
405
```

Version

Tile

Files

data-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-utilities.mddocs/