or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdevaluation.mdindex.mdindexing.mdjava.mdretrieval.mdtext-processing.mdtransformers.mdutilities.md

datasets.mddocs/

0

# Datasets

1

2

PyTerrier's dataset management system provides built-in access to standard information retrieval test collections and supports creating custom datasets. The system includes topics (queries), relevance judgments (qrels), document corpora, and pre-built indexes.

3

4

## Capabilities

5

6

### Dataset Access Functions

7

8

Core functions for discovering, accessing, and managing IR test collections.

9

10

```python { .api }

11

def get_dataset(name: str) -> 'Dataset':

12

"""

13

Retrieve a specific dataset by name.

14

15

Parameters:

16

- name: Dataset name (e.g., 'vaswani', 'msmarco-passage', 'trec-covid')

17

18

Returns:

19

- Dataset object providing access to topics, qrels, corpus, and indexes

20

"""

21

22

def find_datasets(query: str = None, **kwargs) -> List[str]:

23

"""

24

Find datasets matching search criteria.

25

26

Parameters:

27

- query: Search query for dataset names or descriptions

28

- **kwargs: Additional search filters

29

30

Returns:

31

- List of matching dataset names

32

"""

33

34

def list_datasets() -> List[str]:

35

"""

36

List all available datasets.

37

38

Returns:

39

- List of all dataset names

40

"""

41

42

def transformer_from_dataset(dataset_name: str, variant: str = None, **kwargs) -> 'Transformer':

43

"""

44

Create a transformer (typically retriever) from a dataset.

45

46

Parameters:

47

- dataset_name: Name of the dataset

48

- variant: Specific variant or index type

49

- **kwargs: Additional transformer parameters

50

51

Returns:

52

- Configured transformer for the dataset

53

"""

54

```

55

56

**Usage Examples:**

57

58

```python

59

# Get specific dataset

60

vaswani = pt.get_dataset('vaswani')

61

msmarco = pt.get_dataset('msmarco-passage')

62

63

# Find datasets by query

64

covid_datasets = pt.find_datasets('covid')

65

passage_datasets = pt.find_datasets('passage')

66

67

# List all available datasets

68

all_datasets = pt.list_datasets()

69

print(f"Available datasets: {len(all_datasets)}")

70

71

# Create retriever from dataset

72

bm25_retriever = pt.transformer_from_dataset('vaswani', 'terrier_stemmed', wmodel='BM25')

73

```

74

75

### Dataset Class

76

77

Core dataset class providing access to all components of an IR test collection.

78

79

```python { .api }

80

class Dataset:

81

"""

82

Represents an information retrieval test collection with topics, qrels, corpus, and indexes.

83

"""

84

85

def get_topics(self, variant: str = None) -> pd.DataFrame:

86

"""

87

Get query topics for the dataset.

88

89

Parameters:

90

- variant: Specific topic variant (e.g., 'title', 'description', 'narrative')

91

92

Returns:

93

- DataFrame with 'qid' and 'query' columns

94

"""

95

96

def get_qrels(self, variant: str = None) -> pd.DataFrame:

97

"""

98

Get relevance judgments (qrels) for the dataset.

99

100

Parameters:

101

- variant: Specific qrels variant if multiple available

102

103

Returns:

104

- DataFrame with 'qid', 'docno', and 'label' columns

105

"""

106

107

def get_corpus_iter(self, verbose: bool = True) -> Iterator[Dict[str, Any]]:

108

"""

109

Get iterator over document corpus.

110

111

Parameters:

112

- verbose: Show progress information

113

114

Returns:

115

- Iterator yielding documents with 'docno' and 'text' fields

116

"""

117

118

def get_corpus(self) -> pd.DataFrame:

119

"""

120

Get entire document corpus as DataFrame.

121

122

Returns:

123

- DataFrame with 'docno' and 'text' columns

124

"""

125

126

def get_index(self, variant: str = None) -> Any:

127

"""

128

Get pre-built index for the dataset.

129

130

Parameters:

131

- variant: Index variant (e.g., 'terrier_stemmed', 'terrier_unstemmed')

132

133

Returns:

134

- IndexRef object for the dataset index

135

"""

136

137

def info(self) -> Dict[str, Any]:

138

"""

139

Get metadata information about the dataset.

140

141

Returns:

142

- Dictionary with dataset metadata

143

"""

144

```

145

146

**Usage Examples:**

147

148

```python

149

# Get dataset components

150

dataset = pt.get_dataset('vaswani')

151

152

# Get topics (queries)

153

topics = dataset.get_topics()

154

print(f"Number of topics: {len(topics)}")

155

156

# Get relevance judgments

157

qrels = dataset.get_qrels()

158

print(f"Number of qrels: {len(qrels)}")

159

160

# Get document corpus

161

corpus_iter = dataset.get_corpus_iter()

162

for doc in corpus_iter:

163

print(f"Document {doc['docno']}: {doc['text'][:100]}...")

164

break

165

166

# Get pre-built index

167

index_ref = dataset.get_index('terrier_stemmed')

168

169

# Get dataset information

170

info = dataset.info()

171

print(f"Dataset info: {info}")

172

```

173

174

### Remote Dataset Support

175

176

Extended dataset class for handling remote datasets that are downloaded on demand.

177

178

```python { .api }

179

class RemoteDataset(Dataset):

180

"""

181

Dataset stored remotely and downloaded on first access.

182

183

Inherits all Dataset methods and adds remote download capabilities.

184

"""

185

186

def download(self, force: bool = False) -> None:

187

"""

188

Download dataset components.

189

190

Parameters:

191

- force: Force re-download even if already cached

192

"""

193

194

def is_downloaded(self) -> bool:

195

"""

196

Check if dataset has been downloaded.

197

198

Returns:

199

- True if dataset is locally available

200

"""

201

```

202

203

### Dataset Providers

204

205

Provider classes for different dataset sources and formats.

206

207

```python { .api }

208

class DatasetProvider:

209

"""

210

Abstract base class for dataset providers.

211

"""

212

213

def get_dataset(self, name: str) -> Dataset: ...

214

def list_datasets(self) -> List[str]: ...

215

def find_datasets(self, query: str) -> List[str]: ...

216

217

class BuiltinDatasetProvider(DatasetProvider):

218

"""

219

Provider for built-in PyTerrier datasets.

220

"""

221

222

class IRDSDatasetProvider(DatasetProvider):

223

"""

224

Provider for ir-datasets integration.

225

Provides access to datasets from the ir-datasets library.

226

"""

227

```

228

229

### Dataset Registry

230

231

Global registry for managing available datasets across different providers.

232

233

```python { .api }

234

DATASET_MAP: Dict[str, Dataset] # Global dataset registry mapping names to Dataset objects

235

```

236

237

## Common Datasets

238

239

### Built-in Test Collections

240

241

```python

242

# Small test collections for development

243

vaswani = pt.get_dataset('vaswani') # Classic Vaswani collection (11,429 docs)

244

antique = pt.get_dataset('antique') # ANTIQUE non-factoid QA dataset

245

246

# TREC collections

247

robust04 = pt.get_dataset('trec-robust-2004') # TREC Robust 2004

248

covid = pt.get_dataset('trec-covid') # TREC-COVID dataset

249

250

# Web collections

251

msmarco_passage = pt.get_dataset('msmarco-passage') # MS MARCO passage ranking

252

msmarco_document = pt.get_dataset('msmarco-document') # MS MARCO document ranking

253

254

# Academic collections

255

cord19 = pt.get_dataset('cord19') # CORD-19 COVID-19 research papers

256

```

257

258

### Dataset Variants

259

260

Many datasets provide multiple variants for different use cases:

261

262

```python

263

# Get different topic variants

264

vaswani = pt.get_dataset('vaswani')

265

title_topics = vaswani.get_topics('title') # Title-only queries

266

desc_topics = vaswani.get_topics('description') # Description queries

267

narrative_topics = vaswani.get_topics('narrative') # Full narrative queries

268

269

# Get different index variants

270

stemmed_index = vaswani.get_index('terrier_stemmed') # Stemmed index

271

unstemmed_index = vaswani.get_index('terrier_unstemmed') # Unstemmed index

272

```

273

274

## Advanced Dataset Usage

275

276

### Custom Dataset Creation

277

278

```python

279

# Create custom dataset from local files

280

class CustomDataset(pt.datasets.Dataset):

281

def __init__(self, topics_file, qrels_file, corpus_path):

282

self.topics_file = topics_file

283

self.qrels_file = qrels_file

284

self.corpus_path = corpus_path

285

286

def get_topics(self):

287

return pd.read_csv(self.topics_file)

288

289

def get_qrels(self):

290

return pd.read_csv(self.qrels_file)

291

292

def get_corpus_iter(self):

293

# Custom corpus loading logic

294

pass

295

296

# Register custom dataset

297

custom_dataset = CustomDataset('/path/to/topics.csv', '/path/to/qrels.csv', '/path/to/corpus/')

298

pt.datasets.DATASET_MAP['my-custom'] = custom_dataset

299

```

300

301

### Dataset Integration with Pipelines

302

303

```python

304

# Create retrieval pipeline from dataset

305

dataset = pt.get_dataset('vaswani')

306

retriever = pt.terrier.Retriever.from_dataset('vaswani', 'terrier_stemmed')

307

308

# Evaluate on dataset

309

topics = dataset.get_topics()

310

qrels = dataset.get_qrels()

311

312

results = retriever.transform(topics)

313

evaluation = pt.Experiment([retriever], topics, qrels, ['map', 'ndcg'])

314

```

315

316

### Corpus Processing

317

318

```python

319

# Process large corpus efficiently

320

dataset = pt.get_dataset('msmarco-passage')

321

corpus_iter = dataset.get_corpus_iter()

322

323

# Create custom indexer for corpus

324

indexer = pt.IterDictIndexer('/path/to/custom_index')

325

index_ref = indexer.index(corpus_iter)

326

```

327

328

### Multi-Dataset Experiments

329

330

```python

331

# Compare across multiple datasets

332

datasets = ['vaswani', 'antique', 'trec-robust-2004']

333

results = []

334

335

for dataset_name in datasets:

336

dataset = pt.get_dataset(dataset_name)

337

retriever = pt.transformer_from_dataset(dataset_name, wmodel='BM25')

338

339

topics = dataset.get_topics()

340

qrels = dataset.get_qrels()

341

342

result = pt.Experiment([retriever], topics, qrels, ['map'])

343

result['dataset'] = dataset_name

344

results.append(result)

345

346

# Combine results

347

combined_results = pd.concat(results)

348

```

349

350

## Types

351

352

```python { .api }

353

from typing import Dict, List, Any, Iterator, Optional, Union

354

import pandas as pd

355

356

# Dataset-specific types

357

DatasetName = str # Dataset identifier

358

TopicVariant = str # Topic variant name ('title', 'description', etc.)

359

QrelsVariant = str # Qrels variant name

360

IndexVariant = str # Index variant name ('terrier_stemmed', etc.)

361

DatasetInfo = Dict[str, Any] # Dataset metadata

362

DocumentIterator = Iterator[Dict[str, Any]] # Document corpus iterator

363

DatasetProvider = Any # Dataset provider instance

364

```