or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdevaluation.mdindex.mdindexing.mdjava.mdretrieval.mdtext-processing.mdtransformers.mdutilities.md

indexing.mddocs/

0

# Indexing

1

2

PyTerrier's indexing components provide comprehensive functionality for creating searchable indexes from various document formats. The indexing system supports multiple input formats, customizable text processing pipelines, and flexible index configurations.

3

4

## Capabilities

5

6

### Index Factory

7

8

Central factory class for creating and managing indexes from various sources including datasets, file collections, and custom iterators.

9

10

```python { .api }

11

class IndexFactory:

12

"""

13

Factory class for creating and managing Terrier indexes.

14

"""

15

16

@staticmethod

17

def from_dataset(dataset_name: str, variant: str = None, **kwargs) -> Any: ...

18

19

@staticmethod

20

def from_trec(path: str, single_file: bool = False, **kwargs) -> Any: ...

21

22

@staticmethod

23

def from_xml(path: str, **kwargs) -> Any: ...

24

25

@staticmethod

26

def memory(documents: List[Dict[str, Any]], **kwargs) -> Any: ...

27

```

28

29

**Usage Examples:**

30

31

```python

32

# Create index from dataset

33

vaswani_index = pt.terrier.IndexFactory.from_dataset('vaswani')

34

35

# Create index from TREC collection

36

trec_index = pt.terrier.IndexFactory.from_trec('/path/to/trec/files')

37

38

# Create in-memory index for small collections

39

documents = [

40

{'docno': 'doc1', 'text': 'This is the first document'},

41

{'docno': 'doc2', 'text': 'This is the second document'}

42

]

43

memory_index = pt.terrier.IndexFactory.memory(documents)

44

```

45

46

### Generic Indexers

47

48

Base indexer classes for creating indexes from different input sources and formats.

49

50

```python { .api }

51

class TerrierIndexer(Indexer):

52

"""

53

Generic Terrier indexer with configurable text processing pipeline.

54

55

Parameters:

56

- index_path: Path where index will be created

57

- blocks: Whether to record block information for phrase queries

58

- overwrite: Whether to overwrite existing index

59

- verbose: Enable verbose output

60

- meta: Dictionary mapping metadata field names to lengths

61

- stemmer: Stemmer to use ('porter', 'weak_porter', etc.)

62

- stopwords: Stopword list to use ('terrier', 'smart', etc.)

63

- tokeniser: Tokeniser configuration

64

"""

65

def __init__(self, index_path: str, blocks: bool = False,

66

overwrite: bool = False, verbose: bool = False,

67

meta: Dict[str, int] = None, stemmer: str = None,

68

stopwords: str = None, tokeniser: str = None, **kwargs): ...

69

```

70

71

### File-Based Indexers

72

73

Specialized indexers for processing file collections and directories.

74

75

```python { .api }

76

class FilesIndexer(Indexer):

77

"""

78

Index files from a directory or file list.

79

80

Parameters:

81

- index_path: Path where index will be created

82

- blocks: Whether to record block information

83

- verbose: Enable verbose output

84

- meta: Metadata field configuration

85

- type: File type ('txt', 'pdf', 'docx', etc.)

86

"""

87

def __init__(self, index_path: str, blocks: bool = False,

88

verbose: bool = False, meta: Dict[str, int] = None,

89

type: str = 'txt', **kwargs): ...

90

```

91

92

**Usage Example:**

93

94

```python

95

# Index text files from directory

96

files_indexer = pt.FilesIndexer('/path/to/index', verbose=True)

97

index_ref = files_indexer.index('/path/to/documents/')

98

99

# Index PDF files with metadata

100

pdf_indexer = pt.FilesIndexer(

101

'/path/to/pdf_index',

102

type='pdf',

103

meta={'title': 100, 'author': 50}

104

)

105

index_ref = pdf_indexer.index('/path/to/pdfs/')

106

```

107

108

### TREC Collection Indexer

109

110

Specialized indexer for TREC-formatted document collections with support for various TREC formats.

111

112

```python { .api }

113

class TRECCollectionIndexer(Indexer):

114

"""

115

Index TREC-formatted document collections.

116

117

Parameters:

118

- index_path: Path where index will be created

119

- collection: List of TREC collection files or single file path

120

- blocks: Whether to record block information

121

- verbose: Enable verbose output

122

- meta: Metadata field configuration

123

"""

124

def __init__(self, index_path: str, collection: Union[str, List[str]] = None,

125

blocks: bool = False, verbose: bool = False,

126

meta: Dict[str, int] = None, **kwargs): ...

127

```

128

129

**Usage Example:**

130

131

```python

132

# Index single TREC file

133

trec_indexer = pt.TRECCollectionIndexer(

134

'/path/to/trec_index',

135

collection='/path/to/collection.trec'

136

)

137

index_ref = trec_indexer.index()

138

139

# Index multiple TREC files

140

multi_trec_indexer = pt.TRECCollectionIndexer(

141

'/path/to/multi_index',

142

collection=['/path/to/file1.trec', '/path/to/file2.trec']

143

)

144

index_ref = multi_trec_indexer.index()

145

```

146

147

### DataFrame Indexer

148

149

Indexer for creating indexes directly from pandas DataFrames, enabling in-memory document processing.

150

151

```python { .api }

152

class DFIndexer(Indexer):

153

"""

154

Index documents from pandas DataFrame.

155

156

Parameters:

157

- index_path: Path where index will be created

158

- blocks: Whether to record block information

159

- verbose: Enable verbose output

160

- meta: Metadata field configuration

161

- text_attr: Name of column containing document text (default: 'text')

162

- docno_attr: Name of column containing document IDs (default: 'docno')

163

"""

164

def __init__(self, index_path: str, blocks: bool = False,

165

verbose: bool = False, meta: Dict[str, int] = None,

166

text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...

167

168

class DFIndexUtils:

169

"""Utilities for DataFrame indexing operations."""

170

171

@staticmethod

172

def create_df(documents: List[Dict[str, Any]]) -> pd.DataFrame: ...

173

174

@staticmethod

175

def validate_df(df: pd.DataFrame) -> bool: ...

176

```

177

178

**Usage Example:**

179

180

```python

181

# Create DataFrame with documents

182

documents_df = pd.DataFrame([

183

{'docno': 'doc1', 'text': 'First document content', 'title': 'Document 1'},

184

{'docno': 'doc2', 'text': 'Second document content', 'title': 'Document 2'}

185

])

186

187

# Index DataFrame

188

df_indexer = pt.DFIndexer(

189

'/path/to/df_index',

190

meta={'title': 100}, # Include title metadata with max length 100

191

verbose=True

192

)

193

index_ref = df_indexer.index(documents_df)

194

```

195

196

### Iterator Dictionary Indexer

197

198

Indexer for processing document iterators, useful for streaming large collections without loading everything into memory.

199

200

```python { .api }

201

class IterDictIndexer(Indexer):

202

"""

203

Index documents from iterator of dictionaries.

204

205

Parameters:

206

- index_path: Path where index will be created

207

- blocks: Whether to record block information

208

- verbose: Enable verbose output

209

- meta: Metadata field configuration

210

- text_attr: Name of field containing document text (default: 'text')

211

- docno_attr: Name of field containing document IDs (default: 'docno')

212

"""

213

def __init__(self, index_path: str, blocks: bool = False,

214

verbose: bool = False, meta: Dict[str, int] = None,

215

text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...

216

```

217

218

**Usage Example:**

219

220

```python

221

# Define document iterator

222

def document_iterator():

223

for i in range(1000):

224

yield {

225

'docno': f'doc_{i}',

226

'text': f'This is document number {i} with some content.',

227

'category': f'category_{i % 10}'

228

}

229

230

# Index iterator

231

iter_indexer = pt.IterDictIndexer(

232

'/path/to/iter_index',

233

meta={'category': 20},

234

verbose=True

235

)

236

index_ref = iter_indexer.index(document_iterator())

237

```

238

239

### Indexing Configuration

240

241

Enumeration and utilities for configuring indexing behavior and text processing pipelines.

242

243

```python { .api }

244

class IndexingType:

245

"""Enumeration of indexing types and configurations."""

246

CLASSIC = 'classic'

247

SINGLEPASS = 'singlepass'

248

MEMORY = 'memory'

249

250

# High-level indexing function

251

def index(iter_dict_or_df, index_path: str = None,

252

indexer_class = None, **kwargs) -> Any:

253

"""

254

High-level function for creating indexes from various input types.

255

256

Parameters:

257

- iter_dict_or_df: Input data (DataFrame, iterator, or file path)

258

- index_path: Where to create the index

259

- indexer_class: Specific indexer class to use

260

- **kwargs: Additional indexer parameters

261

262

Returns:

263

- IndexRef object for the created index

264

"""

265

```

266

267

**Usage Example:**

268

269

```python

270

# High-level indexing function

271

documents = [

272

{'docno': 'doc1', 'text': 'Document 1 content'},

273

{'docno': 'doc2', 'text': 'Document 2 content'}

274

]

275

276

# Simple indexing

277

index_ref = pt.index(documents, '/path/to/simple_index')

278

279

# Indexing with custom parameters

280

index_ref = pt.index(

281

documents,

282

'/path/to/custom_index',

283

stemmer='porter',

284

stopwords='smart',

285

blocks=True

286

)

287

```

288

289

## Advanced Indexing Patterns

290

291

### Custom Text Processing Pipeline

292

293

```python

294

# Configure custom text processing

295

custom_indexer = pt.DFIndexer(

296

'/path/to/custom_index',

297

stemmer='weak_porter', # Use weak Porter stemmer

298

stopwords='smart', # Use SMART stopword list

299

blocks=True, # Enable block information for phrases

300

meta={'title': 100, 'url': 200} # Include metadata fields

301

)

302

```

303

304

### Multi-Field Indexing

305

306

```python

307

# Index documents with multiple text fields

308

documents = pd.DataFrame([

309

{

310

'docno': 'doc1',

311

'text': 'Main document content here',

312

'title': 'Document Title',

313

'abstract': 'Document abstract or summary'

314

}

315

])

316

317

# Configure indexer to handle multiple fields

318

multi_field_indexer = pt.DFIndexer(

319

'/path/to/multi_field_index',

320

meta={'title': 100, 'abstract': 500},

321

verbose=True

322

)

323

```

324

325

### Incremental Indexing

326

327

```python

328

# Create base index

329

base_indexer = pt.DFIndexer('/path/to/base_index')

330

base_index = base_indexer.index(initial_documents)

331

332

# Add more documents (typically requires rebuilding)

333

additional_indexer = pt.DFIndexer('/path/to/updated_index')

334

updated_index = additional_indexer.index(all_documents) # Full rebuild

335

```

336

337

## Types

338

339

```python { .api }

340

from typing import Dict, List, Any, Union, Iterator, Optional

341

import pandas as pd

342

343

# Indexing-specific types

344

IndexRef = Any # Java IndexRef object

345

IndexPath = str # File system path for index

346

DocumentIterator = Iterator[Dict[str, Any]] # Document iterator type

347

MetadataConfig = Dict[str, int] # Metadata field name to max length mapping

348

TextProcessingConfig = Dict[str, str] # Text processing configuration

349

```