or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

datasets.mdevaluation.mdindex.mdindexing.mdjava.mdretrieval.mdtext-processing.mdtransformers.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

PyTerrier's utility modules provide supporting functionality for DataFrame manipulation, I/O operations, progress tracking, debugging, and general helper functions that support the core information retrieval capabilities.

3

4

## Capabilities

5

6

### General Utilities (`pyterrier.utils`)

7

8

Core utility functions for progress tracking, system information, and general helper functionality.

9

10

```python { .api }

11

def set_tqdm(tqdm_type: str = None) -> None:

12

"""

13

Configure progress bar type for PyTerrier operations.

14

15

Parameters:

16

- tqdm_type: Progress bar type ('tqdm', 'notebook', 'auto', or None to disable)

17

"""

18

19

def entry_points(group: str) -> List[Any]:

20

"""

21

Get package entry points for specified group.

22

23

Parameters:

24

- group: Entry point group name

25

26

Returns:

27

- List of entry point objects

28

"""

29

30

def is_windows() -> bool:

31

"""

32

Check if running on Windows operating system.

33

34

Returns:

35

- True if Windows, False otherwise

36

"""

37

38

def noop(*args, **kwargs) -> None:

39

"""

40

No-operation function that accepts any arguments and does nothing.

41

"""

42

43

def once() -> Callable:

44

"""

45

Decorator that ensures a function can only be called once.

46

47

Returns:

48

- Decorator function

49

"""

50

51

def get_class_methods(cls: type) -> List[str]:

52

"""

53

Get list of methods defined by a class (not inherited).

54

55

Parameters:

56

- cls: Class to inspect

57

58

Returns:

59

- List of method names

60

"""

61

62

def pre_invocation_decorator(decorator: Callable) -> Callable:

63

"""

64

Create decorator that runs before method invocation.

65

66

Parameters:

67

- decorator: Decorator function to apply

68

69

Returns:

70

- Pre-invocation decorator

71

"""

72

73

def byte_count_to_human_readable(byte_count: int) -> str:

74

"""

75

Convert byte count to human-readable format.

76

77

Parameters:

78

- byte_count: Number of bytes

79

80

Returns:

81

- Human-readable string (e.g., '1.2 GB', '45.3 MB')

82

"""

83

84

def temp_env(key: str, value: str):

85

"""

86

Context manager for temporarily setting environment variable.

87

88

Parameters:

89

- key: Environment variable name

90

- value: Temporary value

91

92

Returns:

93

- Context manager

94

"""

95

96

class GeneratorLen:

97

"""

98

Wrapper for generator that tracks length.

99

100

Provides len() support for generators by consuming and caching items.

101

"""

102

def __init__(self, generator: Iterator, length: int = None): ...

103

def __len__(self) -> int: ...

104

def __iter__(self) -> Iterator: ...

105

```

106

107

**Usage Examples:**

108

109

```python

110

# Configure progress bars

111

pt.utils.set_tqdm('notebook') # For Jupyter notebooks

112

pt.utils.set_tqdm('tqdm') # For command line

113

pt.utils.set_tqdm(None) # Disable progress bars

114

115

# System information

116

if pt.utils.is_windows():

117

print("Running on Windows")

118

119

# Temporary environment variable

120

with pt.utils.temp_env('JAVA_HOME', '/custom/java/path'):

121

pt.java.init() # Uses custom Java path

122

123

# Human-readable byte counts

124

size_str = pt.utils.byte_count_to_human_readable(1073741824) # "1.0 GB"

125

126

# Generator with length

127

def doc_generator():

128

for i in range(1000):

129

yield {'docno': f'doc_{i}', 'text': f'Document {i}'}

130

131

gen_with_len = pt.utils.GeneratorLen(doc_generator(), 1000)

132

print(f"Generator length: {len(gen_with_len)}")

133

```

134

135

### DataFrame Model Utilities (`pyterrier.model`)

136

137

Utilities for manipulating PyTerrier DataFrames and data structures.

138

139

```python { .api }

140

def add_ranks(df: pd.DataFrame, single_query: bool = False) -> pd.DataFrame:

141

"""

142

Add rank column to DataFrame based on score values.

143

144

Parameters:

145

- df: DataFrame with score column

146

- single_query: Whether DataFrame contains single query (default: False)

147

148

Returns:

149

- DataFrame with added 'rank' column

150

"""

151

152

def document_columns(df: pd.DataFrame) -> List[str]:

153

"""

154

Get document-related column names from DataFrame.

155

156

Parameters:

157

- df: DataFrame to analyze

158

159

Returns:

160

- List of document-related column names

161

"""

162

163

def query_columns(df: pd.DataFrame, qid: bool = True) -> List[str]:

164

"""

165

Get query-related column names from DataFrame.

166

167

Parameters:

168

- df: DataFrame to analyze

169

- qid: Whether to include 'qid' column (default: True)

170

171

Returns:

172

- List of query-related column names

173

"""

174

175

def push_queries(df: pd.DataFrame) -> pd.DataFrame:

176

"""

177

Push query columns (rename query -> query_0, etc.).

178

179

Parameters:

180

- df: DataFrame with query columns

181

182

Returns:

183

- DataFrame with pushed query columns

184

"""

185

186

def push_queries_dict(input_dict: Dict[str, Any]) -> Dict[str, Any]:

187

"""

188

Dictionary version of push_queries.

189

190

Parameters:

191

- input_dict: Dictionary with query fields

192

193

Returns:

194

- Dictionary with pushed query fields

195

"""

196

197

def pop_queries(df: pd.DataFrame) -> pd.DataFrame:

198

"""

199

Pop query columns (reverse of push_queries).

200

201

Parameters:

202

- df: DataFrame with pushed query columns

203

204

Returns:

205

- DataFrame with restored query columns

206

"""

207

208

def ranked_documents_to_queries(topics_and_res: pd.DataFrame) -> pd.DataFrame:

209

"""

210

Extract unique queries from ranked documents DataFrame.

211

212

Parameters:

213

- topics_and_res: DataFrame with queries and results

214

215

Returns:

216

- DataFrame with unique queries

217

"""

218

219

def coerce_queries_dataframe(query: Union[str, Dict, pd.DataFrame]) -> pd.DataFrame:

220

"""

221

Convert various input types to standard queries DataFrame.

222

223

Parameters:

224

- query: Query in various formats (string, dict, DataFrame)

225

226

Returns:

227

- Standard queries DataFrame with 'qid' and 'query' columns

228

"""

229

230

def coerce_dataframe_types(dataframe: pd.DataFrame) -> pd.DataFrame:

231

"""

232

Ensure proper column data types for PyTerrier DataFrames.

233

234

Parameters:

235

- dataframe: DataFrame to type-check

236

237

Returns:

238

- DataFrame with corrected types

239

"""

240

241

def split_df(df: pd.DataFrame, N: int = None, batch_size: int = None) -> Iterator[pd.DataFrame]:

242

"""

243

Split DataFrame into chunks for batch processing.

244

245

Parameters:

246

- df: DataFrame to split

247

- N: Number of chunks (alternative to batch_size)

248

- batch_size: Size of each chunk

249

250

Returns:

251

- Iterator of DataFrame chunks

252

"""

253

254

# Constants

255

FIRST_RANK: int = 0 # Starting rank value

256

STRICT_SORT: bool = False # Whether to enforce strict sorting

257

```

258

259

**Usage Examples:**

260

261

```python

262

# Add ranks to results

263

results_with_ranks = pt.model.add_ranks(retrieval_results)

264

265

# Get column information

266

doc_cols = pt.model.document_columns(results)

267

query_cols = pt.model.query_columns(results)

268

269

# Query manipulation

270

pushed_queries = pt.model.push_queries(topics)

271

restored_queries = pt.model.pop_queries(pushed_queries)

272

273

# Convert various query formats

274

query_df = pt.model.coerce_queries_dataframe("information retrieval")

275

query_df = pt.model.coerce_queries_dataframe({'qid': '1', 'query': 'search'})

276

277

# Batch processing

278

for batch in pt.model.split_df(large_dataframe, batch_size=1000):

279

processed_batch = some_transformer.transform(batch)

280

```

281

282

### I/O Utilities (`pyterrier.io`)

283

284

File input/output utilities with support for various formats and compression.

285

286

```python { .api }

287

def autoopen(filename: str, mode: str = 'r', **kwargs):

288

"""

289

Automatically handle file opening with compression detection.

290

291

Parameters:

292

- filename: File path (supports .gz, .bz2, .xz compression)

293

- mode: File opening mode ('r', 'w', 'rb', 'wb', etc.)

294

- **kwargs: Additional arguments for file opening

295

296

Returns:

297

- File handle with appropriate compression handling

298

"""

299

300

def finalized_open(path: str, mode: str):

301

"""

302

Atomic file writing context manager.

303

304

Parameters:

305

- path: Target file path

306

- mode: File opening mode

307

308

Returns:

309

- Context manager for atomic file writing

310

"""

311

312

def find_files(directory: str) -> List[str]:

313

"""

314

Recursively find all files in directory.

315

316

Parameters:

317

- directory: Directory path to search

318

319

Returns:

320

- List of file paths

321

"""

322

323

def coerce_dataframe(obj: Any) -> pd.DataFrame:

324

"""

325

Convert various object types to DataFrame.

326

327

Parameters:

328

- obj: Object to convert (dict, list, etc.)

329

330

Returns:

331

- Converted DataFrame

332

"""

333

```

334

335

**Usage Examples:**

336

337

```python

338

# Automatic compression handling

339

with pt.io.autoopen('data.txt.gz', 'r') as f:

340

content = f.read()

341

342

with pt.io.autoopen('results.json.bz2', 'w') as f:

343

json.dump(data, f)

344

345

# Atomic file writing

346

with pt.io.finalized_open('important_results.txt', 'w') as f:

347

f.write("Critical data") # Only written if no exceptions

348

349

# File discovery

350

all_files = pt.io.find_files('/path/to/documents')

351

text_files = [f for f in all_files if f.endswith('.txt')]

352

353

# DataFrame conversion

354

df = pt.io.coerce_dataframe([{'docno': 'doc1', 'text': 'content'}])

355

```

356

357

### Debugging Utilities (`pyterrier.debug`)

358

359

Debugging and inspection utilities for PyTerrier pipelines.

360

361

```python { .api }

362

def print_columns(by_query: bool = False, message: str = None) -> Transformer:

363

"""

364

Debug transformer that prints DataFrame column information.

365

366

Parameters:

367

- by_query: Whether to group output by query (default: False)

368

- message: Optional message to print with column information

369

370

Returns:

371

- Transformer that prints column info and passes data through

372

"""

373

```

374

375

**Usage Example:**

376

377

```python

378

# Debug pipeline by printing column information

379

debug_pipeline = (

380

retriever >>

381

pt.debug.print_columns(message="After retrieval") >>

382

reranker >>

383

pt.debug.print_columns(message="After reranking", by_query=True) >>

384

(lambda df: df.head(10)) # Final cutoff

385

)

386

387

results = debug_pipeline.transform(topics)

388

```

389

390

### DataFrame Creation Utilities (`pyterrier.new`)

391

392

Utilities for creating standard PyTerrier DataFrames.

393

394

```python { .api }

395

def empty_Q() -> pd.DataFrame:

396

"""

397

Create empty queries DataFrame with standard columns.

398

399

Returns:

400

- Empty DataFrame with 'qid' and 'query' columns

401

"""

402

403

def queries(queries: Union[List[str], Dict[str, str]], qid: str = None, **others) -> pd.DataFrame:

404

"""

405

Create queries DataFrame from various input formats.

406

407

Parameters:

408

- queries: Query data (list of strings, dict mapping qid->query, etc.)

409

- qid: Base qid for auto-generated IDs

410

- **others: Additional columns to include

411

412

Returns:

413

- Standard queries DataFrame

414

"""

415

416

def empty_R() -> pd.DataFrame:

417

"""

418

Create empty results DataFrame with standard columns.

419

420

Returns:

421

- Empty DataFrame with 'qid', 'docno', 'score', 'rank' columns

422

"""

423

424

def ranked_documents(topics: pd.DataFrame = None, docnos: List[str] = None,

425

scores: List[float] = None, **others) -> pd.DataFrame:

426

"""

427

Create ranked documents DataFrame.

428

429

Parameters:

430

- topics: Topics DataFrame to associate with documents

431

- docnos: List of document IDs

432

- scores: List of relevance scores

433

- **others: Additional columns to include

434

435

Returns:

436

- Standard ranked documents DataFrame

437

"""

438

```

439

440

**Usage Examples:**

441

442

```python

443

# Create queries DataFrame

444

topics = pt.new.queries([

445

"information retrieval",

446

"search engines",

447

"natural language processing"

448

])

449

450

topics_with_ids = pt.new.queries({

451

'q1': 'machine learning',

452

'q2': 'deep learning',

453

'q3': 'neural networks'

454

})

455

456

# Create results DataFrame

457

results = pt.new.ranked_documents(

458

topics=topics,

459

docnos=['doc1', 'doc2', 'doc3'],

460

scores=[0.95, 0.87, 0.76]

461

)

462

463

# Empty DataFrames for initialization

464

empty_queries = pt.new.empty_Q()

465

empty_results = pt.new.empty_R()

466

```

467

468

### Inspection Utilities (`pyterrier.inspect`)

469

470

Utilities for inspecting PyTerrier objects and artifacts.

471

472

```python { .api }

473

def artifact_type_format(artifact: Any) -> Tuple[str, str]:

474

"""

475

Get artifact type and format information.

476

477

Parameters:

478

- artifact: Artifact object to inspect

479

480

Returns:

481

- Tuple of (type_name, format_name)

482

"""

483

```

484

485

### Learning-to-Rank Utilities (`pyterrier.ltr`)

486

487

Utilities for learning-to-rank applications and feature analysis.

488

489

```python { .api }

490

class AblateFeatures(Transformer):

491

"""

492

Feature ablation transformer for analyzing feature importance.

493

494

Systematically removes features to study their impact on ranking performance.

495

"""

496

def __init__(self, features_to_ablate: List[str] = None): ...

497

```

498

499

### Parallel Processing Utilities (`pyterrier.parallel`)

500

501

Utilities for parallel processing and distributed execution.

502

503

```python { .api }

504

class PoolParallelTransformer(Transformer):

505

"""

506

Wrapper transformer for parallel execution using process pools.

507

508

Parameters:

509

- transformer: Base transformer to parallelize

510

- jobs: Number of parallel processes

511

- backend: Parallel backend ('joblib', 'ray')

512

"""

513

def __init__(self, transformer: Transformer, jobs: int = 2, backend: str = 'joblib'): ...

514

```

515

516

## Types

517

518

```python { .api }

519

from typing import Dict, List, Any, Union, Optional, Iterator, Callable, Tuple

520

import pandas as pd

521

522

# Utility types

523

ProgressBarType = str # Progress bar implementation name

524

EnvironmentVariable = Tuple[str, str] # Environment variable key-value pair

525

ByteCount = int # Number of bytes

526

HumanReadableSize = str # Human-readable size string (e.g., '1.2 GB')

527

ColumnNames = List[str] # List of DataFrame column names

528

BatchSize = int # Batch processing size

529

ChunkCount = int # Number of chunks for splitting

530

QueryInput = Union[str, Dict[str, str], pd.DataFrame] # Various query input formats

531

DataFrameChunk = pd.DataFrame # DataFrame chunk for batch processing

532

ArtifactInfo = Tuple[str, str] # Artifact type and format information

533

```