or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

aggregation.mdbasic-transformations.mddata-io.mddata-reshaping.mdindex.mdsorting-joins.mdtable-operations.mdvalidation-analysis.md

sorting-joins.mddocs/

0

# Sorting and Joins

1

2

Advanced operations for sorting data and combining multiple tables through various join types. PETL provides both memory-based and disk-based sorting for large datasets, along with comprehensive join operations for data integration and analysis.

3

4

## Capabilities

5

6

### Sorting Operations

7

8

Sort table data using various keys and algorithms, with support for large datasets through external sorting.

9

10

```python { .api }

11

def sort(table, key=None, reverse=False, buffersize=None, tempdir=None, cache=True) -> Table:

12

"""

13

Sort table rows by key.

14

15

Parameters:

16

- table: Input table

17

- key: Sort key (field name, index, function, or tuple of keys)

18

- reverse: If True, sort in descending order

19

- buffersize: Buffer size for external sorting (default: 100000)

20

- tempdir: Directory for temporary files during external sorting

21

- cache: Whether to cache sorted results

22

23

Returns:

24

Sorted table

25

"""

26

27

def mergesort(*tables, **kwargs) -> Table:

28

"""

29

Merge multiple pre-sorted tables.

30

31

Parameters:

32

- tables: Pre-sorted tables to merge

33

- key: Sort key used for merging

34

- reverse: Whether tables are sorted in reverse order

35

- kwargs: Additional sorting options

36

37

Returns:

38

Single merged and sorted table

39

"""

40

41

def issorted(table, key=None, reverse=False, strict=False) -> bool:

42

"""

43

Test if table is sorted by key.

44

45

Parameters:

46

- table: Input table

47

- key: Sort key to test

48

- reverse: Whether to test for reverse sort order

49

- strict: If True, require strict ordering (no equal values)

50

51

Returns:

52

Boolean indicating if table is sorted

53

"""

54

```

55

56

### Inner Joins

57

58

Combine tables by matching key values, returning only rows with matches in both tables.

59

60

```python { .api }

61

def join(table1, table2, key=None, lkey=None, rkey=None, presorted=False,

62

buffersize=None, tempdir=None, cache=True) -> Table:

63

"""

64

Join two tables on matching key values.

65

66

Parameters:

67

- table1: Left table

68

- table2: Right table

69

- key: Join key (used for both tables if lkey/rkey not specified)

70

- lkey: Left table join key

71

- rkey: Right table join key

72

- presorted: If True, tables are already sorted by join key

73

- buffersize: Buffer size for sorting

74

- tempdir: Directory for temporary files

75

- cache: Whether to cache results

76

77

Returns:

78

Table with matching rows from both tables

79

"""

80

81

def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True,

82

lprefix=None, rprefix=None) -> Table:

83

"""

84

Hash-based inner join (memory efficient for large tables).

85

86

Parameters:

87

- left: Left table

88

- right: Right table

89

- key: Join key

90

- lkey: Left table join key

91

- rkey: Right table join key

92

- cache: Whether to cache hash table

93

- lprefix: Prefix for left table field names in conflicts

94

- rprefix: Prefix for right table field names in conflicts

95

96

Returns:

97

Table with hash-joined results

98

"""

99

```

100

101

### Outer Joins

102

103

Include rows that may not have matches in both tables.

104

105

```python { .api }

106

def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

107

presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

108

"""

109

Left outer join two tables.

110

111

Parameters:

112

- left: Left table (all rows preserved)

113

- right: Right table

114

- key: Join key

115

- lkey: Left table join key

116

- rkey: Right table join key

117

- missing: Value for missing fields from right table

118

- presorted: If True, tables are pre-sorted

119

- buffersize: Buffer size for sorting

120

- tempdir: Directory for temporary files

121

- cache: Whether to cache results

122

123

Returns:

124

Table with all left rows plus matching right rows

125

"""

126

127

def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

128

presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

129

"""

130

Right outer join two tables.

131

132

Parameters:

133

- left: Left table

134

- right: Right table (all rows preserved)

135

- key: Join key

136

- lkey: Left table join key

137

- rkey: Right table join key

138

- missing: Value for missing fields from left table

139

- presorted: If True, tables are pre-sorted

140

- buffersize: Buffer size for sorting

141

- tempdir: Directory for temporary files

142

- cache: Whether to cache results

143

144

Returns:

145

Table with all right rows plus matching left rows

146

"""

147

148

def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

149

presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

150

"""

151

Full outer join two tables.

152

153

Parameters:

154

- left: Left table

155

- right: Right table

156

- key: Join key

157

- lkey: Left table join key

158

- rkey: Right table join key

159

- missing: Value for missing fields

160

- presorted: If True, tables are pre-sorted

161

- buffersize: Buffer size for sorting

162

- tempdir: Directory for temporary files

163

- cache: Whether to cache results

164

165

Returns:

166

Table with all rows from both tables

167

"""

168

169

def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

170

cache=True, lprefix=None, rprefix=None) -> Table:

171

"""Hash-based left outer join."""

172

173

def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

174

cache=True, lprefix=None, rprefix=None) -> Table:

175

"""Hash-based right outer join."""

176

```

177

178

### Anti-Joins and Lookup Joins

179

180

Specialized join operations for data analysis and lookup scenarios.

181

182

```python { .api }

183

def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False,

184

buffersize=None, tempdir=None, cache=True) -> Table:

185

"""

186

Return rows from left table with no match in right table.

187

188

Parameters:

189

- left: Left table

190

- right: Right table

191

- key: Join key

192

- lkey: Left table join key

193

- rkey: Right table join key

194

- presorted: If True, tables are pre-sorted

195

- buffersize: Buffer size for sorting

196

- tempdir: Directory for temporary files

197

- cache: Whether to cache results

198

199

Returns:

200

Table with left rows that have no match in right table

201

"""

202

203

def hashantijoin(left, right, key=None, lkey=None, rkey=None) -> Table:

204

"""Hash-based anti-join."""

205

206

def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

207

presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

208

"""

209

Join using lookup from right table.

210

211

Parameters:

212

- left: Left table

213

- right: Right table

214

- key: Join key

215

- lkey: Left table join key

216

- rkey: Right table join key

217

- missing: Value for missing lookups

218

- presorted: If True, tables are pre-sorted

219

- buffersize: Buffer size for sorting

220

- tempdir: Directory for temporary files

221

- cache: Whether to cache results

222

223

Returns:

224

Table with lookup values from right table

225

"""

226

227

def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,

228

lprefix=None, rprefix=None) -> Table:

229

"""Hash-based lookup join."""

230

```

231

232

### Cross Joins and Table Splitting

233

234

Operations for Cartesian products and table decomposition.

235

236

```python { .api }

237

def crossjoin(*tables, **kwargs) -> Table:

238

"""

239

Cartesian product of tables.

240

241

Parameters:

242

- tables: Tables to cross join

243

- kwargs: Additional options

244

245

Returns:

246

Table with all possible row combinations

247

"""

248

249

def unjoin(table, value, key=None, autoincrement=None, presorted=False,

250

buffersize=None, tempdir=None, cache=True) -> tuple:

251

"""

252

Split a table into two by extracting key and value fields.

253

254

Parameters:

255

- table: Input table

256

- value: Value field name

257

- key: Key field name (default: remaining fields)

258

- autoincrement: If True, add auto-incrementing key

259

- presorted: If True, table is pre-sorted

260

- buffersize: Buffer size for sorting

261

- tempdir: Directory for temporary files

262

- cache: Whether to cache results

263

264

Returns:

265

Tuple of (key_table, value_table)

266

"""

267

```

268

269

### Set Operations

270

271

Operations treating tables as sets for data comparison and analysis.

272

273

```python { .api }

274

def complement(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

275

"""

276

Return rows in table a but not in table b.

277

278

Parameters:

279

- a: First table

280

- b: Second table

281

- presorted: If True, tables are pre-sorted

282

- buffersize: Buffer size for sorting

283

- tempdir: Directory for temporary files

284

- cache: Whether to cache results

285

286

Returns:

287

Table with rows in a but not in b

288

"""

289

290

def intersection(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:

291

"""

292

Return rows present in both tables.

293

294

Parameters:

295

- a: First table

296

- b: Second table

297

- presorted: If True, tables are pre-sorted

298

- buffersize: Buffer size for sorting

299

- tempdir: Directory for temporary files

300

- cache: Whether to cache results

301

302

Returns:

303

Table with rows present in both tables

304

"""

305

306

def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True, strict=False) -> Table:

307

"""

308

Return rows that differ between tables.

309

310

Parameters:

311

- a: First table

312

- b: Second table

313

- presorted: If True, tables are pre-sorted

314

- buffersize: Buffer size for sorting

315

- tempdir: Directory for temporary files

316

- cache: Whether to cache results

317

- strict: If True, use strict comparison

318

319

Returns:

320

Table with differing rows

321

"""

322

323

def hashcomplement(a, b, strict=False) -> Table:

324

"""Hash-based complement operation."""

325

326

def hashintersection(a, b) -> Table:

327

"""Hash-based intersection operation."""

328

```

329

330

## Usage Examples

331

332

### Basic Sorting

333

334

```python

335

import petl as etl

336

337

table = etl.fromcsv('employees.csv')

338

339

# Sort by single field

340

sorted_by_name = etl.sort(table, 'name')

341

342

# Sort by multiple fields

343

sorted_multi = etl.sort(table, ('department', 'salary'))

344

345

# Sort in reverse order

346

sorted_desc = etl.sort(table, 'salary', reverse=True)

347

348

# Sort with custom function

349

sorted_custom = etl.sort(table, lambda row: (row.department, -row.salary))

350

351

# Check if table is sorted

352

is_sorted = etl.issorted(table, 'name')

353

```

354

355

### Inner Joins

356

357

```python

358

import petl as etl

359

360

employees = etl.fromcsv('employees.csv') # id, name, dept_id

361

departments = etl.fromcsv('departments.csv') # id, dept_name

362

363

# Simple join on matching field names

364

joined = etl.join(employees, departments, 'id')

365

366

# Join with different key names

367

joined = etl.join(employees, departments,

368

lkey='dept_id', rkey='id')

369

370

# Hash join for better performance with large tables

371

hash_joined = etl.hashjoin(employees, departments,

372

lkey='dept_id', rkey='id')

373

```

374

375

### Outer Joins

376

377

```python

378

import petl as etl

379

380

customers = etl.fromcsv('customers.csv')

381

orders = etl.fromcsv('orders.csv')

382

383

# Left join - all customers, with orders where they exist

384

customer_orders = etl.leftjoin(customers, orders,

385

lkey='id', rkey='customer_id',

386

missing='No orders')

387

388

# Right join - all orders with customer details

389

order_details = etl.rightjoin(customers, orders,

390

lkey='id', rkey='customer_id')

391

392

# Full outer join - all customers and all orders

393

full_join = etl.outerjoin(customers, orders,

394

lkey='id', rkey='customer_id')

395

```

396

397

### Anti-Joins and Lookups

398

399

```python

400

import petl as etl

401

402

all_products = etl.fromcsv('all_products.csv')

403

sold_products = etl.fromcsv('sold_products.csv')

404

405

# Find products that haven't been sold

406

unsold = etl.antijoin(all_products, sold_products, 'product_id')

407

408

# Lookup product names

409

orders = etl.fromcsv('orders.csv')

410

products = etl.fromcsv('products.csv')

411

412

orders_with_names = etl.lookupjoin(orders, products,

413

lkey='product_id', rkey='id')

414

```

415

416

### Set Operations

417

418

```python

419

import petl as etl

420

421

current_employees = etl.fromcsv('current_employees.csv')

422

former_employees = etl.fromcsv('former_employees.csv')

423

424

# Find employees who are only current (not former)

425

only_current = etl.complement(current_employees, former_employees)

426

427

# Find employees who have been both current and former

428

both = etl.intersection(current_employees, former_employees)

429

430

# Find all differences between two employee lists

431

differences = etl.diff(current_employees, former_employees)

432

```

433

434

### Complex Join Scenarios

435

436

```python

437

import petl as etl

438

439

# Multi-table joins

440

employees = etl.fromcsv('employees.csv')

441

departments = etl.fromcsv('departments.csv')

442

locations = etl.fromcsv('locations.csv')

443

444

# Chain joins for multi-table relationships

445

emp_dept = etl.join(employees, departments,

446

lkey='dept_id', rkey='id')

447

full_info = etl.join(emp_dept, locations,

448

lkey='location_id', rkey='id')

449

450

# Self-join for hierarchical data

451

managers = etl.join(employees, employees,

452

lkey='manager_id', rkey='id',

453

rprefix='mgr_')

454

455

# Conditional joins with preprocessing

456

high_performers = etl.select(employees, 'rating', lambda x: x > 4.0)

457

bonus_eligible = etl.join(high_performers, departments,

458

lkey='dept_id', rkey='id')

459

```

460

461

### Performance Optimization

462

463

```python

464

import petl as etl

465

466

large_table1 = etl.fromcsv('large_data1.csv')

467

large_table2 = etl.fromcsv('large_data2.csv')

468

469

# Use hash joins for better performance

470

fast_join = etl.hashjoin(large_table1, large_table2, 'id')

471

472

# Pre-sort for sort-merge joins

473

sorted1 = etl.sort(large_table1, 'id')

474

sorted2 = etl.sort(large_table2, 'id')

475

efficient_join = etl.join(sorted1, sorted2, 'id', presorted=True)

476

477

# External sorting for very large datasets

478

huge_table = etl.fromcsv('huge_data.csv')

479

sorted_huge = etl.sort(huge_table, 'timestamp',

480

buffersize=50000, # Smaller buffer

481

tempdir='/tmp/sorting')

482

```