0
# Sorting and Joins
1
2
Advanced operations for sorting data and combining multiple tables through various join types. PETL provides both memory-based and disk-based sorting for large datasets, along with comprehensive join operations for data integration and analysis.
3
4
## Capabilities
5
6
### Sorting Operations
7
8
Sort table data using various keys and algorithms, with support for large datasets through external sorting.
9
10
```python { .api }
11
def sort(table, key=None, reverse=False, buffersize=None, tempdir=None, cache=True) -> Table:
12
"""
13
Sort table rows by key.
14
15
Parameters:
16
- table: Input table
17
- key: Sort key (field name, index, function, or tuple of keys)
18
- reverse: If True, sort in descending order
19
- buffersize: Buffer size for external sorting (default: 100000)
20
- tempdir: Directory for temporary files during external sorting
21
- cache: Whether to cache sorted results
22
23
Returns:
24
Sorted table
25
"""
26
27
def mergesort(*tables, **kwargs) -> Table:
28
"""
29
Merge multiple pre-sorted tables.
30
31
Parameters:
32
- tables: Pre-sorted tables to merge
33
- key: Sort key used for merging
34
- reverse: Whether tables are sorted in reverse order
35
- kwargs: Additional sorting options
36
37
Returns:
38
Single merged and sorted table
39
"""
40
41
def issorted(table, key=None, reverse=False, strict=False) -> bool:
42
"""
43
Test if table is sorted by key.
44
45
Parameters:
46
- table: Input table
47
- key: Sort key to test
48
- reverse: Whether to test for reverse sort order
49
- strict: If True, require strict ordering (no equal values)
50
51
Returns:
52
Boolean indicating if table is sorted
53
"""
54
```
55
56
### Inner Joins
57
58
Combine tables by matching key values, returning only rows with matches in both tables.
59
60
```python { .api }
61
def join(table1, table2, key=None, lkey=None, rkey=None, presorted=False,
62
buffersize=None, tempdir=None, cache=True) -> Table:
63
"""
64
Join two tables on matching key values.
65
66
Parameters:
67
- table1: Left table
68
- table2: Right table
69
- key: Join key (used for both tables if lkey/rkey not specified)
70
- lkey: Left table join key
71
- rkey: Right table join key
72
- presorted: If True, tables are already sorted by join key
73
- buffersize: Buffer size for sorting
74
- tempdir: Directory for temporary files
75
- cache: Whether to cache results
76
77
Returns:
78
Table with matching rows from both tables
79
"""
80
81
def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True,
82
lprefix=None, rprefix=None) -> Table:
83
"""
84
Hash-based inner join (memory efficient for large tables).
85
86
Parameters:
87
- left: Left table
88
- right: Right table
89
- key: Join key
90
- lkey: Left table join key
91
- rkey: Right table join key
92
- cache: Whether to cache hash table
93
- lprefix: Prefix for left table field names in conflicts
94
- rprefix: Prefix for right table field names in conflicts
95
96
Returns:
97
Table with hash-joined results
98
"""
99
```
100
101
### Outer Joins
102
103
Include rows that may not have matches in both tables.
104
105
```python { .api }
106
def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
107
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
108
"""
109
Left outer join two tables.
110
111
Parameters:
112
- left: Left table (all rows preserved)
113
- right: Right table
114
- key: Join key
115
- lkey: Left table join key
116
- rkey: Right table join key
117
- missing: Value for missing fields from right table
118
- presorted: If True, tables are pre-sorted
119
- buffersize: Buffer size for sorting
120
- tempdir: Directory for temporary files
121
- cache: Whether to cache results
122
123
Returns:
124
Table with all left rows plus matching right rows
125
"""
126
127
def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
128
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
129
"""
130
Right outer join two tables.
131
132
Parameters:
133
- left: Left table
134
- right: Right table (all rows preserved)
135
- key: Join key
136
- lkey: Left table join key
137
- rkey: Right table join key
138
- missing: Value for missing fields from left table
139
- presorted: If True, tables are pre-sorted
140
- buffersize: Buffer size for sorting
141
- tempdir: Directory for temporary files
142
- cache: Whether to cache results
143
144
Returns:
145
Table with all right rows plus matching left rows
146
"""
147
148
def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
149
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
150
"""
151
Full outer join two tables.
152
153
Parameters:
154
- left: Left table
155
- right: Right table
156
- key: Join key
157
- lkey: Left table join key
158
- rkey: Right table join key
159
- missing: Value for missing fields
160
- presorted: If True, tables are pre-sorted
161
- buffersize: Buffer size for sorting
162
- tempdir: Directory for temporary files
163
- cache: Whether to cache results
164
165
Returns:
166
Table with all rows from both tables
167
"""
168
169
def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
170
cache=True, lprefix=None, rprefix=None) -> Table:
171
"""Hash-based left outer join."""
172
173
def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
174
cache=True, lprefix=None, rprefix=None) -> Table:
175
"""Hash-based right outer join."""
176
```
177
178
### Anti-Joins and Lookup Joins
179
180
Specialized join operations for data analysis and lookup scenarios.
181
182
```python { .api }
183
def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False,
184
buffersize=None, tempdir=None, cache=True) -> Table:
185
"""
186
Return rows from left table with no match in right table.
187
188
Parameters:
189
- left: Left table
190
- right: Right table
191
- key: Join key
192
- lkey: Left table join key
193
- rkey: Right table join key
194
- presorted: If True, tables are pre-sorted
195
- buffersize: Buffer size for sorting
196
- tempdir: Directory for temporary files
197
- cache: Whether to cache results
198
199
Returns:
200
Table with left rows that have no match in right table
201
"""
202
203
def hashantijoin(left, right, key=None, lkey=None, rkey=None) -> Table:
204
"""Hash-based anti-join."""
205
206
def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
207
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
208
"""
209
Join using lookup from right table.
210
211
Parameters:
212
- left: Left table
213
- right: Right table
214
- key: Join key
215
- lkey: Left table join key
216
- rkey: Right table join key
217
- missing: Value for missing lookups
218
- presorted: If True, tables are pre-sorted
219
- buffersize: Buffer size for sorting
220
- tempdir: Directory for temporary files
221
- cache: Whether to cache results
222
223
Returns:
224
Table with lookup values from right table
225
"""
226
227
def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
228
lprefix=None, rprefix=None) -> Table:
229
"""Hash-based lookup join."""
230
```
231
232
### Cross Joins and Table Splitting
233
234
Operations for Cartesian products and table decomposition.
235
236
```python { .api }
237
def crossjoin(*tables, **kwargs) -> Table:
238
"""
239
Cartesian product of tables.
240
241
Parameters:
242
- tables: Tables to cross join
243
- kwargs: Additional options
244
245
Returns:
246
Table with all possible row combinations
247
"""
248
249
def unjoin(table, value, key=None, autoincrement=None, presorted=False,
250
buffersize=None, tempdir=None, cache=True) -> tuple:
251
"""
252
Split a table into two by extracting key and value fields.
253
254
Parameters:
255
- table: Input table
256
- value: Value field name
257
- key: Key field name (default: remaining fields)
258
- autoincrement: If True, add auto-incrementing key
259
- presorted: If True, table is pre-sorted
260
- buffersize: Buffer size for sorting
261
- tempdir: Directory for temporary files
262
- cache: Whether to cache results
263
264
Returns:
265
Tuple of (key_table, value_table)
266
"""
267
```
268
269
### Set Operations
270
271
Operations treating tables as sets for data comparison and analysis.
272
273
```python { .api }
274
def complement(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
275
"""
276
Return rows in table a but not in table b.
277
278
Parameters:
279
- a: First table
280
- b: Second table
281
- presorted: If True, tables are pre-sorted
282
- buffersize: Buffer size for sorting
283
- tempdir: Directory for temporary files
284
- cache: Whether to cache results
285
286
Returns:
287
Table with rows in a but not in b
288
"""
289
290
def intersection(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
291
"""
292
Return rows present in both tables.
293
294
Parameters:
295
- a: First table
296
- b: Second table
297
- presorted: If True, tables are pre-sorted
298
- buffersize: Buffer size for sorting
299
- tempdir: Directory for temporary files
300
- cache: Whether to cache results
301
302
Returns:
303
Table with rows present in both tables
304
"""
305
306
def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True, strict=False) -> Table:
307
"""
308
Return rows that differ between tables.
309
310
Parameters:
311
- a: First table
312
- b: Second table
313
- presorted: If True, tables are pre-sorted
314
- buffersize: Buffer size for sorting
315
- tempdir: Directory for temporary files
316
- cache: Whether to cache results
317
- strict: If True, use strict comparison
318
319
Returns:
320
Table with differing rows
321
"""
322
323
def hashcomplement(a, b, strict=False) -> Table:
324
"""Hash-based complement operation."""
325
326
def hashintersection(a, b) -> Table:
327
"""Hash-based intersection operation."""
328
```
329
330
## Usage Examples
331
332
### Basic Sorting
333
334
```python
335
import petl as etl
336
337
table = etl.fromcsv('employees.csv')
338
339
# Sort by single field
340
sorted_by_name = etl.sort(table, 'name')
341
342
# Sort by multiple fields
343
sorted_multi = etl.sort(table, ('department', 'salary'))
344
345
# Sort in reverse order
346
sorted_desc = etl.sort(table, 'salary', reverse=True)
347
348
# Sort with custom function
349
sorted_custom = etl.sort(table, lambda row: (row.department, -row.salary))
350
351
# Check if table is sorted
352
is_sorted = etl.issorted(table, 'name')
353
```
354
355
### Inner Joins
356
357
```python
358
import petl as etl
359
360
employees = etl.fromcsv('employees.csv') # id, name, dept_id
361
departments = etl.fromcsv('departments.csv') # id, dept_name
362
363
# Simple join on matching field names
364
joined = etl.join(employees, departments, 'id')
365
366
# Join with different key names
367
joined = etl.join(employees, departments,
368
lkey='dept_id', rkey='id')
369
370
# Hash join for better performance with large tables
371
hash_joined = etl.hashjoin(employees, departments,
372
lkey='dept_id', rkey='id')
373
```
374
375
### Outer Joins
376
377
```python
378
import petl as etl
379
380
customers = etl.fromcsv('customers.csv')
381
orders = etl.fromcsv('orders.csv')
382
383
# Left join - all customers, with orders where they exist
384
customer_orders = etl.leftjoin(customers, orders,
385
lkey='id', rkey='customer_id',
386
missing='No orders')
387
388
# Right join - all orders with customer details
389
order_details = etl.rightjoin(customers, orders,
390
lkey='id', rkey='customer_id')
391
392
# Full outer join - all customers and all orders
393
full_join = etl.outerjoin(customers, orders,
394
lkey='id', rkey='customer_id')
395
```
396
397
### Anti-Joins and Lookups
398
399
```python
400
import petl as etl
401
402
all_products = etl.fromcsv('all_products.csv')
403
sold_products = etl.fromcsv('sold_products.csv')
404
405
# Find products that haven't been sold
406
unsold = etl.antijoin(all_products, sold_products, 'product_id')
407
408
# Lookup product names
409
orders = etl.fromcsv('orders.csv')
410
products = etl.fromcsv('products.csv')
411
412
orders_with_names = etl.lookupjoin(orders, products,
413
lkey='product_id', rkey='id')
414
```
415
416
### Set Operations
417
418
```python
419
import petl as etl
420
421
current_employees = etl.fromcsv('current_employees.csv')
422
former_employees = etl.fromcsv('former_employees.csv')
423
424
# Find employees who are only current (not former)
425
only_current = etl.complement(current_employees, former_employees)
426
427
# Find employees who have been both current and former
428
both = etl.intersection(current_employees, former_employees)
429
430
# Find all differences between two employee lists
431
differences = etl.diff(current_employees, former_employees)
432
```
433
434
### Complex Join Scenarios
435
436
```python
437
import petl as etl
438
439
# Multi-table joins
440
employees = etl.fromcsv('employees.csv')
441
departments = etl.fromcsv('departments.csv')
442
locations = etl.fromcsv('locations.csv')
443
444
# Chain joins for multi-table relationships
445
emp_dept = etl.join(employees, departments,
446
lkey='dept_id', rkey='id')
447
full_info = etl.join(emp_dept, locations,
448
lkey='location_id', rkey='id')
449
450
# Self-join for hierarchical data
451
managers = etl.join(employees, employees,
452
lkey='manager_id', rkey='id',
453
rprefix='mgr_')
454
455
# Conditional joins with preprocessing
456
high_performers = etl.select(employees, 'rating', lambda x: x > 4.0)
457
bonus_eligible = etl.join(high_performers, departments,
458
lkey='dept_id', rkey='id')
459
```
460
461
### Performance Optimization
462
463
```python
464
import petl as etl
465
466
large_table1 = etl.fromcsv('large_data1.csv')
467
large_table2 = etl.fromcsv('large_data2.csv')
468
469
# Use hash joins for better performance
470
fast_join = etl.hashjoin(large_table1, large_table2, 'id')
471
472
# Pre-sort for sort-merge joins
473
sorted1 = etl.sort(large_table1, 'id')
474
sorted2 = etl.sort(large_table2, 'id')
475
efficient_join = etl.join(sorted1, sorted2, 'id', presorted=True)
476
477
# External sorting for very large datasets
478
huge_table = etl.fromcsv('huge_data.csv')
479
sorted_huge = etl.sort(huge_table, 'timestamp',
480
buffersize=50000, # Smaller buffer
481
tempdir='/tmp/sorting')
482
```