Tessl Tile for pypi/petl@1.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

aggregation.md basic-transformations.md data-io.md data-reshaping.md index.md sorting-joins.md table-operations.md validation-analysis.md

sorting-joins.mddocs/

0
# Sorting and Joins
1

2
Advanced operations for sorting data and combining multiple tables through various join types. PETL provides both memory-based and disk-based sorting for large datasets, along with comprehensive join operations for data integration and analysis.
3

4
## Capabilities
5

6
### Sorting Operations
7

8
Sort table data using various keys and algorithms, with support for large datasets through external sorting.
9

10
```python { .api }
11
def sort(table, key=None, reverse=False, buffersize=None, tempdir=None, cache=True) -> Table:
12
    """
13
    Sort table rows by key.
14
    
15
    Parameters:
16
    - table: Input table
17
    - key: Sort key (field name, index, function, or tuple of keys)
18
    - reverse: If True, sort in descending order
19
    - buffersize: Buffer size for external sorting (default: 100000)
20
    - tempdir: Directory for temporary files during external sorting
21
    - cache: Whether to cache sorted results
22
    
23
    Returns:
24
    Sorted table
25
    """
26

27
def mergesort(*tables, **kwargs) -> Table:
28
    """
29
    Merge multiple pre-sorted tables.
30
    
31
    Parameters:
32
    - tables: Pre-sorted tables to merge
33
    - key: Sort key used for merging
34
    - reverse: Whether tables are sorted in reverse order
35
    - kwargs: Additional sorting options
36
    
37
    Returns:
38
    Single merged and sorted table
39
    """
40

41
def issorted(table, key=None, reverse=False, strict=False) -> bool:
42
    """
43
    Test if table is sorted by key.
44
    
45
    Parameters:
46
    - table: Input table
47
    - key: Sort key to test
48
    - reverse: Whether to test for reverse sort order
49
    - strict: If True, require strict ordering (no equal values)
50
    
51
    Returns:
52
    Boolean indicating if table is sorted
53
    """
54
```
55

56
### Inner Joins
57

58
Combine tables by matching key values, returning only rows with matches in both tables.
59

60
```python { .api }
61
def join(table1, table2, key=None, lkey=None, rkey=None, presorted=False, 
62
         buffersize=None, tempdir=None, cache=True) -> Table:
63
    """
64
    Join two tables on matching key values.
65
    
66
    Parameters:
67
    - table1: Left table
68
    - table2: Right table
69
    - key: Join key (used for both tables if lkey/rkey not specified)
70
    - lkey: Left table join key
71
    - rkey: Right table join key
72
    - presorted: If True, tables are already sorted by join key
73
    - buffersize: Buffer size for sorting
74
    - tempdir: Directory for temporary files
75
    - cache: Whether to cache results
76
    
77
    Returns:
78
    Table with matching rows from both tables
79
    """
80

81
def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True, 
82
             lprefix=None, rprefix=None) -> Table:
83
    """
84
    Hash-based inner join (memory efficient for large tables).
85
    
86
    Parameters:
87
    - left: Left table
88
    - right: Right table
89
    - key: Join key
90
    - lkey: Left table join key
91
    - rkey: Right table join key
92
    - cache: Whether to cache hash table
93
    - lprefix: Prefix for left table field names in conflicts
94
    - rprefix: Prefix for right table field names in conflicts
95
    
96
    Returns:
97
    Table with hash-joined results
98
    """
99
```
100

101
### Outer Joins
102

103
Include rows that may not have matches in both tables.
104

105
```python { .api }
106
def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
107
             presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
108
    """
109
    Left outer join two tables.
110
    
111
    Parameters:
112
    - left: Left table (all rows preserved)
113
    - right: Right table
114
    - key: Join key
115
    - lkey: Left table join key
116
    - rkey: Right table join key
117
    - missing: Value for missing fields from right table
118
    - presorted: If True, tables are pre-sorted
119
    - buffersize: Buffer size for sorting
120
    - tempdir: Directory for temporary files
121
    - cache: Whether to cache results
122
    
123
    Returns:
124
    Table with all left rows plus matching right rows
125
    """
126

127
def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
128
              presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
129
    """
130
    Right outer join two tables.
131
    
132
    Parameters:
133
    - left: Left table
134
    - right: Right table (all rows preserved)
135
    - key: Join key
136
    - lkey: Left table join key
137
    - rkey: Right table join key
138
    - missing: Value for missing fields from left table
139
    - presorted: If True, tables are pre-sorted
140
    - buffersize: Buffer size for sorting
141
    - tempdir: Directory for temporary files
142
    - cache: Whether to cache results
143
    
144
    Returns:
145
    Table with all right rows plus matching left rows
146
    """
147

148
def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
149
              presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
150
    """
151
    Full outer join two tables.
152
    
153
    Parameters:
154
    - left: Left table
155
    - right: Right table
156
    - key: Join key
157
    - lkey: Left table join key
158
    - rkey: Right table join key
159
    - missing: Value for missing fields
160
    - presorted: If True, tables are pre-sorted
161
    - buffersize: Buffer size for sorting
162
    - tempdir: Directory for temporary files
163
    - cache: Whether to cache results
164
    
165
    Returns:
166
    Table with all rows from both tables
167
    """
168

169
def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
170
                 cache=True, lprefix=None, rprefix=None) -> Table:
171
    """Hash-based left outer join."""
172

173
def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
174
                  cache=True, lprefix=None, rprefix=None) -> Table:
175
    """Hash-based right outer join."""
176
```
177

178
### Anti-Joins and Lookup Joins
179

180
Specialized join operations for data analysis and lookup scenarios.
181

182
```python { .api }
183
def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False, 
184
             buffersize=None, tempdir=None, cache=True) -> Table:
185
    """
186
    Return rows from left table with no match in right table.
187
    
188
    Parameters:
189
    - left: Left table
190
    - right: Right table
191
    - key: Join key
192
    - lkey: Left table join key
193
    - rkey: Right table join key
194
    - presorted: If True, tables are pre-sorted
195
    - buffersize: Buffer size for sorting
196
    - tempdir: Directory for temporary files
197
    - cache: Whether to cache results
198
    
199
    Returns:
200
    Table with left rows that have no match in right table
201
    """
202

203
def hashantijoin(left, right, key=None, lkey=None, rkey=None) -> Table:
204
    """Hash-based anti-join."""
205

206
def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
207
               presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
208
    """
209
    Join using lookup from right table.
210
    
211
    Parameters:
212
    - left: Left table
213
    - right: Right table
214
    - key: Join key
215
    - lkey: Left table join key
216
    - rkey: Right table join key
217
    - missing: Value for missing lookups
218
    - presorted: If True, tables are pre-sorted
219
    - buffersize: Buffer size for sorting
220
    - tempdir: Directory for temporary files
221
    - cache: Whether to cache results
222
    
223
    Returns:
224
    Table with lookup values from right table
225
    """
226

227
def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
228
                   lprefix=None, rprefix=None) -> Table:
229
    """Hash-based lookup join."""
230
```
231

232
### Cross Joins and Table Splitting
233

234
Operations for Cartesian products and table decomposition.
235

236
```python { .api }
237
def crossjoin(*tables, **kwargs) -> Table:
238
    """
239
    Cartesian product of tables.
240
    
241
    Parameters:
242
    - tables: Tables to cross join
243
    - kwargs: Additional options
244
    
245
    Returns:
246
    Table with all possible row combinations
247
    """
248

249
def unjoin(table, value, key=None, autoincrement=None, presorted=False, 
250
           buffersize=None, tempdir=None, cache=True) -> tuple:
251
    """
252
    Split a table into two by extracting key and value fields.
253
    
254
    Parameters:
255
    - table: Input table
256
    - value: Value field name
257
    - key: Key field name (default: remaining fields)
258
    - autoincrement: If True, add auto-incrementing key
259
    - presorted: If True, table is pre-sorted
260
    - buffersize: Buffer size for sorting
261
    - tempdir: Directory for temporary files
262
    - cache: Whether to cache results
263
    
264
    Returns:
265
    Tuple of (key_table, value_table)
266
    """
267
```
268

269
### Set Operations
270

271
Operations treating tables as sets for data comparison and analysis.
272

273
```python { .api }
274
def complement(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
275
    """
276
    Return rows in table a but not in table b.
277
    
278
    Parameters:
279
    - a: First table
280
    - b: Second table
281
    - presorted: If True, tables are pre-sorted
282
    - buffersize: Buffer size for sorting
283
    - tempdir: Directory for temporary files
284
    - cache: Whether to cache results
285
    
286
    Returns:
287
    Table with rows in a but not in b
288
    """
289

290
def intersection(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
291
    """
292
    Return rows present in both tables.
293
    
294
    Parameters:
295
    - a: First table
296
    - b: Second table
297
    - presorted: If True, tables are pre-sorted
298
    - buffersize: Buffer size for sorting
299
    - tempdir: Directory for temporary files
300
    - cache: Whether to cache results
301
    
302
    Returns:
303
    Table with rows present in both tables
304
    """
305

306
def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True, strict=False) -> Table:
307
    """
308
    Return rows that differ between tables.
309
    
310
    Parameters:
311
    - a: First table
312
    - b: Second table
313
    - presorted: If True, tables are pre-sorted
314
    - buffersize: Buffer size for sorting
315
    - tempdir: Directory for temporary files
316
    - cache: Whether to cache results
317
    - strict: If True, use strict comparison
318
    
319
    Returns:
320
    Table with differing rows
321
    """
322

323
def hashcomplement(a, b, strict=False) -> Table:
324
    """Hash-based complement operation."""
325

326
def hashintersection(a, b) -> Table:
327
    """Hash-based intersection operation."""
328
```
329

330
## Usage Examples
331

332
### Basic Sorting
333

334
```python
335
import petl as etl
336

337
table = etl.fromcsv('employees.csv')
338

339
# Sort by single field
340
sorted_by_name = etl.sort(table, 'name')
341

342
# Sort by multiple fields
343
sorted_multi = etl.sort(table, ('department', 'salary'))
344

345
# Sort in reverse order
346
sorted_desc = etl.sort(table, 'salary', reverse=True)
347

348
# Sort with custom function
349
sorted_custom = etl.sort(table, lambda row: (row.department, -row.salary))
350

351
# Check if table is sorted
352
is_sorted = etl.issorted(table, 'name')
353
```
354

355
### Inner Joins
356

357
```python
358
import petl as etl
359

360
employees = etl.fromcsv('employees.csv')  # id, name, dept_id
361
departments = etl.fromcsv('departments.csv')  # id, dept_name
362

363
# Simple join on matching field names
364
joined = etl.join(employees, departments, 'id')
365

366
# Join with different key names
367
joined = etl.join(employees, departments, 
368
                 lkey='dept_id', rkey='id')
369

370
# Hash join for better performance with large tables
371
hash_joined = etl.hashjoin(employees, departments, 
372
                          lkey='dept_id', rkey='id')
373
```
374

375
### Outer Joins
376

377
```python
378
import petl as etl
379

380
customers = etl.fromcsv('customers.csv')
381
orders = etl.fromcsv('orders.csv')
382

383
# Left join - all customers, with orders where they exist
384
customer_orders = etl.leftjoin(customers, orders, 
385
                              lkey='id', rkey='customer_id',
386
                              missing='No orders')
387

388
# Right join - all orders with customer details
389
order_details = etl.rightjoin(customers, orders,
390
                             lkey='id', rkey='customer_id')
391

392
# Full outer join - all customers and all orders
393
full_join = etl.outerjoin(customers, orders,
394
                         lkey='id', rkey='customer_id')
395
```
396

397
### Anti-Joins and Lookups
398

399
```python
400
import petl as etl
401

402
all_products = etl.fromcsv('all_products.csv')
403
sold_products = etl.fromcsv('sold_products.csv')
404

405
# Find products that haven't been sold
406
unsold = etl.antijoin(all_products, sold_products, 'product_id')
407

408
# Lookup product names
409
orders = etl.fromcsv('orders.csv')
410
products = etl.fromcsv('products.csv')
411

412
orders_with_names = etl.lookupjoin(orders, products,
413
                                  lkey='product_id', rkey='id')
414
```
415

416
### Set Operations
417

418
```python
419
import petl as etl
420

421
current_employees = etl.fromcsv('current_employees.csv')
422
former_employees = etl.fromcsv('former_employees.csv')
423

424
# Find employees who are only current (not former)
425
only_current = etl.complement(current_employees, former_employees)
426

427
# Find employees who have been both current and former
428
both = etl.intersection(current_employees, former_employees)
429

430
# Find all differences between two employee lists
431
differences = etl.diff(current_employees, former_employees)
432
```
433

434
### Complex Join Scenarios
435

436
```python
437
import petl as etl
438

439
# Multi-table joins
440
employees = etl.fromcsv('employees.csv')
441
departments = etl.fromcsv('departments.csv')
442
locations = etl.fromcsv('locations.csv')
443

444
# Chain joins for multi-table relationships
445
emp_dept = etl.join(employees, departments, 
446
                   lkey='dept_id', rkey='id')
447
full_info = etl.join(emp_dept, locations,
448
                    lkey='location_id', rkey='id')
449

450
# Self-join for hierarchical data
451
managers = etl.join(employees, employees, 
452
                   lkey='manager_id', rkey='id',
453
                   rprefix='mgr_')
454

455
# Conditional joins with preprocessing
456
high_performers = etl.select(employees, 'rating', lambda x: x > 4.0)
457
bonus_eligible = etl.join(high_performers, departments,
458
                         lkey='dept_id', rkey='id')
459
```
460

461
### Performance Optimization
462

463
```python
464
import petl as etl
465

466
large_table1 = etl.fromcsv('large_data1.csv')
467
large_table2 = etl.fromcsv('large_data2.csv')
468

469
# Use hash joins for better performance
470
fast_join = etl.hashjoin(large_table1, large_table2, 'id')
471

472
# Pre-sort for sort-merge joins
473
sorted1 = etl.sort(large_table1, 'id')
474
sorted2 = etl.sort(large_table2, 'id')
475
efficient_join = etl.join(sorted1, sorted2, 'id', presorted=True)
476

477
# External sorting for very large datasets
478
huge_table = etl.fromcsv('huge_data.csv')
479
sorted_huge = etl.sort(huge_table, 'timestamp', 
480
                      buffersize=50000,  # Smaller buffer
481
                      tempdir='/tmp/sorting')
482
```

Version

Tile

Files

sorting-joins.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

sorting-joins.mddocs/