0
# Data Manipulation
1
2
Comprehensive functions for combining, transforming, and reshaping data frames with high-performance operations.
3
4
## Capabilities
5
6
### Frame Binding Operations
7
8
Combine multiple frames by columns or rows with flexible options for handling mismatched structures.
9
10
```python { .api }
11
def cbind(*frames, force=False) -> Frame:
12
"""
13
Bind frames column-wise (horizontally).
14
15
Parameters:
16
- *frames: Frame objects to bind
17
- force: Force binding even with mismatched row counts
18
19
Returns:
20
Frame with columns from all input frames
21
"""
22
23
def rbind(*frames, force=False, bynames=True) -> Frame:
24
"""
25
Bind frames row-wise (vertically).
26
27
Parameters:
28
- *frames: Frame objects to bind
29
- force: Force binding even with mismatched column types
30
- bynames: Match columns by name (True) or position (False)
31
32
Returns:
33
Frame with rows from all input frames
34
"""
35
```
36
37
### Sorting and Uniqueness
38
39
Sort frames and extract unique values with flexible column specifications.
40
41
```python { .api }
42
def sort(frame, *cols, reverse=False, na_position='first') -> Frame:
43
"""
44
Sort frame by specified columns.
45
46
Parameters:
47
- frame: Frame to sort
48
- *cols: Column expressions or names to sort by
49
- reverse: Sort in descending order
50
- na_position: Position of NA values ('first' or 'last')
51
52
Returns:
53
Sorted Frame
54
"""
55
56
def unique(frame, *cols) -> Frame:
57
"""
58
Return unique rows based on specified columns.
59
60
Parameters:
61
- frame: Frame to process
62
- *cols: Columns to consider for uniqueness (all if none specified)
63
64
Returns:
65
Frame with unique rows
66
"""
67
```
68
69
### Utility Functions
70
71
Helper functions for data transformation and manipulation.
72
73
```python { .api }
74
def repeat(frame, n) -> Frame:
75
"""
76
Repeat frame rows n times.
77
78
Parameters:
79
- frame: Frame to repeat
80
- n: Number of repetitions
81
82
Returns:
83
Frame with repeated rows
84
"""
85
86
def shift(column, n=1) -> FExpr:
87
"""
88
Shift column values by n positions.
89
90
Parameters:
91
- column: Column expression to shift
92
- n: Number of positions to shift (positive=down, negative=up)
93
94
Returns:
95
Expression with shifted values
96
"""
97
98
def fillna(column, value) -> FExpr:
99
"""
100
Fill missing values in column with specified value.
101
102
Parameters:
103
- column: Column expression with missing values
104
- value: Value to use for filling NAs
105
106
Returns:
107
Expression with filled values
108
"""
109
110
def ifelse(condition, true_value, false_value) -> FExpr:
111
"""
112
Conditional expression returning different values based on condition.
113
114
Parameters:
115
- condition: Boolean expression
116
- true_value: Value when condition is True
117
- false_value: Value when condition is False
118
119
Returns:
120
Expression with conditional values
121
"""
122
```
123
124
### Type Conversion
125
126
Convert between different data types with explicit control over the conversion process.
127
128
```python { .api }
129
def as_type(frame_or_column, new_type) -> Frame:
130
"""
131
Convert frame or column to specified type.
132
133
Parameters:
134
- frame_or_column: Frame or column expression to convert
135
- new_type: Target stype or Type object
136
137
Returns:
138
Frame or expression with converted types
139
"""
140
141
def update(**kwargs) -> UpdateExpr:
142
"""
143
Create update specification for adding or modifying columns.
144
145
Parameters:
146
- **kwargs: column_name=expression pairs
147
148
Returns:
149
Update expression for use in Frame operations
150
"""
151
152
def cut(column, bins, right=True, labels=None) -> FExpr:
153
"""
154
Bin values into discrete intervals.
155
156
Parameters:
157
- column: Column expression to bin
158
- bins: Number of bins or sequence of bin edges
159
- right: Include right edge of intervals
160
- labels: Labels for bins
161
162
Returns:
163
Categorical column with binned values
164
"""
165
166
def qcut(column, q, labels=None) -> FExpr:
167
"""
168
Quantile-based binning of values.
169
170
Parameters:
171
- column: Column expression to bin
172
- q: Number of quantiles or sequence of quantile boundaries
173
- labels: Labels for bins
174
175
Returns:
176
Categorical column with quantile-based bins
177
"""
178
179
def split_into_nhot(frame, delimiter=",") -> Frame:
180
"""
181
One-hot encoding for delimited string values.
182
183
Parameters:
184
- frame: Frame containing delimited strings
185
- delimiter: Character used to separate values
186
187
Returns:
188
Frame with binary columns for each unique value
189
"""
190
```
191
192
## Data Binding Examples
193
194
### Column Binding
195
196
```python
197
import datatable as dt
198
199
# Create sample frames
200
DT1 = dt.Frame({'A': [1, 2, 3], 'B': [4, 5, 6]})
201
DT2 = dt.Frame({'C': [7, 8, 9], 'D': [10, 11, 12]})
202
DT3 = dt.Frame({'E': [13, 14, 15]})
203
204
# Bind columns
205
result = dt.cbind(DT1, DT2, DT3)
206
# Result: Frame with columns A, B, C, D, E
207
208
# Force binding with mismatched row counts
209
DT4 = dt.Frame({'F': [16, 17]}) # Only 2 rows
210
result = dt.cbind(DT1, DT4, force=True) # Shorter frame is recycled
211
```
212
213
### Row Binding
214
215
```python
216
# Create compatible frames
217
DT1 = dt.Frame({'X': [1, 2], 'Y': ['a', 'b']})
218
DT2 = dt.Frame({'X': [3, 4], 'Y': ['c', 'd']})
219
DT3 = dt.Frame({'X': [5, 6], 'Y': ['e', 'f']})
220
221
# Bind rows
222
result = dt.rbind(DT1, DT2, DT3)
223
# Result: Frame with 6 rows and columns X, Y
224
225
# Bind with different column orders
226
DT4 = dt.Frame({'Y': ['g', 'h'], 'X': [7, 8]})
227
result = dt.rbind(DT1, DT4, bynames=True) # Matches by column names
228
229
# Force binding with type mismatches
230
DT5 = dt.Frame({'X': [1.1, 2.2], 'Y': ['i', 'j']}) # X is float
231
result = dt.rbind(DT1, DT5, force=True) # Forces compatible types
232
```
233
234
## Sorting Examples
235
236
### Basic Sorting
237
238
```python
239
DT = dt.Frame({
240
'A': [3, 1, 4, 1, 5],
241
'B': ['c', 'a', 'd', 'a', 'e'],
242
'C': [3.3, 1.1, 4.4, 1.2, 5.5]
243
})
244
245
# Sort by single column
246
sorted_DT = dt.sort(DT, f.A) # Sort by A ascending
247
sorted_DT = dt.sort(DT, -f.A) # Sort by A descending
248
sorted_DT = dt.sort(DT, f.A, reverse=True) # Alternative descending
249
250
# Sort by multiple columns
251
sorted_DT = dt.sort(DT, f.B, f.A) # Sort by B, then A
252
sorted_DT = dt.sort(DT, f.B, -f.C) # Sort by B asc, C desc
253
254
# Sort with NA handling
255
DT_na = dt.Frame({'X': [3, None, 1, None, 2]})
256
sorted_DT = dt.sort(DT_na, f.X, na_position='last')
257
```
258
259
### Sorting in Frame Operations
260
261
```python
262
# Sort as part of selection
263
result = DT[:, :, dt.sort(f.A)]
264
result = DT[f.A > 2, :, dt.sort(f.B)]
265
266
# Sort within groups
267
result = DT[:, :, dt.sort(f.C), dt.by(f.B)]
268
```
269
270
## Uniqueness Examples
271
272
### Basic Unique Operations
273
274
```python
275
DT = dt.Frame({
276
'A': [1, 2, 2, 3, 3, 3],
277
'B': ['x', 'y', 'y', 'z', 'z', 'w'],
278
'C': [1.1, 2.2, 2.2, 3.3, 3.4, 3.5]
279
})
280
281
# Unique rows (all columns)
282
unique_DT = dt.unique(DT)
283
284
# Unique based on specific columns
285
unique_DT = dt.unique(DT, f.A) # Unique values of A
286
unique_DT = dt.unique(DT, f.A, f.B) # Unique combinations of A and B
287
288
# Unique in Frame operations
289
result = DT[:, :, dt.unique(f.A)]
290
```
291
292
## Transformation Examples
293
294
### Conditional Logic
295
296
```python
297
DT = dt.Frame({
298
'score': [85, 92, 78, 95, 67],
299
'category': ['A', 'B', 'A', 'B', 'C']
300
})
301
302
# Simple conditional
303
result = DT[:, dt.update(
304
grade=dt.ifelse(f.score >= 90, "A", "B")
305
)]
306
307
# Nested conditionals
308
result = DT[:, dt.update(
309
grade=dt.ifelse(f.score >= 90, "A",
310
dt.ifelse(f.score >= 80, "B",
311
dt.ifelse(f.score >= 70, "C", "F")))
312
)]
313
314
# Conditional aggregation
315
result = DT[:, dt.sum(dt.ifelse(f.score >= 80, 1, 0)), dt.by(f.category)]
316
```
317
318
### Missing Value Handling
319
320
```python
321
DT = dt.Frame({
322
'A': [1, None, 3, None, 5],
323
'B': [1.1, 2.2, None, 4.4, None]
324
})
325
326
# Fill missing values
327
result = DT[:, dt.update(
328
A_filled=dt.fillna(f.A, 0),
329
B_filled=dt.fillna(f.B, dt.mean(f.B))
330
)]
331
332
# Forward fill
333
result = DT[:, dt.update(
334
A_ffill=dt.fillna(f.A, dt.shift(f.A, 1))
335
)]
336
337
# Conditional filling
338
result = DT[:, dt.update(
339
A_smart=dt.ifelse(dt.isna(f.A), dt.mean(f.A), f.A)
340
)]
341
```
342
343
### Data Shifting
344
345
```python
346
DT = dt.Frame({
347
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
348
'value': [10, 15, 12, 18]
349
})
350
351
# Lag values (shift down)
352
result = DT[:, dt.update(
353
prev_value=dt.shift(f.value, 1), # Previous value
354
prev2_value=dt.shift(f.value, 2) # Value 2 periods ago
355
)]
356
357
# Lead values (shift up)
358
result = DT[:, dt.update(
359
next_value=dt.shift(f.value, -1), # Next value
360
next2_value=dt.shift(f.value, -2) # Value 2 periods ahead
361
)]
362
363
# Calculate differences
364
result = DT[:, dt.update(
365
diff=f.value - dt.shift(f.value, 1),
366
pct_change=((f.value - dt.shift(f.value, 1)) / dt.shift(f.value, 1)) * 100
367
)]
368
```
369
370
### Repetition and Expansion
371
372
```python
373
DT = dt.Frame({'A': [1, 2], 'B': ['x', 'y']})
374
375
# Repeat entire frame
376
repeated = dt.repeat(DT, 3) # 6 rows total
377
378
# Repeat with expressions
379
result = DT[:, dt.repeat(f.A, 2)] # Each value repeated twice
380
381
# Create expanding sequences
382
base = dt.Frame({'seq': [1]})
383
expanded = dt.repeat(base, 5)[:, dt.update(seq=range(1, 6))]
384
```
385
386
## Type Conversion Examples
387
388
### Basic Type Conversion
389
390
```python
391
DT = dt.Frame({
392
'A': [1, 2, 3], # int64 by default
393
'B': [1.1, 2.2, 3.3], # float64 by default
394
'C': ['1', '2', '3'] # str64
395
})
396
397
# Convert single column
398
result = DT[:, dt.update(A_float=dt.as_type(f.A, dt.float32))]
399
400
# Convert multiple columns
401
result = DT[:, dt.update(
402
A_str=dt.as_type(f.A, dt.str32),
403
C_int=dt.as_type(f.C, dt.int32)
404
)]
405
406
# Convert entire frame
407
DT_float = dt.as_type(DT, dt.float64)
408
```
409
410
### Advanced Type Operations
411
412
```python
413
# Conditional type conversion
414
result = DT[:, dt.update(
415
A_converted=dt.ifelse(f.A > 2,
416
dt.as_type(f.A, dt.float32),
417
dt.as_type(f.A, dt.int32))
418
)]
419
420
# Safe conversion with error handling
421
try:
422
result = DT[:, dt.update(C_numeric=dt.as_type(f.C, dt.float64))]
423
except dt.exceptions.TypeError as e:
424
# Handle conversion errors
425
result = DT[:, dt.update(C_numeric=dt.fillna(dt.as_type(f.C, dt.float64), 0))]
426
```