0
# User-Defined Functions
1
2
Comprehensive UDF system supporting scalar, aggregate, and analytic functions with type safety and backend compatibility.
3
4
## Capabilities
5
6
### Scalar UDFs
7
8
Create custom scalar functions that operate on individual values.
9
10
```python { .api }
11
@ibis.udf.scalar(signature, **kwargs)
12
def function_name(arg1, arg2, ...):
13
"""
14
Decorator for scalar UDFs.
15
16
Parameters:
17
- signature: function signature with input and output types
18
- **kwargs: additional UDF configuration
19
20
Returns:
21
Decorated function that can be used in expressions
22
"""
23
```
24
25
**Usage Examples:**
26
```python
27
import ibis
28
from ibis import udf
29
30
# Simple scalar UDF
31
@udf.scalar((int, int) -> int)
32
def add_one(x):
33
return x + 1
34
35
# Use in expressions
36
result = table.select(
37
'value',
38
incremented=add_one(table.value)
39
)
40
41
# UDF with multiple parameters
42
@udf.scalar((str, str) -> str)
43
def concat_with_separator(a, b):
44
return f"{a}|{b}"
45
46
result = table.select(
47
combined=concat_with_separator(table.first_name, table.last_name)
48
)
49
```
50
51
### String manipulation UDF:
52
```python
53
@udf.scalar(str -> str)
54
def reverse_string(s):
55
return s[::-1] if s else None
56
57
# Mathematical UDF
58
@udf.scalar(float -> float)
59
def sigmoid(x):
60
import math
61
return 1 / (1 + math.exp(-x))
62
63
result = table.select(
64
'text',
65
reversed=reverse_string(table.text),
66
probability=sigmoid(table.score)
67
)
68
```
69
70
### Aggregate UDFs
71
72
Create custom aggregation functions.
73
74
```python { .api }
75
@ibis.udf.aggregate(signature, **kwargs)
76
def aggregate_function_name():
77
"""
78
Decorator for aggregate UDFs.
79
80
Parameters:
81
- signature: function signature with input and output types
82
- **kwargs: additional UDF configuration
83
84
Returns:
85
Decorated aggregation function
86
"""
87
```
88
89
**Usage Examples:**
90
```python
91
# Custom aggregation
92
@udf.aggregate([int] -> float)
93
def geometric_mean(values):
94
import math
95
if not values:
96
return None
97
98
product = 1
99
for v in values:
100
if v <= 0:
101
return None
102
product *= v
103
104
return math.pow(product, 1/len(values))
105
106
# Use in aggregations
107
result = table.group_by('category').aggregate(
108
count=table.count(),
109
avg=table.value.mean(),
110
geom_mean=geometric_mean(table.value)
111
)
112
113
# Custom median implementation
114
@udf.aggregate([float] -> float)
115
def custom_median(values):
116
if not values:
117
return None
118
119
sorted_vals = sorted(values)
120
n = len(sorted_vals)
121
122
if n % 2 == 0:
123
return (sorted_vals[n//2 - 1] + sorted_vals[n//2]) / 2
124
else:
125
return sorted_vals[n//2]
126
```
127
128
### Elementwise UDFs
129
130
Apply functions element-wise to array or column values.
131
132
```python { .api }
133
@ibis.udf.elementwise(signature, **kwargs)
134
def elementwise_function():
135
"""
136
Decorator for elementwise UDFs.
137
138
Parameters:
139
- signature: function signature
140
- **kwargs: additional configuration
141
142
Returns:
143
Decorated elementwise function
144
"""
145
```
146
147
**Usage Examples:**
148
```python
149
# Array transformation
150
@udf.elementwise([int] -> [int])
151
def double_array_elements(arr):
152
return [x * 2 for x in arr] if arr else None
153
154
result = table.select(
155
'numbers',
156
doubled=double_array_elements(table.numbers)
157
)
158
159
# String array processing
160
@udf.elementwise([str] -> [str])
161
def uppercase_strings(strings):
162
return [s.upper() for s in strings] if strings else None
163
```
164
165
### Reduction UDFs
166
167
Create custom reduction operations.
168
169
```python { .api }
170
@ibis.udf.reduction(signature, **kwargs)
171
def reduction_function():
172
"""
173
Decorator for reduction UDFs.
174
175
Parameters:
176
- signature: function signature
177
- **kwargs: additional configuration
178
179
Returns:
180
Decorated reduction function
181
"""
182
```
183
184
**Usage Examples:**
185
```python
186
# Custom reduction
187
@udf.reduction([float] -> dict)
188
def statistics_summary(values):
189
if not values:
190
return None
191
192
import statistics
193
return {
194
'mean': statistics.mean(values),
195
'median': statistics.median(values),
196
'mode': statistics.mode(values) if len(set(values)) < len(values) else None,
197
'std': statistics.stdev(values) if len(values) > 1 else 0
198
}
199
200
result = table.aggregate(
201
stats=statistics_summary(table.scores)
202
)
203
```
204
205
### Analytic UDFs
206
207
Create custom analytic/window functions.
208
209
```python { .api }
210
@ibis.udf.analytic(signature, **kwargs)
211
def analytic_function():
212
"""
213
Decorator for analytic UDFs.
214
215
Parameters:
216
- signature: function signature
217
- **kwargs: additional configuration
218
219
Returns:
220
Decorated analytic function
221
"""
222
```
223
224
**Usage Examples:**
225
```python
226
# Custom ranking function
227
@udf.analytic([float] -> [int])
228
def custom_rank(values):
229
"""Custom ranking with specific tie-breaking logic."""
230
sorted_pairs = sorted(enumerate(values), key=lambda x: (-x[1], x[0]))
231
ranks = [0] * len(values)
232
233
for rank, (orig_idx, _) in enumerate(sorted_pairs, 1):
234
ranks[orig_idx] = rank
235
236
return ranks
237
238
# Use with window
239
result = table.select(
240
'name', 'score',
241
custom_ranking=custom_rank(table.score).over(
242
group_by='category',
243
order_by='score'
244
)
245
)
246
```
247
248
### UDF Type System
249
250
Specify precise types for UDF inputs and outputs.
251
252
```python { .api }
253
# Type specification examples
254
str -> str # String input, string output
255
(int, float) -> bool # Two inputs, boolean output
256
[int] -> float # Array input, scalar output
257
{str: int} -> [str] # Map input, array output
258
Optional[str] -> str # Nullable input
259
```
260
261
**Usage Examples:**
262
```python
263
# Complex type signatures
264
@udf.scalar((Optional[str], int) -> Optional[str])
265
def truncate_string(s, max_len):
266
if s is None:
267
return None
268
return s[:max_len] if len(s) > max_len else s
269
270
# Array operations
271
@udf.scalar([float] -> float)
272
def array_variance(arr):
273
if not arr or len(arr) < 2:
274
return None
275
276
mean = sum(arr) / len(arr)
277
return sum((x - mean) ** 2 for x in arr) / (len(arr) - 1)
278
279
# Map operations
280
@udf.scalar({str: int} -> int)
281
def map_sum(mapping):
282
return sum(mapping.values()) if mapping else 0
283
```
284
285
### Backend-Specific UDFs
286
287
Create UDFs optimized for specific backends.
288
289
**Usage Examples:**
290
```python
291
# Backend-specific implementation
292
@udf.scalar(str -> str, backend='postgres')
293
def postgres_upper(s):
294
# PostgreSQL-specific implementation
295
return s.upper() if s else None
296
297
@udf.scalar(str -> str, backend='bigquery')
298
def bigquery_upper(s):
299
# BigQuery-specific implementation
300
return s.upper() if s else None
301
302
# SQL-based UDF (for SQL backends)
303
@udf.scalar(int -> int, sql="CASE WHEN {0} > 0 THEN {0} ELSE 0 END")
304
def positive_only(x):
305
pass # Implementation provided via SQL
306
307
result = table.select(
308
positive_value=positive_only(table.value)
309
)
310
```
311
312
### UDF with External Dependencies
313
314
Use external libraries in UDFs.
315
316
**Usage Examples:**
317
```python
318
# UDF with external dependencies
319
@udf.scalar(str -> float)
320
def sentiment_score(text):
321
# This would require textblob to be available
322
try:
323
from textblob import TextBlob
324
if not text:
325
return 0.0
326
blob = TextBlob(text)
327
return blob.sentiment.polarity
328
except ImportError:
329
return 0.0 # Fallback if library not available
330
331
# JSON processing UDF
332
@udf.scalar(str -> dict)
333
def parse_json_safe(json_str):
334
import json
335
try:
336
return json.loads(json_str) if json_str else {}
337
except json.JSONDecodeError:
338
return {}
339
340
result = table.select(
341
'review_text',
342
sentiment=sentiment_score(table.review_text),
343
metadata=parse_json_safe(table.metadata_json)
344
)
345
```
346
347
### UDF Error Handling
348
349
Handle errors gracefully in UDFs.
350
351
**Usage Examples:**
352
```python
353
# Safe division UDF
354
@udf.scalar((float, float) -> Optional[float])
355
def safe_divide(a, b):
356
try:
357
return a / b if b != 0 else None
358
except (TypeError, ZeroDivisionError):
359
return None
360
361
# String processing with error handling
362
@udf.scalar(str -> Optional[str])
363
def extract_domain(email):
364
try:
365
if email and '@' in email:
366
return email.split('@')[1].lower()
367
return None
368
except (AttributeError, IndexError):
369
return None
370
371
result = table.select(
372
'email',
373
domain=extract_domain(table.email),
374
ratio=safe_divide(table.numerator, table.denominator)
375
)
376
```
377
378
### Performance Considerations
379
380
Tips for efficient UDF implementation.
381
382
**Usage Examples:**
383
```python
384
# Vectorized operations when possible
385
@udf.scalar([float] -> float)
386
def efficient_mean(values):
387
# Use numpy for better performance if available
388
try:
389
import numpy as np
390
return float(np.mean(values)) if values else None
391
except ImportError:
392
return sum(values) / len(values) if values else None
393
394
# Caching expensive computations
395
@udf.scalar(str -> str)
396
def expensive_transformation(text):
397
# Cache results for repeated calls
398
if not hasattr(expensive_transformation, 'cache'):
399
expensive_transformation.cache = {}
400
401
if text in expensive_transformation.cache:
402
return expensive_transformation.cache[text]
403
404
# Expensive computation here
405
result = text.upper() # Simplified example
406
expensive_transformation.cache[text] = result
407
return result
408
```