0
# Outlier Detection and Handling
1
2
Transformers for identifying and handling outliers using statistical methods including Winsorization, capping, and trimming techniques to improve data quality and model robustness.
3
4
## Capabilities
5
6
### Winsorizer
7
8
Caps outliers to percentile values, replacing extreme values with less extreme percentile values.
9
10
```python { .api }
11
class Winsorizer:
12
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None, missing_values='raise'):
13
"""
14
Initialize Winsorizer.
15
16
Parameters:
17
- capping_method (str): Method to identify outliers - 'gaussian', 'iqr', 'mad', or 'quantiles'
18
- tail (str): Which tail to cap - 'right', 'left', or 'both'
19
- fold (int/float): Factor for outlier boundary calculation (used with gaussian, iqr, mad methods)
20
- variables (list): List of numerical variables to process. If None, selects all numerical variables
21
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
22
"""
23
24
def fit(self, X, y=None):
25
"""
26
Learn outlier boundaries for each variable.
27
28
Parameters:
29
- X (pandas.DataFrame): Training dataset
30
- y (pandas.Series, optional): Target variable (not used)
31
32
Returns:
33
- self
34
"""
35
36
def transform(self, X):
37
"""
38
Cap outliers to learned boundaries.
39
40
Parameters:
41
- X (pandas.DataFrame): Dataset to transform
42
43
Returns:
44
- pandas.DataFrame: Dataset with outliers capped to boundary values
45
"""
46
47
def fit_transform(self, X, y=None):
48
"""Fit to data, then transform it."""
49
```
50
51
**Usage Example**:
52
```python
53
from feature_engine.outliers import Winsorizer
54
import pandas as pd
55
import numpy as np
56
57
# Sample data with outliers
58
np.random.seed(42)
59
data = {
60
'var1': np.concatenate([np.random.normal(50, 10, 95), [120, 130, 140, 150, 160]]),
61
'var2': np.concatenate([np.random.normal(100, 20, 97), [200, 220, 250]])
62
}
63
df = pd.DataFrame(data)
64
65
# Gaussian method (mean ± 3*std)
66
winsorizer = Winsorizer(capping_method='gaussian', fold=3, tail='both')
67
df_capped = winsorizer.fit_transform(df)
68
69
# IQR method (Q1 - 1.5*IQR, Q3 + 1.5*IQR)
70
winsorizer = Winsorizer(capping_method='iqr', fold=1.5, tail='both')
71
df_capped = winsorizer.fit_transform(df)
72
73
# Only cap right tail (upper outliers)
74
winsorizer = Winsorizer(capping_method='gaussian', fold=2, tail='right')
75
df_capped = winsorizer.fit_transform(df)
76
77
# Access learned boundaries
78
print(winsorizer.right_tail_caps_) # Upper boundaries per variable
79
print(winsorizer.left_tail_caps_) # Lower boundaries per variable
80
```
81
82
### Arbitrary Outlier Capper
83
84
Caps outliers to arbitrary values defined by the user.
85
86
```python { .api }
87
class ArbitraryOutlierCapper:
88
def __init__(self, max_capping_dict=None, min_capping_dict=None, variables=None, missing_values='raise'):
89
"""
90
Initialize ArbitraryOutlierCapper.
91
92
Parameters:
93
- max_capping_dict (dict): Dictionary mapping variables to maximum allowed values
94
- min_capping_dict (dict): Dictionary mapping variables to minimum allowed values
95
- variables (list): List of numerical variables to process. If None, uses variables from capping dictionaries
96
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
97
"""
98
99
def fit(self, X, y=None):
100
"""
101
Validate capping dictionaries and variables.
102
103
Parameters:
104
- X (pandas.DataFrame): Training dataset
105
- y (pandas.Series, optional): Target variable (not used)
106
107
Returns:
108
- self
109
"""
110
111
def transform(self, X):
112
"""
113
Cap outliers using user-defined boundaries.
114
115
Parameters:
116
- X (pandas.DataFrame): Dataset to transform
117
118
Returns:
119
- pandas.DataFrame: Dataset with outliers capped to specified values
120
"""
121
122
def fit_transform(self, X, y=None):
123
"""Fit to data, then transform it."""
124
```
125
126
**Usage Example**:
127
```python
128
from feature_engine.outliers import ArbitraryOutlierCapper
129
130
# Define custom capping values
131
max_capping_dict = {'var1': 80, 'var2': 150}
132
min_capping_dict = {'var1': 20, 'var2': 50}
133
134
capper = ArbitraryOutlierCapper(
135
max_capping_dict=max_capping_dict,
136
min_capping_dict=min_capping_dict
137
)
138
df_capped = capper.fit_transform(df)
139
140
# Cap only maximum values
141
capper = ArbitraryOutlierCapper(max_capping_dict={'var1': 100})
142
df_capped = capper.fit_transform(df)
143
144
# Access capping dictionaries
145
print(capper.right_tail_caps_) # Maximum capping values
146
print(capper.left_tail_caps_) # Minimum capping values
147
```
148
149
### Outlier Trimmer
150
151
Removes outlier observations from the dataset instead of capping them.
152
153
```python { .api }
154
class OutlierTrimmer:
155
def __init__(self, capping_method='gaussian', tail='right', fold=3, variables=None, missing_values='raise'):
156
"""
157
Initialize OutlierTrimmer.
158
159
Parameters:
160
- capping_method (str): Method to identify outliers - 'gaussian', 'iqr', 'mad', or 'quantiles'
161
- tail (str): Which tail to consider for outlier detection - 'right', 'left', or 'both'
162
- fold (int/float): Factor for outlier boundary calculation
163
- variables (list): List of numerical variables to evaluate. If None, selects all numerical variables
164
- missing_values (str): How to handle missing values - 'raise' or 'ignore'
165
"""
166
167
def fit(self, X, y=None):
168
"""
169
Learn outlier boundaries for each variable.
170
171
Parameters:
172
- X (pandas.DataFrame): Training dataset
173
- y (pandas.Series, optional): Target variable (not used)
174
175
Returns:
176
- self
177
"""
178
179
def transform(self, X):
180
"""
181
Remove observations that are outliers in any of the specified variables.
182
183
Parameters:
184
- X (pandas.DataFrame): Dataset to transform
185
186
Returns:
187
- pandas.DataFrame: Dataset with outlier observations removed
188
"""
189
190
def fit_transform(self, X, y=None):
191
"""Fit to data, then transform it."""
192
193
def return_outliers(self, X):
194
"""
195
Return observations that would be removed as outliers.
196
197
Parameters:
198
- X (pandas.DataFrame): Dataset to evaluate
199
200
Returns:
201
- pandas.DataFrame: Outlier observations that would be removed
202
"""
203
```
204
205
**Usage Example**:
206
```python
207
from feature_engine.outliers import OutlierTrimmer
208
209
# Remove outliers using IQR method
210
trimmer = OutlierTrimmer(capping_method='iqr', fold=1.5, tail='both')
211
df_trimmed = trimmer.fit_transform(df)
212
213
# Remove only upper outliers
214
trimmer = OutlierTrimmer(capping_method='gaussian', fold=3, tail='right')
215
df_trimmed = trimmer.fit_transform(df)
216
217
# See which observations would be removed
218
outliers = trimmer.return_outliers(df)
219
print(f"Number of outliers detected: {len(outliers)}")
220
221
# Access outlier boundaries
222
print(trimmer.right_tail_caps_) # Upper boundaries
223
print(trimmer.left_tail_caps_) # Lower boundaries
224
```
225
226
## Outlier Detection Methods
227
228
### Gaussian Method
229
Based on mean and standard deviation: `mean ± fold * std`
230
231
```python
232
# Example: 3-sigma rule
233
winsorizer = Winsorizer(capping_method='gaussian', fold=3)
234
# Outliers: values beyond mean ± 3*std
235
```
236
237
### IQR Method
238
Based on interquartile range: `Q1 - fold * IQR` and `Q3 + fold * IQR`
239
240
```python
241
# Example: Standard IQR rule
242
winsorizer = Winsorizer(capping_method='iqr', fold=1.5)
243
# Outliers: values beyond Q1 - 1.5*IQR or Q3 + 1.5*IQR
244
```
245
246
### MAD Method
247
Based on median absolute deviation: `median ± fold * MAD`
248
249
```python
250
# Example: MAD-based detection
251
winsorizer = Winsorizer(capping_method='mad', fold=3.5)
252
# More robust to outliers than gaussian method
253
```
254
255
### Quantiles Method
256
Based on specific percentiles:
257
258
```python
259
# Example: 5th and 95th percentiles
260
winsorizer = Winsorizer(capping_method='quantiles', fold=0.05)
261
# Caps values below 5th percentile and above 95th percentile
262
```
263
264
## Usage Patterns
265
266
### Comparing Outlier Detection Methods
267
268
```python
269
import matplotlib.pyplot as plt
270
271
methods = {
272
'gaussian': Winsorizer(capping_method='gaussian', fold=3),
273
'iqr': Winsorizer(capping_method='iqr', fold=1.5),
274
'mad': Winsorizer(capping_method='mad', fold=3.5),
275
'quantiles': Winsorizer(capping_method='quantiles', fold=0.05)
276
}
277
278
results = {}
279
for name, method in methods.items():
280
method.fit(df)
281
results[name] = {
282
'lower': method.left_tail_caps_,
283
'upper': method.right_tail_caps_
284
}
285
286
# Compare boundaries for each method
287
for var in df.columns:
288
print(f"\n{var} boundaries:")
289
for method_name, boundaries in results.items():
290
lower = boundaries['lower'].get(var, 'None')
291
upper = boundaries['upper'].get(var, 'None')
292
print(f" {method_name}: [{lower:.2f}, {upper:.2f}]")
293
```
294
295
### Pipeline Integration
296
297
```python
298
from sklearn.pipeline import Pipeline
299
from feature_engine.imputation import MeanMedianImputer
300
from feature_engine.outliers import Winsorizer
301
from sklearn.preprocessing import StandardScaler
302
303
# Preprocessing pipeline with outlier handling
304
pipeline = Pipeline([
305
('imputer', MeanMedianImputer()),
306
('outlier_capper', Winsorizer(capping_method='iqr', fold=1.5)),
307
('scaler', StandardScaler())
308
])
309
310
df_processed = pipeline.fit_transform(df)
311
```
312
313
### Outlier Analysis
314
315
```python
316
from feature_engine.outliers import OutlierTrimmer
317
318
# Analyze outlier patterns
319
trimmer = OutlierTrimmer(capping_method='iqr', tail='both')
320
trimmer.fit(df)
321
322
# Get outlier observations
323
outliers = trimmer.return_outliers(df)
324
325
# Analyze outlier characteristics
326
print("Outlier Statistics:")
327
print(outliers.describe())
328
329
# Count outliers per variable
330
outlier_counts = {}
331
for var in df.columns:
332
lower_bound = trimmer.left_tail_caps_.get(var, float('-inf'))
333
upper_bound = trimmer.right_tail_caps_.get(var, float('inf'))
334
335
outliers_count = ((df[var] < lower_bound) | (df[var] > upper_bound)).sum()
336
outlier_counts[var] = outliers_count
337
338
print("\nOutliers per variable:", outlier_counts)
339
```
340
341
### Robust vs Non-Robust Methods
342
343
```python
344
# Compare gaussian (sensitive) vs MAD (robust) methods
345
gaussian_winsorizer = Winsorizer(capping_method='gaussian', fold=3)
346
mad_winsorizer = Winsorizer(capping_method='mad', fold=3.5)
347
348
# Fit both methods
349
gaussian_winsorizer.fit(df)
350
mad_winsorizer.fit(df)
351
352
# Compare how many observations would be capped
353
for var in df.columns:
354
# Gaussian boundaries
355
g_lower = gaussian_winsorizer.left_tail_caps_[var]
356
g_upper = gaussian_winsorizer.right_tail_caps_[var]
357
g_outliers = ((df[var] < g_lower) | (df[var] > g_upper)).sum()
358
359
# MAD boundaries
360
m_lower = mad_winsorizer.left_tail_caps_[var]
361
m_upper = mad_winsorizer.right_tail_caps_[var]
362
m_outliers = ((df[var] < m_lower) | (df[var] > m_upper)).sum()
363
364
print(f"{var}: Gaussian={g_outliers}, MAD={m_outliers} outliers")
365
```
366
367
## Common Attributes
368
369
All outlier transformers share these fitted attributes:
370
371
- `variables_` (list): Variables that will be processed
372
- `n_features_in_` (int): Number of features in training set
373
- `right_tail_caps_` (dict): Upper boundary values per variable
374
- `left_tail_caps_` (dict): Lower boundary values per variable
375
376
The boundaries define the thresholds beyond which observations are considered outliers and will be capped (Winsorizer, ArbitraryOutlierCapper) or removed (OutlierTrimmer).