Tessl Tile for pypi/orange3@3.39.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md clustering.md data-handling.md distance.md evaluation.md index.md preprocessing.md projection.md regression.md widgets.md

preprocessing.mddocs/

0
# Data Preprocessing
1

2
Orange3 provides comprehensive data preprocessing capabilities for preparing datasets for machine learning, including transformation, normalization, discretization, and feature selection.
3

4
## Capabilities
5

6
### Discretization
7

8
Convert continuous variables into discrete (categorical) variables.
9

10
```python { .api }
11
class Discretize:
12
    """
13
    Discretize continuous attributes.
14
    
15
    Args:
16
        method: Discretization method
17
        n_intervals: Number of intervals for equal-width/frequency
18
        remove_const: Remove constant attributes
19
    """
20
    def __init__(self, method=None, n_intervals=4, remove_const=True): ...
21
    
22
    def __call__(self, data):
23
        """Apply discretization to data."""
24

25
class EqualFreq:
26
    """Equal frequency discretization."""
27
    def __init__(self, n=4): ...
28

29
class EqualWidth:
30
    """Equal width discretization."""
31
    def __init__(self, n=4): ...
32

33
class EntropyMDL:
34
    """Entropy-based discretization with MDL criterion."""
35
    def __call__(self, data, attribute): ...
36
```
37

38
### Continuization
39

40
Convert discrete variables into continuous representations.
41

42
```python { .api }
43
class Continuize:
44
    """
45
    Convert discrete attributes to continuous.
46
    
47
    Args:
48
        zero_based: Use 0-based encoding
49
        multinomial_treatment: How to handle multinomial variables
50
    """
51
    def __init__(self, zero_based=False, multinomial_treatment=None): ...
52
    
53
    def __call__(self, data):
54
        """Apply continuization to data."""
55

56
class DomainContinuizer:
57
    """Domain-level continuization utilities."""
58
    def __init__(self, zero_based=False): ...
59
    
60
    def __call__(self, data):
61
        """Transform domain to continuous representation."""
62
```
63

64
### Missing Value Imputation
65

66
Handle missing values in datasets.
67

68
```python { .api }
69
class Impute:
70
    """
71
    Impute missing values.
72
    
73
    Args:
74
        method: Imputation method
75
    """
76
    def __init__(self, method=None): ...
77
    
78
    def __call__(self, data):
79
        """Apply imputation to data."""
80

81
class Average:
82
    """
83
    Impute with mean (continuous) or mode (discrete).
84
    """
85
    def __call__(self, data, variable): ...
86

87
class DoNotImpute:
88
    """Leave missing values as-is."""
89
    def __call__(self, data, variable): ...
90

91
class DropInstances:
92
    """Remove instances with missing values."""
93
    def __call__(self, data, variable): ...
94

95
class ReplaceUnknowns:
96
    """Replace unknown values with specified value."""
97
    def __init__(self, value): ...
98
    
99
    def __call__(self, data, variable): ...
100
```
101

102
### Data Cleaning
103

104
Remove problematic rows and columns.
105

106
```python { .api }
107
class RemoveNaNRows:
108
    """Remove rows containing missing values."""
109
    def __call__(self, data):
110
        """Remove rows with NaN values."""
111

112
class RemoveNaNColumns:
113
    """Remove columns containing missing values."""
114
    def __call__(self, data):
115
        """Remove columns with NaN values."""
116
```
117

118
### Normalization and Scaling
119

120
Scale and normalize feature values.
121

122
```python { .api }
123
class Normalizer:
124
    """
125
    Normalize data features.
126
    
127
    Args:
128
        norm_type: Normalization type ('l1', 'l2', 'max')
129
        transform_class: Apply to class variables
130
        zero_based: Use zero-based scaling
131
    """
132
    def __init__(self, norm_type='l2', transform_class=False, zero_based=True): ...
133
    
134
    def __call__(self, data):
135
        """Apply normalization to data."""
136
```
137

138
### Feature Selection
139

140
Select most relevant features for analysis.
141

142
```python { .api }
143
class SelectBestFeatures:
144
    """
145
    Select k best features based on scoring function.
146
    
147
    Args:
148
        method: Feature scoring method
149
        k: Number of features to select
150
    """
151
    def __init__(self, method=None, k=5): ...
152
    
153
    def __call__(self, data):
154
        """Select best features from data."""
155

156
class SelectRandomFeatures:
157
    """
158
    Randomly select features.
159
    
160
    Args:
161
        k: Number of features to select
162
        random_state: Random seed
163
    """
164
    def __init__(self, k=5, random_state=None): ...
165
    
166
    def __call__(self, data):
167
        """Randomly select features."""
168
```
169

170
### Preprocessing Pipelines
171

172
Combine multiple preprocessing steps.
173

174
```python { .api }
175
class Preprocess:
176
    """
177
    Preprocessing pipeline container.
178
    
179
    Args:
180
        preprocessors: List of preprocessing steps
181
    """
182
    def __init__(self, preprocessors=None): ...
183
    
184
    def __call__(self, data):
185
        """Apply all preprocessing steps sequentially."""
186
```
187

188
### Feature Construction
189

190
Create new features from existing ones.
191

192
```python { .api }
193
class FeatureConstructor:
194
    """Base class for feature construction."""
195
    def __call__(self, data): ...
196

197
class Polynomial:
198
    """Create polynomial features."""
199
    def __init__(self, degree=2): ...
200
    
201
    def __call__(self, data):
202
        """Generate polynomial features."""
203
```
204

205
### Usage Examples
206

207
```python
208
# Basic preprocessing workflow
209
from Orange.data import Table
210
from Orange.preprocess import Discretize, Impute, Normalizer, SelectBestFeatures
211

212
# Load data
213
data = Table("iris")
214

215
# Discretization
216
discretizer = Discretize(method=Discretize.EqualFreq, n_intervals=3)
217
discrete_data = discretizer(data)
218

219
# Missing value imputation
220
imputer = Impute(method=Impute.Average())
221
clean_data = imputer(data)
222

223
# Normalization
224
normalizer = Normalizer(norm_type='l2')
225
normalized_data = normalizer(data)
226

227
# Feature selection
228
selector = SelectBestFeatures(k=3)
229
selected_data = selector(data)
230

231
# Preprocessing pipeline
232
from Orange.preprocess import Preprocess
233
pipeline = Preprocess([
234
    RemoveNaNRows(),
235
    Impute(method=Impute.Average()),
236
    Normalizer(norm_type='l2'),
237
    SelectBestFeatures(k=10)
238
])
239
processed_data = pipeline(data)
240

241
# Custom discretization
242
from Orange.preprocess import EqualWidth, EqualFreq, EntropyMDL
243
equal_width = Discretize(method=EqualWidth(n=5))
244
equal_freq = Discretize(method=EqualFreq(n=4))
245
entropy_disc = Discretize(method=EntropyMDL())
246

247
# Continuization example
248
from Orange.preprocess import Continuize
249
continuizer = Continuize(zero_based=True)
250
continuous_data = continuizer(discrete_data)
251

252
# Advanced imputation
253
from Orange.preprocess import ReplaceUnknowns, DropInstances
254
replace_imputer = Impute(method=ReplaceUnknowns(value=0))
255
drop_imputer = Impute(method=DropInstances())
256

257
# Feature selection with different methods
258
from Orange.preprocess import SelectBestFeatures
259
# Note: Different scoring methods would be available in actual implementation
260
chi2_selector = SelectBestFeatures(method='chi2', k=5)
261
f_score_selector = SelectBestFeatures(method='f_classif', k=8)
262

263
print(f"Original data shape: {data.X.shape}")
264
print(f"Processed data shape: {processed_data.X.shape}")
265
print(f"Selected features: {[var.name for var in selected_data.domain.attributes]}")
266
```

Version

Tile

Files

preprocessing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

preprocessing.mddocs/