or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-handling.mddistance.mdevaluation.mdindex.mdpreprocessing.mdprojection.mdregression.mdwidgets.md

preprocessing.mddocs/

0

# Data Preprocessing

1

2

Orange3 provides comprehensive data preprocessing capabilities for preparing datasets for machine learning, including transformation, normalization, discretization, and feature selection.

3

4

## Capabilities

5

6

### Discretization

7

8

Convert continuous variables into discrete (categorical) variables.

9

10

```python { .api }

11

class Discretize:

12

"""

13

Discretize continuous attributes.

14

15

Args:

16

method: Discretization method

17

n_intervals: Number of intervals for equal-width/frequency

18

remove_const: Remove constant attributes

19

"""

20

def __init__(self, method=None, n_intervals=4, remove_const=True): ...

21

22

def __call__(self, data):

23

"""Apply discretization to data."""

24

25

class EqualFreq:

26

"""Equal frequency discretization."""

27

def __init__(self, n=4): ...

28

29

class EqualWidth:

30

"""Equal width discretization."""

31

def __init__(self, n=4): ...

32

33

class EntropyMDL:

34

"""Entropy-based discretization with MDL criterion."""

35

def __call__(self, data, attribute): ...

36

```

37

38

### Continuization

39

40

Convert discrete variables into continuous representations.

41

42

```python { .api }

43

class Continuize:

44

"""

45

Convert discrete attributes to continuous.

46

47

Args:

48

zero_based: Use 0-based encoding

49

multinomial_treatment: How to handle multinomial variables

50

"""

51

def __init__(self, zero_based=False, multinomial_treatment=None): ...

52

53

def __call__(self, data):

54

"""Apply continuization to data."""

55

56

class DomainContinuizer:

57

"""Domain-level continuization utilities."""

58

def __init__(self, zero_based=False): ...

59

60

def __call__(self, data):

61

"""Transform domain to continuous representation."""

62

```

63

64

### Missing Value Imputation

65

66

Handle missing values in datasets.

67

68

```python { .api }

69

class Impute:

70

"""

71

Impute missing values.

72

73

Args:

74

method: Imputation method

75

"""

76

def __init__(self, method=None): ...

77

78

def __call__(self, data):

79

"""Apply imputation to data."""

80

81

class Average:

82

"""

83

Impute with mean (continuous) or mode (discrete).

84

"""

85

def __call__(self, data, variable): ...

86

87

class DoNotImpute:

88

"""Leave missing values as-is."""

89

def __call__(self, data, variable): ...

90

91

class DropInstances:

92

"""Remove instances with missing values."""

93

def __call__(self, data, variable): ...

94

95

class ReplaceUnknowns:

96

"""Replace unknown values with specified value."""

97

def __init__(self, value): ...

98

99

def __call__(self, data, variable): ...

100

```

101

102

### Data Cleaning

103

104

Remove problematic rows and columns.

105

106

```python { .api }

107

class RemoveNaNRows:

108

"""Remove rows containing missing values."""

109

def __call__(self, data):

110

"""Remove rows with NaN values."""

111

112

class RemoveNaNColumns:

113

"""Remove columns containing missing values."""

114

def __call__(self, data):

115

"""Remove columns with NaN values."""

116

```

117

118

### Normalization and Scaling

119

120

Scale and normalize feature values.

121

122

```python { .api }

123

class Normalizer:

124

"""

125

Normalize data features.

126

127

Args:

128

norm_type: Normalization type ('l1', 'l2', 'max')

129

transform_class: Apply to class variables

130

zero_based: Use zero-based scaling

131

"""

132

def __init__(self, norm_type='l2', transform_class=False, zero_based=True): ...

133

134

def __call__(self, data):

135

"""Apply normalization to data."""

136

```

137

138

### Feature Selection

139

140

Select most relevant features for analysis.

141

142

```python { .api }

143

class SelectBestFeatures:

144

"""

145

Select k best features based on scoring function.

146

147

Args:

148

method: Feature scoring method

149

k: Number of features to select

150

"""

151

def __init__(self, method=None, k=5): ...

152

153

def __call__(self, data):

154

"""Select best features from data."""

155

156

class SelectRandomFeatures:

157

"""

158

Randomly select features.

159

160

Args:

161

k: Number of features to select

162

random_state: Random seed

163

"""

164

def __init__(self, k=5, random_state=None): ...

165

166

def __call__(self, data):

167

"""Randomly select features."""

168

```

169

170

### Preprocessing Pipelines

171

172

Combine multiple preprocessing steps.

173

174

```python { .api }

175

class Preprocess:

176

"""

177

Preprocessing pipeline container.

178

179

Args:

180

preprocessors: List of preprocessing steps

181

"""

182

def __init__(self, preprocessors=None): ...

183

184

def __call__(self, data):

185

"""Apply all preprocessing steps sequentially."""

186

```

187

188

### Feature Construction

189

190

Create new features from existing ones.

191

192

```python { .api }

193

class FeatureConstructor:

194

"""Base class for feature construction."""

195

def __call__(self, data): ...

196

197

class Polynomial:

198

"""Create polynomial features."""

199

def __init__(self, degree=2): ...

200

201

def __call__(self, data):

202

"""Generate polynomial features."""

203

```

204

205

### Usage Examples

206

207

```python

208

# Basic preprocessing workflow

209

from Orange.data import Table

210

from Orange.preprocess import Discretize, Impute, Normalizer, SelectBestFeatures

211

212

# Load data

213

data = Table("iris")

214

215

# Discretization

216

discretizer = Discretize(method=Discretize.EqualFreq, n_intervals=3)

217

discrete_data = discretizer(data)

218

219

# Missing value imputation

220

imputer = Impute(method=Impute.Average())

221

clean_data = imputer(data)

222

223

# Normalization

224

normalizer = Normalizer(norm_type='l2')

225

normalized_data = normalizer(data)

226

227

# Feature selection

228

selector = SelectBestFeatures(k=3)

229

selected_data = selector(data)

230

231

# Preprocessing pipeline

232

from Orange.preprocess import Preprocess

233

pipeline = Preprocess([

234

RemoveNaNRows(),

235

Impute(method=Impute.Average()),

236

Normalizer(norm_type='l2'),

237

SelectBestFeatures(k=10)

238

])

239

processed_data = pipeline(data)

240

241

# Custom discretization

242

from Orange.preprocess import EqualWidth, EqualFreq, EntropyMDL

243

equal_width = Discretize(method=EqualWidth(n=5))

244

equal_freq = Discretize(method=EqualFreq(n=4))

245

entropy_disc = Discretize(method=EntropyMDL())

246

247

# Continuization example

248

from Orange.preprocess import Continuize

249

continuizer = Continuize(zero_based=True)

250

continuous_data = continuizer(discrete_data)

251

252

# Advanced imputation

253

from Orange.preprocess import ReplaceUnknowns, DropInstances

254

replace_imputer = Impute(method=ReplaceUnknowns(value=0))

255

drop_imputer = Impute(method=DropInstances())

256

257

# Feature selection with different methods

258

from Orange.preprocess import SelectBestFeatures

259

# Note: Different scoring methods would be available in actual implementation

260

chi2_selector = SelectBestFeatures(method='chi2', k=5)

261

f_score_selector = SelectBestFeatures(method='f_classif', k=8)

262

263

print(f"Original data shape: {data.X.shape}")

264

print(f"Processed data shape: {processed_data.X.shape}")

265

print(f"Selected features: {[var.name for var in selected_data.domain.attributes]}")

266

```