Orange, a component-based data mining framework.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Orange3 provides comprehensive data preprocessing capabilities for preparing datasets for machine learning, including transformation, normalization, discretization, and feature selection.
Convert continuous variables into discrete (categorical) variables.
class Discretize:
"""
Discretize continuous attributes.
Args:
method: Discretization method
n_intervals: Number of intervals for equal-width/frequency
remove_const: Remove constant attributes
"""
def __init__(self, method=None, n_intervals=4, remove_const=True): ...
def __call__(self, data):
"""Apply discretization to data."""
class EqualFreq:
"""Equal frequency discretization."""
def __init__(self, n=4): ...
class EqualWidth:
"""Equal width discretization."""
def __init__(self, n=4): ...
class EntropyMDL:
"""Entropy-based discretization with MDL criterion."""
def __call__(self, data, attribute): ...Convert discrete variables into continuous representations.
class Continuize:
"""
Convert discrete attributes to continuous.
Args:
zero_based: Use 0-based encoding
multinomial_treatment: How to handle multinomial variables
"""
def __init__(self, zero_based=False, multinomial_treatment=None): ...
def __call__(self, data):
"""Apply continuization to data."""
class DomainContinuizer:
"""Domain-level continuization utilities."""
def __init__(self, zero_based=False): ...
def __call__(self, data):
"""Transform domain to continuous representation."""Handle missing values in datasets.
class Impute:
"""
Impute missing values.
Args:
method: Imputation method
"""
def __init__(self, method=None): ...
def __call__(self, data):
"""Apply imputation to data."""
class Average:
"""
Impute with mean (continuous) or mode (discrete).
"""
def __call__(self, data, variable): ...
class DoNotImpute:
"""Leave missing values as-is."""
def __call__(self, data, variable): ...
class DropInstances:
"""Remove instances with missing values."""
def __call__(self, data, variable): ...
class ReplaceUnknowns:
"""Replace unknown values with specified value."""
def __init__(self, value): ...
def __call__(self, data, variable): ...Remove problematic rows and columns.
class RemoveNaNRows:
"""Remove rows containing missing values."""
def __call__(self, data):
"""Remove rows with NaN values."""
class RemoveNaNColumns:
"""Remove columns containing missing values."""
def __call__(self, data):
"""Remove columns with NaN values."""Scale and normalize feature values.
class Normalizer:
"""
Normalize data features.
Args:
norm_type: Normalization type ('l1', 'l2', 'max')
transform_class: Apply to class variables
zero_based: Use zero-based scaling
"""
def __init__(self, norm_type='l2', transform_class=False, zero_based=True): ...
def __call__(self, data):
"""Apply normalization to data."""Select most relevant features for analysis.
class SelectBestFeatures:
"""
Select k best features based on scoring function.
Args:
method: Feature scoring method
k: Number of features to select
"""
def __init__(self, method=None, k=5): ...
def __call__(self, data):
"""Select best features from data."""
class SelectRandomFeatures:
"""
Randomly select features.
Args:
k: Number of features to select
random_state: Random seed
"""
def __init__(self, k=5, random_state=None): ...
def __call__(self, data):
"""Randomly select features."""Combine multiple preprocessing steps.
class Preprocess:
"""
Preprocessing pipeline container.
Args:
preprocessors: List of preprocessing steps
"""
def __init__(self, preprocessors=None): ...
def __call__(self, data):
"""Apply all preprocessing steps sequentially."""Create new features from existing ones.
class FeatureConstructor:
"""Base class for feature construction."""
def __call__(self, data): ...
class Polynomial:
"""Create polynomial features."""
def __init__(self, degree=2): ...
def __call__(self, data):
"""Generate polynomial features."""# Basic preprocessing workflow
from Orange.data import Table
from Orange.preprocess import Discretize, Impute, Normalizer, SelectBestFeatures
# Load data
data = Table("iris")
# Discretization
discretizer = Discretize(method=Discretize.EqualFreq, n_intervals=3)
discrete_data = discretizer(data)
# Missing value imputation
imputer = Impute(method=Impute.Average())
clean_data = imputer(data)
# Normalization
normalizer = Normalizer(norm_type='l2')
normalized_data = normalizer(data)
# Feature selection
selector = SelectBestFeatures(k=3)
selected_data = selector(data)
# Preprocessing pipeline
from Orange.preprocess import Preprocess
pipeline = Preprocess([
RemoveNaNRows(),
Impute(method=Impute.Average()),
Normalizer(norm_type='l2'),
SelectBestFeatures(k=10)
])
processed_data = pipeline(data)
# Custom discretization
from Orange.preprocess import EqualWidth, EqualFreq, EntropyMDL
equal_width = Discretize(method=EqualWidth(n=5))
equal_freq = Discretize(method=EqualFreq(n=4))
entropy_disc = Discretize(method=EntropyMDL())
# Continuization example
from Orange.preprocess import Continuize
continuizer = Continuize(zero_based=True)
continuous_data = continuizer(discrete_data)
# Advanced imputation
from Orange.preprocess import ReplaceUnknowns, DropInstances
replace_imputer = Impute(method=ReplaceUnknowns(value=0))
drop_imputer = Impute(method=DropInstances())
# Feature selection with different methods
from Orange.preprocess import SelectBestFeatures
# Note: Different scoring methods would be available in actual implementation
chi2_selector = SelectBestFeatures(method='chi2', k=5)
f_score_selector = SelectBestFeatures(method='f_classif', k=8)
print(f"Original data shape: {data.X.shape}")
print(f"Processed data shape: {processed_data.X.shape}")
print(f"Selected features: {[var.name for var in selected_data.domain.attributes]}")Install with Tessl CLI
npx tessl i tessl/pypi-orange3