Skip to content

Commit

Permalink
Merge pull request #4466 from janezd/owcontinuize-simplify-normalize
Browse files Browse the repository at this point in the history
[ENH] OWContinuize: Provide the same options as in Preprocess/Normalize
  • Loading branch information
janezd authored Mar 2, 2020
2 parents 6e010ef + a89beee commit e572020
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 159 deletions.
223 changes: 98 additions & 125 deletions Orange/widgets/data/owcontinuize.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from functools import reduce
from types import SimpleNamespace

from AnyQt.QtCore import Qt

import Orange.data
from Orange.util import Reprable
from Orange.statistics import distribution
from Orange.preprocess import Continuize, Normalize
from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer
from Orange.preprocess import Continuize
from Orange.preprocess.transformation import Identity, Indicator, Normalizer
from Orange.data.table import Table
from Orange.widgets import gui, widget
from Orange.widgets.settings import Setting
Expand All @@ -34,16 +34,13 @@ class Outputs:
buttons_area_orientation = Qt.Vertical
resizing_enabled = False

# continuous treats
Leave, NormalizeBySpan, NormalizeBySD = range(3)
Normalize = SimpleNamespace(Leave=0, Standardize=1, Center=2, Scale=3,
Normalize11=4, Normalize01=5)

settings_version = 2
multinomial_treatment = Setting(0)
zero_based = Setting(1)
continuous_treatment = Setting(Leave)
continuous_treatment = Setting(Normalize.Leave)
class_treatment = Setting(0)

transform_class = Setting(False)

autosend = Setting(True)

multinomial_treats = (
Expand All @@ -56,9 +53,13 @@ class Outputs:
("Divide by number of values", Continuize.AsNormalizedOrdinal))

continuous_treats = (
("Leave them as they are", Continuize.Leave),
("Normalize by span", Normalize.NormalizeBySpan),
("Normalize by standard deviation", Normalize.NormalizeBySD))
("Leave them as they are", True),
("Standardize to μ=0, σ²=1", False),
("Center to μ=0", False),
("Scale to σ²=1", True),
("Normalize to interval [-1, 1]", False),
("Normalize to interval [0, 1]", False)
)

class_treats = (
("Leave it as it is", Continuize.Leave),
Expand All @@ -67,8 +68,6 @@ class Outputs:
("One class per value", Continuize.Indicators),
)

value_ranges = ["From -1 to 1", "From 0 to 1"]

def __init__(self):
super().__init__()

Expand All @@ -84,19 +83,12 @@ def __init__(self):
btnLabels=[x[0] for x in self.continuous_treats],
callback=self.settings_changed)

box = gui.vBox(self.controlArea, "Categorical Outcomes")
box = gui.vBox(self.controlArea, "Categorical Outcome(s)")
gui.radioButtonsInBox(
box, self, "class_treatment",
btnLabels=[t[0] for t in self.class_treats],
callback=self.settings_changed)

zbbox = gui.vBox(self.controlArea, "Value Range")

gui.radioButtonsInBox(
zbbox, self, "zero_based",
btnLabels=self.value_ranges,
callback=self.settings_changed)

gui.auto_apply(self.buttonsArea, self, "autosend", box=False)

self.data = None
Expand All @@ -120,31 +112,27 @@ def setData(self, data):
self.unconditional_commit()

def enable_normalization(self):
enable = not (self.data and self.data.is_sparse())
if not enable and self.continuous_treatment in (self.NormalizeBySpan,
self.NormalizeBySD):
self.continuous_treatment = self.Leave
buttons = self.controls.continuous_treatment.buttons
buttons[self.NormalizeBySpan].setEnabled(enable)
buttons[self.NormalizeBySD].setEnabled(enable)
if self.data is not None and self.data.is_sparse():
if self.continuous_treatment == self.Normalize.Standardize:
self.continuous_treatment = self.Normalize.Scale
else:
self.continuous_treatment = self.Normalize.Leave
for button, (_, supports_sparse) \
in zip(buttons, self.continuous_treats):
button.setEnabled(supports_sparse)
else:
for button in buttons:
button.setEnabled(True)

def constructContinuizer(self):
conzer = DomainContinuizer(
zero_based=self.zero_based,
multinomial_treatment=self.multinomial_treats[self.multinomial_treatment][1],
continuous_treatment=self.continuous_treats[self.continuous_treatment][1],
continuous_treatment=self.continuous_treatment,
class_treatment=self.class_treats[self.class_treatment][1]
)
return conzer

# def sendPreprocessor(self):
# continuizer = self.constructContinuizer()
# self.send("Preprocessor", PreprocessedLearner(
# lambda data, weightId=0, tc=(self.targetValue if self.classTreatment else -1):
# Table(continuizer(data, weightId, tc)
# if data.domain.has_discrete_class
# else continuizer(data, weightId), data)))

def commit(self):
continuizer = self.constructContinuizer()
if self.data:
Expand All @@ -155,16 +143,28 @@ def commit(self):
else:
self.Outputs.data.send(self.data) # None or empty data


def send_report(self):
self.report_items(
"Settings",
[("Categorical features",
self.multinomial_treats[self.multinomial_treatment][0]),
("Numeric features",
self.continuous_treats[self.continuous_treatment][0]),
("Class", self.class_treats[self.class_treatment][0]),
("Value range", self.value_ranges[self.zero_based])])
("Class", self.class_treats[self.class_treatment][0])])

@classmethod
def migrate_settings(cls, settings, version):
if version < 2:
Normalize = cls.Normalize
cont_treat = settings.pop("continuous_treatment", 0)
zero_based = settings.pop("zero_based", True)
if cont_treat == 1:
if zero_based:
settings["continuous_treatment"] = Normalize.Normalize01
else:
settings["continuous_treatment"] = Normalize.Normalize11
elif cont_treat == 2:
settings["continuous_treatment"] = Normalize.Standardize


class WeightedIndicator(Indicator):
Expand All @@ -179,56 +179,33 @@ def transform(self, c):
return t


class WeightedIndicator1(Indicator1):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
self.weight = weight

def transform(self, c):
t = super().transform(c) * self.weight
if self.weight != 1.0:
t *= self.weight
return t


def make_indicator_var(source, value_ind, weight=None, zero_based=True):
if zero_based and weight is None:
def make_indicator_var(source, value_ind, weight=None):
if weight is None:
indicator = Indicator(source, value=value_ind)
elif zero_based:
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
elif weight is None:
indicator = Indicator1(source, value=value_ind)
else:
indicator = WeightedIndicator1(source, value=value_ind, weight=weight)
indicator = WeightedIndicator(source, value=value_ind, weight=weight)
return Orange.data.ContinuousVariable(
"{}={}".format(source.name, source.values[value_ind]),
compute_value=indicator
)


def dummy_coding(var, base_value=0, zero_based=True):
def dummy_coding(var, base_value=0):
N = len(var.values)
return [make_indicator_var(var, i, zero_based=zero_based)
return [make_indicator_var(var, i)
for i in range(N) if i != base_value]


def one_hot_coding(var, zero_based=True):
def one_hot_coding(var):
N = len(var.values)
return [make_indicator_var(var, i, zero_based=zero_based)
for i in range(N)]
return [make_indicator_var(var, i) for i in range(N)]


def continuize_domain(data_or_domain,
def continuize_domain(data,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
class_treatment=Continuize.Leave,
zero_based=True):

if isinstance(data_or_domain, Orange.data.Domain):
data, domain = None, data_or_domain
else:
data, domain = data_or_domain, data_or_domain.domain

class_treatment=Continuize.Leave):
domain = data.domain
def needs_dist(var, mtreat, ctreat):
"Does the `var` need a distribution given specified flags"
if var.is_discrete:
Expand Down Expand Up @@ -258,14 +235,11 @@ def needs_dist(var, mtreat, ctreat):
dist_iter = iter(dist)

newattrs = [continuize_var(var, next(dist_iter) if needs_dist else None,
multinomial_treatment, continuous_treatment,
zero_based)
multinomial_treatment, continuous_treatment)
for var, needs_dist in zip(domain.attributes, attr_needs_dist)]

newclass = [continuize_var(var,
next(dist_iter) if needs_dist else None,
class_treatment, Continuize.Remove,
zero_based)
class_treatment, Continuize.Remove)
for var, needs_dist in zip(domain.class_vars, cls_needs_dist)]

newattrs = reduce(list.__iadd__, newattrs, [])
Expand All @@ -276,16 +250,16 @@ def needs_dist(var, mtreat, ctreat):
def continuize_var(var,
data_or_dist=None,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
zero_based=True):

continuous_treatment=Continuize.Leave):
def continuize_continuous():
if continuous_treatment == Normalize.NormalizeBySpan:
return [normalize_by_span(var, data_or_dist, zero_based)]
elif continuous_treatment == Normalize.NormalizeBySD:
return [normalize_by_sd(var, data_or_dist)]
else:
dist = _ensure_dist(var, data_or_dist)
treatments = [lambda var, _: var,
normalize_by_sd, center_to_mean, divide_by_sd,
normalize_to_11, normalize_to_01]
if dist.shape[1] == 0:
return [var]
new_var = treatments[continuous_treatment](var, dist)
return [new_var]

def continuize_discrete():
if len(var.values) > 2 and \
Expand All @@ -299,16 +273,16 @@ def continuize_discrete():
elif multinomial_treatment == Continuize.AsOrdinal:
return [ordinal_to_continuous(var)]
elif multinomial_treatment == Continuize.AsNormalizedOrdinal:
return [ordinal_to_norm_continuous(var, zero_based)]
return [ordinal_to_norm_continuous(var)]
elif multinomial_treatment == Continuize.Indicators:
return one_hot_coding(var, zero_based)
return one_hot_coding(var)
elif multinomial_treatment in (
Continuize.FirstAsBase, Continuize.RemoveMultinomial):
return dummy_coding(var, zero_based=zero_based)
return dummy_coding(var)
elif multinomial_treatment == Continuize.FrequentAsBase:
dist = _ensure_dist(var, data_or_dist)
modus = dist.modus()
return dummy_coding(var, base_value=modus, zero_based=zero_based)
return dummy_coding(var, base_value=modus)
elif multinomial_treatment == Continuize.Leave:
return [var]
raise ValueError("Invalid value of `multinomial_treatment`")
Expand Down Expand Up @@ -345,68 +319,67 @@ def ordinal_to_continuous(var):
compute_value=Identity(var))


def ordinal_to_norm_continuous(var, zero_based=True):
def ordinal_to_norm_continuous(var):
n_values = len(var.values)
if zero_based:
return normalized_var(var, 0, 1 / (n_values - 1))
else:
return normalized_var(var, (n_values - 1) / 2, 2 / (n_values - 1))
return normalized_var(var, 0, 1 / (n_values - 1))


def normalize_by_span(var, data_or_dist, zero_based=True):
dist = _ensure_dist(var, data_or_dist)
if dist.shape[1] > 0:
v_max, v_min = dist.max(), dist.min()
else:
v_max, v_min = 0, 0
def normalize_by_sd(var, dist):
mean, sd = dist.mean(), dist.standard_deviation()
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


def center_to_mean(var, dist):
return normalized_var(var, dist.mean(), 1)


def divide_by_sd(var, dist):
sd = dist.standard_deviation()
sd = sd if sd > 1e-10 else 1
return normalized_var(var, 0, 1 / sd)


def normalize_to_11(var, dist):
return normalize_by_span(var, dist, False)


def normalize_to_01(var, dist):
return normalize_by_span(var, dist, True)


def normalize_by_span(var, dist, zero_based=True):
v_max, v_min = dist.max(), dist.min()
span = (v_max - v_min)
if span < 1e-15:
span = 1

if zero_based:
return normalized_var(var, v_min, 1 / span)
else:
return normalized_var(var, (v_min + v_max) / 2, 2 / span)


def normalize_by_sd(var, data_or_dist):
dist = _ensure_dist(var, data_or_dist)
if dist.shape[1] > 0:
mean, sd = dist.mean(), dist.standard_deviation()
else:
mean, sd = 0, 1
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


class DomainContinuizer(Reprable):
def __init__(self, zero_based=True,
def __init__(self,
multinomial_treatment=Continuize.Indicators,
continuous_treatment=Continuize.Leave,
class_treatment=Continuize.Leave):
self.zero_based = zero_based
self.multinomial_treatment = multinomial_treatment
self.continuous_treatment = continuous_treatment
self.class_treatment = class_treatment

def __call__(self, data):
treat = self.multinomial_treatment
if isinstance(data, Orange.data.Domain):
domain, data = data, None
else:
domain = data.domain

domain = data.domain
if (treat == Continuize.ReportError and
any(var.is_discrete and len(var.values) > 2 for var in domain)):
raise ValueError("Domain has multinomial attributes")

newdomain = continuize_domain(
data or domain,
data,
self.multinomial_treatment,
self.continuous_treatment,
self.class_treatment,
self.zero_based
)
self.class_treatment)
return newdomain


Expand Down
Loading

0 comments on commit e572020

Please sign in to comment.