Skip to content

Commit

Permalink
Merge pull request #2144 from jerneju/zerodivision_index-continuize
Browse files Browse the repository at this point in the history
[FIX] Continuize: prevent crashing - column with equal and NaN values
  • Loading branch information
astaric authored Apr 21, 2017
2 parents 9274a20 + e8d303f commit 1d37f63
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 14 deletions.
35 changes: 21 additions & 14 deletions Orange/widgets/data/owcontinuize.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from functools import reduce

import numpy as np

from AnyQt import QtWidgets
from AnyQt.QtCore import Qt

import Orange.data
from Orange.util import Reprable
from Orange.statistics import distribution
from Orange.preprocess import Continuize, Normalize
from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer
from Orange.data.table import Table
from Orange.widgets import gui, widget
from Orange.widgets.settings import Setting
Expand Down Expand Up @@ -138,12 +144,6 @@ def send_report(self):
("Value range", self.value_ranges[self.zero_based])])


from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer

from functools import reduce


class WeightedIndicator(Indicator):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
Expand All @@ -156,7 +156,7 @@ def transform(self, c):
return t


class WeightedIndicator_1(Indicator1):
class WeightedIndicator1(Indicator1):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
self.weight = weight
Expand All @@ -176,7 +176,7 @@ def make_indicator_var(source, value_ind, weight=None, zero_based=True):
elif weight is None:
indicator = Indicator1(source, value=value_ind)
else:
indicator = WeightedIndicator_1(source, value=value_ind, weight=weight)
indicator = WeightedIndicator1(source, value=value_ind, weight=weight)
return Orange.data.ContinuousVariable(
"{}={}".format(source.name, source.values[value_ind]),
compute_value=indicator
Expand Down Expand Up @@ -279,7 +279,7 @@ def continuize_var(var,
elif multinomial_treatment == Continuize.AsOrdinal:
return [ordinal_to_continuous(var)]
elif multinomial_treatment == Continuize.AsNormalizedOrdinal:
return [ordinal_to_normalized_continuous(var, zero_based)]
return [ordinal_to_norm_continuous(var, zero_based)]
elif multinomial_treatment == Continuize.Indicators:
return one_hot_coding(var, zero_based)
elif multinomial_treatment == Continuize.FirstAsBase or \
Expand Down Expand Up @@ -320,7 +320,7 @@ def ordinal_to_continuous(var):
compute_value=Identity(var))


def ordinal_to_normalized_continuous(var, zero_based=True):
def ordinal_to_norm_continuous(var, zero_based=True):
n_values = len(var.values)
if zero_based:
return normalized_var(var, 0, 1 / (n_values - 1))
Expand All @@ -330,8 +330,11 @@ def ordinal_to_normalized_continuous(var, zero_based=True):

def normalize_by_span(var, data_or_dist, zero_based=True):
dist = _ensure_dist(var, data_or_dist)
v_max, v_min = dist.max(), dist.min()
span = v_max - v_min
if dist.shape[1] > 0:
v_max, v_min = dist.max(), dist.min()
else:
v_max, v_min = 0, 0
span = (v_max - v_min)
if span < 1e-15:
span = 1

Expand All @@ -343,7 +346,11 @@ def normalize_by_span(var, data_or_dist, zero_based=True):

def normalize_by_sd(var, data_or_dist):
dist = _ensure_dist(var, data_or_dist)
mean, sd = dist.mean(), dist.standard_deviation()
if dist.shape[1] > 0:
mean, sd = dist.mean(), dist.standard_deviation()
else:
mean, sd = 0, 1
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


Expand All @@ -365,7 +372,7 @@ def __call__(self, data):
domain = data.domain

if (treat == Continuize.ReportError and
any(var.is_discrete and len(var.values) > 2 for var in domain)):
any(var.is_discrete and len(var.values) > 2 for var in domain)):
raise ValueError("Domain has multinomial attributes")

newdomain = continuize_domain(
Expand Down
50 changes: 50 additions & 0 deletions Orange/widgets/data/tests/test_owcontinuize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,53 @@ def test_empty_data(self):
widget.unconditional_commit()
imp_data = self.get_output("Data")
self.assertIsNone(imp_data)

def test_one_column_equal_values(self):
"""
No crash on a column with equal values and with selected option
normalize by standard deviation.
GH-2144
"""
table = Table("iris")
table = table[:, 1]
table[:] = 42.0
self.send_signal("Data", table)
# Normalize.NormalizeBySD
self.widget.continuous_treatment = 2
self.widget.unconditional_commit()

def test_one_column_nan_values_normalize_sd(self):
"""
No crash on a column with NaN values and with selected option
normalize by standard deviation (Not the same issue which is
tested above).
GH-2144
"""
table = Table("iris")
table[:, 2] = np.NaN
self.send_signal("Data", table)
# Normalize.NormalizeBySD
self.widget.continuous_treatment = 2
self.widget.unconditional_commit()
table = Table("iris")
table[1, 2] = np.NaN
self.send_signal("Data", table)
self.widget.unconditional_commit()


def test_one_column_nan_values_normalize_span(self):
"""
No crash on a column with NaN values and with selected option
normalize by span.
GH-2144
"""
table = Table("iris")
table[:, 2] = np.NaN
self.send_signal("Data", table)
# Normalize.NormalizeBySpan
self.widget.continuous_treatment = 1
self.widget.unconditional_commit()
table = Table("iris")
table[1, 2] = np.NaN
self.send_signal("Data", table)
self.widget.unconditional_commit()

0 comments on commit 1d37f63

Please sign in to comment.