diff --git a/Orange/preprocess/transformation.py b/Orange/preprocess/transformation.py index 20b55bd7c11..a99de426022 100644 --- a/Orange/preprocess/transformation.py +++ b/Orange/preprocess/transformation.py @@ -89,6 +89,12 @@ class Identity(Transformation): def transform(self, c): return c + def __eq__(self, other): # pylint: disable=useless-parent-delegation + return super().__eq__(other) + + def __hash__(self): + return super().__hash__() + # pylint: disable=abstract-method class _Indicator(Transformation): @@ -145,6 +151,12 @@ def transform(self, c): c = c.toarray().ravel() return self._nan_fixed(c, c == self.value) + def __eq__(self, other): # pylint: disable=useless-parent-delegation + return super().__eq__(other) + + def __hash__(self): + return super().__hash__() + class Indicator1(_Indicator): """ diff --git a/Orange/widgets/data/owcontinuize.py b/Orange/widgets/data/owcontinuize.py index 6b1dfc54523..806ae11c0e1 100644 --- a/Orange/widgets/data/owcontinuize.py +++ b/Orange/widgets/data/owcontinuize.py @@ -1,159 +1,724 @@ -from functools import reduce +from functools import partial from types import SimpleNamespace +from typing import NamedTuple, Dict, List -from AnyQt.QtWidgets import QGridLayout +import numpy as np +import scipy.sparse as sp -import Orange.data -from Orange.util import Reprable -from Orange.statistics import distribution -from Orange.preprocess import Continuize +from AnyQt.QtCore import Qt, QSize, QAbstractListModel, QObject, \ + QItemSelectionModel +from AnyQt.QtGui import QColor +from AnyQt.QtWidgets import QButtonGroup, QRadioButton, QListView + +from orangewidget.utils import listview +from orangewidget.utils.itemmodels import SeparatedListDelegate, \ + LabelledSeparator + +from Orange.data import DiscreteVariable, ContinuousVariable, Domain, Table +from Orange.preprocess import Continuize as Continuizer from Orange.preprocess.transformation import Identity, Indicator, Normalizer -from Orange.data.table import Table from Orange.widgets import gui, widget from Orange.widgets.settings import Setting +from Orange.widgets.utils.itemmodels import DomainModel from Orange.widgets.utils.sql import check_sql_input from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.widget import Input, Output +class MethodDesc(NamedTuple): + id_: int + label: str # Label used for radio button + short_desc: str # Short description for list views + tooltip: str # Tooltip for radio button + supports_sparse: bool = True + + +DefaultKey = "" +DefaultId = 99 +BackCompatClass = object() + +Continuize = SimpleNamespace( + Default=DefaultId, + **{v.name: v.value for v in Continuizer.MultinomialTreatment}) + +DiscreteOptions: Dict[int, MethodDesc] = { + method.id_: method + for method in ( + MethodDesc( + Continuize.Default, "Use preset", "preset", + "Treat the variable as defined in preset"), + MethodDesc( + Continuize.Leave, "Keep categorical", "keep as is", + "Keep the variable discrete"), + MethodDesc( + Continuize.FirstAsBase, "First value as base", "first as base", + "One indicator variable for each value except the first"), + MethodDesc( + Continuize.FrequentAsBase, "Most frequent as base", "frequent as base", + "One indicator variable for each value except the most frequent", + False), + MethodDesc( + Continuize.Indicators, "One-hot encoding", "one-hot", + "One indicator variable for each value", + False), + MethodDesc( + Continuize.RemoveMultinomial, "Remove if more than 2 values", "remove if >2", + "Remove variables with more than two values; indicator otherwise"), + MethodDesc( + Continuize.Remove, "Remove", "remove", + "Remove variable"), + MethodDesc( + Continuize.AsOrdinal, "Treat as ordinal", "as ordinal", + "Each value gets a consecutive number from 0 to number of values - 1"), + MethodDesc( + Continuize.AsNormalizedOrdinal, "Treat as normalized ordinal", "as norm. ordinal", + "Same as above, but scaled to [0, 1]") + )} + +ContinuizationDefault = Continuize.FirstAsBase + + +Normalize = SimpleNamespace(Default=DefaultId, + Leave=0, Standardize=1, Center=2, Scale=3, + Normalize11=4, Normalize01=5) + +ContinuousOptions: Dict[int, MethodDesc] = { + method.id_: method + for method in ( + MethodDesc( + Normalize.Default, "Use preset", "preset", + "Treat the variable as defined in 'default setting'"), + MethodDesc( + Normalize.Leave, "Keep as it is", "no change", + "Keep the variable as it is"), + MethodDesc( + Normalize.Standardize, "Standardize to μ=0, σ²=1", "standardize", + "Subtract the mean and divide by standard deviation", + False), + MethodDesc( + Normalize.Center, "Center to μ=0", "center", + "Subtract the mean", + False), + MethodDesc( + Normalize.Scale, "Scale to σ²=1", "scale", + "Divide by standard deviation"), + MethodDesc( + Normalize.Normalize11, "Normalize to interval [-1, 1]", "to [-1, 1]", + "Linear transformation into interval [-1, 1]", + False), + MethodDesc( + Normalize.Normalize01, "Normalize to interval [0, 1]", "to [0, 1]", + "Linear transformation into interval [0, 1]", + False), + )} + +NormalizationDefault = Normalize.Leave + + +class ContDomainModel(DomainModel): + HintRole = next(gui.OrangeUserRole) + FilterRole = next(gui.OrangeUserRole) + """Domain model that adds description of chosen methods""" + def __init__(self, valid_type): + super().__init__( + order=(DomainModel.ATTRIBUTES, + LabelledSeparator("Meta attributes"), DomainModel.METAS, + LabelledSeparator("Targets"), DomainModel.CLASSES), + valid_types=(valid_type, ), strict_type=True) + + def data(self, index, role=Qt.DisplayRole): + if role == Qt.ToolTipRole: + return None + if role == self.FilterRole: + name = super().data(index, Qt.DisplayRole) + if not isinstance(name, str): + return None + hint = index.data(self.HintRole) + if hint is None: + return name + return f"{name} {hint[0]}" + value = super().data(index, role) + if role == Qt.DisplayRole: + if isinstance(value, LabelledSeparator): + return None + return value, *(index.data(self.HintRole) or ("", False)) + return value + + +class DefaultContModel(QAbstractListModel): + """A model used for showing "Default settings" above the list view""" + icon = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if DefaultContModel.icon is None: + DefaultContModel.icon = gui.createAttributePixmap( + "★", QColor(0, 0, 0, 0), Qt.black) + self.method = "" + + @staticmethod + def rowCount(parent): + return 0 if parent.isValid() else 1 + + @staticmethod + def columnCount(parent): + return 0 if parent.isValid() else 1 + + def data(self, _, role=Qt.DisplayRole): + if role == Qt.DisplayRole: + return f"Preset: {self.method}" + elif role == Qt.DecorationRole: + return self.icon + elif role == Qt.ToolTipRole: + return "Default for variables without specific settings" + return None + + def setMethod(self, method): + self.method = method + self.dataChanged.emit(self.index(0, 0), self.index(0, 0)) + + +class ListViewSearch(listview.ListViewSearch): + """ + A list view with two components shown above it: + - a listview containing a single item representing default settings + - a filter for search + + The class is based on listview.ListViewSearch and needs to have the same + name in order to override its private method __layout. + + Inherited __init__ calls __layout, so `default_view` must be constructed + there. Construction before calling super().__init__ doesn't work because + PyQt does not allow it. + """ + class Delegate(SeparatedListDelegate): + """ + A delegate that shows items (variables) with specific settings in bold + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._default_hints = False + + def initStyleOption(self, option, index): + super().initStyleOption(option, index) + hint = index.data(ContDomainModel.HintRole) + option.font.setBold(hint is not None and hint[1]) + + def set_default_hints(self, show): + self._default_hints = show + + def displayText(self, value, _): + if value is None: + return None + name, hint, nondefault = value + if self._default_hints or nondefault: + name += f": {hint}" + return name + + def __init__(self, *args, **kwargs): + self.default_view = None + super().__init__(preferred_size=QSize(350, -1), *args, **kwargs) + self.setItemDelegate(self.Delegate(self)) + self.force_hints = False + + def select_default(self): + """Select the item representing default settings""" + self.default_view.selectionModel().select( + self.default_view.model().index(0), + QItemSelectionModel.Select) + + def set_default_method(self, method): + self.default_view.model().setMethod(method) + + # pylint: disable=unused-private-member + def __layout(self): + if self.default_view is None: # __layout was called from __init__ + view = self.default_view = QListView(self) + view.setModel(DefaultContModel()) + self.filterProxyModel().setFilterRole(ContDomainModel.FilterRole) + view.verticalScrollBar().setDisabled(True) + view.horizontalScrollBar().setDisabled(True) + view.setHorizontalScrollBarPolicy( + Qt.ScrollBarPolicy.ScrollBarAlwaysOff) + view.setVerticalScrollBarPolicy( + Qt.ScrollBarPolicy.ScrollBarAlwaysOff) + font = view.font() + font.setBold(True) + view.setFont(font) + else: + view = self.default_view + + # Put the list view with default on top + margins = self.viewportMargins() + def_height = view.sizeHintForRow(0) + 2 * view.spacing() + 2 + view.setGeometry(0, 0, self.geometry().width(), def_height) + view.setFixedHeight(def_height) + + # Then search + search = self.__search + src_height = search.sizeHint().height() + size = self.size() + search.setGeometry(0, def_height + 2, size.width(), src_height) + + # Then the real list view + margins.setTop(def_height + 2 + src_height) + self.setViewportMargins(margins) + + def event(self, ev): + if ev.type() == ev.ToolTip: + self.itemDelegate().set_default_hints(True) + self.viewport().update() + return True + return super().event(ev) + + def leaveEvent(self, _): + self.itemDelegate().set_default_hints(False) + self.viewport().update() + + class OWContinuize(widget.OWWidget): + # Many false positives for `hints`; pylint ignores type annotations + # pylint: disable=unsubscriptable-object,unsupported-assignment-operation + # pylint: disable=unsupported-membership-test, unsupported-delete-operation name = "Continuize" description = ("Transform categorical attributes into numeric and, " + - "optionally, normalize numeric values.") + "optionally, scale numeric values.") icon = "icons/Continuize.svg" category = "Transform" keywords = "continuize, encode, dummy, numeric, one-hot, binary, treatment, contrast" priority = 2120 class Inputs: - data = Input("Data", Orange.data.Table) + data = Input("Data", Table) class Outputs: - data = Output("Data", Orange.data.Table) + data = Output("Data", Table) - want_main_area = False - resizing_enabled = False + class Error(widget.OWWidget.Error): + unsupported_sparse = \ + widget.Msg("Some chosen methods do support sparse data: {}") - Normalize = SimpleNamespace(Leave=0, Standardize=1, Center=2, Scale=3, - Normalize11=4, Normalize01=5) + want_control_area = False - settings_version = 2 - multinomial_treatment = Setting(0) - continuous_treatment = Setting(Normalize.Leave) - class_treatment = Setting(0) + settings_version = 3 + disc_var_hints: Dict[str, int] = Setting( + {DefaultKey: ContinuizationDefault}, schema_only=True) + cont_var_hints: Dict[str, int] = Setting( + {DefaultKey: NormalizationDefault}, schema_only=True) autosend = Setting(True) - multinomial_treats = ( - ("First value as base", Continuize.FirstAsBase), - ("Most frequent value as base", Continuize.FrequentAsBase), - ("One attribute per value", Continuize.Indicators), - ("Ignore multinomial attributes", Continuize.RemoveMultinomial), - ("Remove categorical attributes", Continuize.Remove), - ("Treat as ordinal", Continuize.AsOrdinal), - ("Divide by number of values", Continuize.AsNormalizedOrdinal)) - - continuous_treats = ( - ("Leave them as they are", True), - ("Standardize to μ=0, σ²=1", False), - ("Center to μ=0", False), - ("Scale to σ²=1", True), - ("Normalize to interval [-1, 1]", False), - ("Normalize to interval [0, 1]", False) - ) - - class_treats = ( - ("Leave it as it is", Continuize.Leave), - ("Treat as ordinal", Continuize.AsOrdinal), - ("Divide by number of values", Continuize.AsNormalizedOrdinal), - ("One class per value", Continuize.Indicators), - ) - def __init__(self): super().__init__() - - layout = QGridLayout() - gui.widgetBox(self.controlArea, orientation=layout) - - box = gui.radioButtonsInBox( - None, self, "multinomial_treatment", box="Categorical Features", - btnLabels=[x[0] for x in self.multinomial_treats], - callback=self.commit.deferred) - gui.rubber(box) - layout.addWidget(box, 0, 0, 2, 1) - - box = gui.radioButtonsInBox( - None, self, "continuous_treatment", box = "Numeric Features", - btnLabels=[x[0] for x in self.continuous_treats], - callback=self.commit.deferred) - gui.rubber(box) - layout.addWidget(box, 0, 1, 2, 1) - - box = gui.radioButtonsInBox( - None, self, "class_treatment", box="Categorical Outcome(s)", - btnLabels=[t[0] for t in self.class_treats], - callback=self.commit.deferred) - gui.rubber(box) - layout.addWidget(box, 0, 2, 2, 1) - - gui.auto_apply(self.buttonsArea, self, "autosend") - self.data = None + self._var_cache = {} + + def create(title, vartype, methods): + hbox = gui.hBox(box, title) + view = ListViewSearch( + selectionMode=ListViewSearch.ExtendedSelection, + uniformItemSizes=True) + view.setModel(ContDomainModel(vartype)) + view.selectionModel().selectionChanged.connect( + lambda: self._on_var_selection_changed(view)) + view.default_view.selectionModel().selectionChanged.connect( + lambda selected: self._on_default_selected(view, selected)) + hbox.layout().addWidget(view) + + bbox = gui.vBox(hbox) + bgroup = QButtonGroup(self) + bgroup.idClicked.connect(self._on_radio_clicked) + for desc in methods.values(): + button = QRadioButton(desc.label) + button.setToolTip(desc.tooltip) + bgroup.addButton(button, desc.id_) + bbox.layout().addWidget(button) + bbox.layout().addStretch(1) + return hbox, view, bbox, bgroup + + box = gui.vBox(self.mainArea, True, spacing=8) + self.disc_box, self.disc_view, self.disc_radios, self.disc_group = \ + create("Categorical Variables", DiscreteVariable, DiscreteOptions) + self.disc_view.set_default_method( + DiscreteOptions[self.disc_var_hints[DefaultKey]].short_desc) + self.disc_view.select_default() + + self.cont_box, self.cont_view, self.cont_radios, self.cont_group = \ + create("Numeric Variables", ContinuousVariable, ContinuousOptions) + self.cont_view.set_default_method( + ContinuousOptions[self.cont_var_hints[DefaultKey]].short_desc) + self.cont_view.select_default() + + boxes = (self.disc_radios, self.cont_radios) + width = max(box.sizeHint().width() for box in boxes) + for box in boxes: + box.setFixedWidth(width) + + box = gui.hBox(self.mainArea) + gui.button( + box, self, "Reset All", callback=self._on_reset_hints, + autoDefault=False) + gui.rubber(box) + gui.auto_apply(box, self, "autosend") + + def _on_var_selection_changed(self, view): + if not view.selectionModel().selectedIndexes(): + # Prevent infinite recursion (with _on_default_selected) + return + view.default_view.selectionModel().clearSelection() + self._update_radios(view) + + def _on_default_selected(self, view, selected): + if not selected: + # Prevent infinite recursion (with _var_selection_selected) + return + view.selectionModel().clearSelection() + self._update_radios(view) + + def selected_vars(self, view) -> List[str]: + """ + Return selected variables + + If 'Default settings' are selected, this returns DefaultKey + """ + model = view.model() + return [model[index.row()] + for index in view.selectionModel().selectedRows()] + + def _update_radios(self, view): + if view is self.disc_view: + group, hints = self.disc_group, self.disc_var_hints + else: + group, hints = self.cont_group, self.cont_var_hints + + selvars = self.selected_vars(view) + if not selvars: + self._check_button(group, hints[DefaultKey], True) + self._set_radio_enabled(group, DefaultId, False) + return + + self._set_radio_enabled(group, DefaultId, True) + options = {hints.get(var.name, self.default_for_var(var)) + for var in selvars} + if len(options) == 1: + self._check_button(group, options.pop(), True) + else: + self._uncheck_all_buttons(group) + + @staticmethod + def _uncheck_all_buttons(group: QButtonGroup): + button = group.checkedButton() + if button is not None: + group.setExclusive(False) + button.setChecked(False) + group.setExclusive(True) + + def _set_radio_enabled( + self, group: QButtonGroup, method_id: int, value: bool): + if group.button(method_id).isChecked() and not value: + self._uncheck_all_buttons(group) + group.button(method_id).setEnabled(value) + + @staticmethod + def _check_button(group: QButtonGroup, method_id: int, checked: bool): + group.button(method_id).setChecked(checked) + + def _on_radio_clicked(self, method_id: int): + if QObject().sender() is self.disc_group: + view, hints, methods = \ + self.disc_view, self.disc_var_hints, DiscreteOptions + leave_id = Continuize.Leave + else: + view, hints, methods = \ + self.cont_view, self.cont_var_hints, ContinuousOptions + leave_id = Normalize.Leave + selvars = self.selected_vars(view) + if not selvars: + hints[DefaultKey] = method_id + view.set_default_method(methods[method_id].short_desc) + else: + keys = [var.name for var in selvars] + indexes = view.selectionModel().selectedIndexes() + model = view.model() + # These two keys may delete values from dict, hence we must loop + if method_id in (DefaultId, leave_id): + for key in keys: + # Attributes do not store the hint if it equals Default; + # metas and targets do not store it if it is Leave + if method_id == (DefaultId if self.is_attr(key) else leave_id): + if key in hints: + del hints[key] + else: + hints[key] = method_id + else: + hints.update(dict.fromkeys(keys, method_id)) + desc = methods[method_id].short_desc + for index, var in zip(indexes, selvars): + show = method_id != self.default_for_var(var) + model.setData(index, (desc, show), model.HintRole) + self.commit.deferred() @Inputs.data @check_sql_input - def setData(self, data): + def set_data(self, data): self.data = data - self.enable_normalization() - if data is None: - self.Outputs.data.send(None) - else: - self.commit.now() + self._var_cache.clear() + domain = data.domain if data else None + self.disc_view.model().set_domain(domain) + self.cont_view.model().set_domain(domain) + if data: + # Clean up hints only when receiving new data, not on disconnection + self._set_hints() + self.commit.now() + + def _set_hints(self): + assert self.data + + # Backward compatibility for settings < 3 + class_treatment = self.disc_var_hints.get(BackCompatClass, None) + if class_treatment is not None \ + and self.data.domain.class_var is not None: + self.disc_var_hints[self.data.domain.class_var.name] \ + = class_treatment + + for hints, model, options in ( + (self.cont_var_hints, self.cont_view.model(), ContinuousOptions), + (self.disc_var_hints, self.disc_view.model(), DiscreteOptions)): + filtered = {DefaultKey: hints[DefaultKey]} + for i, var in enumerate(model): + if isinstance(var, LabelledSeparator): + continue + default = self.default_for_var(var) + method_id = hints.get(var.name, default) + nondefault = method_id != default + if nondefault: + filtered[var.name] = method_id + model.setData( + model.index(i, 0), + (options[method_id].short_desc, nondefault), + model.HintRole) + hints.clear() + hints.update(filtered) + + def _on_reset_hints(self): + if not self.data: + return + self.cont_var_hints.clear() + self.disc_var_hints.clear() + self.disc_var_hints[DefaultKey] = ContinuizationDefault + self.cont_var_hints[DefaultKey] = NormalizationDefault + self._set_hints() + self.cont_view.set_default_method( + ContinuousOptions[ContinuizationDefault].short_desc) + self.disc_view.set_default_method( + ContinuousOptions[NormalizationDefault].short_desc) - def enable_normalization(self): - buttons = self.controls.continuous_treatment.buttons - if self.data is not None and self.data.is_sparse(): - if self.continuous_treatment == self.Normalize.Standardize: - self.continuous_treatment = self.Normalize.Scale + @gui.deferred + def commit(self): + self.Outputs.data.send(self._prepare_output()) + + def _prepare_output(self): + self.Error.unsupported_sparse.clear() + if not self.data: + return None + if unsupp_sparse := self._unsupported_sparse(): + if len(unsupp_sparse) == 1: + self.Error.unsupported_sparse(unsupp_sparse[0]) else: - self.continuous_treatment = self.Normalize.Leave - for button, (_, supports_sparse) \ - in zip(buttons, self.continuous_treats): - button.setEnabled(supports_sparse) + self.Error.unsupported_sparse("\n" + ", ".join(unsupp_sparse)) + return None + + domain = self.data.domain + attrs = self._create_vars(domain.attributes) + class_vars = self._create_vars(domain.class_vars) + metas = self._create_vars(domain.metas) + return self.data.transform(Domain(attrs, class_vars, metas)) + + def _unsupported_sparse(self): + # time is not continuous, pylint: disable=unidiomatic-typecheck + domain = self.data.domain + disc = set() + cont = set() + # At the time of writing, self.data.Y cannot be sparse (setter for + # `Y` converts it to dense, as done in + # https://github.com/biolab/orange3/commit/a18f38059caf37f3b329d6ad688189561959bb24) + # Including it here doesn't hurt, though. + for part, attrs in ((self.data.X, domain.attributes), + (self.data.Y, domain.class_vars), + (self.data.metas, domain.metas)): + if sp.issparse(part): + disc |= {self._hint_for_var(var) + for var in attrs + if var.is_discrete} + cont |= {self._hint_for_var(var) + for var in attrs + if type(var) is ContinuousVariable} + disc &= {method.id_ + for method in DiscreteOptions.values() + if not method.supports_sparse} + cont &= {method.id_ + for method in ContinuousOptions.values() + if not method.supports_sparse} + + # Retrieve them from DiscreteOptions/ContinuousOptions to keep the order + return [method.label + for methods, problems in ((DiscreteOptions, disc), + (ContinuousOptions, cont)) + for method in methods.values() if method.id_ in problems] + + def _create_vars(self, part): + # time is not continuous, pylint: disable=unidiomatic-typecheck + return sum( + (self._continuized_vars(var) if var.is_discrete + else self._scaled_vars(var) if type(var) is ContinuousVariable + else [var] + for var in part), + start=[]) + + def _get(self, var, stat): + def most_frequent(col): + col = col[np.isfinite(col)].astype(int) + counts = np.bincount(col, minlength=len(var.values)) + return np.argmax(counts) + + funcs = {"min": np.nanmin, "max": np.nanmax, + "mean": np.nanmean, "std": np.nanstd, + "major": most_frequent} + name = var.name + cache = self._var_cache.setdefault(name, {}) + if stat not in cache: + cache[stat] = funcs[stat](self.data.get_column(var)) + return cache[stat] + + def is_attr(self, var): + domain = self.data.domain + return 0 <= domain.index(var) < len(domain.attributes) + + def default_for_var(self, var): + if self.is_attr(var): + return DefaultId + return Continuize.Leave if var.is_discrete else Normalize.Leave + + def _hint_for_var(self, var): + if var.is_discrete: + hints, leave_id = self.disc_var_hints, Continuize.Leave else: - for button in buttons: - button.setEnabled(True) + hints, leave_id = self.cont_var_hints, Normalize.Leave + + # Default for attributes is given by "default" + if self.is_attr(var): + return hints.get(var.name, hints[DefaultKey]) + + # For metas and targets, default is Leave + # If user changes it to "Default", default is used + hint = hints.get(var.name, leave_id) + if hint == DefaultId: + hint = hints[DefaultKey] + return hint + + def _scaled_vars(self, var): + hint = self._hint_for_var(var) + if hint == Normalize.Leave: + return [var] - def constructContinuizer(self): - conzer = DomainContinuizer( - multinomial_treatment=self.multinomial_treats[self.multinomial_treatment][1], - continuous_treatment=self.continuous_treatment, - class_treatment=self.class_treats[self.class_treatment][1] - ) - return conzer + get = partial(self._get, var) + if hint == Normalize.Standardize: + off, scale = get("mean"), 1 / (get("std") or 1) + elif hint == Normalize.Center: + off, scale = get("mean"), 1 + elif hint == Normalize.Scale: + off, scale = 0, 1 / (get("std") or 1) + else: + assert hint in (Normalize.Normalize11, Normalize.Normalize01), f"hint={hint}?!" + min_, max_ = get("min"), get("max") + span = (max_ - min_) or 1 + if hint == Normalize.Normalize11: + off, scale = (min_ + max_) / 2, 2 / span + else: + off, scale = min_, 1 / span - @gui.deferred - def commit(self): - continuizer = self.constructContinuizer() - if self.data: - domain = continuizer(self.data) - data = self.data.transform(domain) - self.Outputs.data.send(data) + return [ContinuousVariable( + var.name, + compute_value=Normalizer(var, off, scale))] + + def _continuized_vars(self, var, hint=None): + if hint is None: + hint = self._hint_for_var(var) + + # Single variable + if hint == Continuize.Leave: + return [var] + if hint == Continuize.Remove: + return [] + if hint == Continuize.RemoveMultinomial and len(var.values) <= 2 or \ + hint == Continuize.AsOrdinal: + return [ContinuousVariable(var.name, + compute_value=Identity(var))] + if hint == Continuize.RemoveMultinomial: + assert len(var.values) > 2 + return [] + if hint == Continuize.AsOrdinal: + return [ContinuousVariable(var.name, + compute_value=Identity(var))] + if hint == Continuize.AsNormalizedOrdinal: + scale = 1 / (len(var.values) - 1 or 1) + return [ContinuousVariable(var.name, + compute_value=Normalizer(var, 0, scale))] + + # Multiple dummy variables + if hint == Continuize.FirstAsBase: + base = 0 + elif hint == Continuize.FrequentAsBase: + base = self._get(var, "major") + elif hint == Continuize.Indicators: + base = None else: - self.Outputs.data.send(self.data) # None or empty data + assert False, f"hint={hint}?!" + return [ + ContinuousVariable(f"{var.name}={value}", + compute_value=Indicator(var, value=i)) + for i, value in enumerate(var.values) + if i != base + ] def send_report(self): - self.report_items( - "Settings", - [("Categorical features", - self.multinomial_treats[self.multinomial_treatment][0]), - ("Numeric features", - self.continuous_treats[self.continuous_treatment][0]), - ("Class", self.class_treats[self.class_treatment][0])]) + if not self.data: + return + single_disc = len(self.disc_view.model()) > 0 \ + and len(self.disc_var_hints) == 1 \ + and DiscreteOptions[self.disc_var_hints[DefaultKey]].label.lower() + single_cont = len(self.cont_view.model()) > 0 \ + and len(self.cont_var_hints) == 1 \ + and ContinuousOptions[self.cont_var_hints[DefaultKey]].label.lower() + if single_disc and single_cont: + self.report_items( + (("Categorical variables", single_disc), + ("Numeric variables", single_cont)) + ) + else: + if single_disc: + self.report_paragraph("Categorical variables", single_disc) + elif len(self.disc_view.model()) > 0: + self.report_items( + "Categorical variables", + [("Preset" if name == DefaultKey else name, + DiscreteOptions[id_].label.lower()) + for name, id_ in self.disc_var_hints.items()]) + if single_cont: + self.report_paragraph("Numeric variables", single_cont) + elif len(self.cont_view.model()) > 0: + self.report_items( + "Numeric variables", + [("Preset" if name == DefaultKey else name, + ContinuousOptions[id_].label.lower()) + for name, id_ in self.cont_var_hints.items()]) + self.report_paragraph("Unlisted", + "Any unlisted attributes default to preset option, and " + "unlisted meta attributes and target variables are kept " + "as they are") @classmethod def migrate_settings(cls, settings, version): if version < 2: - Normalize = cls.Normalize cont_treat = settings.pop("continuous_treatment", 0) zero_based = settings.pop("zero_based", True) if cont_treat == 1: @@ -163,229 +728,33 @@ def migrate_settings(cls, settings, version): settings["continuous_treatment"] = Normalize.Normalize11 elif cont_treat == 2: settings["continuous_treatment"] = Normalize.Standardize + if version < 3: + settings["cont_var_hints"] = \ + {DefaultKey: + settings.pop("continuous_treatment", Normalize.Leave)} + # DISC OPS: Default=99, Indicators=1, FirstAsBase=2, FrequentAsBase=3, Remove=4, + # RemoveMultinomial=5, ReportError=6, AsOrdinal=7, AsNormalizedOrdinal=8, Leave=9 -class WeightedIndicator(Indicator): - def __init__(self, variable, value, weight=1.0): - super().__init__(variable, value) - self.weight = weight - - def transform(self, c): - t = super().transform(c) * self.weight - if self.weight != 1.0: - t *= self.weight - return t - - def __eq__(self, other): - return super().__eq__(other) and self.weight == other.weight - - def __hash__(self): - return hash((type(self), self.variable, self.value, self.weight)) - - -def make_indicator_var(source, value_ind, weight=None): - if weight is None: - indicator = Indicator(source, value=value_ind) - else: - indicator = WeightedIndicator(source, value=value_ind, weight=weight) - return Orange.data.ContinuousVariable( - "{}={}".format(source.name, source.values[value_ind]), - compute_value=indicator - ) - - -def dummy_coding(var, base_value=0): - N = len(var.values) - return [make_indicator_var(var, i) - for i in range(N) if i != base_value] - - -def one_hot_coding(var): - N = len(var.values) - return [make_indicator_var(var, i) for i in range(N)] - - -def continuize_domain(data, - multinomial_treatment=Continuize.Indicators, - continuous_treatment=OWContinuize.Normalize.Leave, - class_treatment=Continuize.Leave): - domain = data.domain - def needs_dist(var, mtreat, ctreat): - "Does the `var` need a distribution given specified flags" - if var.is_discrete: - return mtreat == Continuize.FrequentAsBase - elif var.is_continuous: - return ctreat != OWContinuize.Normalize.Leave - else: - raise ValueError - - # Compute the column indices which need a distribution. - attr_needs_dist = [needs_dist(var, multinomial_treatment, - continuous_treatment) - for var in domain.attributes] - cls_needs_dist = [needs_dist(var, class_treatment, OWContinuize.Normalize.Leave) - for var in domain.class_vars] - - columns = [i for i, needs in enumerate(attr_needs_dist + cls_needs_dist) - if needs] - - if columns: - if data is None: - raise TypeError("continuizer requires data") - dist = distribution.get_distributions_for_columns(data, columns) - else: - dist = [] - - dist_iter = iter(dist) - - newattrs = [continuize_var(var, next(dist_iter) if needs_dist else None, - multinomial_treatment, continuous_treatment) - for var, needs_dist in zip(domain.attributes, attr_needs_dist)] - newclass = [continuize_var(var, - next(dist_iter) if needs_dist else None, - class_treatment, OWContinuize.Normalize.Leave) - for var, needs_dist in zip(domain.class_vars, cls_needs_dist)] - - newattrs = reduce(list.__iadd__, newattrs, []) - newclass = reduce(list.__iadd__, newclass, []) - return Orange.data.Domain(newattrs, newclass, domain.metas) - - -def continuize_var(var, - data_or_dist=None, - multinomial_treatment=Continuize.Indicators, - continuous_treatment=OWContinuize.Normalize.Leave): - def continuize_continuous(): - dist = _ensure_dist(var, data_or_dist) if continuous_treatment != OWContinuize.Normalize.Leave else None - treatments = [lambda var, _: var, - normalize_by_sd, center_to_mean, divide_by_sd, - normalize_to_11, normalize_to_01] - if dist is not None and dist.shape[1] == 0: - return [var] - new_var = treatments[continuous_treatment](var, dist) - return [new_var] - - def continuize_discrete(): - if len(var.values) > 2 and \ - multinomial_treatment == Continuize.ReportError: - raise ValueError("{0.name} is a multinomial variable".format(var)) - if len(var.values) < 2 or \ - multinomial_treatment == Continuize.Remove or \ - (multinomial_treatment == Continuize.RemoveMultinomial - and len(var.values) > 2): - return [] - elif multinomial_treatment == Continuize.AsOrdinal: - return [ordinal_to_continuous(var)] - elif multinomial_treatment == Continuize.AsNormalizedOrdinal: - return [ordinal_to_norm_continuous(var)] - elif multinomial_treatment == Continuize.Indicators: - return one_hot_coding(var) - elif multinomial_treatment in ( - Continuize.FirstAsBase, Continuize.RemoveMultinomial): - return dummy_coding(var) - elif multinomial_treatment == Continuize.FrequentAsBase: - dist = _ensure_dist(var, data_or_dist) - modus = dist.modus() - return dummy_coding(var, base_value=modus) - elif multinomial_treatment == Continuize.Leave: - return [var] - raise ValueError("Invalid value of `multinomial_treatment`") - - if var.is_continuous: - return continuize_continuous() - elif var.is_discrete: - return continuize_discrete() - raise TypeError("Non-primitive variables cannot be continuized") - - -def _ensure_dist(var, data_or_dist): - if isinstance(data_or_dist, distribution.Discrete): - if not var.is_discrete: - raise TypeError - return data_or_dist - elif isinstance(data_or_dist, distribution.Continuous): - if not var.is_continuous: - raise TypeError - return data_or_dist - elif isinstance(data_or_dist, Orange.data.Storage): - return distribution.get_distribution(data_or_dist, var) - else: - raise ValueError("Need a distribution or data.") - - -def normalized_var(var, translate, scale): - return Orange.data.ContinuousVariable(var.name, - compute_value=Normalizer(var, translate, scale)) - - -def ordinal_to_continuous(var): - return Orange.data.ContinuousVariable(var.name, - compute_value=Identity(var)) - - -def ordinal_to_norm_continuous(var): - n_values = len(var.values) - return normalized_var(var, 0, 1 / (n_values - 1)) - - -def normalize_by_sd(var, dist): - mean, sd = dist.mean(), dist.standard_deviation() - sd = sd if sd > 1e-10 else 1 - return normalized_var(var, mean, 1 / sd) - - -def center_to_mean(var, dist): - return normalized_var(var, dist.mean(), 1) - - -def divide_by_sd(var, dist): - sd = dist.standard_deviation() - sd = sd if sd > 1e-10 else 1 - return normalized_var(var, 0, 1 / sd) - - -def normalize_to_11(var, dist): - return normalize_by_span(var, dist, False) - - -def normalize_to_01(var, dist): - return normalize_by_span(var, dist, True) - + # OLD ORDER: [FirstAsBase, FrequentAsBase, Indicators, RemoveMultinomial, Remove, + # AsOrdinal, AsNormalizedOrdinal] + old_to_new = [2, 3, 1, 5, 4, 7, 8] -def normalize_by_span(var, dist, zero_based=True): - v_max, v_min = dist.max(), dist.min() - span = (v_max - v_min) - if span < 1e-15: - span = 1 - if zero_based: - return normalized_var(var, v_min, 1 / span) - else: - return normalized_var(var, (v_min + v_max) / 2, 2 / span) + settings["disc_var_hints"] = \ + {DefaultKey: + old_to_new[settings.pop("multinomial_treatment", 0)]} + # OLD ORDER: [Leave, AsOrdinal, AsNormalizedOrdinal, Indicators] + old_to_new = [9, 7, 8, 1] -class DomainContinuizer(Reprable): - def __init__(self, - multinomial_treatment=Continuize.Indicators, - continuous_treatment=OWContinuize.Normalize.Leave, - class_treatment=Continuize.Leave): - self.multinomial_treatment = multinomial_treatment - self.continuous_treatment = continuous_treatment - self.class_treatment = class_treatment + class_treatment = old_to_new[settings.pop("class_treatment", 0)] + if class_treatment != Continuize.Leave: + settings["disc_var_hints"][BackCompatClass] = class_treatment - def __call__(self, data): - treat = self.multinomial_treatment - domain = data.domain - if (treat == Continuize.ReportError and - any(var.is_discrete and len(var.values) > 2 for var in domain)): - raise ValueError("Domain has multinomial attributes") - newdomain = continuize_domain( - data, - self.multinomial_treatment, - self.continuous_treatment, - self.class_treatment) - return newdomain +# Backward compatibility for unpickling settings +OWContinuize.Normalize = Normalize if __name__ == "__main__": # pragma: no cover - WidgetPreview(OWContinuize).run(Table("iris")) + WidgetPreview(OWContinuize).run(Table("heart_disease")) diff --git a/Orange/widgets/data/tests/test_owcontinuize.py b/Orange/widgets/data/tests/test_owcontinuize.py index 56816c27cbe..af84bd1d526 100644 --- a/Orange/widgets/data/tests/test_owcontinuize.py +++ b/Orange/widgets/data/tests/test_owcontinuize.py @@ -1,13 +1,19 @@ # Test methods with long descriptive names can omit docstrings -# pylint: disable=missing-docstring,unsubscriptable-object +# pylint: disable=missing-docstring,unsubscriptable-object,protected-access import unittest +from unittest.mock import Mock, patch import numpy as np +from AnyQt.QtCore import Qt, QModelIndex, QItemSelectionModel +from AnyQt.QtTest import QSignalSpy + +from orangewidget.tests.base import GuiTest +from orangewidget.utils.itemmodels import SeparatedListDelegate from Orange.data import Table, DiscreteVariable, ContinuousVariable, Domain -from Orange.preprocess import transformation -from Orange.widgets.data import owcontinuize -from Orange.widgets.data.owcontinuize import OWContinuize, WeightedIndicator +from Orange.widgets.data.owcontinuize import OWContinuize, DefaultKey, \ + ContinuousOptions, Normalize, Continuize, DiscreteOptions, \ + ContDomainModel, DefaultContModel, ListViewSearch, DefaultId from Orange.widgets.tests.base import WidgetTest @@ -19,25 +25,18 @@ def test_empty_data(self): """No crash on empty data""" data = Table("iris") widget = self.widget - widget.multinomial_treatment = 1 self.send_signal(self.widget.Inputs.data, data) widget.commit.now() - imp_data = self.get_output(self.widget.Outputs.data) - np.testing.assert_equal(imp_data.X, data.X) - np.testing.assert_equal(imp_data.Y, data.Y) - widget.continuous_treatment = 1 self.send_signal(self.widget.Inputs.data, Table.from_domain(data.domain)) widget.commit.now() - imp_data = self.get_output(self.widget.Outputs.data) - self.assertEqual(len(imp_data), 0) + self.assertIsNone(self.get_output(self.widget.Outputs.data)) self.send_signal(self.widget.Inputs.data, None) widget.commit.now() - imp_data = self.get_output(self.widget.Outputs.data) - self.assertIsNone(imp_data) + self.assertIsNone(self.get_output(self.widget.Outputs.data)) def test_continuous(self): table = Table("housing") @@ -45,11 +44,6 @@ def test_continuous(self): self.widget.commit.now() def test_one_column_equal_values(self): - """ - No crash on a column with equal values and with selected option - normalize by standard deviation. - GH-2144 - """ table = Table("iris") table = table[:, 1].copy() with table.unlocked(): @@ -60,12 +54,6 @@ def test_one_column_equal_values(self): self.widget.commit.now() def test_one_column_nan_values_normalize_sd(self): - """ - No crash on a column with NaN values and with selected option - normalize by standard deviation (Not the same issue which is - tested above). - GH-2144 - """ table = Table("iris") with table.unlocked(): table[:, 2] = np.NaN @@ -81,11 +69,6 @@ def test_one_column_nan_values_normalize_sd(self): self.widget.commit.now() def test_one_column_nan_values_normalize_span(self): - """ - No crash on a column with NaN values and with selected option - normalize by span. - GH-2144 - """ table = Table("iris") with table.unlocked(): table[:, 2] = np.NaN @@ -100,193 +83,791 @@ def test_one_column_nan_values_normalize_span(self): self.send_signal(self.widget.Inputs.data, table) self.widget.commit.now() - def test_disable_normalize_sparse(self): - def assert_enabled(enabled): - for button, (method, supports_sparse) in \ - zip(buttons, w.continuous_treats): - self.assertEqual(button.isEnabled(), enabled or supports_sparse, - msg=f"Error in {method}") - buttons[w.Normalize.Leave].click() - buttons[w.Normalize.Standardize].click() - + def test_commit_calls_prepare_output(self): + # This test ensures that commit returns the result of _prepare_output, + # so further tests can just check the latter. If this is changed, the + # test will fail, which is OK - test can be removed, but other tests + # then have to check the output and not just _prepare_output. + out = object() + self.widget._prepare_output = lambda: out + self.widget.Outputs.data.send = Mock() + self.widget.commit.now() + self.widget.Outputs.data.send.assert_called_with(out) + + def test_check_unsuppoerted_sparse_continuous(self): + # This test checks response at two points: + # - when scaling sparse data with a method that does not support it, + # the wiget must show an error and output nothing + # - the above is tested via method _unsupported_sparse, so we also + # directly check this method w = self.widget - buttons = w.controls.continuous_treatment.buttons + hints = w.cont_var_hints iris = Table("iris") - sparse_iris = iris.to_sparse() + iris = iris.transform(Domain(iris.domain[:2], + iris.domain.class_var, + iris.domain.attributes[2:])) + sparse_iris = iris.to_sparse(sparse_class=True, sparse_metas=True) + + for attr in (iris.domain.attributes[0], iris.domain.metas[0]): + for key in (DefaultKey, attr.name): + hints[DefaultKey] = Normalize.Leave + for hints[key], desc in ContinuousOptions.items(): + if desc.id_ == Normalize.Default: + continue + msg = f"at {attr} = {desc.label}, " \ + + ("default" if key is DefaultKey else key) + + # input dense + self.send_signal(w.Inputs.data, iris) + self.assertFalse(w._unsupported_sparse(), msg) + self.assertFalse(w.Error.unsupported_sparse.is_shown(), msg) + self.assertIsNotNone(self.get_output(w.Outputs.data), msg) + + # input sparse + self.send_signal(w.Inputs.data, sparse_iris) + self.assertIsNot(w._unsupported_sparse(), + desc.supports_sparse, msg) + self.assertIsNot(w.Error.unsupported_sparse.is_shown(), + desc.supports_sparse, msg) + if desc.supports_sparse: + self.assertIsNotNone(self.get_output(w.Outputs.data), + msg) + else: + self.assertIsNone(self.get_output(w.Outputs.data), + msg) + self.send_signal(w.Inputs.data, None) + self.assertFalse(w.Error.unsupported_sparse.is_shown(), + msg) + del hints[key] + + def test_check_unsuppoerted_sparse_discrete(self): + # This test checks response at two points: + # - when scaling sparse data with a method that does not support it, + # the wiget must show an error and output nothing + # - the above is tested via method _unsupported_sparse, so we also + # directly check this method + w = self.widget + hints = w.disc_var_hints + zoo = Table("zoo") + zoo = zoo.transform(Domain(zoo.domain.attributes[:2], + None, + zoo.domain.attributes[2:])) + sparse_zoo = zoo.to_sparse(sparse_metas=True) # input dense - self.send_signal(w.Inputs.data, iris) - assert_enabled(True) - self.assertEqual(w.continuous_treatment, w.Normalize.Standardize) + for attr in (zoo.domain[0], zoo.domain.metas[0]): + for key in (DefaultKey, attr.name): + hints[DefaultKey] = Continuize.Leave + for hints[key], desc in DiscreteOptions.items(): + if desc.id_ == Continuize.Default: + continue + msg = f"at {key} = {desc.label}, " \ + + ("default" if key is DefaultKey else key) + + self.send_signal(w.Inputs.data, zoo) + self.assertFalse(w._unsupported_sparse(), msg) + self.assertFalse(w.Error.unsupported_sparse.is_shown(), msg) + self.assertIsNotNone(self.get_output(w.Outputs.data), msg) + + self.send_signal(w.Inputs.data, sparse_zoo) + self.assertIsNot(w._unsupported_sparse(), + desc.supports_sparse, msg) + self.assertIsNot(w.Error.unsupported_sparse.is_shown(), + desc.supports_sparse, msg) + if desc.supports_sparse: + self.assertIsNotNone(self.get_output(w.Outputs.data), msg) + else: + self.assertIsNone(self.get_output(w.Outputs.data), msg) + self.send_signal(w.Inputs.data, None) + self.assertFalse(w.Error.unsupported_sparse.is_shown(), msg) + del hints[key] + + def test_update_cont_radio_buttons(self): + w = self.widget + w.disc_var_hints[DefaultKey] = Continuize.AsOrdinal + w.disc_var_hints["chest pain"] \ + = w.disc_var_hints["rest ECG"] \ + = Continuize.Remove + w.disc_var_hints["exerc ind ang"] = Continuize.FirstAsBase + + w.cont_var_hints[DefaultKey] = Normalize.Center + w.cont_var_hints["cholesterol"] = Normalize.Scale + + self.send_signal(w.Inputs.data, Table("heart_disease")) + + dview = w.disc_view + dmod = dview.model() + dselmod = dview.selectionModel() + dgroup = w.disc_group + + with patch.object(w, "_update_radios") as upd: + w._on_var_selection_changed(dview) + upd.assert_not_called() + + dselmod.select(dmod.index(1, 0), + QItemSelectionModel.ClearAndSelect) # chest_pain + self.assertEqual(dgroup.checkedId(), Continuize.Remove) + self.assertTrue(dgroup.button(99).isEnabled()) + + dselmod.select(dmod.index(2, 0), + QItemSelectionModel.ClearAndSelect) # blood sugar + self.assertEqual(dgroup.checkedId(), Continuize.Default) + + dselmod.select(dmod.index(3, 0), + QItemSelectionModel.ClearAndSelect) # rest ECG + self.assertEqual(dgroup.checkedId(), Continuize.Remove) + + dselmod.select(dmod.index(4, 0), + QItemSelectionModel.ClearAndSelect) # exerc ind ang + self.assertEqual(dgroup.checkedId(), Continuize.FirstAsBase) + + dselmod.select(dmod.index(3, 0), + QItemSelectionModel.Select) # read ECG and exerc ind ang + self.assertEqual(dgroup.checkedId(), -1) + + dview.select_default() + self.assertEqual(dgroup.checkedId(), Continuize.AsOrdinal) + self.assertFalse(dgroup.button(99).isEnabled()) + + cview = w.cont_view + cmod = cview.model() + cselmod = cview.selectionModel() + cgroup = w.cont_group + + cselmod.select(cmod.index(2, 0), + QItemSelectionModel.ClearAndSelect) # cholesterol + self.assertEqual(cgroup.checkedId(), Normalize.Scale) + self.assertEqual(dgroup.checkedId(), Continuize.AsOrdinal) + self.assertTrue(cgroup.button(99).isEnabled()) + + cview.select_default() + self.assertEqual(cgroup.checkedId(), Normalize.Center) + self.assertEqual(dgroup.checkedId(), Continuize.AsOrdinal) + self.assertFalse(cgroup.button(99).isEnabled()) + + w._uncheck_all_buttons(cgroup) + self.assertEqual(cgroup.checkedId(), -1) + self.assertEqual(dgroup.checkedId(), Continuize.AsOrdinal) + + w._uncheck_all_buttons(dgroup) + self.assertEqual(dgroup.checkedId(), -1) + + def test_update_disc_radio_buttons_mixed(self): + def select(xs): + dselmod.select(dmod.index(xs[0], 0), + QItemSelectionModel.ClearAndSelect) + for x in xs[1:]: + dselmod.select(dmod.index(x, 0), + QItemSelectionModel.Select) + + w = self.widget + dview = w.disc_view + dmod = dview.model() + dselmod = dview.selectionModel() + dgroup = w.disc_group + + domain = Domain( + [DiscreteVariable(x, values=["0", "1"]) for x in "abc"], + DiscreteVariable("d", values=["0", "1"]), + [DiscreteVariable(x, values=["0", "1"]) for x in "efg"]) + data = Table.from_list(domain, [[1] * 7] * 2) + + w.disc_var_hints = {DefaultKey: Continuize.FirstAsBase} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(dgroup.checkedId(), -1) + select([1]) + self.assertEqual(dgroup.checkedId(), DefaultId) + select([4, 8]) + self.assertEqual(dgroup.checkedId(), Continuize.Leave) + + w.disc_var_hints = {DefaultKey: Continuize.FirstAsBase, + "b": Continuize.Leave} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(dgroup.checkedId(), Continuize.Leave) + + w.disc_var_hints = {DefaultKey: Continuize.FirstAsBase, + "e": DefaultId, "d": DefaultId} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(dgroup.checkedId(), DefaultId) + + def test_update_cont_radio_buttons_mixed(self): + def select(xs): + cselmod.select(cmod.index(xs[0], 0), + QItemSelectionModel.ClearAndSelect) + for x in xs[1:]: + cselmod.select(cmod.index(x, 0), + QItemSelectionModel.Select) - # input sparse - self.send_signal(w.Inputs.data, sparse_iris) - self.assertEqual(w.continuous_treatment, w.Normalize.Scale) - assert_enabled(False) - self.assertEqual(w.continuous_treatment, w.Normalize.Leave) + w = self.widget + cview = w.cont_view + cmod = cview.model() + cselmod = cview.selectionModel() + cgroup = w.cont_group + + domain = Domain([ContinuousVariable(x) for x in "abc"], + ContinuousVariable("d"), + [ContinuousVariable(x) for x in "efg"]) + data = Table.from_list(domain, [[1] * 7] * 2) + + w.cont_var_hints = {DefaultKey: Normalize.Center} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(cgroup.checkedId(), -1) + select([1]) + self.assertEqual(cgroup.checkedId(), DefaultId) + select([4, 8]) + self.assertEqual(cgroup.checkedId(), Normalize.Leave) + + w.cont_var_hints = {DefaultKey: Normalize.Center, + "b": Normalize.Leave} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(cgroup.checkedId(), Normalize.Leave) + + w.cont_var_hints = {DefaultKey: Normalize.Center, + "e": DefaultId, "d": DefaultId} + self.send_signal(w.Inputs.data, data) + select([1, 4, 8]) + self.assertEqual(cgroup.checkedId(), DefaultId) + + def test_set_hints_on_new_data(self): + w = self.widget + domain = Domain([ContinuousVariable(c) for c in "abc"] + + [DiscreteVariable("m", values=tuple("xy"))], + ContinuousVariable("d"), + [ContinuousVariable(c) for c in "ef"]) + data = Table.from_list(domain, [[0] * 6]) + + w.cont_var_hints["b"] = Normalize.Leave + w.cont_var_hints["f"] = Normalize.Normalize11 + w.cont_var_hints["x"] = Normalize.Normalize11 - # remove data self.send_signal(w.Inputs.data, None) - assert_enabled(True) + self.send_signal(w.Inputs.data, data) + + model = w.cont_view.model() + self.assertEqual(model.index(0, 0).data(model.HintRole), + ("preset", False)) + self.assertEqual(model.index(1, 0).data(model.HintRole), + (ContinuousOptions[Normalize.Leave].short_desc, True)) + self.assertEqual(model.index(5, 0).data(model.HintRole), + (ContinuousOptions[Normalize.Normalize11].short_desc, True)) + self.assertNotIn("x", w.cont_var_hints) + + def test_reset_hints(self): + w = self.widget + domain = Domain([ContinuousVariable(c) for c in "abc"] + + [DiscreteVariable("m", values=tuple("xy"))], + ContinuousVariable("d"), + [ContinuousVariable(c) for c in "ef"]) + data = Table.from_list(domain, [[0] * 6]) + + w.cont_var_hints[DefaultKey] = Normalize.Center + w.cont_var_hints["b"] = Normalize.Leave + w.cont_var_hints["f"] = Normalize.Normalize11 + w.cont_var_hints["x"] = Normalize.Normalize11 + w.disc_var_hints[DefaultKey] = Continuize.Indicators + w.cont_var_hints["m"] = Continuize.Remove + + self.send_signal(w.Inputs.data, data) + w._on_reset_hints() + + self.assertEqual(w.cont_var_hints[DefaultKey], Normalize.Leave) + self.assertEqual(w.disc_var_hints[DefaultKey], Continuize.FirstAsBase) + + def test_change_hints_disc(self): + w = self.widget + w.disc_var_hints[DefaultKey] = Continuize.AsOrdinal + w.disc_var_hints["chest pain"] \ + = w.disc_var_hints["rest ECG"] \ + = Continuize.Remove + w.disc_var_hints["exerc ind ang"] = Continuize.FirstAsBase + + dview = w.disc_view + dmod = dview.model() + dselmod = dview.selectionModel() + dgroup = w.disc_group + + self.send_signal(w.Inputs.data, Table("heart_disease")) + self.assertEqual( + dmod.index(3, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Remove].short_desc, True)) + + dselmod.select(dmod.index(1, 0), + QItemSelectionModel.ClearAndSelect) # chest pain + dselmod.select(dmod.index(4, 0), + QItemSelectionModel.Select) # exerc ind ang + dgroup.button(Continuize.AsOrdinal).setChecked(True) + dgroup.idClicked.emit(Continuize.AsOrdinal) + + self.assertFalse("gender" in w.disc_var_hints) + self.assertEqual(w.disc_var_hints["chest pain"], Continuize.AsOrdinal) + self.assertEqual(w.disc_var_hints["exerc ind ang"], Continuize.AsOrdinal) + self.assertEqual(w.disc_var_hints["rest ECG"], Continuize.Remove) + + dselmod.select(dmod.index(1, 0), + QItemSelectionModel.ClearAndSelect) # chest pain + dselmod.select(dmod.index(0, 0), + QItemSelectionModel.Select) # gender + dgroup.button(99).setChecked(True) + dgroup.idClicked.emit(99) + self.assertFalse("chest pain" in w.disc_var_hints) + self.assertFalse("gender" in w.disc_var_hints) + self.assertEqual(w.disc_var_hints["rest ECG"], Continuize.Remove) + + self.assertEqual(dmod.index(0, 0).data(dmod.HintRole), + ("preset", False)) + self.assertEqual( + dmod.index(3, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Remove].short_desc, True)) + + dview.select_default() + dgroup.button(Continuize.AsOrdinal).setChecked(True) + dgroup.idClicked.emit(Continuize.AsOrdinal) + self.assertEqual(w.disc_var_hints[DefaultKey], Continuize.AsOrdinal) + + def test_change_hints_disc_class_meta(self): + w = self.widget + dview = w.disc_view + dmod = dview.model() + dselmod = dview.selectionModel() + dgroup = w.disc_group + + domain = Domain([DiscreteVariable(x, values=["0", "1"]) for x in "abc"], + DiscreteVariable("d", values=["0", "1"]), + [DiscreteVariable(x, values=["0", "1"]) for x in "efg"]) + data = Table.from_list(domain, [[1] * 7] * 2) + self.send_signal(w.Inputs.data, data) + + dselmod.select(dmod.index(1, 0), + QItemSelectionModel.ClearAndSelect) # attribute b + dselmod.select(dmod.index(4, 0), + QItemSelectionModel.Select) # meta e + dselmod.select(dmod.index(8, 0), + QItemSelectionModel.Select) # class d + dgroup.button(Continuize.Remove).setChecked(True) + dgroup.idClicked.emit(Continuize.Remove) + self.assertEqual(w.disc_var_hints["b"], Continuize.Remove) + self.assertEqual(w.disc_var_hints["e"], Continuize.Remove) + self.assertEqual(w.disc_var_hints["d"], Continuize.Remove) + self.assertEqual( + dmod.index(1, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Remove].short_desc, True)) + self.assertEqual( + dmod.index(4, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Remove].short_desc, True)) + self.assertEqual( + dmod.index(8, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Remove].short_desc, True)) + + dgroup.button(DefaultId).setChecked(True) + dgroup.idClicked.emit(DefaultId) + self.assertNotIn("b", w.disc_var_hints) + self.assertEqual(w.disc_var_hints["e"], DefaultId) + self.assertEqual(w.disc_var_hints["d"], DefaultId) + self.assertEqual( + dmod.index(1, 0).data(dmod.HintRole), + (DiscreteOptions[DefaultId].short_desc, False)) + self.assertEqual( + dmod.index(4, 0).data(dmod.HintRole), + (DiscreteOptions[DefaultId].short_desc, True)) + self.assertEqual( + dmod.index(8, 0).data(dmod.HintRole), + (DiscreteOptions[DefaultId].short_desc, True)) + + dgroup.button(Continuize.Leave).setChecked(True) + dgroup.idClicked.emit(Continuize.Leave) + self.assertEqual(w.disc_var_hints["b"], Continuize.Leave) + self.assertNotIn("e", w.disc_var_hints) + self.assertNotIn("d", w.disc_var_hints) + self.assertEqual( + dmod.index(1, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Leave].short_desc, True)) + self.assertEqual( + dmod.index(4, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Leave].short_desc, False)) + self.assertEqual( + dmod.index(8, 0).data(dmod.HintRole), + (DiscreteOptions[Continuize.Leave].short_desc, False)) + + def test_change_hints_cont(self): + w = self.widget + w.cont_var_hints[DefaultKey] = Normalize.Center + w.cont_var_hints["cholesterol"] = Normalize.Scale + + self.send_signal(w.Inputs.data, Table("heart_disease")) + + cview = w.cont_view + cmod = cview.model() + cselmod = cview.selectionModel() + cgroup = w.cont_group + + cselmod.select(cmod.index(2, 0), + QItemSelectionModel.ClearAndSelect) # cholesterol + cselmod.select(cmod.index(3, 0), + QItemSelectionModel.Select) # max HR + cgroup.button(Normalize.Normalize11).setChecked(True) + cgroup.idClicked.emit(Normalize.Normalize11) + + self.assertFalse("age" in w.cont_var_hints) + self.assertEqual(w.cont_var_hints["cholesterol"], Normalize.Normalize11) + self.assertEqual(w.cont_var_hints["max HR"], Normalize.Normalize11) + + cselmod.select(cmod.index(2, 0), + QItemSelectionModel.ClearAndSelect) # cholesterol + cselmod.select(cmod.index(0, 0), + QItemSelectionModel.Select) # age + cgroup.button(99).setChecked(True) + cgroup.idClicked.emit(99) + self.assertFalse("age" in w.cont_var_hints) + self.assertFalse("cholesterol" in w.cont_var_hints) + self.assertEqual(w.cont_var_hints["max HR"], Normalize.Normalize11) + + self.assertEqual(cmod.index(0, 0).data(cmod.HintRole), + ("preset", False)) + self.assertEqual( + cmod.index(3, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Normalize11].short_desc, True)) + + def test_change_hints_cont_class_meta(self): + w = self.widget + cview = w.cont_view + cmod = cview.model() + cselmod = cview.selectionModel() + cgroup = w.cont_group + + domain = Domain([ContinuousVariable(x) for x in "abc"], + ContinuousVariable("d"), + [ContinuousVariable(x) for x in "efg"]) + data = Table.from_list(domain, [[1] * 7] * 2) + self.send_signal(w.Inputs.data, data) + + cselmod.select(cmod.index(1, 0), + QItemSelectionModel.ClearAndSelect) # attribute b + cselmod.select(cmod.index(4, 0), + QItemSelectionModel.Select) # meta e + cselmod.select(cmod.index(8, 0), + QItemSelectionModel.Select) # class d + cgroup.button(Normalize.Center).setChecked(True) + cgroup.idClicked.emit(Normalize.Center) + self.assertEqual(w.cont_var_hints["b"], Normalize.Center) + self.assertEqual(w.cont_var_hints["e"], Normalize.Center) + self.assertEqual(w.cont_var_hints["d"], Normalize.Center) + self.assertEqual( + cmod.index(1, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Center].short_desc, True)) + self.assertEqual( + cmod.index(4, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Center].short_desc, True)) + self.assertEqual( + cmod.index(8, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Center].short_desc, True)) + + cgroup.button(DefaultId).setChecked(True) + cgroup.idClicked.emit(DefaultId) + self.assertNotIn("b", w.cont_var_hints) + self.assertEqual(w.cont_var_hints["e"], DefaultId) + self.assertEqual(w.cont_var_hints["d"], DefaultId) + self.assertEqual( + cmod.index(1, 0).data(cmod.HintRole), + (ContinuousOptions[DefaultId].short_desc, False)) + self.assertEqual( + cmod.index(4, 0).data(cmod.HintRole), + (ContinuousOptions[DefaultId].short_desc, True)) + self.assertEqual( + cmod.index(8, 0).data(cmod.HintRole), + (ContinuousOptions[DefaultId].short_desc, True)) + + cgroup.button(Normalize.Leave).setChecked(True) + cgroup.idClicked.emit(Normalize.Leave) + self.assertEqual(w.cont_var_hints["b"], Normalize.Leave) + self.assertNotIn("e", w.cont_var_hints) + self.assertNotIn("d", w.cont_var_hints) + self.assertEqual( + cmod.index(1, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Leave].short_desc, True)) + self.assertEqual( + cmod.index(4, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Leave].short_desc, False)) + self.assertEqual( + cmod.index(8, 0).data(cmod.HintRole), + (ContinuousOptions[Normalize.Leave].short_desc, False)) + + def test_is_attr_and_default(self): + w = self.widget + a, b, d, e, f = (ContinuousVariable(x) for x in "abdef") + c, g = (DiscreteVariable(x, values=["0", "1"]) for x in "cg") + domain = Domain([a, b, c], d, [e, f, g]) + data = Table.from_list(domain, [[1] * 7] * 2) + self.send_signal(w.Inputs.data, data) + + self.assertTrue(w.is_attr(a)) + self.assertTrue(w.is_attr(b)) + self.assertTrue(w.is_attr(c)) + self.assertFalse(w.is_attr(d)) + self.assertFalse(w.is_attr(e)) + self.assertFalse(w.is_attr(f)) + self.assertFalse(w.is_attr(g)) + + self.assertEqual(w.default_for_var(a), DefaultId) + self.assertEqual(w.default_for_var(c), DefaultId) + self.assertEqual(w.default_for_var(d), Normalize.Leave) + self.assertEqual(w.default_for_var(e), Normalize.Leave) + self.assertEqual(w.default_for_var(g), Continuize.Leave) + + def test_hint_for_var(self): + w = self.widget + c1, c2, c3, c4, c5 = (ContinuousVariable(f"c{x}") for x in range(1, 6)) + d1, d2, d3, d4 = (DiscreteVariable(f"d{x}", values=["0", "1"]) for x in range(1, 5)) + domain = Domain([c1, c2, d1, d2], c5, [c3, c4, d3, d4]) + data = Table.from_list(domain, [[1] * 7] * 2) + w.cont_var_hints = { + DefaultKey: Normalize.Center, + "c1": Normalize.Scale, + "c3": Normalize.Standardize, + } + w.disc_var_hints = { + DefaultKey: Continuize.FrequentAsBase, + "d1": Continuize.Remove, + "d3": Continuize.Indicators + } + self.send_signal(w.Inputs.data, data) + + self.assertEqual(w._hint_for_var(c1), Normalize.Scale) + self.assertEqual(w._hint_for_var(c2), Normalize.Center) + self.assertEqual(w._hint_for_var(c3), Normalize.Standardize) + self.assertEqual(w._hint_for_var(c4), Normalize.Leave) + + self.assertEqual(w._hint_for_var(d1), Continuize.Remove) + self.assertEqual(w._hint_for_var(d2), Continuize.FrequentAsBase) + self.assertEqual(w._hint_for_var(d3), Continuize.Indicators) + self.assertEqual(w._hint_for_var(d4), Continuize.Leave) + + def test_transformations(self): + domain = Domain([DiscreteVariable(c, values="abc") + for c in ("default", "leave", "first", "frequent", + "one-hot", "remove-if", "remove", "ordinal", + "normordinal")], + DiscreteVariable("y", values="abc"), + [ContinuousVariable(c) + for c in ("cdefault", "cleave", + "cstandardize", "ccenter", "cscale", + "cnormalize11", "cnormalize01")] + ) + data = Table.from_list(domain, + [[x] * 17 for x in range(3)] + [[2] * 17]) - # input sparse - buttons[w.Normalize.Normalize11].click() - self.send_signal(w.Inputs.data, sparse_iris) - self.assertEqual(w.continuous_treatment, w.Normalize.Leave) - assert_enabled(False) + w = self.widget + w.disc_var_hints = { + var.name: id_ + for var, id_ in zip(domain.attributes, DiscreteOptions) + if id_ != 99 + } + w.disc_var_hints[DefaultKey] = Continuize.FrequentAsBase + + w.cont_var_hints = { + var.name: id_ + for var, id_ in zip(domain.metas, ContinuousOptions) + if id_ != 99 + } + w.cont_var_hints[DefaultKey] = Normalize.Center + + self.send_signal(w.Inputs.data, data) + outp = self.get_output(w.Outputs.data) - # input dense - self.send_signal(w.Inputs.data, iris) - assert_enabled(True) + np.testing.assert_almost_equal( + outp.X, + [[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0], + [0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0.5], + [0, 0, 2, 0, 1, 0, 0, 0, 0, 1, 2, 1], + [0, 0, 2, 0, 1, 0, 0, 0, 0, 1, 2, 1], + ] + ) + np.testing.assert_almost_equal( + outp.Y, + [0, 1, 2, 2] + ) + np.testing.assert_almost_equal( + outp.metas, + [[0, 0, -1.50755672, -1.25, 0, -1, 0], + [1, 1, -0.30151134, -0.25, 1.20604538, 0, 0.5], + [2, 2, 0.90453403, 0.75, 2.41209076, 1, 1], + [2, 2, 0.90453403, 0.75, 2.41209076, 1, 1], + ] + ) - def test_migrate_settings_to_v2(self): - Normalize = OWContinuize.Normalize + def test_send_report(self): + w = self.widget + self.send_signal(w.Inputs.data, Table("heart_disease")) + self.widget.send_report() + w.disc_var_hints[DefaultKey] = Continuize.AsOrdinal + w.disc_var_hints["chest pain"] \ + = w.disc_var_hints["rest ECG"] \ + = Continuize.Remove + w.disc_var_hints["exerc ind ang"] = Continuize.FirstAsBase + + self.send_signal(w.Inputs.data, Table("heart_disease")) + self.widget.send_report() + + w.cont_var_hints[DefaultKey] = Normalize.Center + w.cont_var_hints["cholesterol"] = Normalize.Scale + + self.send_signal(w.Inputs.data, Table("heart_disease")) + self.widget.send_report() + + w.continuize_class = True + w.disc_var_hints[DefaultKey] = Continuize.AsOrdinal + w.disc_var_hints["chest pain"] \ + = w.disc_var_hints["rest ECG"] \ + = Continuize.Remove + w.disc_var_hints["exerc ind ang"] = Continuize.FirstAsBase + + w.cont_var_hints[DefaultKey] = Normalize.Center + w.cont_var_hints["cholesterol"] = Normalize.Scale + + self.send_signal(w.Inputs.data, Table("heart_disease")) + self.widget.send_report() + + def test_migrate_settings_to_v3(self): + # why not?, pylint: disable=use-dict-literal widget = self.create_widget( OWContinuize, stored_settings=dict(continuous_treatment=0)) - self.assertEqual(widget.continuous_treatment, Normalize.Leave) + self.assertEqual(widget.cont_var_hints[DefaultKey], + Normalize.Leave) widget = self.create_widget( OWContinuize, stored_settings=dict(continuous_treatment=1, zero_based=True)) - self.assertEqual(widget.continuous_treatment, Normalize.Normalize01) + self.assertEqual(widget.cont_var_hints[DefaultKey], + Normalize.Normalize01) widget = self.create_widget( OWContinuize, stored_settings=dict(continuous_treatment=1, zero_based=False)) - self.assertEqual(widget.continuous_treatment, Normalize.Normalize11) + self.assertEqual(widget.cont_var_hints[DefaultKey], + Normalize.Normalize11) widget = self.create_widget( OWContinuize, stored_settings=dict(continuous_treatment=2)) - self.assertEqual(widget.continuous_treatment, Normalize.Standardize) - - def test_normalizations(self): - buttons = self.widget.controls.continuous_treatment.buttons - Normalize = self.widget.Normalize - - domain = Domain([ContinuousVariable(name) for name in "xyz"]) - col0 = np.arange(0, 10, 2).reshape(5, 1) - col1 = np.ones((5, 1)) - col2 = np.arange(-2, 3).reshape(5, 1) - means = np.array([4, 1, 0]) - sds = np.sqrt(np.array([16 + 4 + 0 + 4 + 16, 5, 4 + 1 + 0 + 1 + 4]) / 5) - - x = np.hstack((col0, col1, col2)) - data = Table.from_numpy(domain, x) - self.send_signal(OWContinuize.Inputs.data, data) - - buttons[Normalize.Leave].click() - out = self.get_output(self.widget.Outputs.data) - np.testing.assert_equal(out.X, x) - - buttons[Normalize.Standardize].click() - out = self.get_output(self.widget.Outputs.data) - np.testing.assert_almost_equal(out.X, (x - means) / sds) - - buttons[Normalize.Center].click() - out = self.get_output(self.widget.Outputs.data) - np.testing.assert_almost_equal(out.X, x - means) - - buttons[Normalize.Scale].click() - out = self.get_output(self.widget.Outputs.data) - np.testing.assert_almost_equal(out.X, x / sds) - - buttons[Normalize.Normalize01].click() - out = self.get_output(self.widget.Outputs.data) - col = (np.arange(5) / 4).reshape(5, 1) - np.testing.assert_almost_equal( - out.X, - np.hstack((col, np.zeros((5, 1)), col)) - ) + self.assertEqual(widget.cont_var_hints[DefaultKey], + Normalize.Standardize) - buttons[Normalize.Normalize11].click() - out = self.get_output(self.widget.Outputs.data) - col = (np.arange(5) / 2).reshape(5, 1) - 1 - np.testing.assert_almost_equal( - out.X, - np.hstack((col, np.zeros((5, 1)), col)) + widget = self.create_widget( + OWContinuize, + stored_settings=dict(multinomial_treatment=2) ) + self.assertEqual(widget.disc_var_hints[DefaultKey], + Continuize.Indicators) - def test_send_report(self): - self.widget.send_report() + def test_migrate_settings_to_v3_class_treatment(self): + # why not?, pylint: disable=use-dict-literal + domain = Domain([ContinuousVariable(c) for c in "abc"], + DiscreteVariable("y")) + data = Table.from_list(domain, [[0] * 4] * 2) + widget = self.create_widget( + OWContinuize, + stored_settings=dict(multinomial_treatment=4, + class_treatment=3) + ) + self.send_signal(widget.Inputs.data, data) + self.assertEqual(widget.disc_var_hints["y"], Continuize.Indicators) + self.assertEqual(widget.disc_var_hints[DefaultKey], Continuize.Remove) -class TestOWContinuizeUtils(unittest.TestCase): - def test_dummy_coding_zero_based(self): - var = DiscreteVariable("foo", values=tuple("abc")) - - varb, varc = owcontinuize.dummy_coding(var) - - self.assertEqual(varb.name, "foo=b") - self.assertIsInstance(varb.compute_value, transformation.Indicator) - self.assertEqual(varb.compute_value.value, 1) - self.assertIs(varb.compute_value.variable, var) - - self.assertEqual(varc.name, "foo=c") - self.assertIsInstance(varc.compute_value, transformation.Indicator) - self.assertEqual(varc.compute_value.value, 2) - self.assertIs(varc.compute_value.variable, var) - - def test_dummy_coding_base_value(self): - var = DiscreteVariable("foo", values=tuple("abc")) - - varb, varc = owcontinuize.dummy_coding(var, base_value=0) - - self.assertEqual(varb.name, "foo=b") - self.assertIsInstance(varb.compute_value, transformation.Indicator) - self.assertEqual(varb.compute_value.value, 1) - self.assertEqual(varc.name, "foo=c") - self.assertIsInstance(varc.compute_value, transformation.Indicator) - self.assertEqual(varc.compute_value.value, 2) - - varb, varc = owcontinuize.dummy_coding(var, base_value=1) - - self.assertEqual(varb.name, "foo=a") - self.assertIsInstance(varb.compute_value, transformation.Indicator) - self.assertEqual(varb.compute_value.value, 0) - self.assertEqual(varc.name, "foo=c") - self.assertIsInstance(varc.compute_value, transformation.Indicator) - self.assertEqual(varc.compute_value.value, 2) - - def test_one_hot_coding(self): - var = DiscreteVariable("foo", values=tuple("abc")) - - new_vars = owcontinuize.one_hot_coding(var) - for i, (c, nvar) in enumerate(zip("abc", new_vars)): - self.assertEqual(nvar.name, f"foo={c}") - self.assertIsInstance(nvar.compute_value, transformation.Indicator) - self.assertEqual(nvar.compute_value.value, i) - self.assertIs(nvar.compute_value.variable, var) - - -class TestWeightedIndicator(unittest.TestCase): - def test_equality(self): - disc1 = DiscreteVariable("d1", values=tuple("abc")) - disc1a = DiscreteVariable("d1", values=tuple("abc")) - disc2 = DiscreteVariable("d2", values=tuple("abc")) - assert disc1 == disc1a - - t1 = WeightedIndicator(disc1, 0, 1) - t1a = WeightedIndicator(disc1a, 0, 1) - t2 = WeightedIndicator(disc2, 0, 1) - self.assertEqual(t1, t1) - self.assertEqual(t1, t1a) - self.assertNotEqual(t1, t2) - - self.assertEqual(hash(t1), hash(t1a)) - self.assertNotEqual(hash(t1), hash(t2)) - - t1 = WeightedIndicator(disc1, 0, 1) - t1a = WeightedIndicator(disc1a, 1, 1) - self.assertNotEqual(t1, t1a) - self.assertNotEqual(hash(t1), hash(t1a)) - - t1 = WeightedIndicator(disc1, 0, 1) - t1a = WeightedIndicator(disc1a, 0, 2) - self.assertNotEqual(t1, t1a) - self.assertNotEqual(hash(t1), hash(t1a)) + widget = self.create_widget( + OWContinuize, + stored_settings=dict(multinomial_treatment=4, + class_treatment=0) + ) + self.send_signal(widget.Inputs.data, data) + self.assertNotIn("y", widget.disc_var_hints) + self.assertEqual(widget.disc_var_hints[DefaultKey], 4) + + widget = self.create_widget( + OWContinuize, + stored_settings=dict(multinomial_treatment=Continuize.Remove) + ) + self.send_signal(widget.Inputs.data, data) + self.assertNotIn("y", widget.disc_var_hints) + self.assertEqual(widget.disc_var_hints[DefaultKey], Continuize.Remove) + + +class TestModelsAndViews(GuiTest): + def test_contmodel(self): + domain = Domain([ContinuousVariable(c) for c in "abc"], + ContinuousVariable("y")) + model = ContDomainModel(ContinuousVariable) + model.set_domain(domain) + + ind = model.index(0, 0) + self.assertEqual(ind.data()[0], "a") + self.assertEqual(ind.data(model.FilterRole)[0], "a") + self.assertIsNone(ind.data(Qt.ToolTipRole)) + + ind = model.index(1, 0) + model.setData(ind, ("mega encoding", True), model.HintRole) + self.assertEqual(ind.data(), ("b", "mega encoding", True)) + self.assertEqual(ind.data(model.HintRole), ("mega encoding", True)) + self.assertIn("b", ind.data(model.FilterRole)) + self.assertIn("mega encoding", ind.data(model.FilterRole)) + self.assertNotIn("bmega encoding", ind.data(model.FilterRole)) + self.assertIsNone(ind.data(Qt.ToolTipRole)) + + ind = model.index(3, 0) # separator + self.assertIsNone(ind.data()) + self.assertIsNone(ind.data(model.HintRole)) + self.assertIsNone(ind.data(model.FilterRole)) + + def test_defaultcontmodel(self): + model = DefaultContModel() + self.assertEqual(1, model.rowCount(QModelIndex())) + self.assertEqual(1, model.columnCount(QModelIndex())) + ind = model.index(0, 0) + spy = QSignalSpy(model.dataChanged) + model.setMethod("mega encoding") + self.assertEqual(spy[0][0].row(), 0) + self.assertEqual(ind.data(), "Preset: mega encoding") + self.assertIsNotNone(ind.data(Qt.DecorationRole)) + self.assertIsNotNone(ind.data(Qt.ToolTipRole)) + + +class TestListViewDelegate(unittest.TestCase): + def test_displaytext(self): + delegate = ListViewSearch.Delegate() + self.assertEqual(delegate.displayText(("a", "foo", False), Mock()), + "a") + self.assertEqual(delegate.displayText(("a", "foo", True), Mock()), + "a: foo") + delegate.set_default_hints(True) + self.assertEqual(delegate.displayText(("a", "foo", False), Mock()), + "a: foo") + delegate.set_default_hints(False) + self.assertEqual(delegate.displayText(("a", "foo", False), Mock()), + "a") + + @patch.object(SeparatedListDelegate, "initStyleOption") + def test_bold(self, _): + delegate = ListViewSearch.Delegate() + option = Mock() + index = Mock() + index.data = lambda role: ("foo", True) \ + if role == ContDomainModel.HintRole else None + delegate.initStyleOption(option, index) + option.font.setBold.assert_called_with(True) + index.data = lambda role: ("foo", False) \ + if role == ContDomainModel.HintRole else None + delegate.initStyleOption(option, index) + option.font.setBold.assert_called_with(False) + index.data = lambda role: None \ + if role == ContDomainModel.HintRole else None + delegate.initStyleOption(option, index) + option.font.setBold.assert_called_with(False) if __name__ == "__main__": diff --git a/Orange/widgets/utils/itemmodels.py b/Orange/widgets/utils/itemmodels.py index a749b7115c0..2bd996c884a 100644 --- a/Orange/widgets/utils/itemmodels.py +++ b/Orange/widgets/utils/itemmodels.py @@ -22,7 +22,8 @@ import numpy from orangewidget.utils.itemmodels import ( - PyListModel, AbstractSortTableModel as _AbstractSortTableModel + PyListModel, AbstractSortTableModel as _AbstractSortTableModel, + LabelledSeparator, SeparatorItem ) from Orange.widgets.utils.colorpalettes import ContinuousPalettes, ContinuousPalette @@ -451,7 +452,9 @@ class DomainModel(VariableListModel): PRIMITIVE = (DiscreteVariable, ContinuousVariable) def __init__(self, order=SEPARATED, separators=True, placeholder=None, - valid_types=None, alphabetical=False, skip_hidden_vars=True, **kwargs): + valid_types=None, alphabetical=False, skip_hidden_vars=True, + *, strict_type=False, + **kwargs): """ Parameters @@ -465,9 +468,13 @@ def __init__(self, order=SEPARATED, separators=True, placeholder=None, valid_types: tuple (Sub)types of `Variable` that are included in the model alphabetical: bool - If true, variables are sorted alphabetically. + If True, variables are sorted alphabetically. skip_hidden_vars: bool - If true, variables marked as "hidden" are skipped. + If True, variables marked as "hidden" are skipped. + strict_type: bool + If True, variable must be one of specified valid_types and not a + derived type (i.e. TimeVariable is not accepted as + ContinuousVariable) """ super().__init__(placeholder=placeholder, **kwargs) if isinstance(order, int): @@ -479,9 +486,10 @@ def __init__(self, order=SEPARATED, separators=True, placeholder=None, (self.Separator, ) * (self.Separator in order) + \ order if not separators: - order = [e for e in order if e is not self.Separator] + order = [e for e in order if not isinstance(e, SeparatorItem)] self.order = order self.valid_types = valid_types + self.strict_type = strict_type self.alphabetical = alphabetical self.skip_hidden_vars = skip_hidden_vars self._within_set_domain = False @@ -493,10 +501,10 @@ def set_domain(self, domain): # The logic related to separators is a bit complicated: it ensures that # even when a section is empty we don't have two separators in a row # or a separator at the end - add_separator = False + add_separator = None for section in self.order: - if section is self.Separator: - add_separator = True + if isinstance(section, SeparatorItem): + add_separator = section continue if isinstance(section, int): if domain is None: @@ -509,7 +517,9 @@ def set_domain(self, domain): to_add = list(filter_visible(to_add)) if self.valid_types is not None: to_add = [var for var in to_add - if isinstance(var, self.valid_types)] + if (type(var) in self.valid_types + if self.strict_type + else isinstance(var, self.valid_types))] if self.alphabetical: to_add = sorted(to_add, key=lambda x: x.name) elif isinstance(section, list): @@ -517,9 +527,10 @@ def set_domain(self, domain): else: to_add = [section] if to_add: - if add_separator and content: - content.append(self.Separator) - add_separator = False + if add_separator and ( + content or isinstance(add_separator, LabelledSeparator)): + content.append(add_separator) + add_separator = None content += to_add try: self._within_set_domain = True diff --git a/doc/visual-programming/source/widgets/data/continuize.md b/doc/visual-programming/source/widgets/data/continuize.md index 2097189f127..0a3d85bc714 100644 --- a/doc/visual-programming/source/widgets/data/continuize.md +++ b/doc/visual-programming/source/widgets/data/continuize.md @@ -11,13 +11,21 @@ Turns discrete variables (attributes) into numeric ("continuous") dummy variable - Data: transformed data set -The **Continuize** widget receives a data set in the input and outputs the same data set in which the discrete variables (including binary variables) are replaced with continuous ones. +The **Continuize** widget receives a data set in the input and outputs the same data set in which some or all categorical variables are replaced with continuous ones and numeric variables are scaled. ![](images/Continuize-stamped.png) -1. Define the treatment of non-binary categorical variables. +1. Select a categorical attribute to define its specific treatmen, or click the "Deafult" option above to set the default treatment for all categorical attributes without specific settings. - Examples in this section will assume that we have a discrete attribute status with the values low, middle and high, listed in that order. Options for their transformation are: + Multiple attributes can be chosen. + +2. Define the treatment of categorical variables. + + Examples in this section will assume that we have a categorical attribute *status* with values *low*, *middle* and *high*, listed in that order. Options for their transformation are: + + - **Use default setting**: use the default treatment. + + - **Leave categorical**: leave the attribute as it is. - **First value as base**: a N-valued categorical variable will be transformed into N-1 numeric variables, each serving as an indicator for one of the original values except for the base value. The base value is the first value in the list. By default, the values are ordered alphabetically; their order can be changed in [Edit Domain](../data/editdomain). @@ -25,31 +33,37 @@ The **Continuize** widget receives a data set in the input and outputs the same - **Most frequent value as base**: similar to the above, except that the most frequent value is used as a base. So, if the most frequent value in the above example is *middle*, then *middle* is considered as the base and the two newly constructed variables are *status=low* and *status=high*. - - **One attribute per value**: this option constructs one numeric variable per each value of the original variable. In the above case, we would get variables *status=low*, *status=middle* and *status=high*. + - **One-hot encoding**: this option constructs one numeric variable per each value of the original variable. In the above case, we would get variables *status=low*, *status=middle* and *status=high*. - - **Ignore multinomial attributes**: removes non-binary categorical variables from the data. + - **Remove if more than 3 values**: removes non-binary categorical variables from the data. - - **Treat as ordinal**: converts the variable into a single numeric variable enumerating the original values. In the above case, the new variable would have the value of 0 for *low*, 1 for *middle* and 2 for *high*. Again note that the order of values can be set in [Edit Domain](../data/editdomain). - - - **Divide by number of values**: same as above, except that values are normalized into range 0-1. In our example, the values of the new variable would be 0, 0.5 and 1. + - **Remove**: removes the attribute. -2. Define the treatment of continuous attributes. Besised the option to *Leave them as they are*, we can *Normalize by span*, which will subtract the lowest value found in the data and divide by the span, so all values will fit into [0, 1]. Option *Normalize by standard deviation* subtracts the average and divides by the standard deviation. + - **Treat as ordinal**: converts the variable into a single numeric variable enumerating the original values. In the above case, the new variable would have the value of 0 for *low*, 1 for *middle* and 2 for *high*. Again note that the order of values can be set in [Edit Domain](../data/editdomain). -3. Define the treatment of class attributes (outcomes, targets). Besides leaving it as it is, the available options mirror those for multinomial attributes, except for those that would split the outcome into multiple outcome variables. + - **Treat as normalized ordinal**: same as above, except that values are normalized into range 0-1. In our example, the values of the new variable would be 0, 0.5 and 1. -4. This option defines the ranges of new variables. In the above text, we supposed the range *from 0 to 1*. +3. Select attributes to set individual treatments or click "Default" to set the default treatment for numeric attributes. -5. Produce a report. +4. Define the treatment of numeric attributes. -6. If *Apply automatically* is ticked, changes are committed automatically. Otherwise, you have to press *Apply* after each change. + - **Use default setting**: use the general default. + - **Leave as it is**: do not change anything. + - **Standardize**: subtract the mean and divide by the standard deviation (not available for sparse data). + - **Center**: subtract the mean (not available for sparse data). + - **Scale**: divide by standard deviation. + - **Normalize to interval [-1, 1]**: linearly scale the values into interval [-1, 1] (not available for sparse data) + - **Normalize to interval [0, 1]**: linearly scale the values into interval [0, 1] (not available for sparse data) + +5. If checked, the class attribute is converted in the same fashion as categorical attributes that are treated as ordinal (see above). Examples -------- -First, let's see what is the output of the **Continuize** widget. We feed the original data (the *Heart disease* data set) into the [Data Table](../data/datatable) and see how they look like. Then we continuize the discrete values and observe them in another [Data Table](../data/datatable). +First, let's see what is the output of the **Continuize** widget. We feed the original data (the *Heart disease* data set) into the [Data Table](../data/datatable) and see how they look like. Then we continuize the discrete values using various options and observe them in another [Data Table](../data/datatable). ![](images/Continuize-Example1.png) -In the second example, we show a typical use of this widget - in order to properly plot the linear projection of the data, discrete attributes need to be converted to continuous ones and that is why we put the data through the **Continuize** widget before drawing it. The attribute "*chest pain*" originally had four values and was transformed into three continuous attributes; similar happened to gender, which was transformed into a single attribute "*gender=female*". +In the second example, we show a typical use of this widget - in order to properly plot the linear projection of the data, discrete attributes need to be converted to continuous ones and that is why we put the data through the **Continuize** widget before drawing it. Gender, for instance, is transformed into two attributes "*gender=female*" and *gender=male*. ![](images/Continuize-Example2.png) diff --git a/doc/visual-programming/source/widgets/data/images/Continuize-Example1.png b/doc/visual-programming/source/widgets/data/images/Continuize-Example1.png index 6b4d376060f..cbd915b5f87 100644 Binary files a/doc/visual-programming/source/widgets/data/images/Continuize-Example1.png and b/doc/visual-programming/source/widgets/data/images/Continuize-Example1.png differ diff --git a/doc/visual-programming/source/widgets/data/images/Continuize-Example2.png b/doc/visual-programming/source/widgets/data/images/Continuize-Example2.png index e33032ccae2..5861e307fe7 100644 Binary files a/doc/visual-programming/source/widgets/data/images/Continuize-Example2.png and b/doc/visual-programming/source/widgets/data/images/Continuize-Example2.png differ diff --git a/doc/visual-programming/source/widgets/data/images/Continuize-stamped.png b/doc/visual-programming/source/widgets/data/images/Continuize-stamped.png index 1fb7ae34e61..3bd67ffda14 100644 Binary files a/doc/visual-programming/source/widgets/data/images/Continuize-stamped.png and b/doc/visual-programming/source/widgets/data/images/Continuize-stamped.png differ