Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] OwLouvain: Add normalize data checkbox to PCA preprocessing #3573

Merged
merged 7 commits into from
Feb 15, 2019
19 changes: 16 additions & 3 deletions Orange/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,24 @@ class Normalizer(Reprable):
def __init__(self,
zero_based=True,
norm_type=Normalize.NormalizeBySD,
transform_class=False):
transform_class=False,
center=True):
self.zero_based = zero_based
self.norm_type = norm_type
self.transform_class = transform_class
self.center = center

def __call__(self, data):

dists = distribution.get_distributions(data)
new_attrs = [self.normalize(dists[i], var) for
(i, var) in enumerate(data.domain.attributes)]

new_class_vars = data.domain.class_vars
if self.transform_class:
attr_len = len(data.domain.attributes)
new_class_vars = [self.normalize(dists[i + attr_len], var) for
(i, var) in enumerate(data.domain.class_vars)]

domain = Domain(new_attrs, new_class_vars, data.domain.metas)
return data.transform(domain)

Expand All @@ -41,7 +44,17 @@ def normalize_by_sd(self, dist, var):
avg, sd = (dist.mean(), dist.standard_deviation()) if dist.size else (0, 1)
if sd == 0:
sd = 1
return ContinuousVariable(var.name, compute_value=Norm(var, avg, 1 / sd), sparse=var.sparse)

if self.center:
compute_val = Norm(var, avg, 1 / sd)
else:
compute_val = Norm(var, 0, 1 / sd)

return ContinuousVariable(
var.name,
compute_value=compute_val,
sparse=var.sparse,
)

def normalize_by_span(self, dist, var):
dma, dmi = dist.max(), dist.min()
Expand Down
15 changes: 13 additions & 2 deletions Orange/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ class Normalize(Preprocess):
Parameters
----------
zero_based : bool (default=True)
Only used when `norm_type=NormalizeBySpan`.

Determines the value used as the “low” value of the variable.
It determines the interval for normalized continuous variables
(either [-1, 1] or [0, 1]).
Expand All @@ -286,6 +288,11 @@ class Normalize(Preprocess):
transform_class : bool (default=False)
If True the class is normalized as well.

center : bool (default=True)
Only used when `norm_type=NormalizeBySD`.

Whether or not to center the data so it has mean zero.
pavlin-policar marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> from Orange.data import Table
Expand All @@ -301,10 +308,12 @@ class Normalize(Preprocess):
def __init__(self,
zero_based=True,
norm_type=NormalizeBySD,
transform_class=False):
transform_class=False,
center=True):
self.zero_based = zero_based
self.norm_type = norm_type
self.transform_class = transform_class
self.center = center

def __call__(self, data):
"""
Expand Down Expand Up @@ -334,7 +343,9 @@ def __call__(self, data):
normalizer = normalize.Normalizer(
zero_based=self.zero_based,
norm_type=self.norm_type,
transform_class=self.transform_class)
transform_class=self.transform_class,
center=self.center,
)
return normalizer(data)


Expand Down
17 changes: 17 additions & 0 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,23 @@
from sklearn.utils.sparsefuncs import mean_variance_axis


def sparse_array_equal(x1, x2):
"""Check if two sparse arrays are equal."""
if not sp.issparse(x1):
raise TypeError("`x1` must be sparse.")
if not sp.issparse(x2):
raise TypeError("`x2` must be sparse.")

return x1.shape == x2.shape and (x1 != x2).nnz == 0


def array_equal(x1, x2):
"""Equivalent of np.array_equal that properly handles sparse matrices."""
if sp.issparse(x1) and sp.issparse(x2):
return sparse_array_equal(x1, x2)
return np.array_equal(x1, x2)


def _count_nans_per_row_sparse(X, weights, dtype=None):
""" Count the number of nans (undefined) values per row. """
if weights is not None:
Expand Down
21 changes: 20 additions & 1 deletion Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from Orange.statistics.util import bincount, countnans, contingency, digitize, \
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
unique, var, nanstd, nanvar, nanmode
unique, var, nanstd, nanvar, nanmode, array_equal
from sklearn.utils import check_random_state


Expand Down Expand Up @@ -590,6 +590,25 @@ def test_nanunique_ignores_nans_in_counts(self, array):
np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)


class TestArrayEqual(unittest.TestCase):
@dense_sparse
def test_same_matrices(self, array):
x = array([0, 1, 0, 0, 2])
self.assertTrue(array_equal(x, x))

@dense_sparse
def test_with_different_shapes(self, array):
x = array(np.eye(4))
y = array(np.eye(5))
self.assertFalse(array_equal(x, y))

@dense_sparse
def test_with_different_values(self, array):
x = array([0, 1, 0, 0, 2])
y = array([0, 3, 0, 0, 2])
self.assertFalse(array_equal(x, y))


class TestNanModeAppVeyor(unittest.TestCase):
def test_appveyour_still_not_onscipy_1_2_0(self):
import scipy
Expand Down
18 changes: 3 additions & 15 deletions Orange/widgets/data/tests/test_owfeaturestatistics.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime
import warnings
from collections import namedtuple
from functools import wraps, partial
from functools import partial
from itertools import chain
from typing import Callable, List
from typing import List

import numpy as np
from AnyQt.QtCore import QItemSelection, QItemSelectionRange, \
Expand All @@ -12,7 +12,7 @@
from Orange.data import Table, Domain, StringVariable, ContinuousVariable, \
DiscreteVariable, TimeVariable
from Orange.widgets.tests.base import WidgetTest, datasets
from Orange.widgets.tests.utils import simulate
from Orange.widgets.tests.utils import simulate, table_dense_sparse
from Orange.widgets.data.owfeaturestatistics import \
OWFeatureStatistics

Expand Down Expand Up @@ -175,18 +175,6 @@ def make_table(attributes, target=None, metas=None):
)


def table_dense_sparse(test_case):
# type: (Callable) -> Callable
"""Run a single test case on both dense and sparse Orange tables."""

@wraps(test_case)
def _wrapper(self):
test_case(self, lambda table: table.to_dense())
test_case(self, lambda table: table.to_sparse())

return _wrapper


class TestVariousDataSets(WidgetTest):
def setUp(self):
self.widget = self.create_widget(
Expand Down
23 changes: 23 additions & 0 deletions Orange/widgets/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
from functools import wraps

import warnings
import contextlib

Expand Down Expand Up @@ -317,3 +319,24 @@ def mouseMove(widget, pos=QPoint(), delay=-1): # pragma: no-cover
QTest.qWait(delay)

QApplication.sendEvent(widget, me)


def table_dense_sparse(test_case):
# type: (Callable) -> Callable
"""Run a single test case on both dense and sparse Orange tables.

Examples
--------
>>> @table_dense_sparse
... def test_something(self, prepare_table):
... data: Table # The table you want to test on
... data = prepare_table(data) # This converts the table to dense/sparse

"""

@wraps(test_case)
def _wrapper(self):
test_case(self, lambda table: table.to_dense())
test_case(self, lambda table: table.to_sparse())

return _wrapper
Loading