From 4c9885f467d0e2b6cc8acead0c6808012eaff247 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Wed, 16 Nov 2022 00:26:11 +0100 Subject: [PATCH 1/2] Test statistics.util.stats with unit weights and nans --- Orange/tests/test_statistics.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index 15dca625395..e2db88b6fe6 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -128,6 +128,17 @@ def test_stats_weights(self): X = np.arange(4).reshape(2, 2).astype(object) np.testing.assert_equal(stats(X, weights), stats(X)) + def test_stats_nans_neutral_weights(self): + X = np.arange(4).reshape(2, 2).astype(float) + X[0, 0] = np.nan + np.testing.assert_equal(stats(X, weights=np.array([1, 1])), stats(X)) + + def test_stats_nans_neutral_weights_sparse(self): + X = np.arange(4).reshape(2, 2).astype(float) + X = csr_matrix(X) + X[0, 0] = np.nan + np.testing.assert_equal(stats(X, weights=np.array([1, 1])), stats(X)) + def test_stats_weights_sparse(self): X = np.arange(4).reshape(2, 2).astype(float) X = csr_matrix(X) From 9c69c4dd754b74e2327406799b5686332429f517 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Tue, 15 Nov 2022 13:26:23 +0100 Subject: [PATCH 2/2] Orange.statistics.util.nanmean extended with weights --- Orange/statistics/util.py | 34 ++++++++++++++++--------- Orange/tests/test_statistics.py | 45 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index 36a910029bb..b2042f36223 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -342,20 +342,14 @@ def stats(X, weights=None, compute_variance=False): is_numeric = np.issubdtype(X.dtype, np.number) is_sparse = sp.issparse(X) weighted = weights is not None and X.dtype != object - - def weighted_mean(): - if is_sparse: - w_X = X.multiply(sp.csr_matrix(np.c_[weights] / sum(weights))) - return np.asarray(w_X.sum(axis=0)).ravel() - else: - return bn.nansum(X * np.c_[weights] / sum(weights), axis=0) + weights = weights if weighted else None if X.size and is_numeric and not is_sparse: nans = np.isnan(X).sum(axis=0) return np.column_stack(( np.nanmin(X, axis=0), np.nanmax(X, axis=0), - nanmean(X, axis=0) if not weighted else weighted_mean(), + nanmean(X, axis=0, weights=weights), nanvar(X, axis=0) if compute_variance else \ np.zeros(X.shape[1] if X.ndim == 2 else 1), nans, @@ -369,7 +363,7 @@ def weighted_mean(): return np.column_stack(( nanmin(X, axis=0), nanmax(X, axis=0), - nanmean(X, axis=0) if not weighted else weighted_mean(), + nanmean(X, axis=0, weights=weights), np.zeros(X.shape[1]), # variance not supported X.shape[0] - non_zero, non_zero)) @@ -453,15 +447,31 @@ def nansum_sparse(x): return _apply_func(x, np.nansum, nansum_sparse, axis=axis) -def nanmean(x, axis=None): +def nanmean(x, axis=None, weights=None): """ Equivalent of np.nanmean that supports sparse or dense matrices. """ + if axis is None and weights is not None: + raise NotImplementedError("weights are only supported if axis is defined") + if not sp.issparse(x): - means = bn.nanmean(x, axis=axis) + if weights is None: + means = bn.nanmean(x, axis=axis) + else: + if axis == 0: + weights = weights.reshape(-1, 1) + elif axis == 1: + weights = weights.reshape(1, -1) + else: + raise NotImplementedError + nanw = ~np.isnan(x) * weights # do not divide by non-used weights + means = bn.nansum(x * weights, axis=axis) / np.sum(nanw, axis=axis) elif axis is None: means, _ = mean_variance_axis(x, axis=0) means = np.nanmean(means) else: - means, _ = mean_variance_axis(x, axis=axis) + # mean_variance_axis is picky regarding the input type + if weights is not None: + weights = weights.astype(float) + means, _ = mean_variance_axis(x, axis=axis, weights=weights) return means diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index e2db88b6fe6..f670e4ddeed 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -353,6 +353,16 @@ def setUp(self): self.random_state = check_random_state(42) self.x = self.random_state.uniform(size=(10, 5)) np.fill_diagonal(self.x, np.nan) + self.y = np.array([[0, 1, 5], + [3, 4, np.nan], + [2, np.nan, np.nan], + [np.nan, np.nan, np.nan]]) + self.r0 = [5/3, 5/2, 5/1] + self.r1 = [6/3, 7/2, 2/1, np.nan] + self.w0 = np.array([4, 3, 2, 1]) + self.w1 = np.array([1, 2, 3]) + self.r0w = [13/9, 16/7, 20/4] + self.r1w = [17/6, 11/3, 2/1, np.nan] @dense_sparse def test_axis_none(self, array): @@ -372,6 +382,41 @@ def test_axis_1(self, array): np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1) ) + @dense_sparse + def test_weights_axis_none(self, array): + with self.assertRaises(NotImplementedError): + nanmean(array(self.x), weights=1) + + @dense_sparse + def test_weights_axis_0(self, array): + np.testing.assert_almost_equal( + self.r0, nanmean(array(self.y), axis=0) + ) + np.testing.assert_almost_equal( + self.r1, nanmean(array(self.y.T), axis=0) + ) + np.testing.assert_almost_equal( + self.r0w, nanmean(array(self.y), axis=0, weights=self.w0) + ) + np.testing.assert_almost_equal( + self.r1w, nanmean(array(self.y.T), axis=0, weights=self.w1) + ) + + @dense_sparse + def test_weights_axis_1(self, array): + np.testing.assert_almost_equal( + self.r1, nanmean(array(self.y), axis=1) + ) + np.testing.assert_almost_equal( + self.r0, nanmean(array(self.y.T), axis=1) + ) + np.testing.assert_almost_equal( + self.r1w, nanmean(array(self.y), axis=1, weights=self.w1) + ) + np.testing.assert_almost_equal( + self.r0w, nanmean(array(self.y.T), axis=1, weights=self.w0) + ) + class TestDigitize(unittest.TestCase): def setUp(self):