Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Weighted mean computation in Orange.statistics.util.stats #6204

Merged
merged 2 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,20 +342,14 @@ def stats(X, weights=None, compute_variance=False):
is_numeric = np.issubdtype(X.dtype, np.number)
is_sparse = sp.issparse(X)
weighted = weights is not None and X.dtype != object

def weighted_mean():
if is_sparse:
w_X = X.multiply(sp.csr_matrix(np.c_[weights] / sum(weights)))
return np.asarray(w_X.sum(axis=0)).ravel()
else:
return bn.nansum(X * np.c_[weights] / sum(weights), axis=0)
weights = weights if weighted else None

if X.size and is_numeric and not is_sparse:
nans = np.isnan(X).sum(axis=0)
return np.column_stack((
np.nanmin(X, axis=0),
np.nanmax(X, axis=0),
nanmean(X, axis=0) if not weighted else weighted_mean(),
nanmean(X, axis=0, weights=weights),
nanvar(X, axis=0) if compute_variance else \
np.zeros(X.shape[1] if X.ndim == 2 else 1),
nans,
Expand All @@ -369,7 +363,7 @@ def weighted_mean():
return np.column_stack((
nanmin(X, axis=0),
nanmax(X, axis=0),
nanmean(X, axis=0) if not weighted else weighted_mean(),
nanmean(X, axis=0, weights=weights),
np.zeros(X.shape[1]), # variance not supported
X.shape[0] - non_zero,
non_zero))
Expand Down Expand Up @@ -453,15 +447,31 @@ def nansum_sparse(x):
return _apply_func(x, np.nansum, nansum_sparse, axis=axis)


def nanmean(x, axis=None):
def nanmean(x, axis=None, weights=None):
""" Equivalent of np.nanmean that supports sparse or dense matrices. """
if axis is None and weights is not None:
raise NotImplementedError("weights are only supported if axis is defined")

if not sp.issparse(x):
means = bn.nanmean(x, axis=axis)
if weights is None:
means = bn.nanmean(x, axis=axis)
else:
if axis == 0:
weights = weights.reshape(-1, 1)
elif axis == 1:
weights = weights.reshape(1, -1)
else:
raise NotImplementedError
nanw = ~np.isnan(x) * weights # do not divide by non-used weights
means = bn.nansum(x * weights, axis=axis) / np.sum(nanw, axis=axis)
elif axis is None:
means, _ = mean_variance_axis(x, axis=0)
means = np.nanmean(means)
else:
means, _ = mean_variance_axis(x, axis=axis)
# mean_variance_axis is picky regarding the input type
if weights is not None:
weights = weights.astype(float)
means, _ = mean_variance_axis(x, axis=axis, weights=weights)

return means

Expand Down
56 changes: 56 additions & 0 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,17 @@ def test_stats_weights(self):
X = np.arange(4).reshape(2, 2).astype(object)
np.testing.assert_equal(stats(X, weights), stats(X))

def test_stats_nans_neutral_weights(self):
X = np.arange(4).reshape(2, 2).astype(float)
X[0, 0] = np.nan
np.testing.assert_equal(stats(X, weights=np.array([1, 1])), stats(X))

def test_stats_nans_neutral_weights_sparse(self):
X = np.arange(4).reshape(2, 2).astype(float)
X = csr_matrix(X)
X[0, 0] = np.nan
np.testing.assert_equal(stats(X, weights=np.array([1, 1])), stats(X))

def test_stats_weights_sparse(self):
X = np.arange(4).reshape(2, 2).astype(float)
X = csr_matrix(X)
Expand Down Expand Up @@ -342,6 +353,16 @@ def setUp(self):
self.random_state = check_random_state(42)
self.x = self.random_state.uniform(size=(10, 5))
np.fill_diagonal(self.x, np.nan)
self.y = np.array([[0, 1, 5],
[3, 4, np.nan],
[2, np.nan, np.nan],
[np.nan, np.nan, np.nan]])
self.r0 = [5/3, 5/2, 5/1]
self.r1 = [6/3, 7/2, 2/1, np.nan]
self.w0 = np.array([4, 3, 2, 1])
self.w1 = np.array([1, 2, 3])
self.r0w = [13/9, 16/7, 20/4]
self.r1w = [17/6, 11/3, 2/1, np.nan]

@dense_sparse
def test_axis_none(self, array):
Expand All @@ -361,6 +382,41 @@ def test_axis_1(self, array):
np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1)
)

@dense_sparse
def test_weights_axis_none(self, array):
with self.assertRaises(NotImplementedError):
nanmean(array(self.x), weights=1)

@dense_sparse
def test_weights_axis_0(self, array):
np.testing.assert_almost_equal(
self.r0, nanmean(array(self.y), axis=0)
)
np.testing.assert_almost_equal(
self.r1, nanmean(array(self.y.T), axis=0)
)
np.testing.assert_almost_equal(
self.r0w, nanmean(array(self.y), axis=0, weights=self.w0)
)
np.testing.assert_almost_equal(
self.r1w, nanmean(array(self.y.T), axis=0, weights=self.w1)
)

@dense_sparse
def test_weights_axis_1(self, array):
np.testing.assert_almost_equal(
self.r1, nanmean(array(self.y), axis=1)
)
np.testing.assert_almost_equal(
self.r0, nanmean(array(self.y.T), axis=1)
)
np.testing.assert_almost_equal(
self.r1w, nanmean(array(self.y), axis=1, weights=self.w1)
)
np.testing.assert_almost_equal(
self.r0w, nanmean(array(self.y.T), axis=1, weights=self.w0)
)


class TestDigitize(unittest.TestCase):
def setUp(self):
Expand Down