Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Statistics.countnans/bincount: Fix NaN Counting, Consider Implicit Zeros #2698

Merged
merged 20 commits into from
Oct 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b39db6e
Statistics.countnans: Fix sparse implementation and add axis support
pavlin-policar Sep 4, 2017
1d2bee0
Statistics.bincount: Fix sparse implementation
pavlin-policar Sep 8, 2017
ef2ba73
Statistics.tests: Implement dense_sparse decorator
pavlin-policar Sep 8, 2017
ee8634b
Statistics.countnans: Support 2d weights for sparse matrices
pavlin-policar Sep 8, 2017
941bd2b
Statistics.digitize: Move tests to own class and use dense_sparse dec…
pavlin-policar Sep 8, 2017
ea74b94
Statistics.bincount: Add weight support to sparse, add docstring
pavlin-policar Sep 8, 2017
ab5cc8b
Statistics: Implement sparse_count_zeros
pavlin-policar Sep 8, 2017
b4eb25a
Statistics.countnans: Add dtype param support to sparse
pavlin-policar Sep 8, 2017
ca4c80f
Table._compute_distributions: Fix parameter ordering to bincount call
pavlin-policar Sep 8, 2017
09ddc33
Statistics.sparse_has_zeros: Make public
pavlin-policar Sep 9, 2017
0057143
Table._compute_distributions: Correctly count zeros in sparse continu…
pavlin-policar Sep 9, 2017
a21af1a
DomainDistribution: Change tests to check for true zero counts
pavlin-policar Sep 9, 2017
d7d91c8
TestNormalize: Fix failing test due to previous handling of zeros in …
pavlin-policar Sep 9, 2017
afa3df8
Statistics.countnans: Fix copy=False param from coo.tocsr call
pavlin-policar Sep 9, 2017
6f12808
Pylint: Add pylint ignores to more human-friendly formatted matrices
pavlin-policar Sep 9, 2017
dd516a7
Statistics.countnans: Support csc_matrices
pavlin-policar Oct 20, 2017
e515f30
Statistics: Rename sparse_zeros to sparse_implicit_zeros
pavlin-policar Oct 20, 2017
e4206e2
Statistics.tests: Inject explicit zeros into dense_sparse decorator
pavlin-policar Oct 20, 2017
473f867
Statistics.countnans: Fix distributions test
pavlin-policar Oct 20, 2017
b7bc576
Statistics.bincount: Match x and weight axis to properly index weights
pavlin-policar Oct 21, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 39 additions & 22 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
)
from Orange.data.util import SharedComputeValue, vstack, hstack
from Orange.statistics.util import bincount, countnans, contingency, \
stats as fast_stats
stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
sparse_implicit_zero_weights
from Orange.util import flatten

__all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]
Expand Down Expand Up @@ -1384,42 +1385,58 @@ def _compute_distributions(self, columns=None):
columns = range(len(self.domain.variables))
else:
columns = [self.domain.index(var) for var in columns]

distributions = []
if sp.issparse(self.X):
self.X = self.X.tocsc()

W = self.W.ravel() if self.has_weights() else None

for col in columns:
var = self.domain[col]
variable = self.domain[col]

# Select the correct data column from X, Y or metas
if 0 <= col < self.X.shape[1]:
m = self.X[:, col]
x = self.X[:, col]
elif col < 0:
m = self.metas[:, col * (-1) - 1]
if np.issubdtype(m.dtype, np.dtype(object)):
m = m.astype(float)
x = self.metas[:, col * (-1) - 1]
if np.issubdtype(x.dtype, np.dtype(object)):
x = x.astype(float)
else:
m = self._Y[:, col - self.X.shape[1]]
if var.is_discrete:
dist, unknowns = bincount(m, len(var.values) - 1, W)
elif not m.shape[0]:
x = self._Y[:, col - self.X.shape[1]]

if variable.is_discrete:
dist, unknowns = bincount(x, weights=W, max_val=len(variable.values) - 1)
elif not x.shape[0]:
dist, unknowns = np.zeros((2, 0)), 0
else:
if W is not None:
unknowns = countnans(m, W)
if sp.issparse(m):
arg_sort = np.argsort(m.data)
ranks = m.indices[arg_sort]
vals = np.vstack((m.data[arg_sort], W[ranks]))
if sp.issparse(x):
arg_sort = np.argsort(x.data)
ranks = x.indices[arg_sort]
vals = np.vstack((x.data[arg_sort], W[ranks]))
else:
ranks = np.argsort(m)
vals = np.vstack((m[ranks], W[ranks]))
ranks = np.argsort(x)
vals = np.vstack((x[ranks], W[ranks]))
else:
unknowns = countnans(m.astype(float))
if sp.issparse(m):
m = m.data
vals = np.ones((2, m.shape[0]))
vals[0, :] = m
x_values = x.data if sp.issparse(x) else x
vals = np.ones((2, x_values.shape[0]))
vals[0, :] = x_values
vals[0, :].sort()

dist = np.array(_valuecount.valuecount(vals))
# If sparse, then 0s will not be counted with `valuecount`, so
# we have to add them to the result manually.
if sp.issparse(x) and sparse_has_implicit_zeros(x):
if W is not None:
zero_weights = sparse_implicit_zero_weights(x, W).sum()
else:
zero_weights = sparse_count_implicit_zeros(x)
zero_vec = [0, zero_weights]
dist = np.insert(dist, np.searchsorted(dist[0], 0), zero_vec, axis=1)
# Since `countnans` assumes vector shape to be (1, n) and `x`
# shape is (n, 1), we pass the transpose
unknowns = countnans(x.T, W)
distributions.append((dist, unknowns))

return distributions
Expand Down
4 changes: 2 additions & 2 deletions Orange/statistics/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ def mean(self):
return np.average(np.asarray(self[0]), weights=np.asarray(self[1]))

def variance(self):
avg = self.mean()
return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1])
mean = self.mean()
return sum(((x - mean) ** 2) * w for x, w in zip(self[0], self[1])) / sum(self[1])

def standard_deviation(self):
return math.sqrt(self.variance())
Expand Down
207 changes: 158 additions & 49 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,85 +5,199 @@
It also patches bottleneck to contain these functions.
"""
from warnings import warn
import numpy as np
import scipy.sparse as sp

import bottleneck as bn
import numpy as np
from scipy import sparse as sp


def _count_nans_per_row_sparse(X, weights):
def _count_nans_per_row_sparse(X, weights, dtype=None):
""" Count the number of nans (undefined) values per row. """
items_per_row = 1 if X.ndim == 1 else X.shape[1]
counts = np.ones(X.shape[0]) * items_per_row
nnz_per_row = np.bincount(X.indices, minlength=len(counts))
counts -= nnz_per_row
if weights is not None:
counts *= weights
return np.sum(counts)
X = X.tocoo(copy=False)
nonzero_mask = np.isnan(X.data)
nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask]

if weights.ndim == 1:
data_weights = weights[nan_rows]
else:
data_weights = weights[nan_rows, nan_cols]

w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape)
w = w.tocsr()

return np.fromiter((np.sum(row.data) for row in w), dtype=dtype)

return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype)

def bincount(X, max_val=None, weights=None, minlength=None):

def sparse_count_implicit_zeros(x):
""" Count the number of implicit zeros in a sparse matrix. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')
return np.prod(x.shape) - x.nnz


def sparse_has_implicit_zeros(x):
""" Check if sparse matrix contains any implicit zeros. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')
return np.prod(x.shape) != x.nnz


def sparse_implicit_zero_weights(x, weights):
""" Extract the weight values of all zeros in a sparse matrix. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')

if weights.ndim == 1:
# Match weights and x axis so `indices` will be set appropriately
if x.shape[0] == weights.shape[0]:
x = x.tocsc()
elif x.shape[1] == weights.shape[0]:
x = x.tocsr()
n_items = np.prod(x.shape)
zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)
return weights[zero_indices]
else:
# Can easily be implemented using a coo_matrix
raise NotImplementedError(
'Computing zero weights on ndimensinal weight matrix is not implemented'
)


def bincount(x, weights=None, max_val=None, minlength=None):
"""Return counts of values in array X.

Works kind of like np.bincount(), except that it also supports floating
arrays with nans.

Parameters
----------
x : array_like, 1 dimension, nonnegative ints
Input array.
weights : array_like, optional
Weights, array of the same shape as x.
max_val : int, optional
Indicates the maximum value we expect to find in X and sets the result
array size accordingly. E.g. if we set `max_val=2` yet the largest
value in X is 1, the result will contain a bin for the value 2, and
will be set to 0. See examples for usage.
minlength : int, optional
A minimum number of bins for the output array. See numpy docs for info.

Returns
-------
Tuple[np.ndarray, int]
Returns the bincounts and the number of NaN values.

Examples
--------
In case `max_val` is provided, the return shape includes bins for these
values as well, even if they do not appear in the data. However, this will
not truncate the bincount if values larger than `max_count` are found.
>>> bincount([0, 0, 1, 1, 2], max_val=4)
(array([ 2., 2., 1., 0., 0.]), 0.0)
>>> bincount([0, 1, 2, 3, 4], max_val=2)
(array([ 1., 1., 1., 1., 1.]), 0.0)

"""
if sp.issparse(X):
minlength = max_val + 1
bin_weights = weights[X.indices] if weights is not None else None
return (np.bincount(X.data.astype(int),
weights=bin_weights,
minlength=minlength, ),
_count_nans_per_row_sparse(X, weights))

X = np.asanyarray(X)
if X.dtype.kind == 'f' and bn.anynan(X):
nonnan = ~np.isnan(X)
X = X[nonnan]
# Store the original matrix before any manipulation to check for sparse
x_original = x
if sp.issparse(x):
if weights is not None:
# Match weights and x axis so `indices` will be set appropriately
if x.shape[0] == weights.shape[0]:
x = x.tocsc()
elif x.shape[1] == weights.shape[0]:
x = x.tocsr()

zero_weights = sparse_implicit_zero_weights(x, weights).sum()
weights = weights[x.indices]
else:
zero_weights = sparse_count_implicit_zeros(x)

x = x.data

x = np.asanyarray(x)
if x.dtype.kind == 'f' and bn.anynan(x):
nonnan = ~np.isnan(x)
x = x[nonnan]
if weights is not None:
nans = (~nonnan * weights).sum(axis=0)
weights = weights[nonnan]
else:
nans = (~nonnan).sum(axis=0)
else:
nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)

if minlength is None and max_val is not None:
minlength = max_val + 1
bc = np.array([]) if minlength is not None and minlength <= 0 else \
np.bincount(X.astype(np.int32, copy=False),
weights=weights, minlength=minlength).astype(float)

if minlength is not None and minlength <= 0:
bc = np.array([])
else:
bc = np.bincount(
x.astype(np.int32, copy=False), weights=weights, minlength=minlength
).astype(float)
# Since `csr_matrix.values` only contain non-zero values or explicit
# zeros, we must count implicit zeros separately and add them to the
# explicit ones found before
if sp.issparse(x_original):
bc[0] += zero_weights

return bc, nans


def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
def countnans(x, weights=None, axis=None, dtype=None, keepdims=False):
"""
Count the undefined elements in arr along given axis.
Count the undefined elements in an array along given axis.

Parameters
----------
X : array_like
weights : array_like
x : array_like
weights : array_like, optional
Weights to weight the nans with, before or after counting (depending
on the weights shape).
axis : int, optional
dtype : dtype, optional
The data type of the returned array.

Returns
-------
counts
Union[np.ndarray, float]

"""
if not sp.issparse(X):
X = np.asanyarray(X)
isnan = np.isnan(X)
if weights is not None and weights.shape == X.shape:
if not sp.issparse(x):
x = np.asanyarray(x)
isnan = np.isnan(x)
if weights is not None and weights.shape == x.shape:
isnan = isnan * weights

counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
if weights is not None and weights.shape != X.shape:
if weights is not None and weights.shape != x.shape:
counts = counts * weights
else:
if any(attr is not None for attr in [axis, dtype]) or \
keepdims is not False:
raise ValueError('Arguments axis, dtype and keepdims'
'are not yet supported on sparse data!')
assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported'
# To have consistent behaviour with dense matrices, raise error when
# `axis=1` and the array is 1d (e.g. [[1 2 3]])
if x.shape[0] == 1 and axis == 1:
raise ValueError('Axis %d is out of bounds' % axis)

arr = x if axis == 1 else x.T

if weights is not None:
weights = weights if axis == 1 else weights.T

arr = arr.tocsr()
counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype)

# We want a scalar value if `axis=None` or if the sparse matrix is
# actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy
# implementation
if axis is None or x.shape[0] == 1:
counts = counts.sum(dtype=dtype)

counts = _count_nans_per_row_sparse(X, weights)
return counts


Expand Down Expand Up @@ -234,17 +348,12 @@ def weighted_mean():
X.shape[0] - nans))


def _sparse_has_zeros(x):
""" Check if sparse matrix contains any implicit zeros. """
return np.prod(x.shape) != x.nnz


def _nan_min_max(x, func, axis=0):
if not sp.issparse(x):
return func(x, axis=axis)
if axis is None:
extreme = func(x.data, axis=axis) if x.nnz else float('nan')
if _sparse_has_zeros(x):
if sparse_has_implicit_zeros(x):
extreme = func([0, extreme])
return extreme
if axis == 0:
Expand All @@ -257,7 +366,7 @@ def _nan_min_max(x, func, axis=0):
for row in x:
values = row.data
extreme = func(values) if values.size else float('nan')
if _sparse_has_zeros(row):
if sparse_has_implicit_zeros(row):
extreme = func([0, extreme])
r.append(extreme)
return np.array(r)
Expand Down Expand Up @@ -323,7 +432,7 @@ def unique(x, return_counts=False):
if not sp.issparse(x):
return np.unique(x, return_counts=return_counts)

implicit_zeros = np.prod(x.shape) - x.nnz
implicit_zeros = sparse_count_implicit_zeros(x)
explicit_zeros = not np.all(x.data)
r = np.unique(x.data, return_counts=return_counts)
if not implicit_zeros:
Expand Down
Loading