Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Distances: Optimize PearsonR/SpearmanR #2852

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 116 additions & 12 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,25 +418,128 @@ def __init__(self, absolute, axis=1, impute=False):
self.absolute = absolute

def compute_distances(self, x1, x2):
if x2 is None:
x2 = x1
rho = self.compute_correlation(x1, x2)
if self.absolute:
return (1. - np.abs(rho)) / 2.
else:
return (1. - rho) / 2.

def compute_correlation(self, x1, x2):
pass
raise NotImplementedError()


class SpearmanModel(CorrelationDistanceModel):
def compute_correlation(self, x1, x2):
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
if isinstance(rho, np.float):
return np.array([[rho]])
slc = x1.shape[1 - self.axis]
return rho[:slc, slc:]
if x2 is None:
n1 = x1.shape[1 - self.axis]
if n1 == 2:
# Special case to properly fill degenerate self correlations
# (nan, inf on the diagonals)
rho = stats.spearmanr(x1, x1, axis=self.axis)[0]
assert rho.shape == (4, 4)
rho = rho[:2, :2].copy()
else:
# scalar if n1 == 1
rho = stats.spearmanr(x1, axis=self.axis)[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these two cases (if, else) necessary? At first glance stats.spearmanr seems to (efficiently) handle the case of a missing second attribute.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind.

return np.atleast_2d(rho)
else:
return _spearmanr2(x1, x2, axis=self.axis)


def _spearmanr2(a, b, axis=0):
"""
Compute all pairwise spearman rank moment correlations between rows
or columns of a and b

Parameters
----------
a : (N, M) numpy.ndarray
The input cases a.
b : (J, K) numpy.ndarray
The input cases b. In case of axis == 0: J must equal N;
otherwise if axis == 1 then K must equal M.
axis : int
If 0 the correlation are computed between a and b's columns.
Otherwise if 1 the correlations are computed between rows.

Returns
-------
cor : (N, J) or (M, K) nd.array
If axis == 0 then (N, J) matrix of correlations between a x b columns
else a (N, J) matrix of correlations between a x b rows.

See Also
--------
scipy.stats.spearmanr
"""
a, b = np.atleast_2d(a, b)
assert a.shape[axis] == b.shape[axis]
ar = np.apply_along_axis(stats.rankdata, axis, a)
br = np.apply_along_axis(stats.rankdata, axis, b)

return _corrcoef2(ar, br, axis=axis)


def _corrcoef2(a, b, axis=0):
"""
Compute all pairwise Pearson product-moment correlation coefficients
between rows or columns of a and b

Parameters
----------
a : (N, M) numpy.ndarray
The input cases a.
b : (J, K) numpy.ndarray
The input cases b. In case of axis == 0: J must equal N;
otherwise if axis == 1 then K must equal M.
axis : int
If 0 the correlation are computed between a and b's columns.
Otherwise if 1 the correlations are computed between rows.

Returns
-------
cor : (N, J) or (M, K) nd.array
If axis == 0 then (N, J) matrix of correlations between a x b columns
else a (N, J) matrix of correlations between a x b rows.

See Also
--------
numpy.corrcoef
"""
a, b = np.atleast_2d(a, b)
if not (axis == 0 or axis == 1):
raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))

mean_a = np.mean(a, axis=axis, keepdims=True)
mean_b = np.mean(b, axis=axis, keepdims=True)
assert a.shape[axis] == b.shape[axis]

n = a.shape[1 - axis]
m = b.shape[1 - axis]

a = a - mean_a
b = b - mean_b

if axis == 0:
C = a.T.dot(b)
assert C.shape == (n, m)
elif axis == 1:
C = a.dot(b.T)
assert C.shape == (n, m)

ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
ss_b = np.sum(b ** 2, axis=axis, keepdims=True)

if axis == 0:
ss_a = ss_a.T
else:
ss_b = ss_b.T

assert ss_a.shape == (n, 1)
assert ss_b.shape == (1, m)
C /= np.sqrt(ss_a)
C /= np.sqrt(ss_b)
return C


class CorrelationDistance(Distance):
Expand All @@ -455,10 +558,11 @@ def fit(self, _):

class PearsonModel(CorrelationDistanceModel):
def compute_correlation(self, x1, x2):
if self.axis == 0:
x1 = x1.T
x2 = x2.T
return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
if x2 is None:
c = np.corrcoef(x1, rowvar=self.axis == 1)
return np.atleast_2d(c)
else:
return _corrcoef2(x1, x2, axis=self.axis)


class PearsonR(CorrelationDistance):
Expand Down
53 changes: 53 additions & 0 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@

import numpy as np
import scipy
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
PearsonR, PearsonRAbsolute, Manhattan, Cosine,
Jaccard, _preprocess, MahalanobisDistance)
from Orange.distance.distance import _spearmanr2, _corrcoef2
from Orange.misc import DistMatrix
from Orange.tests import named_file, test_filename
from Orange.util import OrangeDeprecationWarning
Expand Down Expand Up @@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self):
[0.3833333],
[0.]]))

def test_spearmanr2(self):
# Test that _spearnmanr2 returns the same result that stats.spearmanr
# would
n, m = tuple(np.random.randint(2, 5, size=2))
mean = np.random.uniform(-1, 1, size=m)
cov = np.random.uniform(0, 1./m, size=(m, m))
cov = (cov + cov.T) / 2
cov.flat[::m + 1] = 1.0
X1 = np.random.multivariate_normal(mean, cov, size=n)
X2 = np.random.multivariate_normal(mean, cov, size=n)
expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:]
np.testing.assert_almost_equal(
_spearmanr2(X1, X2, axis=1),
expected,
decimal=9
)

expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:]
np.testing.assert_almost_equal(
_spearmanr2(X1, X2, axis=0),
expected,
decimal=9,
)


# noinspection PyTypeChecker
class TestSpearmanRAbsolute(TestCase):
Expand Down Expand Up @@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self):
[0.32783865],
[0.]]))

def test_corrcoef2(self):
# Test that _corrcoef2 returns the same result that np.corrcoef would
n, m = tuple(np.random.randint(2, 5, size=2))
mean = np.random.uniform(-1, 1, size=m)
cov = np.random.uniform(0, 1./m, size=(m, m))
cov = (cov + cov.T) / 2
cov.flat[::m + 1] = 1.0
X1 = np.random.multivariate_normal(mean, cov, size=n)
X2 = np.random.multivariate_normal(mean, cov, size=n)
expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:]
np.testing.assert_almost_equal(
_corrcoef2(X1, X2, axis=1),
expected,
decimal=9
)

expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:]
np.testing.assert_almost_equal(
_corrcoef2(X1, X2, axis=0),
expected,
decimal=9,
)

with self.assertRaises(ValueError):
_corrcoef2(X1, X2, axis=10)


# noinspection PyTypeChecker
class TestPearsonRAbsolute(TestCase):
Expand Down