Skip to content

Commit

Permalink
Jaccard distance: Move from a fallback to its own class
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Mar 12, 2019
1 parent 551effc commit fd1fd19
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 81 deletions.
49 changes: 0 additions & 49 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
import sklearn.metrics as skl_metrics
from scipy.sparse import issparse, csr_matrix

from Orange.data import Table, Domain, Instance, RowInstance
from Orange.misc import DistMatrix
Expand Down Expand Up @@ -507,51 +506,3 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

class SparseJaccard:
"""
Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
sparse, data normalized with intersection/union. Sklearn's function can't
handle discrete or missing data and normalization.
"""

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
if issparse(x1):
dist = self.sparse_jaccard(x1, x2)
else:
dist = skl_metrics.pairwise.pairwise_distances(x1,
x2,
metric="jaccard")
if impute and np.isnan(dist).any():
dist = np.nan_to_num(dist)
if isinstance(e1, (Table, RowInstance)):
dist_matrix = DistMatrix(dist, e1, e2, axis)
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

def sparse_jaccard(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = csr_matrix(x1)
x1.eliminate_zeros()
x2 = csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix
36 changes: 31 additions & 5 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances
Expand All @@ -11,7 +12,7 @@
from Orange.statistics import util

from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
SklDistance, _orange_to_numpy, SparseJaccard)
SklDistance, _orange_to_numpy)

class EuclideanRowsModel(FittedDistanceModel):
"""
Expand Down Expand Up @@ -395,6 +396,9 @@ def compute_distances(self, x1, x2):
compute distances between rows without missing values, and a slower
loop for those with missing values.
"""
if sp.issparse(x1):
return self.sparse_jaccard(x1, x2)

nonzeros1 = np.not_equal(x1, 0).view(np.int8)
if self.axis == 1:
nans1 = _distance.any_nan_row(x1)
Expand All @@ -414,21 +418,43 @@ def compute_distances(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def sparse_jaccard(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = sp.csr_matrix(x1)
x1.eliminate_zeros()
x2 = sp.csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix


class Jaccard(FittedDistance):
supports_sparse = True
supports_discrete = True
fallback = SparseJaccard()
ModelType = JaccardModel

def fit_rows(self, attributes, x, n_vals):
"""
Return a model for computation of Jaccard values. The model stores
frequencies of non-zero values per each column.
"""
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
if sp.issparse(x):
ps = None # wrong!
else:
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
return JaccardModel(attributes, self.axis, self.impute, ps)

fit_cols = fit_rows
Expand Down
12 changes: 7 additions & 5 deletions Orange/distance/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,13 @@ def setUp(self):
self.mixed_data = self.data = Table.from_numpy(
self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))

self.dense_X = np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]])
self.sparse_data = Table(csr_matrix(self.dense_X))
self.dense_X = Table.from_list(
self.cont_domain,
np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]]))
self.sparse_data = Table(self.cont_domain, csr_matrix(self.dense_X))



Expand Down
42 changes: 20 additions & 22 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix
from sklearn.exceptions import DataConversionWarning

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
Expand Down Expand Up @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
[0., 0., 0.5]]))

def test_jaccard_distance_numpy(self):
with self.assertWarns(DataConversionWarning):
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))


# noinspection PyTypeChecker
Expand Down

0 comments on commit fd1fd19

Please sign in to comment.