Skip to content

Commit

Permalink
distance: Implement Euclidean distances
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Jul 7, 2017
1 parent 942877a commit b53b509
Show file tree
Hide file tree
Showing 9 changed files with 27,688 additions and 62 deletions.
267 changes: 215 additions & 52 deletions Orange/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from Orange import data
from Orange.misc import DistMatrix
from Orange.preprocess import SklImpute
from Orange.distance import _distance
from Orange.statistics import util

__all__ = ['Euclidean', 'Manhattan', 'Cosine', 'Jaccard', 'SpearmanR',
__all__ = ['Euclidean', 'Manhattan', 'Cosine', 'Jaccard', '`SpearmanR',
'SpearmanRAbsolute', 'PearsonR', 'PearsonRAbsolute', 'Mahalanobis',
'MahalanobisDistance']

Expand All @@ -15,13 +17,7 @@ def _preprocess(table):
"""Remove categorical attributes and impute missing values."""
if not len(table):
return table
new_domain = data.Domain(
[a for a in table.domain.attributes if a.is_continuous],
table.domain.class_vars,
table.domain.metas)
new_data = table.transform(new_domain)
new_data = SklImpute()(new_data)
return new_data
return SklImpute()(table)


def _orange_to_numpy(x):
Expand All @@ -39,63 +35,230 @@ def _orange_to_numpy(x):


class Distance:
def __call__(self, e1, e2=None, axis=1, impute=False):
"""
:param e1: input data instances, we calculate distances between all
pairs
:type e1: :class:`Orange.data.Table` or
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
:param e2: optional second argument for data instances if provided,
distances between each pair, where first item is from e1 and
second is from e2, are calculated
:type e2: :class:`Orange.data.Table` or
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
:param axis: if axis=1 we calculate distances between rows, if axis=0
we calculate distances between columns
:type axis: int
:param impute: if impute=True all NaN values in matrix are replaced
with 0
:type impute: bool
:return: the matrix with distances between given examples
:rtype: :class:`Orange.misc.distmatrix.DistMatrix`
"""
raise NotImplementedError(
'Distance is an abstract class and should not be used directly.')
def __new__(cls, e1=None, e2=None, axis=1, **kwargs):
self = super().__new__(cls)
self.axis = axis
# Ugly, but needed for backwards compatibility hack below, to allow
# setting parameters like 'normalize'
self.__dict__.update(**kwargs)
if e1 is None:
return self

# Backwards compatibility with SKL-based instances
model = self.fit(e1)
return model(e1, e2)

def fit(self, e1):
pass


class DistanceModel:
def __init__(self, axis, impute=False):
self.axis = axis
self.impute = impute

class SklDistance(Distance):
"""Generic scikit-learn distance."""
def __init__(self, metric, name, supports_sparse):
def __call__(self, e1, e2=None):
"""
If e2 is omitted, calculate distances between all rows (axis=1) or
columns (axis=2) of e1. If e2 is present, calculate distances between
all pairs if rows from e1 and e2.
Args:
metric: The metric to be used for distance calculation
name (str): Name of the distance
supports_sparse (boolean): Whether this metric works on sparse data
or not.
e1 (Orange.data.Table or Orange.data.RowInstance or numpy.ndarray):
input data
e2 (Orange.data.Table or Orange.data.RowInstance or numpy.ndarray):
secondary data
Returns:
A distance matrix (Orange.misc.distmatrix.DistMatrix)
"""
self.metric = metric
self.name = name
self.supports_sparse = supports_sparse
if self.axis == 0 and e2 is not None:
raise ValueError("Two tables cannot be compared by columns")

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
dist = skl_metrics.pairwise.pairwise_distances(
x1, x2, metric=self.metric)
dist = self.compute_distances(x1, x2)
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
dist = DistMatrix(dist, e1, e2, axis)
dist = DistMatrix(dist, e1, e2, self.axis)
else:
dist = DistMatrix(dist)
return dist

Euclidean = SklDistance('euclidean', 'Euclidean', True)
Manhattan = SklDistance('manhattan', 'Manhattan', True)
Cosine = SklDistance('cosine', 'Cosine', True)
Jaccard = SklDistance('jaccard', 'Jaccard', False)
def compute_distances(self, x1, x2):
pass


class FittedDistanceModel(DistanceModel):
def __init__(self, attributes, axis, impute=False, fit_params=None):
super().__init__(axis, impute)
self.attributes = attributes
self.fit_params = fit_params

def __call__(self, e1, e2=None):
if e1.domain.attributes != self.attributes or \
e2 is not None and e2.domain.attributes != self.attributes:
raise ValueError("mismatching domains")
return super().__call__(e1, e2)

def compute_distances(self, x1, x2=None):
if self.axis == 0:
return self.distance_by_cols(x1, self.fit_params)
else:
return self.distance_by_rows(
x1, x2 if x2 is not None else x1, self.fit_params)


class FittedDistance(Distance):
ModelType = None #: Option[FittedDistanceModel]

def fit(self, data):
attributes = data.domain.attributes
x = _orange_to_numpy(data)
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
fit_params = [self.fit_cols, self.fit_rows][self.axis](x, n_vals)
# pylint: disable=not-callable
return self.ModelType(attributes, axis=self.axis, fit_params=fit_params)


class EuclideanModel(FittedDistanceModel):
name = "Euclidean"
supports_sparse = False
distance_by_cols = _distance.euclidean_cols
distance_by_rows = _distance.euclidean_rows


class Euclidean(FittedDistance):
ModelType = EuclideanModel

def __new__(cls, *args, **kwargs):
kwargs.setdefault("normalize", False)
return super().__new__(cls, *args, **kwargs)

def fit_rows(self, x, n_vals):
n_cols = len(n_vals)
n_bins = max(n_vals)
means = np.zeros(n_cols, dtype=float)
vars = np.empty(n_cols, dtype=float)
dist_missing = np.zeros((n_cols, n_bins), dtype=float)
dist_missing2 = np.zeros(n_cols, dtype=float)

for col in range(n_cols):
column = x[:, col]
if n_vals[col]:
vars[col] = -1
dist_missing[col] = util.bincount(column, minlength=n_bins)[0]
dist_missing[col] /= max(1, sum(dist_missing[col]))
dist_missing2[col] = 1 - np.sum(dist_missing[col] ** 2)
dist_missing[col] = 1 - dist_missing[col]
elif np.isnan(column).all(): # avoid warnings in nanmean and nanvar
vars[col] = -2
else:
means[col] = util.nanmean(column)
vars[col] = util.nanvar(column)
if vars[col] == 0:
vars[col] = -2
if self.normalize:
dist_missing2[col] = 1
else:
dist_missing2[col] = 2 * vars[col]
if np.isnan(dist_missing2[col]):
dist_missing2[col] = 0

return dict(means=means, vars=vars,
dist_missing=dist_missing, dist_missing2=dist_missing2,
normalize=int(self.normalize))

def fit_cols(self, x, n_vals):
if any(n_vals):
raise ValueError(
"columns with discrete values are not commensurate")
means = np.nanmean(x, axis=0)
vars = np.nanvar(x, axis=0)
if np.isnan(vars).any() or not vars.all():
raise ValueError("some columns are constant or have no values")
return dict(means=means, vars=vars, normalize=int(self.normalize))


class ManhattanModel(FittedDistanceModel):
supports_sparse = False
distance_by_cols = _distance.manhattan_cols
distance_by_rows = _distance.manhattan_rows


class Manhattan(FittedDistance):
ModelType = ManhattanModel
name = "Manhattan"

def __new__(cls, *args, **kwargs):
kwargs.setdefault("normalize", False)
return super().__new__(cls, *args, **kwargs)

def fit_rows(self, x, n_vals):
n_cols = len(n_vals)
n_bins = max(n_vals)

medians = np.zeros(n_cols)
mads = np.zeros(n_cols)
dist_missing = np.zeros((n_cols, max(n_vals)))
dist_missing2 = np.zeros(n_cols)
for col in range(n_cols):
column = x[:, col]
if n_vals[col]:
mads[col] = -1
dist_missing[col] = util.bincount(column, minlength=n_bins)[0]
dist_missing[col] /= max(1, sum(dist_missing[col]))
dist_missing2[col] = 1 - np.sum(dist_missing[col] ** 2)
dist_missing[col] = 1 - dist_missing[col]
elif np.isnan(column).all(): # avoid warnings in nanmedian
mads[col] = -2
else:
medians[col] = np.nanmedian(column)
mads[col] = np.nanmedian(np.abs(column - medians[col]))
if mads[col] == 0:
mads[col] = -2
if self.normalize:
dist_missing2[col] = 1
else:
dist_missing2[col] = 2 * mads[col]
return dict(medians=medians, mads=mads,
dist_missing=dist_missing, dist_missing2=dist_missing2,
normalize=int(self.normalize))

def fit_cols(self, x, n_vals):
if any(n_vals):
raise ValueError(
"columns with discrete values are not commensurate")
medians = np.nanmedian(x, axis=0)
mads = np.nanmedian(np.abs(x - medians), axis=0)
if np.isnan(mads).any() or not mads.all():
raise ValueError(
"some columns have zero absolute distance from median, "
"or no values")
return dict(medians=medians, mads=mads, normalize=int(self.normalize))


class JaccardModel(FittedDistanceModel):
supports_sparse = False
distance_by_cols = _distance.jaccard_cols
distance_by_rows = _distance.jaccard_rows


class Jaccard(FittedDistance):
ModelType = JaccardModel
name = "Jaccard"
fit_rows = fit_cols = _distance.fit_jaccard


class CosineModel(EuclideanModel):
def compute_distances(self, x1, x2=None):
return 1 - np.cos(1 - super().compute_distances(x1, x2))


class Cosine(Euclidean):
ModelType = CosineModel
name = "Cosine"


class SpearmanDistance(Distance):
Expand Down
Loading

0 comments on commit b53b509

Please sign in to comment.