Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Silhouette Plot: Add cosine distance #3176

Merged
merged 8 commits into from
Aug 6, 2018
Merged
5 changes: 3 additions & 2 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def fit_cols(self, attributes, x, n_vals):

class Cosine(FittedDistance):
supports_sparse = True # via fallback
supports_discrete = True
supports_discrete = False
fallback = SklDistance('cosine')

@staticmethod
Expand Down Expand Up @@ -348,7 +348,8 @@ def prepare_data(x):
dist = safe_sparse_dot(data1, data2.T)
np.clip(dist, 0, 1, out=dist)
if x2 is None:
dist.flat[::dist.shape[0] + 1] = 1.0
diag = np.diag_indices_from(dist)
dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0)
return 1 - dist


Expand Down
26 changes: 20 additions & 6 deletions Orange/widgets/visualize/owsilhouetteplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ class Outputs:
auto_commit = settings.Setting(True)

Distances = [("Euclidean", Orange.distance.Euclidean),
("Manhattan", Orange.distance.Manhattan)]
("Manhattan", Orange.distance.Manhattan),
("Cosine", Orange.distance.Cosine)]

graph_name = "scene"
buttons_area_orientation = Qt.Vertical
Expand All @@ -89,6 +90,8 @@ class Error(widget.OWWidget.Error):
class Warning(widget.OWWidget.Warning):
missing_cluster_assignment = Msg(
"{} instance{s} omitted (missing cluster assignment)")
nan_distances = Msg("{} instance{s} omitted (undefined distances)")
ignoring_categorical = Msg("Ignoring categorical features")

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -251,8 +254,13 @@ def _update(self):

if self._matrix is None and self.data is not None:
_, metric = self.Distances[self.distance_idx]
data = self.data
if not metric.supports_discrete and any(
a.is_discrete for a in data.domain.attributes):
self.Warning.ignoring_categorical()
data = Orange.distance.remove_discrete_features(data)
try:
self._matrix = np.asarray(metric(self.data))
self._matrix = np.asarray(metric(data))
except MemoryError:
self.Error.memory_error()
return
Expand All @@ -271,13 +279,15 @@ def _reset_all(self):

def _clear_messages(self):
self.Error.clear()
self.Warning.missing_cluster_assignment.clear()
self.Warning.clear()

def _update_labels(self):
labelvar = self.cluster_var_model[self.cluster_var_idx]
labels, _ = self.data.get_column_view(labelvar)
labels = np.asarray(labels, dtype=float)
mask = np.isnan(labels)
cluster_mask = np.isnan(labels)
dist_mask = np.isnan(self._matrix).all(axis=0)
mask = cluster_mask | dist_mask
labels = labels.astype(int)
labels = labels[~mask]

Expand All @@ -296,11 +306,15 @@ def _update_labels(self):
self._labels = labels
self._silhouette = silhouette

if labels is not None:
count_missing = np.count_nonzero(mask)
if mask is not None:
count_missing = np.count_nonzero(cluster_mask)
if count_missing:
self.Warning.missing_cluster_assignment(
count_missing, s="s" if count_missing > 1 else "")
count_nandist = np.count_nonzero(dist_mask)
if count_nandist:
self.Warning.nan_distances(
count_nandist, s="s" if count_nandist > 1 else "")

def _set_bar_height(self):
visible = self.bar_size >= 5
Expand Down
56 changes: 42 additions & 14 deletions Orange/widgets/visualize/tests/test_owsilhouetteplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import numpy as np

import Orange.data
from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable
from Orange.data import (
Table, Domain, ContinuousVariable, DiscreteVariable, StringVariable)
from Orange.widgets.utils.annotated_data import ANNOTATED_DATA_SIGNAL_NAME
from Orange.widgets.visualize.owsilhouetteplot import OWSilhouettePlot
from Orange.widgets.tests.base import WidgetTest, WidgetOutputsTestMixin
Expand All @@ -21,6 +21,7 @@ def setUpClass(cls):

cls.signal_name = "Data"
cls.signal_data = cls.data
cls.scorename = "Silhouette ({})".format(cls.data.domain.class_var.name)

def setUp(self):
self.widget = self.create_widget(OWSilhouettePlot,
Expand All @@ -36,11 +37,10 @@ def test_outputs_add_scores(self):
self.send_signal(self.widget.Inputs.data, self.data)
self.widget.controls.add_scores.setChecked(1)
selected_indices = self._select_data()
name = "Silhouette ({})".format(self.data.domain.class_var.name)
selected = self.get_output(self.widget.Outputs.selected_data)
annotated = self.get_output(self.widget.Outputs.annotated_data)
self.assertEqual(name, selected.domain.metas[0].name)
self.assertEqual(name, annotated.domain.metas[0].name)
self.assertEqual(self.scorename, selected.domain.metas[0].name)
self.assertEqual(self.scorename, annotated.domain.metas[0].name)
np.testing.assert_array_equal(selected.X, self.data.X[selected_indices])

def _select_data(self):
Expand All @@ -62,33 +62,61 @@ def test_insufficient_clusters(self):

def test_unknowns_in_labels(self):
self.widget.controls.add_scores.setChecked(1)
scorename = "Silhouette (iris)"
data = self.data[[0, 1, 2, 50, 51, 52, 100, 101, 102]]
data.Y[::3] = np.nan
valid = ~np.isnan(data.Y.flatten())
self.send_signal(self.widget.Inputs.data, data)
output = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
scores = output[:, scorename].metas.flatten()
scores = output[:, self.scorename].metas.flatten()
self.assertTrue(np.all(np.isnan(scores[::3])))
self.assertTrue(np.all(np.isfinite(scores[valid])))

# Run again on subset with known labels
data_1 = data[np.flatnonzero(valid)]
self.send_signal(self.widget.Inputs.data, data_1)
output_1 = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
scores_1 = output_1[:, scorename].metas.flatten()
scores_1 = output_1[:, self.scorename].metas.flatten()
self.assertTrue(np.all(np.isfinite(scores_1)))
# the scores must match
np.testing.assert_almost_equal(scores_1, scores[valid], decimal=12)

def test_nan_distances(self):
self.widget.controls.add_scores.setChecked(1)
self.widget.distance_idx = 2
self.assertEqual(self.widget.Distances[self.widget.distance_idx][0],
'Cosine')
data = self.data[[0, 1, 2, 50, 51, 52, 100, 101, 102]]
data.X[::3] = 0
valid = np.any(data.X != 0, axis=1)
self.assertFalse(self.widget.Warning.nan_distances.is_shown())
self.send_signal(self.widget.Inputs.data, data)
self.assertTrue(np.isnan(self.widget._matrix).any())
self.assertTrue(self.widget.Warning.nan_distances.is_shown())
output = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
scores = output[:, self.scorename].metas.flatten()
self.assertTrue(np.all(np.isnan(scores[::3])))
self.assertTrue(np.all(np.isfinite(scores[valid])))

def test_ignore_categorical(self):
data = Table('heart_disease')
self.widget.distance_idx = 2
self.assertEqual(self.widget.Distances[self.widget.distance_idx][0],
'Cosine')
self.assertFalse(self.widget.Warning.ignoring_categorical.is_shown())
self.send_signal(self.widget.Inputs.data, data)
self.assertTrue(self.widget.Warning.ignoring_categorical.is_shown())
output = self.get_output(ANNOTATED_DATA_SIGNAL_NAME)
self.assertEqual(len(output.domain), len(data.domain))
self.widget.distance_idx = 0
self.widget._update()
self.assertFalse(self.widget.Warning.ignoring_categorical.is_shown())

def test_meta_object_dtype(self):
# gh-1875: Test on mixed string/discrete metas
data = self.data[::5]
domain = Orange.data.Domain(
data.domain.attributes, [],
[data.domain["iris"],
Orange.data.StringVariable("S")]
)
domain = Domain(data.domain.attributes,
[],
[data.domain["iris"], StringVariable("S")])
data = data.from_table(domain, data)
self.send_signal(self.widget.Inputs.data, data)

Expand All @@ -100,7 +128,7 @@ def test_memory_error(self):
GH-2521
"""
for i, side_effect in enumerate([MemoryError, ValueError]):
data = Orange.data.Table("iris")[::3]
data = Table("iris")[::3]
self.send_signal(self.widget.Inputs.data, data)
self.assertFalse(self.widget.Error.memory_error.is_shown())
self.assertFalse(self.widget.Error.value_error.is_shown())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ The **Silhouette Plot** widget offers a graphical representation of consistency

1. Choose the distance metric. You can choose between:

- `Euclidean <https://en.wikipedia.org/wiki/Euclidean_distance>`_ ("straight line", distance between two points)
- `Euclidean <https://en.wikipedia.org/wiki/Euclidean_distance>`_ ("straight line" distance between two points)
- `Manhattan <https://en.wiktionary.org/wiki/Manhattan_distance>`_ (the sum of absolute differences for all attributes)
- `Cosine <https://en.wiktionary.org/wiki/Cosine_similarity>`_ (1 - cosine of the angle between two vectors)

2. Select the cluster label. You can decide whether to group the instances by cluster or not.
3. Display options:
Expand Down