From c84a13d7c9336391ef10d854f8ba643ecfdf24e3 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Fri, 11 Nov 2016 11:47:51 +0100 Subject: [PATCH 1/2] owsilhouetteplot: Check number of labels --- Orange/widgets/visualize/owsilhouetteplot.py | 22 +++++++++++++------ .../visualize/tests/test_owsilhouetteplot.py | 12 ++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Orange/widgets/visualize/owsilhouetteplot.py b/Orange/widgets/visualize/owsilhouetteplot.py index 31b2c667856..a5327146642 100644 --- a/Orange/widgets/visualize/owsilhouetteplot.py +++ b/Orange/widgets/visualize/owsilhouetteplot.py @@ -73,6 +73,7 @@ class OWSilhouettePlot(widget.OWWidget): class Error(widget.OWWidget.Error): need_two_clusters = Msg("Need at least two non-empty clusters") + singleton_clusters_all = Msg("All clusters are singletons") def __init__(self): super().__init__() @@ -204,6 +205,7 @@ def clear(self): self.cluster_var_model[:] = [] self.annotation_var_model[:] = ["None"] self._clear_scene() + self.Error.clear() def _clear_scene(self): # Clear the graphics scene and associated objects @@ -239,15 +241,21 @@ def _update(self): labelvar = self.cluster_var_model[self.cluster_var_idx] labels, _ = self.data.get_column_view(labelvar) - labels = labels.astype(int) - _, counts = numpy.unique(labels, return_counts=True) - if numpy.count_nonzero(counts) >= 2: - self.Error.need_two_clusters.clear() - silhouette = sklearn.metrics.silhouette_samples( - self._matrix, labels, metric="precomputed") - else: + + labels_unq, counts = numpy.unique(labels, return_counts=True) + + self.Error.singleton_clusters_all.clear() + self.Error.need_two_clusters.clear() + + if len(labels_unq) < 2: self.Error.need_two_clusters() labels = silhouette = None + elif len(labels_unq) == len(labels): + self.Error.singleton_clusters_all() + labels = silhouette = None + else: + silhouette = sklearn.metrics.silhouette_samples( + self._matrix, labels, metric="precomputed") self._labels = labels self._silhouette = silhouette diff --git a/Orange/widgets/visualize/tests/test_owsilhouetteplot.py b/Orange/widgets/visualize/tests/test_owsilhouetteplot.py index 9fe774afc10..cdddce18666 100644 --- a/Orange/widgets/visualize/tests/test_owsilhouetteplot.py +++ b/Orange/widgets/visualize/tests/test_owsilhouetteplot.py @@ -21,6 +21,7 @@ def setUpClass(cls): def setUp(self): self.widget = self.create_widget(OWSilhouettePlot, stored_settings={"auto_commit": True}) + self.widget = self.widget # type: OWSilhouettePlot def test_outputs_add_scores(self): # check output when appending scores @@ -39,3 +40,14 @@ def _select_data(self): points = random.sample(range(0, len(self.data)), 20) self.widget._silplot.setSelection(points) return sorted(points) + + def test_insufficient_clusters(self): + iris = self.data + data_one_cluster = iris[:3] # three instances Iris-setosa only + self.send_signal("Data", data_one_cluster) + self.assertTrue(self.widget.Error.need_two_clusters.is_shown()) + + data_singletons = iris[[0, 50, 100]] + assert len(np.unique(data_singletons.Y)) == 3 # 3 instances 3 labels + self.send_signal("Data", data_singletons) + self.assertTrue(self.widget.Error.singleton_clusters_all.is_shown()) From c7efeed824c5cad74395cc01469550c792bd4821 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Fri, 11 Nov 2016 16:25:10 +0100 Subject: [PATCH 2/2] owsilhouetteplot: Handle missing label/cluster values --- Orange/widgets/visualize/owsilhouetteplot.py | 74 +++++++++++++++---- .../visualize/tests/test_owsilhouetteplot.py | 21 ++++++ 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Orange/widgets/visualize/owsilhouetteplot.py b/Orange/widgets/visualize/owsilhouetteplot.py index a5327146642..aedb7668562 100644 --- a/Orange/widgets/visualize/owsilhouetteplot.py +++ b/Orange/widgets/visualize/owsilhouetteplot.py @@ -5,6 +5,9 @@ from xml.sax.saxutils import escape from types import SimpleNamespace as namespace +if sys.version_info > (3, 5): + from typing import Optional + import numpy import sklearn.metrics @@ -57,7 +60,7 @@ class OWSilhouettePlot(widget.OWWidget): cluster_var_idx = settings.ContextSetting(0) #: Annotation variable index annotation_var_idx = settings.ContextSetting(0) - #: Group the silhouettes by cluster + #: Group the (displayed) silhouettes by cluster group_by_cluster = settings.Setting(True) #: A fixed size for an instance bar bar_size = settings.Setting(3) @@ -75,15 +78,28 @@ class Error(widget.OWWidget.Error): need_two_clusters = Msg("Need at least two non-empty clusters") singleton_clusters_all = Msg("All clusters are singletons") + class Warning(widget.OWWidget.Warning): + missing_cluster_assignment = Msg( + "{} instance{s} omitted (missing cluster assignment)") + def __init__(self): super().__init__() - - self.data = None - self._effective_data = None - self._matrix = None - self._silhouette = None - self._labels = None - self._silplot = None + #: The input data + self.data = None # type: Optional[Orange.data.Table] + #: Data after any applied pre-processing step + self._effective_data = None # type: Optional[Orange.data.Table] + #: Distance matrix computed from _effective_data + self._matrix = None # type: Optional[Orange.misc.DistMatrix] + #: An bool mask (size == len(data)) indicating missing group/cluster + #: assignments + self._mask = None # type: Optional[numpy.ndarray] + #: An array of cluster/group labels for instances with valid group + #: assignment + self._labels = None # type: Optional[numpy.ndarray] + #: An array of silhouette scores for instances with valid group + #: assignment + self._silhouette = None # type: Optional[numpy.ndarray] + self._silplot = None # type: Optional[SilhouettePlot] gui.comboBox( self.controlArea, self, "distance_idx", box="Distance", @@ -200,12 +216,14 @@ def clear(self): self.data = None self._effective_data = None self._matrix = None + self._mask = None self._silhouette = None self._labels = None self.cluster_var_model[:] = [] self.annotation_var_model[:] = ["None"] self._clear_scene() self.Error.clear() + self.Warning.clear() def _clear_scene(self): # Clear the graphics scene and associated objects @@ -220,7 +238,7 @@ def _invalidate_distances(self): def _invalidate_scores(self): # Invalidate and recompute the current silhouette scores. - self._labels = self._silhouette = None + self._labels = self._silhouette = self._mask = None self._update() self._replot() if self.data is not None: @@ -229,6 +247,7 @@ def _invalidate_scores(self): def _update(self): # Update/recompute the distances/scores as required if self.data is None: + self._mask = None self._silhouette = None self._labels = None self._matrix = None @@ -241,25 +260,35 @@ def _update(self): labelvar = self.cluster_var_model[self.cluster_var_idx] labels, _ = self.data.get_column_view(labelvar) + mask = numpy.isnan(labels) + labels = labels.astype(int) + labels = labels[~mask] labels_unq, counts = numpy.unique(labels, return_counts=True) self.Error.singleton_clusters_all.clear() self.Error.need_two_clusters.clear() + self.Warning.missing_cluster_assignment.clear() if len(labels_unq) < 2: self.Error.need_two_clusters() - labels = silhouette = None + labels = silhouette = mask = None elif len(labels_unq) == len(labels): self.Error.singleton_clusters_all() - labels = silhouette = None + labels = silhouette = mask = None else: silhouette = sklearn.metrics.silhouette_samples( - self._matrix, labels, metric="precomputed") - + self._matrix[~mask, :][:, ~mask], labels, metric="precomputed") + self._mask = mask self._labels = labels self._silhouette = silhouette + if labels is not None: + count_missing = numpy.count_nonzero(mask) + if count_missing: + self.Warning.missing_cluster_assignment( + count_missing, s="s" if count_missing > 1 else "") + def _set_bar_height(self): visible = self.bar_size >= 5 self._silplot.setBarHeight(self.bar_size) @@ -312,6 +341,9 @@ def _update_annotations(self): if self._silplot is not None: if annot_var is not None: column, _ = self.data.get_column_view(annot_var) + if self._mask is not None: + assert column.shape == self._mask.shape + column = column[~self._mask] self._silplot.setRowNames( [annot_var.str_val(value) for value in column]) else: @@ -326,8 +358,18 @@ def commit(self): selectedmask = numpy.full(len(self.data), False, dtype=bool) if self._silplot is not None: indices = self._silplot.selection() + assert (numpy.diff(indices) > 0).all(), "strictly increasing" + if self._mask is not None: + indices = numpy.flatnonzero(~self._mask)[indices] selectedmask[indices] = True - scores = self._silhouette + + if self._mask is not None: + scores = numpy.full(shape=selectedmask.shape, + fill_value=numpy.nan) + scores[~self._mask] = self._silhouette + else: + scores = self._silhouette + silhouette_var = None if self.add_scores: var = self.cluster_var_model[self.cluster_var_idx] @@ -429,7 +471,9 @@ def setScores(self, scores, labels, values, rownames=None): raise ValueError("rownames must have the same size as scores") Ck = numpy.unique(labels) - assert Ck[0] >= 0 and Ck[-1] < len(values) + if not Ck[0] >= 0 and Ck[-1] < len(values): + raise ValueError( + "All indices in `labels` must be in `range(len(values))`") cluster_indices = [numpy.flatnonzero(labels == i) for i in range(len(values))] cluster_indices = [indices[numpy.argsort(scores[indices])[::-1]] diff --git a/Orange/widgets/visualize/tests/test_owsilhouetteplot.py b/Orange/widgets/visualize/tests/test_owsilhouetteplot.py index cdddce18666..34008a9484c 100644 --- a/Orange/widgets/visualize/tests/test_owsilhouetteplot.py +++ b/Orange/widgets/visualize/tests/test_owsilhouetteplot.py @@ -51,3 +51,24 @@ def test_insufficient_clusters(self): assert len(np.unique(data_singletons.Y)) == 3 # 3 instances 3 labels self.send_signal("Data", data_singletons) self.assertTrue(self.widget.Error.singleton_clusters_all.is_shown()) + + def test_unknowns_in_labels(self): + self.widget.controls.add_scores.setChecked(1) + scorename = "Silhouette (iris)" + data = self.data[[0, 1, 2, 50, 51, 52, 100, 101, 102]] + data.Y[::3] = np.nan + valid = ~np.isnan(data.Y.flatten()) + self.send_signal("Data", data) + output = self.get_output(ANNOTATED_DATA_SIGNAL_NAME) + scores = output[:, scorename].metas.flatten() + self.assertTrue(np.all(np.isnan(scores[::3]))) + self.assertTrue(np.all(np.isfinite(scores[valid]))) + + # Run again on subset with known labels + data_1 = data[np.flatnonzero(valid)] + self.send_signal("Data", data_1) + output_1 = self.get_output(ANNOTATED_DATA_SIGNAL_NAME) + scores_1 = output_1[:, scorename].metas.flatten() + self.assertTrue(np.all(np.isfinite(scores_1))) + # the scores must match + np.testing.assert_almost_equal(scores_1, scores[valid], decimal=12)