Merge pull request #3660 from VesnaT/correlations_heuristic

[ENH] Correlations: Enhancements and fixes
biolab · Mar 8, 2019 · 6bd86e2 · 6bd86e2
2 parents 4835db7 + 48795a2
commit 6bd86e2
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 46 deletions.
diff --git a/Orange/widgets/data/owcorrelations.py b/Orange/widgets/data/owcorrelations.py
@@ -3,6 +3,7 @@
 """
 from enum import IntEnum
 from operator import attrgetter
+from types import SimpleNamespace
 from itertools import combinations, groupby, chain
 
 import numpy as np
@@ -45,17 +46,21 @@ def items():
         return ["Pearson correlation", "Spearman correlation"]
 
 
+class Cluster(SimpleNamespace):
+    instances = None  # type: Optional[List]
+    centroid = None  # type: Optional[np.ndarray]
+
+
 class KMeansCorrelationHeuristic:
     """
-    Heuristic to obtain the most promising attribute pairs, when there are to
+    Heuristic to obtain the most promising attribute pairs, when there are too
     many attributes to calculate correlations for all possible pairs.
     """
-    n_clusters = 10
-
     def __init__(self, data):
         self.n_attributes = len(data.domain.attributes)
         self.data = data
         self.states = None
+        self.n_clusters = int(np.sqrt(self.n_attributes))
 
     def get_clusters_of_attributes(self):
         """
@@ -67,22 +72,39 @@ def get_clusters_of_attributes(self):
         data = Normalize()(self.data).X.T
         kmeans = KMeans(n_clusters=self.n_clusters, random_state=0).fit(data)
         labels_attrs = sorted([(l, i) for i, l in enumerate(kmeans.labels_)])
-        for _, group in groupby(labels_attrs, key=lambda x: x[0]):
-            group = list(group)
-            if len(group) > 1:
-                yield list(pair[1] for pair in group)
+        return [Cluster(instances=list(pair[1] for pair in group),
+                        centroid=kmeans.cluster_centers_[l])
+                for l, group in groupby(labels_attrs, key=lambda x: x[0])]
 
     def get_states(self, initial_state):
         """
-        Generates the most promising states (attribute pairs).
+        Generates states (attribute pairs) - the most promising first, i.e.
+        states within clusters, following by states among clusters.
 
         :param initial_state: initial state; None if this is the first call
         :return: generator of tuples of states
         """
         if self.states is not None:
             return chain([initial_state], self.states)
-        self.states = chain.from_iterable(combinations(inds, 2) for inds in
-                                          self.get_clusters_of_attributes())
+
+        clusters = self.get_clusters_of_attributes()
+
+        # combinations within clusters
+        self.states = chain.from_iterable(combinations(cluster.instances, 2)
+                                          for cluster in clusters)
+        if self.n_clusters == 1:
+            return self.states
+
+        # combinations among clusters - closest clusters first
+        centroids = [c.centroid for c in clusters]
+        centroids_combs = np.array(list(combinations(centroids, 2)))
+        distances = np.linalg.norm((centroids_combs[:, 0] -
+                                    centroids_combs[:, 1]), axis=1)
+        cluster_combs = list(combinations(range(len(clusters)), 2))
+        states = ((min((c1, c2)), max((c1, c2))) for i in np.argsort(distances)
+                  for c1 in clusters[cluster_combs[i][0]].instances
+                  for c2 in clusters[cluster_combs[i][1]].instances)
+        self.states = chain(self.states, states)
         return self.states
 
 
@@ -112,11 +134,8 @@ def initialize(self):
             self.sel_feature_index = None
         if data:
             # use heuristic if data is too big
-            n_attrs = len(self.attrs)
-            use_heuristic = n_attrs > KMeansCorrelationHeuristic.n_clusters
-            self.use_heuristic = use_heuristic and \
-                len(data) * n_attrs ** 2 > SIZE_LIMIT and \
-                self.sel_feature_index is None
+            self.use_heuristic = len(data) * len(self.attrs) ** 2 > SIZE_LIMIT \
+                and self.sel_feature_index is None
             if self.use_heuristic:
                 self.heuristic = KMeansCorrelationHeuristic(data)
 
@@ -161,15 +180,8 @@ def iterate_states_by_feature(self):
                 yield self.sel_feature_index, j
 
     def state_count(self):
-        if self.sel_feature_index is not None:
-            return len(self.attrs) - 1
-        elif self.use_heuristic:
-            n_clusters = KMeansCorrelationHeuristic.n_clusters
-            n_avg_attrs = len(self.attrs) / n_clusters
-            return n_clusters * n_avg_attrs * (n_avg_attrs - 1) / 2
-        else:
-            n_attrs = len(self.attrs)
-            return n_attrs * (n_attrs - 1) / 2
+        n = len(self.attrs)
+        return n * (n - 1) / 2 if self.sel_feature_index is None else n - 1
 
     @staticmethod
     def bar_length(score):
@@ -206,10 +218,12 @@ class Outputs:
     correlation_type = Setting(0)
 
     class Information(OWWidget.Information):
-        not_enough_vars = Msg("Need at least two continuous features.")
-        not_enough_inst = Msg("Need at least two instances.")
         removed_cons_feat = Msg("Constant features have been removed.")
 
+    class Warning(OWWidget.Warning):
+        not_enough_vars = Msg("At least two continuous features are needed.")
+        not_enough_inst = Msg("At least two instances are needed.")
+
     def __init__(self):
         super().__init__()
         self.data = None
@@ -223,9 +237,8 @@ def __init__(self):
         )
 
         self.feature_model = DomainModel(
-            separators=False, placeholder="(All combinations)",
-            valid_types=ContinuousVariable,
-        )
+            order=DomainModel.ATTRIBUTES, separators=False,
+            placeholder="(All combinations)", valid_types=ContinuousVariable)
         gui.comboBox(
             box, self, "feature", callback=self._feature_combo_changed,
             model=self.feature_model
@@ -296,7 +309,7 @@ def set_data(self, data):
         self.selection = ()
         if data is not None:
             if len(data) < 2:
-                self.Information.not_enough_inst()
+                self.Warning.not_enough_inst()
             else:
                 domain = data.domain
                 cont_attrs = [a for a in domain.attributes if a.is_continuous]
@@ -307,7 +320,7 @@ def set_data(self, data):
                 if remover.attr_results["removed"]:
                     self.Information.removed_cons_feat()
                 if len(cont_data.domain.attributes) < 2:
-                    self.Information.not_enough_vars()
+                    self.Warning.not_enough_vars()
                 else:
                     self.cont_data = SklImpute()(cont_data)
         self.set_feature_model()

diff --git a/Orange/widgets/data/tests/test_owcorrelations.py b/Orange/widgets/data/tests/test_owcorrelations.py
@@ -1,6 +1,7 @@
 # Test methods with long descriptive names can omit docstrings
 # pylint: disable=missing-docstring, protected-access
 import time
+import unittest
 from unittest.mock import patch, Mock
 
 import numpy as np
@@ -46,9 +47,9 @@ def test_input_data_cont(self):
     def test_input_data_disc(self):
         """Check correlation table for dataset with discrete attributes"""
         self.send_signal(self.widget.Inputs.data, self.data_disc)
-        self.assertTrue(self.widget.Information.not_enough_vars.is_shown())
+        self.assertTrue(self.widget.Warning.not_enough_vars.is_shown())
         self.send_signal(self.widget.Inputs.data, None)
-        self.assertFalse(self.widget.Information.not_enough_vars.is_shown())
+        self.assertFalse(self.widget.Warning.not_enough_vars.is_shown())
 
     def test_input_data_mixed(self):
         """Check correlation table for dataset with continuous and discrete
@@ -68,9 +69,9 @@ def test_input_data_one_feature(self):
         time.sleep(0.1)
         self.process_events()
         self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 0)
-        self.assertTrue(self.widget.Information.not_enough_vars.is_shown())
+        self.assertTrue(self.widget.Warning.not_enough_vars.is_shown())
         self.send_signal(self.widget.Inputs.data, None)
-        self.assertFalse(self.widget.Information.not_enough_vars.is_shown())
+        self.assertFalse(self.widget.Warning.not_enough_vars.is_shown())
 
     def test_input_data_one_instance(self):
         """Check correlation table for dataset with one instance"""
@@ -79,9 +80,9 @@ def test_input_data_one_instance(self):
         self.process_events()
         self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 0)
         self.assertFalse(self.widget.Information.removed_cons_feat.is_shown())
-        self.assertTrue(self.widget.Information.not_enough_inst.is_shown())
+        self.assertTrue(self.widget.Warning.not_enough_inst.is_shown())
         self.send_signal(self.widget.Inputs.data, None)
-        self.assertFalse(self.widget.Information.not_enough_inst.is_shown())
+        self.assertFalse(self.widget.Warning.not_enough_inst.is_shown())
 
     def test_input_data_with_constant_features(self):
         """Check correlation table for dataset with a constant columns"""
@@ -109,7 +110,7 @@ def test_input_data_with_constant_features(self):
         time.sleep(0.1)
         self.process_events()
         self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 0)
-        self.assertTrue(self.widget.Information.not_enough_vars.is_shown())
+        self.assertTrue(self.widget.Warning.not_enough_vars.is_shown())
         self.assertTrue(self.widget.Information.removed_cons_feat.is_shown())
 
         self.send_signal(self.widget.Inputs.data, None)
@@ -194,7 +195,7 @@ def test_heuristic(self):
         heuristic = KMeansCorrelationHeuristic(self.data_cont)
         heuristic.n_clusters = 2
         self.assertListEqual(list(heuristic.get_states(None)),
-                             [(0, 2), (0, 3), (2, 3)])
+                             [(0, 2), (0, 3), (2, 3), (0, 1), (1, 2), (1, 3)])
 
     def test_heuristic_get_states(self):
         """Check attribute pairs after the widget has been paused"""
@@ -203,7 +204,7 @@ def test_heuristic_get_states(self):
         states = heuristic.get_states(None)
         _ = next(states)
         self.assertListEqual(list(heuristic.get_states(next(states))),
-                             [(0, 3), (2, 3)])
+                             [(0, 3), (2, 3), (0, 1), (1, 2), (1, 3)])
 
     def test_correlation_type(self):
         c_type = self.widget.controls.correlation_type
@@ -226,6 +227,9 @@ def test_feature_combo(self):
                            if attr.is_continuous]
         self.assertEqual(len(feature_combo.model()), len(cont_attributes) + 1)
 
+        self.send_signal(self.widget.Inputs.data, Table("housing"))
+        self.assertEqual(len(feature_combo.model()), 14)
+
     def test_select_feature(self):
         """Test feature selection"""
         feature_combo = self.widget.controls.feature
@@ -254,18 +258,14 @@ def test_select_feature(self):
                                  self.widget.Outputs.features)])
 
     @patch("Orange.widgets.data.owcorrelations.SIZE_LIMIT", 2000)
-    @patch("Orange.widgets.data.owcorrelations."
-           "KMeansCorrelationHeuristic.n_clusters", 2)
     def test_vizrank_use_heuristic(self):
         self.send_signal(self.widget.Inputs.data, self.data_cont)
         time.sleep(0.1)
         self.process_events()
-        self.assertEqual(self.widget.vizrank.rank_model.rowCount(),
-                         len(self.widget.cont_data.domain.attributes) - 1)
+        self.assertTrue(self.widget.vizrank.use_heuristic)
+        self.assertEqual(self.widget.vizrank.rank_model.rowCount(), 6)
 
     @patch("Orange.widgets.data.owcorrelations.SIZE_LIMIT", 2000)
-    @patch("Orange.widgets.data.owcorrelations."
-           "KMeansCorrelationHeuristic.n_clusters", 1)
     def test_select_feature_against_heuristic(self):
         """Never use heuristic if feature is selected"""
         feature_combo = self.widget.controls.feature
@@ -312,3 +312,36 @@ def test_iterate_states_by_feature(self):
         self.vizrank.sel_feature_index = 2
         states = self.vizrank.iterate_states_by_feature()
         self.assertListEqual([(2, 0), (2, 1), (2, 3)], list(states))
+
+    def test_state_count(self):
+        self.assertEqual(self.vizrank.state_count(), 6)
+        self.vizrank.sel_feature_index = 2
+        self.assertEqual(self.vizrank.state_count(), 3)
+
+
+class TestKMeansCorrelationHeuristic(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.data = Table("wine")
+        cls.heuristic = KMeansCorrelationHeuristic(cls.data)
+
+    def test_n_clusters(self):
+        self.assertEqual(self.heuristic.n_clusters, 3)
+
+    def test_get_clusters_of_attributes(self):
+        clusters = self.heuristic.get_clusters_of_attributes()
+        self.assertListEqual([[5, 6, 8, 10, 11], [1, 2, 3, 7], [0, 4, 9, 12]],
+                             [c.instances for c in clusters])
+
+    def test_get_states(self):
+        n_attrs = len(self.data.domain.attributes)
+        states = set(self.heuristic.get_states(None))
+        self.assertEqual(len(states), n_attrs * (n_attrs - 1) / 2)
+        self.assertSetEqual(set((min(i, j), max(i, j)) for i in
+                                range(n_attrs) for j in range(i)), states)
+
+    def test_get_states_one_cluster(self):
+        heuristic = KMeansCorrelationHeuristic(Table("iris")[:, :2])
+        states = set(heuristic.get_states(None))
+        self.assertEqual(len(states), 1)
+        self.assertSetEqual(states, {(0, 1)})