Merge pull request #6542 from janezd/som-output-columns

[ENH] SOM: output columns with coordinates and errors
biolab · Sep 1, 2023 · 3778aee · 3778aee
2 parents 972405f + e3f2371
commit 3778aee
Show file tree

Hide file tree

Showing 6 changed files with 381 additions and 56 deletions.
diff --git a/Orange/projection/_som.pyx b/Orange/projection/_som.pyx
@@ -22,6 +22,8 @@ def get_winners(np.float64_t[:, :, :] weights, np.float64_t[:, :] X, int hex):
         np.float64_t[:] row
         np.ndarray[np.int16_t, ndim=2] winners = \
             np.empty((X.shape[0], 2), dtype=np.int16)
+        np.ndarray[np.float64_t, ndim=1] distances = \
+            np.empty((X.shape[0]), dtype=np.float64)
         int nrows = X.shape[0]
 
     with nogil:
@@ -40,8 +42,9 @@ def get_winners(np.float64_t[:, :, :] weights, np.float64_t[:, :] X, int hex):
                         min_diff = diff
             winners[rowi, 0] = win_x
             winners[rowi, 1] = win_y
+            distances[rowi] = min_diff
 
-    return winners
+    return winners, distances
 
 
 def update(np.float64_t[:, :, :] weights,
@@ -127,6 +130,8 @@ def get_winners_sparse(np.float64_t[:, :, :] weights,
         np.float64_t[:] row,
         np.ndarray[np.int16_t, ndim=2] winners = \
             np.empty((X.shape[0], 2), dtype=np.int16)
+        np.ndarray[np.float64_t, ndim=1] distances = \
+            np.empty((X.shape[0]), dtype=np.float64)
         int nrows = X.shape[0]
 
     with nogil:
@@ -149,7 +154,8 @@ def get_winners_sparse(np.float64_t[:, :, :] weights,
 
             winners[rowi, 0] = win_x
             winners[rowi, 1] = win_y
-    return winners
+            distances[rowi] = min_diff
+    return winners, distances
 
 
 def update_sparse(np.ndarray[np.float64_t, ndim=3] weights,

diff --git a/Orange/projection/som.py b/Orange/projection/som.py
@@ -1,3 +1,5 @@
+from typing import Union, Optional
+
 import numpy as np
 import scipy.sparse as sp
 
@@ -14,6 +16,39 @@ def __init__(self, dim_x, dim_y,
         self.pca_init = pca_init
         self.random_seed = random_seed
 
+    @staticmethod
+    def prepare_data(x: Union[np.ndarray, sp.spmatrix],
+                     offsets: Optional[np.ndarray] = None,
+                     scales: Optional[np.ndarray] = None) \
+            -> (Union[np.ndarray, sp.spmatrix],
+                np.ndarray,
+                Union[np.ndarray, None],
+                Union[np.ndarray, None]):
+        if sp.issparse(x) and offsets is not None:
+            # This is used in compute_value, by any widget, hence there is no
+            # way to prevent it or report an error. We go dense...
+            x = x.todense()
+        if sp.issparse(x):
+            cont_x = x.tocsr()
+            mask = np.ones(cont_x.shape[0], bool)
+        else:
+            mask = np.all(np.isfinite(x), axis=1)
+            useful = np.sum(mask)
+            if useful == 0:
+                return x, mask, offsets, scales
+            if useful == len(mask):
+                cont_x = x.copy()
+            else:
+                cont_x = x[mask]
+            if offsets is None:
+                offsets = np.min(cont_x, axis=0)
+            cont_x -= offsets[None, :]
+            if scales is None:
+                scales = np.max(cont_x, axis=0)
+                scales[scales == 0] = 1
+            cont_x /= scales[None, :]
+        return cont_x, mask, offsets, scales
+
     def init_weights_random(self, x):
         random = (np.random if self.random_seed is None
                   else np.random.RandomState(self.random_seed))