Skip to content

Commit

Permalink
Merge pull request biolab#2699 from jerneju/spg-scaling-metas
Browse files Browse the repository at this point in the history
[ENH] Scatter Plot Graph and Scaling can handle metas
  • Loading branch information
lanzagar authored and nikicc committed Nov 17, 2017
1 parent a25ad6a commit b7545b3
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 136 deletions.
36 changes: 8 additions & 28 deletions Orange/widgets/unsupervised/owmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def update_data(self, attr_x, attr_y, reset_view=True):
self.plot_widget.hideAxis(axis)
self.plot_widget.setAspectLocked(True, 1)

def get_size_index(self):
if self.attr_size == "Stress":
return -2
return super().get_size_index()

def compute_sizes(self):
def scale(a):
dmin, dmax = np.nanmin(a), np.nanmax(a)
Expand All @@ -69,17 +64,16 @@ def scale(a):
return np.zeros_like(a)

self.master.Information.missing_size.clear()
size_index = self.get_size_index()
if size_index == -1:
if self.attr_size is None:
size_data = np.full((self.n_points,), self.point_width,
dtype=float)
elif size_index == -2:
elif self.attr_size == "Stress":
size_data = scale(stress(self.master.embedding, self.master.effective_matrix))
size_data = self.MinShapeSize + size_data * self.point_width
else:
size_data = \
self.MinShapeSize + \
self.scaled_data[size_index, self.valid_data] * \
self.scaled_data.get_column_view(self.attr_size)[0][self.valid_data] * \
self.point_width
nans = np.isnan(size_data)
if np.any(nans):
Expand Down Expand Up @@ -270,11 +264,6 @@ def update_regression_line(self):

def init_attr_values(self):
domain = self.data and len(self.data) and self.data.domain or None
if domain is not None:
domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_vars,
metas=tuple(a for a in domain.metas if a.is_primitive()))
for model in self.models:
model.set_domain(domain)
self.graph.attr_color = self.data.domain.class_var if domain else None
Expand Down Expand Up @@ -653,21 +642,12 @@ def _setup_plot(self, new=False):
coords = np.vstack((emb_x, emb_y)).T

data = self.data

primitive_metas = tuple(a for a in data.domain.metas if a.is_primitive())
keys = [k for k, a in enumerate(data.domain.metas) if a.is_primitive()]
data_metas = data.metas[:, keys].astype(float)

attributes = self.data.domain.attributes + (self.variable_x, self.variable_y) + \
primitive_metas
attributes = data.domain.attributes + (self.variable_x, self.variable_y)
domain = Domain(attributes=attributes,
class_vars=self.data.domain.class_vars)
if data_metas is not None:
data_x = (self.data.X, coords, data_metas)
else:
data_x = (self.data.X, coords)
data = Table.from_numpy(domain, X=hstack(data_x),
Y=self.data.Y)
class_vars=data.domain.class_vars,
metas=data.domain.metas)
data = Table.from_numpy(domain, X=hstack((data.X, coords)),
Y=data.Y, metas=data.metas)
subset_data = data[self._subset_mask] if self._subset_mask is not None else None
self.graph.new_data(data, subset_data=subset_data, new=new)
self.graph.update_data(self.variable_x, self.variable_y, True)
Expand Down
122 changes: 70 additions & 52 deletions Orange/widgets/utils/scaling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from itertools import chain

import numpy as np

from Orange.data import Domain
from Orange.statistics.basic_stats import DomainBasicStats
from Orange.widgets.settings import Setting
from Orange.widgets.utils import checksum
Expand All @@ -12,12 +15,11 @@ class ScaleData:

def _reset_data(self):
self.domain = None
self.data = None
self.original_data = None # as numpy array
self.data = None # as Orange Table
self.scaled_data = None # in [0, 1]
self.jittered_data = None
self.attr_values = {}
self.domain_data_stat = []
self.domain_data_stat = {}
self.valid_data_array = None
self.attribute_flip_info = {} # dictionary with attr: 0/1 if flipped
self.jitter_seed = 0
Expand All @@ -30,53 +32,58 @@ def rescale_data(self):

def _compute_domain_data_stat(self):
stt = self.domain_data_stat = \
getCached(self.data, DomainBasicStats, (self.data,))
for index in range(len(self.domain)):
attr = self.domain[index]
getCached(self.data, DomainBasicStats, (self.data, True))
domain = self.domain
for attr in chain(domain.variables, domain.metas):
if attr.is_discrete:
self.attr_values[attr] = [0, len(attr.values)]
elif attr.is_continuous:
self.attr_values[attr] = [stt[index].min, stt[index].max]
self.attr_values[attr] = [stt[attr].min, stt[attr].max]

def _compute_scaled_data(self):
data = self.data
# We cache scaled_data and validArray to share them between widgets
cached = getCached(data, "visualizationData")
if cached:
self.original_data, self.scaled_data, self.valid_data_array = cached
self.data, self.scaled_data, self.valid_data_array = cached
return

Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
self.original_data = np.hstack((data.X, Y)).T
self.scaled_data = no_jit = self.original_data.copy()
self.valid_data_array = np.isfinite(no_jit)
for index in range(len(data.domain)):
attr = data.domain[index]
if np.any(data.metas):
all_data = (data.X, Y, data.metas)
else:
all_data = (data.X, Y)
all_data = np.hstack(all_data).T
self.scaled_data = self.data.copy()
self.valid_data_array = np.isfinite(all_data)
domain = self.domain
for attr in chain(domain.attributes, domain.class_vars, domain.metas):
c = self.scaled_data.get_column_view(attr)[0]
if attr.is_discrete:
no_jit[index] *= 2
no_jit[index] += 1
no_jit[index] /= 2 * len(attr.values)
c += 0.5
c /= len(attr.values)
else:
dstat = self.domain_data_stat[index]
no_jit[index] -= dstat.min
dstat = self.domain_data_stat[attr]
c -= dstat.min
if dstat.max != dstat.min:
no_jit[index] /= dstat.max - dstat.min
c /= dstat.max - dstat.min
setCached(data, "visualizationData",
(self.original_data, self.scaled_data, self.valid_data_array))
(self.data, self.scaled_data, self.valid_data_array))

def _compute_jittered_data(self):
data = self.data
self.jittered_data = self.scaled_data.copy()
random = np.random.RandomState(seed=self.jitter_seed)
for index, col in enumerate(self.jittered_data):
domain = self.domain
for attr in chain(domain.variables, domain.metas):
# Need to use a different seed for each feature
attr = data.domain[index]
if attr.is_discrete:
off = self.jitter_size / (25 * max(1, len(attr.values)))
elif attr.is_continuous and self.jitter_continuous:
off = self.jitter_size / 25
else:
continue
col = self.jittered_data.get_column_view(attr)[0]
col += random.uniform(-off, off, len(data))
# fix values outside [0, 1]
col = np.absolute(col)
Expand All @@ -92,8 +99,13 @@ def set_data(self, data, skip_if_same=False, no_data=False):
if data is None:
return

self.domain = data.domain
self.data = data
domain = data.domain
new_domain = Domain(attributes=domain.attributes,
class_vars=domain.class_vars,
metas=tuple(v for v in domain.metas if v.is_primitive()))
self.data = data.transform(new_domain)
self.data.metas = self.data.metas.astype(float)
self.domain = self.data.domain
self.attribute_flip_info = {}
if not no_data:
self._compute_domain_data_stat()
Expand All @@ -103,67 +115,73 @@ def set_data(self, data, skip_if_same=False, no_data=False):
def flip_attribute(self, attr):
if attr.is_discrete:
return 0
index = self.domain.index(attr)
self.attribute_flip_info[attr] = 1 - self.attribute_flip_info.get(attr, 0)
if attr.is_continuous:
self.attr_values[attr] = [-self.attr_values[attr][1],
-self.attr_values[attr][0]]

self.jittered_data[index] = 1 - self.jittered_data[index]
self.scaled_data[index] = 1 - self.scaled_data[index]
col = self.jittered_data.get_column_view(attr)[0]
col *= -1
col += 1
col = self.scaled_data.get_column_view(attr)[0]
col *= -1
col += 1
return 1

def get_valid_list(self, indices):
def get_valid_list(self, attrs):
"""
Get array of 0 and 1 of len = len(self.data). If there is a missing
value at any attribute in indices return 0 for that instance.
"""
if self.valid_data_array is None or len(self.valid_data_array) == 0:
return np.array([], np.bool)
domain = self.domain
indices = []
for index, attr in enumerate(chain(domain.variables, domain.metas)):
if attr in attrs:
indices.append(index)
return np.all(self.valid_data_array[indices], axis=0)

def get_valid_indices(self, indices):
def get_valid_indices(self, attrs):
"""
Get array with numbers that represent the instance indices that have a
valid data value.
"""
valid_list = self.get_valid_list(indices)
valid_list = self.get_valid_list(attrs)
return np.nonzero(valid_list)[0]


class ScaleScatterPlotData(ScaleData):
def get_xy_data_positions(self, xattr, yattr, filter_valid=False,
def get_xy_data_positions(self, attr_x, attr_y, filter_valid=False,
copy=True):
"""
Create x-y projection of attributes in attrlist.
"""
xattr_index = self.domain.index(xattr)
yattr_index = self.domain.index(yattr)
jit = self.jittered_data
if filter_valid is True:
filter_valid = self.get_valid_list([xattr_index, yattr_index])
filter_valid = self.get_valid_list([attr_x, attr_y])
if isinstance(filter_valid, np.ndarray):
xdata = self.jittered_data[xattr_index, filter_valid]
ydata = self.jittered_data[yattr_index, filter_valid]
data_x = jit.get_column_view(attr_x)[0][filter_valid]
data_y = jit.get_column_view(attr_y)[0][filter_valid]
elif copy:
xdata = self.jittered_data[xattr_index].copy()
ydata = self.jittered_data[yattr_index].copy()
data_x = jit.get_column_view(attr_x)[0].copy()
data_y = jit.get_column_view(attr_y)[0].copy()
else:
xdata = self.jittered_data[xattr_index]
ydata = self.jittered_data[yattr_index]
data_x = jit.get_column_view(attr_x)[0]
data_y = jit.get_column_view(attr_y)[0]

if self.domain[xattr_index].is_discrete:
xdata *= len(self.domain[xattr_index].values)
xdata -= 0.5
if attr_x.is_discrete:
data_x *= len(attr_x.values)
data_x -= 0.5
else:
xdata *= self.attr_values[xattr][1] - self.attr_values[xattr][0]
xdata += float(self.attr_values[xattr][0])
if self.domain[yattr_index].is_discrete:
ydata *= len(self.domain[yattr_index].values)
ydata -= 0.5
data_x *= self.attr_values[attr_x][1] - self.attr_values[attr_x][0]
data_x += float(self.attr_values[attr_x][0])
if attr_y.is_discrete:
data_y *= len(attr_y.values)
data_y -= 0.5
else:
ydata *= self.attr_values[yattr][1] - self.attr_values[yattr][0]
ydata += float(self.attr_values[yattr][0])
return xdata, ydata
data_y *= self.attr_values[attr_y][1] - self.attr_values[attr_y][0]
data_y += float(self.attr_values[attr_y][0])
return data_x, data_y

getXYDataPositions = get_xy_data_positions
26 changes: 8 additions & 18 deletions Orange/widgets/visualize/owscatterplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,12 @@ def iterate_states(self, initial_state):

def compute_score(self, state):
graph = self.master.graph
ind12 = [graph.domain.index(self.attrs[x]) for x in state]
valid = graph.get_valid_list(ind12)
X = graph.jittered_data[ind12, :][:, valid].T
attrs = [self.attrs[x] for x in state]
valid = graph.get_valid_list(attrs)
cols = []
for var in attrs:
cols.append(graph.jittered_data.get_column_view(var)[0][valid])
X = np.column_stack(cols)
Y = self.master.data.Y[valid]
if X.shape[0] < self.minK:
return
Expand All @@ -66,7 +69,7 @@ def bar_length(self, score):
return max(0, -score)

def score_heuristic(self):
X = self.master.graph.jittered_data.T
X = self.master.graph.jittered_data.X
Y = self.master.data.Y
mdomain = self.master.data.domain
dom = Domain([ContinuousVariable(str(i)) for i in range(X.shape[1])],
Expand Down Expand Up @@ -139,7 +142,6 @@ def __init__(self):

self.data = None # Orange.data.Table
self.subset_data = None # Orange.data.Table
self.data_metas_X = None # self.data, where primitive metas are moved to X
self.sql_data = None # Orange.data.sql.table.SqlTable
self.attribute_selection_list = None # list of Orange.data.Variable
self.__timer = QTimer(self, interval=1200)
Expand Down Expand Up @@ -243,7 +245,6 @@ def set_data(self, data):
same_domain = (self.data and data and
data.domain.checksum() == self.data.domain.checksum())
self.data = data
self.data_metas_X = self.move_primitive_metas_to_X(data)

if not same_domain:
self.init_attr_values()
Expand Down Expand Up @@ -295,7 +296,6 @@ def add_data(self, time=0.4):
data_sample.download_data(2000, partial=True)
data = Table(data_sample)
self.data = Table.concatenate((self.data, data), axis=0)
self.data_metas_X = self.move_primitive_metas_to_X(self.data)
self.handleNewSignals()

def switch_sampling(self):
Expand All @@ -304,15 +304,6 @@ def switch_sampling(self):
self.add_data()
self.__timer.start()

def move_primitive_metas_to_X(self, data):
if data is not None:
new_attrs = [a for a in data.domain.attributes + data.domain.metas
if a.is_primitive()]
new_metas = [m for m in data.domain.metas if not m.is_primitive()]
new_domain = Domain(new_attrs, data.domain.class_vars, new_metas)
data = data.transform(new_domain)
return data

@Inputs.data_subset
def set_subset_data(self, subset_data):
self.warning()
Expand All @@ -322,12 +313,11 @@ def set_subset_data(self, subset_data):
else:
self.warning("Data subset does not support large Sql tables")
subset_data = None
self.subset_data = self.move_primitive_metas_to_X(subset_data)
self.controls.graph.alpha_value.setEnabled(subset_data is None)

# called when all signals are received, so the graph is updated only once
def handleNewSignals(self):
self.graph.new_data(self.data_metas_X, self.subset_data)
self.graph.new_data(self.data, self.subset_data)
if self.attribute_selection_list and self.graph.domain and \
all(attr in self.graph.domain
for attr in self.attribute_selection_list):
Expand Down
Loading

0 comments on commit b7545b3

Please sign in to comment.