Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Scatter Plot Graph and Scaling can handle metas #2699

Merged
merged 5 commits into from
Nov 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 8 additions & 28 deletions Orange/widgets/unsupervised/owmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def update_data(self, attr_x, attr_y, reset_view=True):
self.plot_widget.hideAxis(axis)
self.plot_widget.setAspectLocked(True, 1)

def get_size_index(self):
if self.attr_size == "Stress":
return -2
return super().get_size_index()

def compute_sizes(self):
def scale(a):
dmin, dmax = np.nanmin(a), np.nanmax(a)
Expand All @@ -69,17 +64,16 @@ def scale(a):
return np.zeros_like(a)

self.master.Information.missing_size.clear()
size_index = self.get_size_index()
if size_index == -1:
if self.attr_size is None:
size_data = np.full((self.n_points,), self.point_width,
dtype=float)
elif size_index == -2:
elif self.attr_size == "Stress":
size_data = scale(stress(self.master.embedding, self.master.effective_matrix))
size_data = self.MinShapeSize + size_data * self.point_width
else:
size_data = \
self.MinShapeSize + \
self.scaled_data[size_index, self.valid_data] * \
self.scaled_data.get_column_view(self.attr_size)[0][self.valid_data] * \
self.point_width
nans = np.isnan(size_data)
if np.any(nans):
Expand Down Expand Up @@ -270,11 +264,6 @@ def update_regression_line(self):

def init_attr_values(self):
domain = self.data and len(self.data) and self.data.domain or None
if domain is not None:
domain = Domain(
attributes=domain.attributes,
class_vars=domain.class_vars,
metas=tuple(a for a in domain.metas if a.is_primitive()))
for model in self.models:
model.set_domain(domain)
self.graph.attr_color = self.data.domain.class_var if domain else None
Expand Down Expand Up @@ -653,21 +642,12 @@ def _setup_plot(self, new=False):
coords = np.vstack((emb_x, emb_y)).T

data = self.data

primitive_metas = tuple(a for a in data.domain.metas if a.is_primitive())
keys = [k for k, a in enumerate(data.domain.metas) if a.is_primitive()]
data_metas = data.metas[:, keys].astype(float)

attributes = self.data.domain.attributes + (self.variable_x, self.variable_y) + \
primitive_metas
attributes = data.domain.attributes + (self.variable_x, self.variable_y)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BlazZupan, do we want the columns that contain x and y coordinates from the MDS to appear as ordinary attributes or as metas?

They used to be in metas, but now that visualization widgets can show metas as well, I would tend to store x and y as metas to not pollute that feature set.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually this line here is not related to that (it's for constructing data for plotting).
Output data has new coordinates in metas as we want, see commit function below (L671)

domain = Domain(attributes=attributes,
class_vars=self.data.domain.class_vars)
if data_metas is not None:
data_x = (self.data.X, coords, data_metas)
else:
data_x = (self.data.X, coords)
data = Table.from_numpy(domain, X=hstack(data_x),
Y=self.data.Y)
class_vars=data.domain.class_vars,
metas=data.domain.metas)
data = Table.from_numpy(domain, X=hstack((data.X, coords)),
Y=data.Y, metas=data.metas)
subset_data = data[self._subset_mask] if self._subset_mask is not None else None
self.graph.new_data(data, subset_data=subset_data, new=new)
self.graph.update_data(self.variable_x, self.variable_y, True)
Expand Down
122 changes: 70 additions & 52 deletions Orange/widgets/utils/scaling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from itertools import chain

import numpy as np

from Orange.data import Domain
from Orange.statistics.basic_stats import DomainBasicStats
from Orange.widgets.settings import Setting
from Orange.widgets.utils import checksum
Expand All @@ -12,12 +15,11 @@ class ScaleData:

def _reset_data(self):
self.domain = None
self.data = None
self.original_data = None # as numpy array
self.data = None # as Orange Table
self.scaled_data = None # in [0, 1]
self.jittered_data = None
self.attr_values = {}
self.domain_data_stat = []
self.domain_data_stat = {}
self.valid_data_array = None
self.attribute_flip_info = {} # dictionary with attr: 0/1 if flipped
self.jitter_seed = 0
Expand All @@ -30,53 +32,58 @@ def rescale_data(self):

def _compute_domain_data_stat(self):
stt = self.domain_data_stat = \
getCached(self.data, DomainBasicStats, (self.data,))
for index in range(len(self.domain)):
attr = self.domain[index]
getCached(self.data, DomainBasicStats, (self.data, True))
domain = self.domain
for attr in chain(domain.variables, domain.metas):
if attr.is_discrete:
self.attr_values[attr] = [0, len(attr.values)]
elif attr.is_continuous:
self.attr_values[attr] = [stt[index].min, stt[index].max]
self.attr_values[attr] = [stt[attr].min, stt[attr].max]

def _compute_scaled_data(self):
data = self.data
# We cache scaled_data and validArray to share them between widgets
cached = getCached(data, "visualizationData")
if cached:
self.original_data, self.scaled_data, self.valid_data_array = cached
self.data, self.scaled_data, self.valid_data_array = cached
return

Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
self.original_data = np.hstack((data.X, Y)).T
self.scaled_data = no_jit = self.original_data.copy()
self.valid_data_array = np.isfinite(no_jit)
for index in range(len(data.domain)):
attr = data.domain[index]
if np.any(data.metas):
all_data = (data.X, Y, data.metas)
else:
all_data = (data.X, Y)
all_data = np.hstack(all_data).T
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can have dtype==object when stacking with metas which will cause np.isfinite to fail 2 lines below

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Fixed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed here, but I get similar errors elsewhere (see sentry report).
Try fixing all metas dtype problems (converting all primitive metas to float might be enough, maybe something else is also needed). Then please test thoroughly! Use at least the workflow from the sentry report.

Copy link
Contributor Author

@jerneju jerneju Nov 8, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found another error caused by Corpus (issue: biolab/orange3-text#324 )

self.scaled_data = self.data.copy()
self.valid_data_array = np.isfinite(all_data)
domain = self.domain
for attr in chain(domain.attributes, domain.class_vars, domain.metas):
c = self.scaled_data.get_column_view(attr)[0]
if attr.is_discrete:
no_jit[index] *= 2
no_jit[index] += 1
no_jit[index] /= 2 * len(attr.values)
c += 0.5
c /= len(attr.values)
else:
dstat = self.domain_data_stat[index]
no_jit[index] -= dstat.min
dstat = self.domain_data_stat[attr]
c -= dstat.min
if dstat.max != dstat.min:
no_jit[index] /= dstat.max - dstat.min
c /= dstat.max - dstat.min
setCached(data, "visualizationData",
(self.original_data, self.scaled_data, self.valid_data_array))
(self.data, self.scaled_data, self.valid_data_array))

def _compute_jittered_data(self):
data = self.data
self.jittered_data = self.scaled_data.copy()
random = np.random.RandomState(seed=self.jitter_seed)
for index, col in enumerate(self.jittered_data):
domain = self.domain
for attr in chain(domain.variables, domain.metas):
# Need to use a different seed for each feature
attr = data.domain[index]
if attr.is_discrete:
off = self.jitter_size / (25 * max(1, len(attr.values)))
elif attr.is_continuous and self.jitter_continuous:
off = self.jitter_size / 25
else:
continue
col = self.jittered_data.get_column_view(attr)[0]
col += random.uniform(-off, off, len(data))
# fix values outside [0, 1]
col = np.absolute(col)
Expand All @@ -92,8 +99,13 @@ def set_data(self, data, skip_if_same=False, no_data=False):
if data is None:
return

self.domain = data.domain
self.data = data
domain = data.domain
new_domain = Domain(attributes=domain.attributes,
class_vars=domain.class_vars,
metas=tuple(v for v in domain.metas if v.is_primitive()))
self.data = data.transform(new_domain)
self.data.metas = self.data.metas.astype(float)
self.domain = self.data.domain
self.attribute_flip_info = {}
if not no_data:
self._compute_domain_data_stat()
Expand All @@ -103,67 +115,73 @@ def set_data(self, data, skip_if_same=False, no_data=False):
def flip_attribute(self, attr):
if attr.is_discrete:
return 0
index = self.domain.index(attr)
self.attribute_flip_info[attr] = 1 - self.attribute_flip_info.get(attr, 0)
if attr.is_continuous:
self.attr_values[attr] = [-self.attr_values[attr][1],
-self.attr_values[attr][0]]

self.jittered_data[index] = 1 - self.jittered_data[index]
self.scaled_data[index] = 1 - self.scaled_data[index]
col = self.jittered_data.get_column_view(attr)[0]
col *= -1
col += 1
col = self.scaled_data.get_column_view(attr)[0]
col *= -1
col += 1
return 1

def get_valid_list(self, indices):
def get_valid_list(self, attrs):
"""
Get array of 0 and 1 of len = len(self.data). If there is a missing
value at any attribute in indices return 0 for that instance.
"""
if self.valid_data_array is None or len(self.valid_data_array) == 0:
return np.array([], np.bool)
domain = self.domain
indices = []
for index, attr in enumerate(chain(domain.variables, domain.metas)):
if attr in attrs:
indices.append(index)
return np.all(self.valid_data_array[indices], axis=0)

def get_valid_indices(self, indices):
def get_valid_indices(self, attrs):
"""
Get array with numbers that represent the instance indices that have a
valid data value.
"""
valid_list = self.get_valid_list(indices)
valid_list = self.get_valid_list(attrs)
return np.nonzero(valid_list)[0]


class ScaleScatterPlotData(ScaleData):
def get_xy_data_positions(self, xattr, yattr, filter_valid=False,
def get_xy_data_positions(self, attr_x, attr_y, filter_valid=False,
copy=True):
"""
Create x-y projection of attributes in attrlist.

"""
xattr_index = self.domain.index(xattr)
yattr_index = self.domain.index(yattr)
jit = self.jittered_data
if filter_valid is True:
filter_valid = self.get_valid_list([xattr_index, yattr_index])
filter_valid = self.get_valid_list([attr_x, attr_y])
if isinstance(filter_valid, np.ndarray):
xdata = self.jittered_data[xattr_index, filter_valid]
ydata = self.jittered_data[yattr_index, filter_valid]
data_x = jit.get_column_view(attr_x)[0][filter_valid]
data_y = jit.get_column_view(attr_y)[0][filter_valid]
elif copy:
xdata = self.jittered_data[xattr_index].copy()
ydata = self.jittered_data[yattr_index].copy()
data_x = jit.get_column_view(attr_x)[0].copy()
data_y = jit.get_column_view(attr_y)[0].copy()
else:
xdata = self.jittered_data[xattr_index]
ydata = self.jittered_data[yattr_index]
data_x = jit.get_column_view(attr_x)[0]
data_y = jit.get_column_view(attr_y)[0]

if self.domain[xattr_index].is_discrete:
xdata *= len(self.domain[xattr_index].values)
xdata -= 0.5
if attr_x.is_discrete:
data_x *= len(attr_x.values)
data_x -= 0.5
else:
xdata *= self.attr_values[xattr][1] - self.attr_values[xattr][0]
xdata += float(self.attr_values[xattr][0])
if self.domain[yattr_index].is_discrete:
ydata *= len(self.domain[yattr_index].values)
ydata -= 0.5
data_x *= self.attr_values[attr_x][1] - self.attr_values[attr_x][0]
data_x += float(self.attr_values[attr_x][0])
if attr_y.is_discrete:
data_y *= len(attr_y.values)
data_y -= 0.5
else:
ydata *= self.attr_values[yattr][1] - self.attr_values[yattr][0]
ydata += float(self.attr_values[yattr][0])
return xdata, ydata
data_y *= self.attr_values[attr_y][1] - self.attr_values[attr_y][0]
data_y += float(self.attr_values[attr_y][0])
return data_x, data_y

getXYDataPositions = get_xy_data_positions
26 changes: 8 additions & 18 deletions Orange/widgets/visualize/owscatterplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,12 @@ def iterate_states(self, initial_state):

def compute_score(self, state):
graph = self.master.graph
ind12 = [graph.domain.index(self.attrs[x]) for x in state]
valid = graph.get_valid_list(ind12)
X = graph.jittered_data[ind12, :][:, valid].T
attrs = [self.attrs[x] for x in state]
valid = graph.get_valid_list(attrs)
cols = []
for var in attrs:
cols.append(graph.jittered_data.get_column_view(var)[0][valid])
X = np.column_stack(cols)
Y = self.master.data.Y[valid]
if X.shape[0] < self.minK:
return
Expand All @@ -66,7 +69,7 @@ def bar_length(self, score):
return max(0, -score)

def score_heuristic(self):
X = self.master.graph.jittered_data.T
X = self.master.graph.jittered_data.X
Y = self.master.data.Y
mdomain = self.master.data.domain
dom = Domain([ContinuousVariable(str(i)) for i in range(X.shape[1])],
Expand Down Expand Up @@ -139,7 +142,6 @@ def __init__(self):

self.data = None # Orange.data.Table
self.subset_data = None # Orange.data.Table
self.data_metas_X = None # self.data, where primitive metas are moved to X
self.sql_data = None # Orange.data.sql.table.SqlTable
self.attribute_selection_list = None # list of Orange.data.Variable
self.__timer = QTimer(self, interval=1200)
Expand Down Expand Up @@ -243,7 +245,6 @@ def set_data(self, data):
same_domain = (self.data and data and
data.domain.checksum() == self.data.domain.checksum())
self.data = data
self.data_metas_X = self.move_primitive_metas_to_X(data)

if not same_domain:
self.init_attr_values()
Expand Down Expand Up @@ -295,7 +296,6 @@ def add_data(self, time=0.4):
data_sample.download_data(2000, partial=True)
data = Table(data_sample)
self.data = Table.concatenate((self.data, data), axis=0)
self.data_metas_X = self.move_primitive_metas_to_X(self.data)
self.handleNewSignals()

def switch_sampling(self):
Expand All @@ -304,15 +304,6 @@ def switch_sampling(self):
self.add_data()
self.__timer.start()

def move_primitive_metas_to_X(self, data):
if data is not None:
new_attrs = [a for a in data.domain.attributes + data.domain.metas
if a.is_primitive()]
new_metas = [m for m in data.domain.metas if not m.is_primitive()]
new_domain = Domain(new_attrs, data.domain.class_vars, new_metas)
data = data.transform(new_domain)
return data

@Inputs.data_subset
def set_subset_data(self, subset_data):
self.warning()
Expand All @@ -322,12 +313,11 @@ def set_subset_data(self, subset_data):
else:
self.warning("Data subset does not support large Sql tables")
subset_data = None
self.subset_data = self.move_primitive_metas_to_X(subset_data)
self.controls.graph.alpha_value.setEnabled(subset_data is None)

# called when all signals are received, so the graph is updated only once
def handleNewSignals(self):
self.graph.new_data(self.data_metas_X, self.subset_data)
self.graph.new_data(self.data, self.subset_data)
if self.attribute_selection_list and self.graph.domain and \
all(attr in self.graph.domain
for attr in self.attribute_selection_list):
Expand Down
Loading