-
-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENH] Scatter Plot Graph and Scaling can handle metas #2699
Changes from all commits
9b262f2
ca4db00
7738e5a
81247dd
0fdccda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,8 @@ | ||
from itertools import chain | ||
|
||
import numpy as np | ||
|
||
from Orange.data import Domain | ||
from Orange.statistics.basic_stats import DomainBasicStats | ||
from Orange.widgets.settings import Setting | ||
from Orange.widgets.utils import checksum | ||
|
@@ -12,12 +15,11 @@ class ScaleData: | |
|
||
def _reset_data(self): | ||
self.domain = None | ||
self.data = None | ||
self.original_data = None # as numpy array | ||
self.data = None # as Orange Table | ||
self.scaled_data = None # in [0, 1] | ||
self.jittered_data = None | ||
self.attr_values = {} | ||
self.domain_data_stat = [] | ||
self.domain_data_stat = {} | ||
self.valid_data_array = None | ||
self.attribute_flip_info = {} # dictionary with attr: 0/1 if flipped | ||
self.jitter_seed = 0 | ||
|
@@ -30,53 +32,58 @@ def rescale_data(self): | |
|
||
def _compute_domain_data_stat(self): | ||
stt = self.domain_data_stat = \ | ||
getCached(self.data, DomainBasicStats, (self.data,)) | ||
for index in range(len(self.domain)): | ||
attr = self.domain[index] | ||
getCached(self.data, DomainBasicStats, (self.data, True)) | ||
domain = self.domain | ||
for attr in chain(domain.variables, domain.metas): | ||
if attr.is_discrete: | ||
self.attr_values[attr] = [0, len(attr.values)] | ||
elif attr.is_continuous: | ||
self.attr_values[attr] = [stt[index].min, stt[index].max] | ||
self.attr_values[attr] = [stt[attr].min, stt[attr].max] | ||
|
||
def _compute_scaled_data(self): | ||
data = self.data | ||
# We cache scaled_data and validArray to share them between widgets | ||
cached = getCached(data, "visualizationData") | ||
if cached: | ||
self.original_data, self.scaled_data, self.valid_data_array = cached | ||
self.data, self.scaled_data, self.valid_data_array = cached | ||
return | ||
|
||
Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T | ||
self.original_data = np.hstack((data.X, Y)).T | ||
self.scaled_data = no_jit = self.original_data.copy() | ||
self.valid_data_array = np.isfinite(no_jit) | ||
for index in range(len(data.domain)): | ||
attr = data.domain[index] | ||
if np.any(data.metas): | ||
all_data = (data.X, Y, data.metas) | ||
else: | ||
all_data = (data.X, Y) | ||
all_data = np.hstack(all_data).T | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can have dtype==object when stacking with metas which will cause np.isfinite to fail 2 lines below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Fixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed here, but I get similar errors elsewhere (see sentry report). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found another error caused by Corpus (issue: biolab/orange3-text#324 ) |
||
self.scaled_data = self.data.copy() | ||
self.valid_data_array = np.isfinite(all_data) | ||
domain = self.domain | ||
for attr in chain(domain.attributes, domain.class_vars, domain.metas): | ||
c = self.scaled_data.get_column_view(attr)[0] | ||
if attr.is_discrete: | ||
no_jit[index] *= 2 | ||
no_jit[index] += 1 | ||
no_jit[index] /= 2 * len(attr.values) | ||
c += 0.5 | ||
c /= len(attr.values) | ||
else: | ||
dstat = self.domain_data_stat[index] | ||
no_jit[index] -= dstat.min | ||
dstat = self.domain_data_stat[attr] | ||
c -= dstat.min | ||
if dstat.max != dstat.min: | ||
no_jit[index] /= dstat.max - dstat.min | ||
c /= dstat.max - dstat.min | ||
setCached(data, "visualizationData", | ||
(self.original_data, self.scaled_data, self.valid_data_array)) | ||
(self.data, self.scaled_data, self.valid_data_array)) | ||
|
||
def _compute_jittered_data(self): | ||
data = self.data | ||
self.jittered_data = self.scaled_data.copy() | ||
random = np.random.RandomState(seed=self.jitter_seed) | ||
for index, col in enumerate(self.jittered_data): | ||
domain = self.domain | ||
for attr in chain(domain.variables, domain.metas): | ||
# Need to use a different seed for each feature | ||
attr = data.domain[index] | ||
if attr.is_discrete: | ||
off = self.jitter_size / (25 * max(1, len(attr.values))) | ||
elif attr.is_continuous and self.jitter_continuous: | ||
off = self.jitter_size / 25 | ||
else: | ||
continue | ||
col = self.jittered_data.get_column_view(attr)[0] | ||
col += random.uniform(-off, off, len(data)) | ||
# fix values outside [0, 1] | ||
col = np.absolute(col) | ||
|
@@ -92,8 +99,13 @@ def set_data(self, data, skip_if_same=False, no_data=False): | |
if data is None: | ||
return | ||
|
||
self.domain = data.domain | ||
self.data = data | ||
domain = data.domain | ||
new_domain = Domain(attributes=domain.attributes, | ||
class_vars=domain.class_vars, | ||
metas=tuple(v for v in domain.metas if v.is_primitive())) | ||
self.data = data.transform(new_domain) | ||
self.data.metas = self.data.metas.astype(float) | ||
self.domain = self.data.domain | ||
self.attribute_flip_info = {} | ||
if not no_data: | ||
self._compute_domain_data_stat() | ||
|
@@ -103,67 +115,73 @@ def set_data(self, data, skip_if_same=False, no_data=False): | |
def flip_attribute(self, attr): | ||
if attr.is_discrete: | ||
return 0 | ||
index = self.domain.index(attr) | ||
self.attribute_flip_info[attr] = 1 - self.attribute_flip_info.get(attr, 0) | ||
if attr.is_continuous: | ||
self.attr_values[attr] = [-self.attr_values[attr][1], | ||
-self.attr_values[attr][0]] | ||
|
||
self.jittered_data[index] = 1 - self.jittered_data[index] | ||
self.scaled_data[index] = 1 - self.scaled_data[index] | ||
col = self.jittered_data.get_column_view(attr)[0] | ||
col *= -1 | ||
col += 1 | ||
col = self.scaled_data.get_column_view(attr)[0] | ||
col *= -1 | ||
col += 1 | ||
return 1 | ||
|
||
def get_valid_list(self, indices): | ||
def get_valid_list(self, attrs): | ||
""" | ||
Get array of 0 and 1 of len = len(self.data). If there is a missing | ||
value at any attribute in indices return 0 for that instance. | ||
""" | ||
if self.valid_data_array is None or len(self.valid_data_array) == 0: | ||
return np.array([], np.bool) | ||
domain = self.domain | ||
indices = [] | ||
for index, attr in enumerate(chain(domain.variables, domain.metas)): | ||
if attr in attrs: | ||
indices.append(index) | ||
return np.all(self.valid_data_array[indices], axis=0) | ||
|
||
def get_valid_indices(self, indices): | ||
def get_valid_indices(self, attrs): | ||
""" | ||
Get array with numbers that represent the instance indices that have a | ||
valid data value. | ||
""" | ||
valid_list = self.get_valid_list(indices) | ||
valid_list = self.get_valid_list(attrs) | ||
return np.nonzero(valid_list)[0] | ||
|
||
|
||
class ScaleScatterPlotData(ScaleData): | ||
def get_xy_data_positions(self, xattr, yattr, filter_valid=False, | ||
def get_xy_data_positions(self, attr_x, attr_y, filter_valid=False, | ||
copy=True): | ||
""" | ||
Create x-y projection of attributes in attrlist. | ||
|
||
""" | ||
xattr_index = self.domain.index(xattr) | ||
yattr_index = self.domain.index(yattr) | ||
jit = self.jittered_data | ||
if filter_valid is True: | ||
filter_valid = self.get_valid_list([xattr_index, yattr_index]) | ||
filter_valid = self.get_valid_list([attr_x, attr_y]) | ||
if isinstance(filter_valid, np.ndarray): | ||
xdata = self.jittered_data[xattr_index, filter_valid] | ||
ydata = self.jittered_data[yattr_index, filter_valid] | ||
data_x = jit.get_column_view(attr_x)[0][filter_valid] | ||
data_y = jit.get_column_view(attr_y)[0][filter_valid] | ||
elif copy: | ||
xdata = self.jittered_data[xattr_index].copy() | ||
ydata = self.jittered_data[yattr_index].copy() | ||
data_x = jit.get_column_view(attr_x)[0].copy() | ||
data_y = jit.get_column_view(attr_y)[0].copy() | ||
else: | ||
xdata = self.jittered_data[xattr_index] | ||
ydata = self.jittered_data[yattr_index] | ||
data_x = jit.get_column_view(attr_x)[0] | ||
data_y = jit.get_column_view(attr_y)[0] | ||
|
||
if self.domain[xattr_index].is_discrete: | ||
xdata *= len(self.domain[xattr_index].values) | ||
xdata -= 0.5 | ||
if attr_x.is_discrete: | ||
data_x *= len(attr_x.values) | ||
data_x -= 0.5 | ||
else: | ||
xdata *= self.attr_values[xattr][1] - self.attr_values[xattr][0] | ||
xdata += float(self.attr_values[xattr][0]) | ||
if self.domain[yattr_index].is_discrete: | ||
ydata *= len(self.domain[yattr_index].values) | ||
ydata -= 0.5 | ||
data_x *= self.attr_values[attr_x][1] - self.attr_values[attr_x][0] | ||
data_x += float(self.attr_values[attr_x][0]) | ||
if attr_y.is_discrete: | ||
data_y *= len(attr_y.values) | ||
data_y -= 0.5 | ||
else: | ||
ydata *= self.attr_values[yattr][1] - self.attr_values[yattr][0] | ||
ydata += float(self.attr_values[yattr][0]) | ||
return xdata, ydata | ||
data_y *= self.attr_values[attr_y][1] - self.attr_values[attr_y][0] | ||
data_y += float(self.attr_values[attr_y][0]) | ||
return data_x, data_y | ||
|
||
getXYDataPositions = get_xy_data_positions |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@BlazZupan, do we want the columns that contain x and y coordinates from the MDS to appear as ordinary attributes or as metas?
They used to be in metas, but now that visualization widgets can show metas as well, I would tend to store x and y as metas to not pollute that feature set.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually this line here is not related to that (it's for constructing data for plotting).
Output data has new coordinates in metas as we want, see commit function below (L671)