Skip to content

Commit

Permalink
Merge pull request #2245 from nikicc/edit-domain-sparse
Browse files Browse the repository at this point in the history
[FIX] Support Sparse Data in Domain Editor
  • Loading branch information
jerneju authored Apr 24, 2017
2 parents 1d37f63 + bc60360 commit 5524ed9
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def nanmean(x):
return np.nansum(x.data) / n_values


def unique(x, return_counts=True):
def unique(x, return_counts=False):
""" Equivalent of np.unique that supports sparse or dense matrices. """
if not sp.issparse(x):
return np.unique(x, return_counts=return_counts)
Expand Down
5 changes: 0 additions & 5 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,6 @@ def apply_domain_edit(self):
if self.data is not None:
domain, cols = self.domain_editor.get_domain(self.data.domain, self.data)
X, y, m = cols
X = np.array(X).T if len(X) else np.empty((len(self.data), 0))
y = np.array(y).T if len(y) else None
dtpe = object if any(isinstance(m, StringVariable)
for m in domain.metas) else float
m = np.array(m, dtype=dtpe).T if len(m) else None
table = Table.from_numpy(domain, X, y, m, self.data.W)
table.name = self.data.name
table.ids = np.array(self.data.ids)
Expand Down
20 changes: 20 additions & 0 deletions Orange/widgets/data/tests/test_owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
# pylint: disable=missing-docstring
from os import path, remove
from unittest.mock import Mock
import pickle
import tempfile


import numpy as np
import scipy.sparse as sp

from AnyQt.QtCore import QMimeData, QPoint, Qt, QUrl
from AnyQt.QtGui import QDragEnterEvent, QDropEvent
Expand Down Expand Up @@ -195,3 +199,19 @@ def test_check_datetime_disabled(self):
for i in range(4):
vartype_delegate.setEditorData(combo, idx(i))
self.assertEqual(combo.count(), counts[i])

def test_domain_edit_on_sparse_data(self):
iris = Table("iris")
iris.X = sp.csr_matrix(iris.X)

f = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
pickle.dump(iris, f)
f.close()

self.widget.add_path(f.name)
self.widget.load_data()

output = self.get_output("Data")
self.assertIsInstance(output, Table)
self.assertEqual(iris.X.shape, output.X.shape)
self.assertTrue(sp.issparse(output.X))
76 changes: 61 additions & 15 deletions Orange/widgets/utils/domaineditor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from itertools import chain

import numpy as np
import scipy.sparse as sp

from AnyQt.QtCore import Qt, QAbstractTableModel
from AnyQt.QtGui import QColor
from AnyQt.QtWidgets import QComboBox, QTableView, QSizePolicy

from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
TimeVariable, Domain
from Orange.statistics.util import unique
from Orange.widgets import gui
from Orange.widgets.gui import HorizontalGridDelegate
from Orange.widgets.settings import ContextSetting
Expand Down Expand Up @@ -196,6 +198,37 @@ def __init__(self, widget):
self.place_delegate = PlaceDelegate(self, VarTableModel.places)
self.setItemDelegateForColumn(Column.place, self.place_delegate)

@staticmethod
def _is_missing(x):
return str(x) in ("nan", "")

@staticmethod
def _iter_vals(x):
"""Iterate over values of sparse or dense arrays."""
for i in range(x.shape[0]):
yield x[i, 0]

@staticmethod
def _to_column(x, to_sparse, dtype=None):
"""Transform list of values to sparse/dense column array."""
x = np.array(x, dtype=dtype).reshape(-1, 1)
if to_sparse:
x = sp.csc_matrix(x)
return x

@staticmethod
def _merge(cols, force_dense=False):
if len(cols) == 0:
return None

all_dense = not any(sp.issparse(c) for c in cols)
if all_dense:
return np.hstack(cols)
if force_dense:
return np.hstack([c.toarray() if sp.issparse(c) else c for c in cols])
sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols]
return sp.hstack(sparse_cols).tocsr()

def get_domain(self, domain, data):
"""Create domain (and dataset) from changes made in the widget.
Expand All @@ -212,44 +245,57 @@ def get_domain(self, domain, data):
places = [[], [], []] # attributes, class_vars, metas
cols = [[], [], []] # Xcols, Ycols, Mcols

def is_missing(x):
return str(x) in ("nan", "")

for (name, tpe, place, _, _), (orig_var, orig_plc) in \
zip(variables,
chain([(at, Place.feature) for at in domain.attributes],
[(cl, Place.class_var) for cl in domain.class_vars],
[(mt, Place.meta) for mt in domain.metas])):
if place == Place.skip:
continue
if orig_plc == Place.meta:
col_data = data[:, orig_var].metas
elif orig_plc == Place.class_var:
col_data = data[:, orig_var].Y
else:
col_data = data[:, orig_var].X
col_data = col_data.ravel()

col_data = self._get_column(data, orig_var, orig_plc)
is_sparse = sp.issparse(col_data)
if name == orig_var.name and tpe == type(orig_var):
var = orig_var
elif tpe == type(orig_var):
# change the name so that all_vars will get the correct name
orig_var.name = name
var = orig_var
elif tpe == DiscreteVariable:
values = list(str(i) for i in np.unique(col_data) if not is_missing(i))
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
var = tpe(name, values)
col_data = [np.nan if is_missing(x) else values.index(str(x))
for x in col_data]
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
for x in self._iter_vals(col_data)]
col_data = self._to_column(col_data, is_sparse)
elif tpe == StringVariable and type(orig_var) == DiscreteVariable:
var = tpe(name)
col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
for x in col_data]
for x in self._iter_vals(col_data)]
# don't obey sparsity for StringVariable since they are
# in metas which are transformed to dense below
col_data = self._to_column(col_data, False, dtype=object)
else:
var = tpe(name)
places[place].append(var)
cols[place].append(col_data)

# merge columns for X, Y and metas
feats = cols[Place.feature]
X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
Y = self._merge(cols[Place.class_var], force_dense=True)
m = self._merge(cols[Place.meta], force_dense=True)
domain = Domain(*places)
return domain, cols
return domain, [X, Y, m]

def _get_column(self, data, source_var, source_place):
""" Extract column from data and preserve sparsity. """
if source_place == Place.meta:
col_data = data[:, source_var].metas
elif source_place == Place.class_var:
col_data = data[:, source_var].Y.reshape(-1, 1)
else:
col_data = data[:, source_var].X
return col_data

def set_domain(self, domain):
self.variables = self.parse_domain(domain)
Expand Down

0 comments on commit 5524ed9

Please sign in to comment.