Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Merge: work with sparse #2305

Merged
merged 2 commits into from
May 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ def __init__(self, table, row_index):
self.id = table.ids[row_index]
self._x = table.X[row_index]
if sp.issparse(self._x):
self.sparse_x = self._x
self.sparse_x = sp.csr_matrix(self._x)
self._x = np.asarray(self._x.todense())[0]
self._y = table._Y[row_index]
if sp.issparse(self._y):
self.sparse_y = self._y
self.sparse_y = sp.csr_matrix(self._y)
self._y = np.asarray(self._y.todense())[0]
self._metas = table.metas[row_index]
if sp.issparse(self._metas):
self.sparse_metas = self._metas
self.sparse_metas = sp.csr_matrix(self._metas)
self._metas = np.asarray(self._metas.todense())[0]

@property
Expand Down
41 changes: 26 additions & 15 deletions Orange/widgets/data/owmergedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from AnyQt.QtWidgets import QApplication, QStyle, QSizePolicy

import numpy as np
import scipy.sparse as sp

import Orange
from Orange.data import StringVariable, ContinuousVariable
from Orange.data.util import hstack
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils import itemmodels
from Orange.widgets.utils.sql import check_sql_input
Expand Down Expand Up @@ -362,20 +364,29 @@ def _join_table_by_indices(self, reduced_extra, indices):
def _join_array_by_indices(left, right, indices, string_cols=None):
"""Join (horizontally) two arrays, taking pairs of rows given in indices
"""
tpe = object if object in (left.dtype, right.dtype) else left.dtype
left_width, right_width = left.shape[1], right.shape[1]
arr = np.full((indices.shape[1], left_width + right_width), np.nan, tpe)
if string_cols:
arr[:, string_cols] = ""
for indices, to_change, lookup in (
(indices[0], arr[:, :left_width], left),
(indices[1], arr[:, left_width:], right)):
known = indices != -1
to_change[known] = lookup[indices[known]]
return arr


def test():
def prepare(arr, inds, str_cols):
try:
newarr = arr[inds]
except IndexError:
newarr = np.full_like(arr, np.nan)
else:
empty = np.full(arr.shape[1], np.nan)
if str_cols:
assert arr.dtype == object
empty = empty.astype(object)
empty[str_cols] = ''
newarr[inds == -1] = empty
return newarr

left_width = left.shape[1]
str_left = [i for i in string_cols or () if i < left_width]
str_right = [i - left_width for i in string_cols or () if i >= left_width]
res = hstack((prepare(left, indices[0], str_left),
prepare(right, indices[1], str_right)))
return res


def main():
app = QApplication([])
w = OWMergeData()
data = Orange.data.Table("tests/data-gender-region")
Expand All @@ -388,4 +399,4 @@ def test():


if __name__ == "__main__":
test()
main()
26 changes: 26 additions & 0 deletions Orange/widgets/data/tests/test_owmergedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from itertools import chain

import numpy as np
import scipy.sparse as sp

from Orange.data import Table, Domain, DiscreteVariable, StringVariable
from Orange.widgets.data.owmergedata import OWMergeData, INSTANCEID, INDEX
Expand Down Expand Up @@ -425,3 +426,28 @@ def test_best_match(self):
self.assertEqual(self.widget.attr_merge_extra, zoo_images.domain[-1])
self.assertEqual(self.widget.attr_combine_data, zoo.domain[-1])
self.assertEqual(self.widget.attr_combine_extra, zoo_images.domain[-1])

def test_sparse(self):
"""
Merge should work with sparse.
GH-2295
GH-2155
"""
data = Table("iris")[::25]
data_ed_dense = Table("titanic")[::300]
data_ed_sparse = Table("titanic")[::300]
data_ed_sparse.X = sp.csr_matrix(data_ed_sparse.X)
self.send_signal("Data", data)

self.send_signal("Extra Data", data_ed_dense)
output_dense = self.get_output("Data")
self.assertFalse(sp.issparse(output_dense.X))
self.assertFalse(output_dense.is_sparse())

self.send_signal("Extra Data", data_ed_sparse)
output_sparse = self.get_output("Data")
self.assertTrue(sp.issparse(output_sparse.X))
self.assertTrue(output_sparse.is_sparse())

output_sparse.X = output_sparse.X.toarray()
self.assertTablesEqual(output_dense, output_sparse)