Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Correlations: fixes and enhancements #3591

Merged
merged 6 commits into from
Feb 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
It also patches bottleneck to contain these functions.
"""
import warnings
import math

import numpy as np
import bottleneck as bn
Expand Down Expand Up @@ -604,3 +605,61 @@ def var(x, axis=None, ddof=0):
def std(x, axis=None, ddof=0):
""" Equivalent of np.std that supports sparse and dense matrices. """
return np.sqrt(var(x, axis=axis, ddof=ddof))


# to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]),
# for m in [1,100000].
# For higher values of m use an approximation, with error less or equal to
# 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772...,
# 0.5572 is an Euler-Mascheroni constant)
c = [1.0]
for m in range(2, 100000):
c.append(c[-1] + 1.0/m)


def FDR(p_values, dependent=False, m=None, ordered=False):
""" `False Discovery Rate <http://en.wikipedia.org/wiki/False_discovery_rate>`_
correction on a list of p-values.

:param p_values: a list of p-values.
:param dependent: use correction for dependent hypotheses (default False).
:param m: number of hypotheses tested (default ``len(p_values)``).
:param ordered: prevent sorting of p-values if they are already sorted
(default False).
"""
def is_sorted(l):
return all(l[i] <= l[i + 1] for i in range(len(l) - 1))

if not ordered:
ordered = is_sorted(p_values)

if not ordered:
joined = [(v, i) for i, v in enumerate(p_values)]
joined.sort()
p_values = [p[0] for p in joined]
indices = [p[1] for p in joined]

if not m:
m = len(p_values)
if m <= 0 or not p_values:
return []

if dependent: # correct q for dependent tests
k = c[m-1] if m <= len(c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992
m = m * k

tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)]
fdrs = []
cmin = tmp_fdrs[-1]
for f in reversed(tmp_fdrs):
cmin = min(f, cmin)
fdrs.append( cmin)
fdrs.reverse()

if not ordered:
new = [None] * len(fdrs)
for v, i in zip(fdrs, indices):
new[i] = v
fdrs = new

return fdrs
8 changes: 7 additions & 1 deletion Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from Orange.statistics.util import bincount, countnans, contingency, digitize, \
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
unique, var, nanstd, nanvar, nanmode, array_equal
unique, var, nanstd, nanvar, nanmode, array_equal, FDR
from sklearn.utils import check_random_state


Expand Down Expand Up @@ -263,6 +263,12 @@ def test_nanstd_with_ddof(self):
nanstd(csr_matrix(x), axis=axis, ddof=10),
)

def test_FDR(self):
p_values = [0.00001, 0.0001, 0.0002, 0.0003, 0.0004]
np.testing.assert_almost_equal(
np.array([0.00005, 0.00025, 0.00033, 0.00038, 0.0004]),
FDR(p_values), decimal=5)


class TestNanmean(unittest.TestCase):
def setUp(self):
Expand Down
144 changes: 108 additions & 36 deletions Orange/widgets/data/owcorrelations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,19 @@
from scipy.stats import spearmanr, pearsonr
from sklearn.cluster import KMeans

from AnyQt.QtCore import Qt, QItemSelectionModel, QItemSelection, QSize
from AnyQt.QtGui import QStandardItem, QColor
from AnyQt.QtCore import Qt, QItemSelectionModel, QItemSelection, \
QSize, pyqtSignal as Signal
from AnyQt.QtGui import QStandardItem
from AnyQt.QtWidgets import QHeaderView

from Orange.data import Table, Domain, ContinuousVariable, StringVariable
from Orange.preprocess import SklImpute, Normalize
from Orange.statistics.util import FDR
from Orange.widgets import gui
from Orange.widgets.settings import Setting, ContextSetting, \
DomainContextHandler
from Orange.widgets.utils import vartype
from Orange.widgets.utils.itemmodels import DomainModel
from Orange.widgets.utils.signals import Input, Output
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.visualize.utils import VizRankDialogAttrPair
Expand Down Expand Up @@ -85,62 +90,80 @@ class CorrelationRank(VizRankDialogAttrPair):
"""
Correlations rank widget.
"""
NEGATIVE_COLOR = QColor(70, 190, 250)
POSITIVE_COLOR = QColor(170, 242, 43)
threadStopped = Signal()
VesnaT marked this conversation as resolved.
Show resolved Hide resolved
PValRole = next(gui.OrangeUserRole)

def __init__(self, *args):
super().__init__(*args)
self.heuristic = None
self.use_heuristic = False
self.sel_feature_index = None

def initialize(self):
super().initialize()
data = self.master.cont_data
self.attrs = data and data.domain.attributes
self.model_proxy.setFilterKeyColumn(-1)
self.rank_table.horizontalHeader().setStretchLastSection(False)
self.heuristic = None
self.use_heuristic = False
if self.master.feature is not None:
self.sel_feature_index = data.domain.index(self.master.feature)
else:
self.sel_feature_index = None
if data:
# use heuristic if data is too big
n_attrs = len(self.attrs)
use_heuristic = n_attrs > KMeansCorrelationHeuristic.n_clusters
self.use_heuristic = use_heuristic and \
len(data) * n_attrs ** 2 > SIZE_LIMIT
len(data) * n_attrs ** 2 > SIZE_LIMIT and \
self.sel_feature_index is None
if self.use_heuristic:
self.heuristic = KMeansCorrelationHeuristic(data)

def compute_score(self, state):
(attr1, attr2), corr_type = state, self.master.correlation_type
data = self.master.cont_data.X
corr = pearsonr if corr_type == CorrelationType.PEARSON else spearmanr
result = corr(data[:, attr1], data[:, attr2])[0]
return -abs(result) if not np.isnan(result) else NAN, result
r, p_value = corr(data[:, attr1], data[:, attr2])
return -abs(r) if not np.isnan(r) else NAN, r, p_value

def row_for_state(self, score, state):
attrs = sorted((self.attrs[x] for x in state), key=attrgetter("name"))
attrs_item = QStandardItem(
"{}, {}".format(attrs[0].name, attrs[1].name))
attrs_item.setData(attrs, self._AttrRole)
attrs_item.setData(Qt.AlignLeft + Qt.AlignTop, Qt.TextAlignmentRole)
attr_items = []
for attr in attrs:
item = QStandardItem(attr.name)
item.setData(attrs, self._AttrRole)
item.setData(Qt.AlignLeft + Qt.AlignTop, Qt.TextAlignmentRole)
item.setToolTip(attr.name)
attr_items.append(item)
correlation_item = QStandardItem("{:+.3f}".format(score[1]))
correlation_item.setData(score[2], self.PValRole)
correlation_item.setData(attrs, self._AttrRole)
correlation_item.setData(
self.NEGATIVE_COLOR if score[1] < 0 else self.POSITIVE_COLOR,
gui.TableBarItem.BarColorRole)
return [correlation_item, attrs_item]
return [correlation_item] + attr_items

def check_preconditions(self):
return self.master.cont_data is not None

def iterate_states(self, initial_state):
if self.use_heuristic:
if self.sel_feature_index is not None:
return self.iterate_states_by_feature()
elif self.use_heuristic:
return self.heuristic.get_states(initial_state)
else:
return super().iterate_states(initial_state)

def iterate_states_by_feature(self):
for j in range(len(self.attrs)):
if j != self.sel_feature_index:
yield self.sel_feature_index, j

def state_count(self):
VesnaT marked this conversation as resolved.
Show resolved Hide resolved
if self.use_heuristic:
if self.sel_feature_index is not None:
return len(self.attrs) - 1
elif self.use_heuristic:
n_clusters = KMeansCorrelationHeuristic.n_clusters
n_avg_attrs = len(self.attrs) / n_clusters
return n_clusters * n_avg_attrs * (n_avg_attrs - 1) / 2
Expand All @@ -152,6 +175,11 @@ def state_count(self):
def bar_length(score):
return abs(score[1])

def stopped(self):
self.threadStopped.emit()
header = self.rank_table.horizontalHeader()
header.setSectionResizeMode(1, QHeaderView.Stretch)


class OWCorrelations(OWWidget):
name = "Correlations"
Expand All @@ -169,8 +197,10 @@ class Outputs:

want_control_area = False

settings_version = 2
settingsHandler = DomainContextHandler()
selection = ContextSetting(())
feature = ContextSetting(None)
correlation_type = Setting(0)

class Information(OWWidget.Information):
Expand All @@ -186,12 +216,23 @@ def __init__(self):
box = gui.vBox(self.mainArea)
self.correlation_combo = gui.comboBox(
box, self, "correlation_type", items=CorrelationType.items(),
orientation=Qt.Horizontal, callback=self._correlation_combo_changed)
orientation=Qt.Horizontal, callback=self._correlation_combo_changed
)

self.feature_model = DomainModel(
separators=False, placeholder="(All combinations)",
valid_types=ContinuousVariable,
)
gui.comboBox(
box, self, "feature", callback=self._feature_combo_changed,
model=self.feature_model
)

self.vizrank, _ = CorrelationRank.add_vizrank(
None, self, None, self._vizrank_selection_changed)
self.vizrank.progressBar = self.progressBar
self.vizrank.button.setEnabled(False)
self.vizrank.threadStopped.connect(self._vizrank_stopped)

gui.separator(box)
box.layout().addWidget(self.vizrank.filter)
Expand All @@ -206,22 +247,41 @@ def sizeHint(self):
def _correlation_combo_changed(self):
self.apply()

def _feature_combo_changed(self):
self.apply()

def _vizrank_selection_changed(self, *args):
self.selection = args
self.selection = [(var.name, vartype(var)) for var in args]
self.commit()

def _vizrank_stopped(self):
self._vizrank_select()

def _vizrank_select(self):
model = self.vizrank.rank_table.model()
if not model.rowCount():
return
selection = QItemSelection()
names = sorted(x.name for x in self.selection)
for i in range(model.rowCount()):
# pylint: disable=protected-access
if sorted(x.name for x in model.data(
model.index(i, 0), CorrelationRank._AttrRole)) == names:
selection.select(model.index(i, 0), model.index(i, 1))
self.vizrank.rank_table.selectionModel().select(
selection, QItemSelectionModel.ClearAndSelect)
break

# This flag is needed because data in the model could be
# filtered by a feature and therefore selection could not be found
selection_in_model = False
if self.selection:
sel_names = sorted(name for name, _ in self.selection)
for i in range(model.rowCount()):
# pylint: disable=protected-access
names = sorted(x.name for x in model.data(
model.index(i, 0), CorrelationRank._AttrRole))
if names == sel_names:
selection.select(model.index(i, 0),
model.index(i, model.columnCount() - 1))
selection_in_model = True
break
if not selection_in_model:
selection.select(model.index(0, 0),
model.index(0, model.columnCount() - 1))
self.vizrank.rank_table.selectionModel().select(
selection, QItemSelectionModel.ClearAndSelect)

@Inputs.data
def set_data(self, data):
Expand All @@ -240,18 +300,20 @@ def set_data(self, data):
domain = data.domain
cont_dom = Domain(cont_attrs, domain.class_vars, domain.metas)
self.cont_data = SklImpute()(Table.from_table(cont_dom, data))
self.set_feature_model()
self.openContext(self.cont_data)
self.apply()
self.openContext(self.data)
self._vizrank_select()
self.vizrank.button.setEnabled(self.data is not None)
self.vizrank.button.setEnabled(self.cont_data is not None)

def set_feature_model(self):
self.feature_model.set_domain(self.cont_data and self.cont_data.domain)
self.feature = None

def apply(self):
self.vizrank.initialize()
if self.cont_data is not None:
# this triggers self.commit() by changing vizrank selection
self.vizrank.toggle()
header = self.vizrank.rank_table.horizontalHeader()
header.setStretchLastSection(True)
else:
self.commit()

Expand All @@ -262,11 +324,14 @@ def commit(self):
self.Outputs.correlations.send(None)
return

attrs = [ContinuousVariable("Correlation"), ContinuousVariable("FDR")]
metas = [StringVariable("Feature 1"), StringVariable("Feature 2")]
domain = Domain([ContinuousVariable("Correlation")], metas=metas)
domain = Domain(attrs, metas=metas)
model = self.vizrank.rank_model
x = np.array([[float(model.data(model.index(row, 0)))] for row
in range(model.rowCount())])
x = np.array([[float(model.data(model.index(row, 0), role))
for role in (Qt.DisplayRole, CorrelationRank.PValRole)]
for row in range(model.rowCount())])
x[:, 1] = FDR(list(x[:, 1]))
# pylint: disable=protected-access
m = np.array([[a.name for a in model.data(model.index(row, 0),
CorrelationRank._AttrRole)]
Expand All @@ -276,14 +341,21 @@ def commit(self):

self.Outputs.data.send(self.data)
# data has been imputed; send original attributes
self.Outputs.features.send(AttributeList([attr.compute_value.variable
for attr in self.selection]))
self.Outputs.features.send(AttributeList(
[self.data.domain[name] for name, _ in self.selection]))
self.Outputs.correlations.send(corr_table)

def send_report(self):
self.report_table(CorrelationType.items()[self.correlation_type],
self.vizrank.rank_table)

@classmethod
def migrate_context(cls, context, version):
if version < 2:
sel = context.values["selection"]
context.values["selection"] = ([(var.name, vartype(var))
for var in sel[0]], sel[1])


if __name__ == "__main__": # pragma: no cover
WidgetPreview(OWCorrelations).run(Table("iris"))
Loading