diff --git a/Orange/evaluation/scoring.py b/Orange/evaluation/scoring.py index a73253555bc..f6f962362ec 100644 --- a/Orange/evaluation/scoring.py +++ b/Orange/evaluation/scoring.py @@ -84,25 +84,9 @@ def compute_score(self, results): return self.from_predicted(results, skl_metrics.accuracy_score) -class Precision(Score): - __wraps__ = skl_metrics.precision_score - - def compute_score(self, results): - return self.from_predicted(results, skl_metrics.precision_score, - average="weighted") - - -class Recall(Score): - __wraps__ = skl_metrics.recall_score - - def compute_score(self, results): - return self.from_predicted(results, skl_metrics.recall_score, - average="weighted") - - -class F1(Score): +class TargetScore(Score): """ - ${sklpar} + Base class for scorers that need a target value (a "positive" class). Parameters ---------- @@ -110,27 +94,43 @@ class F1(Score): Stored predictions and actual data in model testing. target : int, optional (default=None) - Value of class to report. + Target class value. + When None: + - if averaging is specified, use all classes and average results + - if average is 'binary' and class variable has exactly 2 values, + use the value '1' as the positive class - Examples - -------- - >>> Orange.evaluation.F1(results) - array([ 0.9...]) + average: str, method for averaging (default='binary') + Default requires a binary class or target to be set. + Options: 'weighted', 'macro', 'micro', None """ - __wraps__ = skl_metrics.f1_score + __wraps__ = None # Subclasses should set the scoring function - def compute_score(self, results, target=None): - if target is None: - if len(results.domain.class_var.values) <= 2: - return self.from_predicted(results, skl_metrics.f1_score, average='binary') - else: - return self.from_predicted(results, skl_metrics.f1_score, average='weighted') - else: - return np.fromiter( - (skl_metrics.f1_score(results.actual, predicted, average=None)[target] - for predicted in results.predicted), - dtype=np.float64, count=len(results.predicted)) + def compute_score(self, results, target=None, average='binary'): + if average == 'binary': + if target is None: + if len(results.domain.class_var.values) > 2: + raise ValueError( + "Multiclass data: specify target class or select " + "averaging ('weighted', 'macro', 'micro')") + target = 1 # Default: use 1 as "positive" class + average = None + labels = None if target is None else [target] + return self.from_predicted( + results, type(self).__wraps__, labels=labels, average=average) + + +class Precision(TargetScore): + __wraps__ = skl_metrics.precision_score + + +class Recall(TargetScore): + __wraps__ = skl_metrics.recall_score + + +class F1(TargetScore): + __wraps__ = skl_metrics.f1_score class PrecisionRecallFSupport(Score): @@ -293,8 +293,8 @@ def compute_CD(avranks, N, alpha="0.05", test="nemenyi"): return cd -def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None, width=6, textspace=1, - reverse=False, filename=None, **kwargs): +def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None, + width=6, textspace=1, reverse=False, filename=None, **kwargs): """ Draws a CD graph, which is used to display the differences in methods' performance. See Janez Demsar, Statistical Comparisons of Classifiers over Multiple Data Sets, 7(Jan):1--30, 2006. @@ -324,7 +324,8 @@ def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None, w import matplotlib.pyplot as plt from matplotlib.backends.backend_agg import FigureCanvasAgg except ImportError: - print("Function requires matplotlib. Please install it.", file=sys.stderr) + print("Function requires matplotlib. Please install it.", + file=sys.stderr) return width = float(width) @@ -414,7 +415,8 @@ def get_lines(sums, hsd): lsums = len(sums) allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i] # remove not significant - notSig = [(i, j) for i, j in allpairs if abs(sums[i] - sums[j]) <= hsd] + notSig = [(i, j) for i, j in allpairs + if abs(sums[i] - sums[j]) <= hsd] # keep only longest def no_longer(ij_tuple, notSig): @@ -478,23 +480,27 @@ def text(x, y, s, *args, **kwargs): tick = smalltick if a == int(a): tick = bigtick - line([(rankpos(a), cline - tick / 2), (rankpos(a), cline)], linewidth=0.7) + line([(rankpos(a), cline - tick / 2), (rankpos(a), cline)], + linewidth=0.7) for a in range(lowv, highv + 1): - text(rankpos(a), cline - tick / 2 - 0.05, str(a), ha="center", va="bottom") + text(rankpos(a), cline - tick / 2 - 0.05, str(a), + ha="center", va="bottom") k = len(ssums) for i in range(math.ceil(k / 2)): chei = cline + minnotsignificant + i * 0.2 - line([(rankpos(ssums[i]), cline), (rankpos(ssums[i]), chei), (textspace - 0.1, chei)], linewidth=0.7) + line([(rankpos(ssums[i]), cline), (rankpos(ssums[i]), chei), + (textspace - 0.1, chei)], linewidth=0.7) text(textspace - 0.2, chei, nnames[i], ha="right", va="center") for i in range(math.ceil(k / 2), k): chei = cline + minnotsignificant + (k - i - 1) * 0.2 - line([(rankpos(ssums[i]), cline), (rankpos(ssums[i]), chei), (textspace + scalewidth + 0.1, chei)], - linewidth=0.7) - text(textspace + scalewidth + 0.2, chei, nnames[i], ha="left", va="center") + line([(rankpos(ssums[i]), cline), (rankpos(ssums[i]), chei), + (textspace + scalewidth + 0.1, chei)], linewidth=0.7) + text(textspace + scalewidth + 0.2, chei, nnames[i], + ha="left", va="center") if cd and cdmethod is None: # upper scale @@ -504,15 +510,19 @@ def text(x, y, s, *args, **kwargs): begin, end = rankpos(highv), rankpos(highv - cd) line([(begin, distanceh), (end, distanceh)], linewidth=0.7) - line([(begin, distanceh + bigtick / 2), (begin, distanceh - bigtick / 2)], linewidth=0.7) - line([(end, distanceh + bigtick / 2), (end, distanceh - bigtick / 2)], linewidth=0.7) - text((begin + end) / 2, distanceh - 0.05, "CD", ha="center", va="bottom") + line([(begin, distanceh + bigtick / 2), + (begin, distanceh - bigtick / 2)], linewidth=0.7) + line([(end, distanceh + bigtick / 2), + (end, distanceh - bigtick / 2)], linewidth=0.7) + text((begin + end) / 2, distanceh - 0.05, "CD", + ha="center", va="bottom") # non significance lines def draw_lines(lines, side=0.05, height=0.1): start = cline + 0.2 for l, r in lines: - line([(rankpos(ssums[l]) - side, start), (rankpos(ssums[r]) + side, start)], linewidth=2.5) + line([(rankpos(ssums[l]) - side, start), + (rankpos(ssums[r]) + side, start)], linewidth=2.5) start += height draw_lines(lines) @@ -521,8 +531,10 @@ def draw_lines(lines, side=0.05, height=0.1): begin = rankpos(avranks[cdmethod] - cd) end = rankpos(avranks[cdmethod] + cd) line([(begin, cline), (end, cline)], linewidth=2.5) - line([(begin, cline + bigtick / 2), (begin, cline - bigtick / 2)], linewidth=2.5) - line([(end, cline + bigtick / 2), (end, cline - bigtick / 2)], linewidth=2.5) + line([(begin, cline + bigtick / 2), (begin, cline - bigtick / 2)], + linewidth=2.5) + line([(end, cline + bigtick / 2), (end, cline - bigtick / 2)], + linewidth=2.5) if filename: print_figure(fig, filename, **kwargs) diff --git a/Orange/tests/test_evaluation_scoring.py b/Orange/tests/test_evaluation_scoring.py index 57c29582bc2..7275c1bca36 100644 --- a/Orange/tests/test_evaluation_scoring.py +++ b/Orange/tests/test_evaluation_scoring.py @@ -13,26 +13,177 @@ from Orange.preprocess import discretize, Discretize +class TestPrecision(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.iris = Table('iris') + cls.score = Precision() + + def test_precision_iris(self): + learner = LogisticRegressionLearner(preprocessors=[]) + res = TestOnTrainingData(self.iris, [learner]) + self.assertAlmostEqual(self.score(res, average='weighted')[0], + 0.96189, 5) + self.assertAlmostEqual(self.score(res, target=1)[0], 0.97826, 5) + self.assertAlmostEqual(self.score(res, target=1, average=None)[0], + 0.97826, 5) + self.assertAlmostEqual(self.score(res, target=1, average='weighted')[0], + 0.97826, 5) + self.assertAlmostEqual(self.score(res, target=0, average=None)[0], 1, 5) + self.assertAlmostEqual(self.score(res, target=2, average=None)[0], + 0.90741, 5) + + def test_precision_multiclass(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01234")), + actual=[0, 4, 4, 1, 2, 0, 1, 2, 3, 2]) + results.predicted = np.array([[0, 4, 4, 1, 2, 0, 1, 2, 3, 2], + [0, 1, 4, 1, 1, 0, 0, 2, 3, 1]]) + res = self.score(results, average='weighted') + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 0.78333, 5) + + for target, prob in ((0, 2 / 3), + (1, 1 / 4), + (2, 1 / 1), + (3, 1 / 1), + (4, 1 / 1)): + res = self.score(results, target=target, average=None) + self.assertEqual(res[0], 1.) + self.assertEqual(res[1], prob) + + def test_precision_binary(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01")), + actual=[0, 1, 1, 1, 0, 0, 1, 0, 0, 1]) + results.predicted = np.array([[0, 1, 1, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 1, 1, 0, 0, 1, 1, 1, 0]]) + res = self.score(results) + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 4 / 6) + res_target = self.score(results, target=1) + self.assertEqual(res[0], res_target[0]) + self.assertEqual(res[1], res_target[1]) + res_target = self.score(results, target=0) + self.assertEqual(res_target[0], 1.) + self.assertAlmostEqual(res_target[1], 3 / 4) + res_target = self.score(results, average='macro') + self.assertEqual(res_target[0], 1.) + self.assertAlmostEqual(res_target[1], (4 / 6 + 3 / 4) / 2) + + class TestRecall(unittest.TestCase): @classmethod def setUpClass(cls): - cls.data = Table('iris') + cls.iris = Table('iris') + cls.score = Recall() - def test_recall(self): + def test_recall_iris(self): learner = LogisticRegressionLearner(preprocessors=[]) - results = TestOnTrainingData(self.data, [learner]) - self.assertAlmostEqual(Recall(results)[0], 0.960, 3) + res = TestOnTrainingData(self.iris, [learner]) + self.assertAlmostEqual(self.score(res, average='weighted')[0], 0.96, 5) + self.assertAlmostEqual(self.score(res, target=1)[0], 0.9, 5) + self.assertAlmostEqual(self.score(res, target=1, average=None)[0], + 0.9, 5) + self.assertAlmostEqual(self.score(res, target=1, average='weighted')[0], + 0.9, 5) + self.assertAlmostEqual(self.score(res, target=0, average=None)[0], 1, 5) + self.assertAlmostEqual(self.score(res, target=2, average=None)[0], + 0.98, 5) + + def test_recall_multiclass(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01234")), + actual=[0, 4, 4, 1, 2, 0, 1, 2, 3, 2]) + results.predicted = np.array([[0, 4, 4, 1, 2, 0, 1, 2, 3, 2], + [0, 1, 4, 1, 1, 0, 0, 2, 3, 1]]) + res = self.score(results, average='weighted') + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 0.6) + + for target, prob in ((0, 2 / 2), + (1, 1 / 2), + (2, 1 / 3), + (3, 1 / 1), + (4, 1 / 2)): + res = self.score(results, target=target) + self.assertEqual(res[0], 1.) + self.assertEqual(res[1], prob) + + def test_recall_binary(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01")), + actual=[0, 1, 1, 1, 0, 0, 1, 0, 0, 1]) + results.predicted = np.array([[0, 1, 1, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 1, 1, 0, 0, 1, 1, 1, 0]]) + res = self.score(results) + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 4 / 5) + res_target = self.score(results, target=1) + self.assertEqual(res[0], res_target[0]) + self.assertEqual(res[1], res_target[1]) + res_target = self.score(results, target=0) + self.assertEqual(res_target[0], 1.) + self.assertAlmostEqual(res_target[1], 3 / 5) + res_target = self.score(results, average='macro') + self.assertEqual(res_target[0], 1.) + self.assertAlmostEqual(res_target[1], (4 / 5 + 3 / 5) / 2) -class TestPrecision(unittest.TestCase): +class TestF1(unittest.TestCase): @classmethod def setUpClass(cls): - cls.data = Table('iris') + cls.iris = Table('iris') + cls.score = F1() - def test_precision(self): + def test_recall_iris(self): learner = LogisticRegressionLearner(preprocessors=[]) - results = TestOnTrainingData(self.data, [learner]) - self.assertAlmostEqual(Precision(results)[0], 0.962, 3) + res = TestOnTrainingData(self.iris, [learner]) + self.assertAlmostEqual(self.score(res, average='weighted')[0], + 0.959935, 5) + self.assertAlmostEqual(self.score(res, target=1)[0], 0.9375, 5) + self.assertAlmostEqual(self.score(res, target=1, average=None)[0], + 0.9375, 5) + self.assertAlmostEqual(self.score(res, target=1, average='weighted')[0], + 0.9375, 5) + self.assertAlmostEqual(self.score(res, target=0, average=None)[0], 1, 5) + self.assertAlmostEqual(self.score(res, target=2, average=None)[0], + 0.942307, 5) + + def test_F1_multiclass(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01234")), + actual=[0, 4, 4, 1, 2, 0, 1, 2, 3, 2]) + results.predicted = np.array([[0, 4, 4, 1, 2, 0, 1, 2, 3, 2], + [0, 1, 4, 1, 1, 0, 0, 2, 3, 1]]) + res = self.score(results, average='weighted') + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 0.61) + + for target, prob in ((0, 4 / 5), + (1, 1 / 3), + (2, 1 / 2), + (3, 1.), + (4, 2 / 3)): + res = self.score(results, target=target) + self.assertEqual(res[0], 1.) + self.assertEqual(res[1], prob) + + def test_F1_binary(self): + results = Results( + domain=Domain([], DiscreteVariable(name="y", values="01")), + actual=[0, 1, 1, 1, 0, 0, 1, 0, 0, 1]) + results.predicted = np.array([[0, 1, 1, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 1, 1, 0, 0, 1, 1, 1, 1]]) + res = self.score(results) + self.assertEqual(res[0], 1.) + self.assertAlmostEqual(res[1], 5 / 6) + res_target = self.score(results, target=1) + self.assertEqual(res[0], res_target[0]) + self.assertEqual(res[1], res_target[1]) + res_target = self.score(results, target=0) + self.assertEqual(res_target[0], 1.) + self.assertAlmostEqual(res_target[1], 3 / 4) class TestCA(unittest.TestCase): @@ -177,48 +328,5 @@ def test_log_loss_calc(self): self.assertAlmostEqual(ll_calc, ll_orange[0]) -class TestF1(unittest.TestCase): - def test_F1_multiclass(self): - results = Results( - domain=Domain([], DiscreteVariable(name="y", values="01234")), - actual=[0, 4, 4, 1, 2, 0, 1, 2, 3, 2]) - results.predicted = np.array([[0, 1, 4, 1, 1, 0, 0, 2, 3, 1], - [0, 4, 4, 1, 2, 0, 1, 2, 3, 2]]) - res = F1(results) - self.assertAlmostEqual(res[0], 0.61) - self.assertEqual(res[1], 1.) - - def test_F1_target(self): - results = Results( - domain=Domain([], DiscreteVariable(name="y", values="01234")), - actual=[0, 4, 4, 1, 2, 0, 1, 2, 3, 2]) - results.predicted = np.array([[0, 1, 4, 1, 1, 0, 0, 2, 3, 1], - [0, 4, 4, 1, 2, 0, 1, 2, 3, 2]]) - - for target, prob in ((0, 4 / 5), - (1, 1 / 3), - (2, 1 / 2), - (3, 1.), - (4, 2 / 3)): - res = F1(results, target=target) - self.assertEqual(res[0], prob) - self.assertEqual(res[1], 1.) - - def test_F1_binary(self): - results = Results( - domain=Domain([], DiscreteVariable(name="y", values="01")), - actual=[0, 1, 1, 1, 0, 0, 1, 0, 0, 1]) - results.predicted = np.array([[0, 1, 1, 1, 0, 0, 1, 0, 0, 1], - [0, 1, 1, 1, 0, 0, 1, 1, 1, 1]]) - res = F1(results) - self.assertEqual(res[0], 1.) - self.assertAlmostEqual(res[1], 5 / 6) - res_target = F1(results, target=1) - self.assertEqual(res[0], res_target[0]) - self.assertEqual(res[1], res_target[1]) - res_target = F1(results, target=0) - self.assertEqual(res_target[0], 1.) - self.assertAlmostEqual(res_target[1], 3 / 4) - if __name__ == '__main__': unittest.main() diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py index 49806f5fcec..ea41fcf4a00 100644 --- a/Orange/widgets/evaluate/owtestlearners.py +++ b/Orange/widgets/evaluate/owtestlearners.py @@ -56,10 +56,13 @@ def classification_stats(results): classification_stats.headers, classification_stats.scores = zip(*( ("AUC", scoring.AUC), - ("CA", scoring.CA), - ("F1", scoring.F1), - ("Precision", scoring.Precision), - ("Recall", scoring.Recall), + ("CA", lambda res, *args, **kwargs: scoring.CA(res)), + ("F1", (lambda res, target=None: + scoring.F1(res, target=target, average='weighted'))), + ("Precision", (lambda res, target=None: + scoring.Precision(res, target=target, average='weighted'))), + ("Recall", (lambda res, target=None: + scoring.Recall(res, target=target, average='weighted'))), )) @@ -559,7 +562,7 @@ def _update_stats_model(self): ovr_results = results_one_vs_rest( slot.results.value, target_index) - stats = [Try(lambda: score(ovr_results)) + stats = [Try(lambda: score(ovr_results, target=1)) for score in classification_stats.scores] else: stats = None