From 35a6f4bbe76c822177a1932a902ca3f286413413 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 15:16:21 +0200
Subject: [PATCH 01/21] Calibration plot: Add plots of ca, sens/spec,
 prec/recall, ppv/npv

---
 Orange/widgets/evaluate/owcalibrationplot.py | 243 ++++++++++++-------
 Orange/widgets/evaluate/utils.py             |   2 +-
 2 files changed, 161 insertions(+), 84 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index c757932adea..e7fb3c502e8 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -1,12 +1,8 @@
-"""
-Calibration Plot Widget
------------------------
-
-"""
-from collections import namedtuple
+from functools import partial
 
 import numpy as np
 
+from AnyQt.QtCore import Qt
 from AnyQt.QtWidgets import QListWidget
 
 import pyqtgraph as pg
@@ -21,19 +17,6 @@
 from Orange.widgets import report
 
 
-Curve = namedtuple(
-    "Curve",
-    ["x", "y"]
-)
-
-PlotCurve = namedtuple(
-    "PlotCurve",
-    ["curve",
-     "curve_item",
-     "rug_item"]
-)
-
-
 class OWCalibrationPlot(widget.OWWidget):
     name = "Calibration Plot"
     description = "Calibration plot based on evaluation of classifiers."
@@ -50,6 +33,8 @@ class Warning(widget.OWWidget.Warning):
 
     target_index = settings.Setting(0)
     selected_classifiers = settings.Setting([])
+    score = settings.Setting(0)
+    fold_curves = settings.Setting(False)
     display_rug = settings.Setting(True)
 
     graph_name = "plot"
@@ -60,41 +45,43 @@ def __init__(self):
         self.results = None
         self.classifier_names = []
         self.colors = []
-        self._curve_data = {}
-
-        box = gui.vBox(self.controlArea, "Plot")
-        tbox = gui.vBox(box, "Target Class")
-        tbox.setFlat(True)
 
+        box = gui.vBox(self.controlArea, "Target Class")
         self.target_cb = gui.comboBox(
-            tbox, self, "target_index", callback=self._replot,
-            contentsLength=8)
+            box, self, "target_index", callback=self._replot, contentsLength=8)
+        gui.checkBox(box, self, "display_rug", "Show rug",
+                     callback=self._on_display_rug_changed)
 
-        cbox = gui.vBox(box, "Classifier")
-        cbox.setFlat(True)
+        box = gui.vBox(self.controlArea, "Metrics")
+        combo = gui.comboBox(
+            box, self, "score", items=(x[0] for x in self.Metrics),
+            callback=self.score_changed)
+        gui.checkBox(
+            box, self, "fold_curves", "Curves for individual folds",
+            callback=self._replot)
+
+        self.explanation = gui.widgetLabel(
+            box, wordWrap=True, fixedWidth=combo.sizeHint().width())
+        self.explanation.setContentsMargins(8, 8, 0, 0)
+        font = self.explanation.font()
+        font.setPointSizeF(0.85 * font.pointSizeF())
+        self.explanation.setFont(font)
 
         self.classifiers_list_box = gui.listBox(
-            box, self, "selected_classifiers", "classifier_names",
-            selectionMode=QListWidget.MultiSelection,
+            self.controlArea, self, "selected_classifiers", "classifier_names",
+            box="Classifier", selectionMode=QListWidget.ExtendedSelection,
             callback=self._replot)
 
-        gui.checkBox(box, self, "display_rug", "Show rug",
-                     callback=self._on_display_rug_changed)
-
         self.plotview = pg.GraphicsView(background="w")
         self.plot = pg.PlotItem(enableMenu=False)
         self.plot.setMouseEnabled(False, False)
         self.plot.hideButtons()
 
-        axis = self.plot.getAxis("bottom")
-        axis.setLabel("Predicted Probability")
-
-        axis = self.plot.getAxis("left")
-        axis.setLabel("Observed Average")
-
         self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05)
         self.plotview.setCentralItem(self.plot)
+
         self.mainArea.layout().addWidget(self.plotview)
+        self._set_explanation()
 
     @Inputs.evaluation_results
     def set_results(self, results):
@@ -117,7 +104,25 @@ def clear(self):
         self.target_cb.clear()
         self.target_index = 0
         self.colors = []
-        self._curve_data = {}
+
+    def score_changed(self):
+        self._set_explanation()
+        self._replot()
+
+    def _set_explanation(self):
+        explanation = self.Metrics[self.score][2]
+        if explanation:
+            self.explanation.setText(explanation)
+            self.explanation.show()
+        else:
+            self.explanation.hide()
+
+        axis = self.plot.getAxis("bottom")
+        axis.setLabel("Predicted probability" if self.score == 0
+                      else "Threshold probability to classify as positive")
+
+        axis = self.plot.getAxis("left")
+        axis.setLabel(self.Metrics[self.score][0])
 
     def _initialize(self, results):
         N = len(results.predicted)
@@ -138,35 +143,16 @@ def _initialize(self, results):
         self.selected_classifiers = list(range(N))
         self.target_cb.addItems(results.data.domain.class_var.values)
 
-    def plot_curve(self, clf_idx, target):
-        if (clf_idx, target) in self._curve_data:
-            return self._curve_data[clf_idx, target]
-
-        ytrue = self.results.actual == target
-        probs = self.results.probabilities[clf_idx, :, target]
+    @staticmethod
+    def plot_metrics(ytrue, probs, metrics, pen_args):
         sortind = np.argsort(probs)
         probs = probs[sortind]
         ytrue = ytrue[sortind]
-        if probs.size:
-            xmin, xmax = probs.min(), probs.max()
-            x = np.linspace(xmin, xmax, 100)
-            if xmax != xmin:
-                f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin))
-                observed = f(x)
-            else:
-                observed = np.full(100, xmax)
-        else:
-            x = np.array([])
-            observed = np.array([])
-
-        curve = Curve(x, observed)
-        curve_item = pg.PlotDataItem(
-            x, observed, pen=pg.mkPen(self.colors[clf_idx], width=1),
-            shadowPen=pg.mkPen(self.colors[clf_idx].lighter(160), width=2),
-            symbol="+", symbolSize=4,
-            antialias=True
-        )
+        fn = np.cumsum(ytrue)
+        metrics(ytrue, probs, fn, pen_args)
 
+    def _rug(self, ytrue, probs, _fn, pen_args):
+        color = pen_args["pen"].color()
         rh = 0.025
         rug_x = np.c_[probs, probs]
         rug_x_true = rug_x[ytrue].ravel()
@@ -177,29 +163,103 @@ def plot_curve(self, clf_idx, target):
         rug_y_false = np.zeros_like(rug_x_false)
         rug_y_false[1::2] = rh
 
-        rug1 = pg.PlotDataItem(
-            rug_x_false, rug_y_false, pen=self.colors[clf_idx],
-            connect="pairs", antialias=True
-        )
-        rug2 = pg.PlotDataItem(
-            rug_x_true, rug_y_true, pen=self.colors[clf_idx],
-            connect="pairs", antialias=True
-        )
-        self._curve_data[clf_idx, target] = PlotCurve(curve, curve_item, (rug1, rug2))
-        return self._curve_data[clf_idx, target]
+        self.plot.plot(
+            rug_x_false, rug_y_false,
+            pen=color, connect="pairs", antialias=True)
+        self.plot.plot(
+            rug_x_true, rug_y_true,
+            pen=color, connect="pairs", antialias=True)
+
+    def _prob_curve(self, ytrue, probs, _fn, pen_args):
+        if not probs.size:
+            return
+
+        xmin, xmax = probs.min(), probs.max()
+        x = np.linspace(xmin, xmax, 100)
+        if xmax != xmin:
+            f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin))
+            y = f(x)
+        else:
+            y = np.full(100, xmax)
+
+        self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args)
+        self.plot.plot([0, 1], [0, 1], antialias=True)
+
+    # For the following methods, at point x=i, we will have i negatives,
+    # fn[i] is the number of false negatives at that point, hence
+    # tn = i - fn[i]
+    # tp = real_pos - fn[i]
+    # fp = real_neg + tn = real_neg - (i - fn[i])
+
+    def _ca_curve(self, ytrue, probs, fn, pen_args):
+        # CA = (tn + tp) / n = ((i - fn[i]) + (real_pos - fn[i])) / n
+        n = len(probs)
+        real_pos = np.sum(ytrue)
+        ca = (real_pos + np.arange(n) - 2 * fn) / n
+        self.plot.plot(probs, ca, **pen_args)
+
+    def _sens_spec_curve(self, ytrue, probs, fn, pen_args):
+        # sens = tp / p = (real_pos - fn[i]) / real_pos
+        # spec = tn / n = (i - fn[i]) / real_neg
+        n = len(probs)
+        real_pos = np.sum(ytrue)
+        real_neg = n - real_pos
+        sens = 1 - fn / real_pos
+        spec = (np.arange(1, n + 1) - fn) / real_neg
+        self.plot.plot(probs, sens, **pen_args)
+        self.plot.plot(probs, spec, **pen_args)
+
+    def _pr_curve(self, ytrue, probs, fn, pen_args):
+        # precision = tp / pred_pos = (real_pos - fn[i]) / (n - i)
+        # recall = tp / p = (real_pos - fn[i]) / real_pos
+        n = len(probs)
+        real_pos = np.sum(ytrue)
+        fn = fn[:-1]  # prevent falling to zero at the end
+        prec = (real_pos - fn) / (np.arange(n, 1, -1))
+        recall = 1 - fn / real_pos
+        self.plot.plot(probs[:-1], prec, **pen_args)
+        self.plot.plot(probs[:-1], recall, **pen_args)
+
+    def _ppv_npv_curve(self, ytrue, probs, fn, pen_args):
+        # ppv = tp / pred_pos = (real_pos - fn[i]) / (n - i)
+        # npv = tn / pred_neg = (i - fn[i]) / i
+        n = len(probs)
+        real_pos = np.sum(ytrue)
+        fn = fn[:-1]  # prevent falling to zero at the end
+        ppv = (real_pos - fn) / (np.arange(n, 1, -1))
+        npv = 1 - fn / np.arange(1, n)
+        self.plot.plot(probs[:-1], ppv, **pen_args)
+        self.plot.plot(probs[:-1], npv, **pen_args)
 
     def _setup_plot(self):
         target = self.target_index
-        selected = self.selected_classifiers
-        curves = [self.plot_curve(i, target) for i in selected]
+        results = self.results
+        metrics = partial(self.Metrics[self.score][1], self)
+        plot_folds = self.fold_curves and results.folds is not None
+
+        ytrue = results.actual == target
+        for clsf in self.selected_classifiers:
+            probs = results.probabilities[clsf, :, target]
+            color = self.colors[clsf]
+            pen_args = dict(
+                pen=pg.mkPen(color, width=1),
+                shadowPen=pg.mkPen(color.lighter(160),
+                                   width=3 + 5 * plot_folds),
+                antiAlias=True)
+            self.plot_metrics(ytrue, probs, metrics, pen_args)
 
-        for curve in curves:
-            self.plot.addItem(curve.curve_item)
             if self.display_rug:
-                self.plot.addItem(curve.rug_item[0])
-                self.plot.addItem(curve.rug_item[1])
-
-        self.plot.plot([0, 1], [0, 1], antialias=True)
+                self.plot_metrics(ytrue, probs, self._rug, pen_args)
+
+            if plot_folds:
+                pen_args = dict(
+                    pen=pg.mkPen(color, width=1, style=Qt.DashLine),
+                    antiAlias=True)
+                for fold in range(len(results.folds)):
+                    fold_results = results.get_fold(fold)
+                    fold_ytrue = fold_results.actual == target
+                    fold_probs = fold_results.probabilities[clsf, :, target]
+                    self.plot_metrics(fold_ytrue, fold_probs, metrics, pen_args)
 
     def _replot(self):
         self.plot.clear()
@@ -218,6 +278,23 @@ def send_report(self):
         self.report_plot()
         self.report_caption(caption)
 
+    Metrics = [
+        ("Actual probability", _prob_curve, ""),
+        ("Classification accuracy", _ca_curve, ""),
+        ("Sensitivity & Specificity", _sens_spec_curve,
+         "<b>Sensitivity</b> (falling) is the proportion of correctly detected "
+         "positive instances (TP / P), and <b>specificity</b> (rising) is the "
+         "proportion of detected negative instances (TP / N)."),
+        ("Precision & Recall", _pr_curve,
+         "<b>Precision</b> (rising) is the fraction of retrieved instances "
+         "that are relevant, TP / (TP + FP), and <b>recall</b> (falling) is "
+         "the proportion of discovered relevant instances, TP / P."),
+        ("Pos & Neg predictive value", _ppv_npv_curve,
+         "<b>Positive predictive value</b> (rising) is the proportion of "
+         "correct positives, TP / (TP  + FP), and <b>negative predictive "
+         "value</b> the proportion of correct negatives, TN / (TN + FN)."),
+    ]
+
 
 def gaussian_smoother(x, y, sigma=1.0):
     x = np.asarray(x)
diff --git a/Orange/widgets/evaluate/utils.py b/Orange/widgets/evaluate/utils.py
index 9e2f579dfae..ebe06032777 100644
--- a/Orange/widgets/evaluate/utils.py
+++ b/Orange/widgets/evaluate/utils.py
@@ -47,7 +47,7 @@ def results_for_preview(data_name=""):
     from Orange.classification import \
         LogisticRegressionLearner, SVMLearner, NuSVMLearner
 
-    data = Table(data_name or "ionosphere")
+    data = Table(data_name or "heart_disease")
     results = CrossValidation(
         data,
         [LogisticRegressionLearner(penalty="l2"),

From 2fa175022b0b23f28dfacea9fa90b6a71aabeba0 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 20:10:09 +0200
Subject: [PATCH 02/21] Calibration plot: Add threshold line

---
 Orange/widgets/evaluate/owcalibrationplot.py | 141 ++++++++++++++-----
 Orange/widgets/gui.py                        |   3 +
 2 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index e7fb3c502e8..8e5e1e1e96d 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -1,9 +1,10 @@
+from collections import namedtuple
 from functools import partial
 
 import numpy as np
 
-from AnyQt.QtCore import Qt
-from AnyQt.QtWidgets import QListWidget
+from AnyQt.QtCore import Qt, QSize
+from AnyQt.QtWidgets import QListWidget, QSizePolicy
 
 import pyqtgraph as pg
 
@@ -16,6 +17,10 @@
 from Orange.widgets.widget import Input
 from Orange.widgets import report
 
+metric_definition = namedtuple(
+    "metric_definition",
+    ("name", "function", "short_names", "explanation"))
+
 
 class OWCalibrationPlot(widget.OWWidget):
     name = "Calibration Plot"
@@ -36,6 +41,7 @@ class Warning(widget.OWWidget.Warning):
     score = settings.Setting(0)
     fold_curves = settings.Setting(False)
     display_rug = settings.Setting(True)
+    threshold = settings.Setting(0.5)
 
     graph_name = "plot"
 
@@ -43,22 +49,32 @@ def __init__(self):
         super().__init__()
 
         self.results = None
+        self.scores = None
         self.classifier_names = []
         self.colors = []
 
-        box = gui.vBox(self.controlArea, "Target Class")
+        box = gui.vBox(self.controlArea, box="Settings")
         self.target_cb = gui.comboBox(
-            box, self, "target_index", callback=self._replot, contentsLength=8)
-        gui.checkBox(box, self, "display_rug", "Show rug",
-                     callback=self._on_display_rug_changed)
+            box, self, "target_index", label="Target:",
+            orientation=Qt.Horizontal, callback=self._replot, contentsLength=8)
+        gui.checkBox(
+            box, self, "display_rug", "Show rug",
+            callback=self._on_display_rug_changed)
+        gui.checkBox(
+            box, self, "fold_curves", "Curves for individual folds",
+            callback=self._replot)
+
+        self.classifiers_list_box = gui.listBox(
+            self.controlArea, self, "selected_classifiers", "classifier_names",
+            box="Classifier", selectionMode=QListWidget.ExtendedSelection,
+            sizePolicy=(QSizePolicy.Preferred, QSizePolicy.Preferred),
+            sizeHint=QSize(150, 40),
+            callback=self._replot)
 
         box = gui.vBox(self.controlArea, "Metrics")
         combo = gui.comboBox(
-            box, self, "score", items=(x[0] for x in self.Metrics),
+            box, self, "score", items=(metric.name for metric in self.Metrics),
             callback=self.score_changed)
-        gui.checkBox(
-            box, self, "fold_curves", "Curves for individual folds",
-            callback=self._replot)
 
         self.explanation = gui.widgetLabel(
             box, wordWrap=True, fixedWidth=combo.sizeHint().width())
@@ -67,16 +83,18 @@ def __init__(self):
         font.setPointSizeF(0.85 * font.pointSizeF())
         self.explanation.setFont(font)
 
-        self.classifiers_list_box = gui.listBox(
-            self.controlArea, self, "selected_classifiers", "classifier_names",
-            box="Classifier", selectionMode=QListWidget.ExtendedSelection,
-            callback=self._replot)
+        box = gui.widgetBox(self.controlArea, "Info")
+        self.info_label = gui.widgetLabel(box)
 
         self.plotview = pg.GraphicsView(background="w")
         self.plot = pg.PlotItem(enableMenu=False)
         self.plot.setMouseEnabled(False, False)
         self.plot.hideButtons()
 
+        for axis_name in ("bottom", "left"):
+            axis = self.plot.getAxis(axis_name)
+            axis.setPen(pg.mkPen(color=0.0))
+
         self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05)
         self.plotview.setCentralItem(self.plot)
 
@@ -110,7 +128,7 @@ def score_changed(self):
         self._replot()
 
     def _set_explanation(self):
-        explanation = self.Metrics[self.score][2]
+        explanation = self.Metrics[self.score].explanation
         if explanation:
             self.explanation.setText(explanation)
             self.explanation.show()
@@ -122,7 +140,7 @@ def _set_explanation(self):
                       else "Threshold probability to classify as positive")
 
         axis = self.plot.getAxis("left")
-        axis.setLabel(self.Metrics[self.score][0])
+        axis.setLabel(self.Metrics[self.score].name)
 
     def _initialize(self, results):
         N = len(results.predicted)
@@ -149,7 +167,7 @@ def plot_metrics(ytrue, probs, metrics, pen_args):
         probs = probs[sortind]
         ytrue = ytrue[sortind]
         fn = np.cumsum(ytrue)
-        metrics(ytrue, probs, fn, pen_args)
+        return probs, metrics(ytrue, probs, fn, pen_args)
 
     def _rug(self, ytrue, probs, _fn, pen_args):
         color = pen_args["pen"].color()
@@ -208,6 +226,7 @@ def _sens_spec_curve(self, ytrue, probs, fn, pen_args):
         spec = (np.arange(1, n + 1) - fn) / real_neg
         self.plot.plot(probs, sens, **pen_args)
         self.plot.plot(probs, spec, **pen_args)
+        return sens, spec
 
     def _pr_curve(self, ytrue, probs, fn, pen_args):
         # precision = tp / pred_pos = (real_pos - fn[i]) / (n - i)
@@ -219,6 +238,7 @@ def _pr_curve(self, ytrue, probs, fn, pen_args):
         recall = 1 - fn / real_pos
         self.plot.plot(probs[:-1], prec, **pen_args)
         self.plot.plot(probs[:-1], recall, **pen_args)
+        return prec, recall
 
     def _ppv_npv_curve(self, ytrue, probs, fn, pen_args):
         # ppv = tp / pred_pos = (real_pos - fn[i]) / (n - i)
@@ -230,12 +250,14 @@ def _ppv_npv_curve(self, ytrue, probs, fn, pen_args):
         npv = 1 - fn / np.arange(1, n)
         self.plot.plot(probs[:-1], ppv, **pen_args)
         self.plot.plot(probs[:-1], npv, **pen_args)
+        return ppv, npv
 
     def _setup_plot(self):
         target = self.target_index
         results = self.results
-        metrics = partial(self.Metrics[self.score][1], self)
+        metrics = partial(self.Metrics[self.score].function, self)
         plot_folds = self.fold_curves and results.folds is not None
+        self.scores = []
 
         ytrue = results.actual == target
         for clsf in self.selected_classifiers:
@@ -246,7 +268,9 @@ def _setup_plot(self):
                 shadowPen=pg.mkPen(color.lighter(160),
                                    width=3 + 5 * plot_folds),
                 antiAlias=True)
-            self.plot_metrics(ytrue, probs, metrics, pen_args)
+            self.scores.append(
+                (self.classifier_names[clsf],
+                 self.plot_metrics(ytrue, probs, metrics, pen_args)))
 
             if self.display_rug:
                 self.plot_metrics(ytrue, probs, self._rug, pen_args)
@@ -265,10 +289,54 @@ def _replot(self):
         self.plot.clear()
         if self.results is not None:
             self._setup_plot()
+        self.line = pg.InfiniteLine(
+            pos=self.threshold, movable=True,
+            pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
+            hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3),
+            bounds=(0, 1),
+        )
+        self.line.sigPositionChanged.connect(self.threshold_change)
+        self.line.sigPositionChangeFinished.connect(self.threshold_change_done)
+        self.plot.addItem(self.line)
+        self._update_info()
+
 
     def _on_display_rug_changed(self):
         self._replot()
 
+    def threshold_change(self):
+        self.threshold = round(self.line.pos().x(), 2)
+        self.line.setPos(self.threshold)
+        self._update_info()
+
+    def _update_info(self):
+
+        text = f"""<table>
+                        <tr>
+                            <th align='right'>Threshold: p=</th>
+                            <td colspan='4'>{self.threshold:.2f}<br/></td>
+                        </tr>"""
+        if self.scores is not None:
+            short_names = self.Metrics[self.score].short_names
+            if short_names:
+                text += f"""<tr>
+                                <th></th>
+                                {"<td></td>".join(f"<td align='right'>{n}</td>"
+                                                  for n in short_names)}
+                            </tr>"""
+            for name, (probs, curves) in self.scores:
+                ind = min(np.searchsorted(probs, self.threshold),
+                          len(probs) - 1)
+                text += f"<tr><th align='right'>{name}:</th>"
+                text += "<td>/</td>".join(f'<td>{curve[ind]:.3f}</td>'
+                                          for curve in curves)
+                text += "</tr>"
+            text += "<table>"
+        self.info_label.setText(text)
+
+    def threshold_change_done(self):
+        ...
+
     def send_report(self):
         if self.results is None:
             return
@@ -278,22 +346,25 @@ def send_report(self):
         self.report_plot()
         self.report_caption(caption)
 
-    Metrics = [
-        ("Actual probability", _prob_curve, ""),
-        ("Classification accuracy", _ca_curve, ""),
-        ("Sensitivity & Specificity", _sens_spec_curve,
-         "<b>Sensitivity</b> (falling) is the proportion of correctly detected "
-         "positive instances (TP / P), and <b>specificity</b> (rising) is the "
-         "proportion of detected negative instances (TP / N)."),
-        ("Precision & Recall", _pr_curve,
-         "<b>Precision</b> (rising) is the fraction of retrieved instances "
-         "that are relevant, TP / (TP + FP), and <b>recall</b> (falling) is "
-         "the proportion of discovered relevant instances, TP / P."),
-        ("Pos & Neg predictive value", _ppv_npv_curve,
-         "<b>Positive predictive value</b> (rising) is the proportion of "
-         "correct positives, TP / (TP  + FP), and <b>negative predictive "
-         "value</b> the proportion of correct negatives, TN / (TN + FN)."),
-    ]
+    Metrics = [metric_definition(*args) for args in (
+        ("Actual probability", _prob_curve, (), ""),
+        ("Classification accuracy", _ca_curve, (), ""),
+        ("Sensitivity and specificity", _sens_spec_curve, ("sens", "spec"),
+         "<p><b>Sensitivity</b> (falling) is the proportion of correctly "
+         "detected positive instances (TP / P).</p>"
+         "<p><b>Specificity</b> (rising) is the proportion of detected "
+         "negative instances (TP / N).</p>"),
+        ("Precision and recall", _pr_curve, ("prec", "recall"),
+         "<p><b>Precision</b> (rising) is the fraction of retrieved instances "
+         "that are relevant, TP / (TP + FP).</p>"
+         "<p><b>Recall</b> (falling) is the proportion of discovered relevant "
+         "instances, TP / P.</p>"),
+        ("Pos and neg predictive value", _ppv_npv_curve, ("PPV", "TPV"),
+         "<p><b>Positive predictive value</b> (rising) is the proportion of "
+         "correct positives, TP / (TP  + FP).</p>"
+         "<p><b>Negative predictive value</b> is the proportion of correct "
+         "negatives, TN / (TN + FN).</p>"),
+    )]
 
 
 def gaussian_smoother(x, y, sigma=1.0):
diff --git a/Orange/widgets/gui.py b/Orange/widgets/gui.py
index 683b8be2f73..b6a8d84552b 100644
--- a/Orange/widgets/gui.py
+++ b/Orange/widgets/gui.py
@@ -1783,6 +1783,9 @@ def __init__(self, master, enableDragDrop=False, dragDropCallback=None,
     def sizeHint(self):
         return self.size_hint
 
+    def minimumSizeHint(self):
+        return self.size_hint
+
     def dragEnterEvent(self, event):
         super().dragEnterEvent(event)
         if self.valid_data_callback:

From d47b68b2fb55ef1d0743b4027fa7e79cdfba510f Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 21:51:43 +0200
Subject: [PATCH 03/21] Calibration plot: Refactor computation of metrics

---
 Orange/widgets/evaluate/owcalibrationplot.py | 196 ++++++++++---------
 1 file changed, 99 insertions(+), 97 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 8e5e1e1e96d..c0d1f10d9cd 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -1,5 +1,4 @@
 from collections import namedtuple
-from functools import partial
 
 import numpy as np
 
@@ -8,7 +7,7 @@
 
 import pyqtgraph as pg
 
-import Orange
+from Orange.evaluation import Results
 from Orange.widgets import widget, gui, settings
 from Orange.widgets.evaluate.utils import \
     check_results_adequacy, results_for_preview
@@ -17,10 +16,77 @@
 from Orange.widgets.widget import Input
 from Orange.widgets import report
 
-metric_definition = namedtuple(
+
+class Data:
+    def __init__(self, ytrue, probs):
+        sortind = np.argsort(probs)
+        self.probs = probs[sortind]
+        self.ytrue = ytrue[sortind]
+        self.fn = np.cumsum(self.ytrue)
+        self.tot = len(probs)
+        self.p = self.fn[-1]
+        self.n = self.tot - self.p
+
+    @property
+    def tn(self):
+        return np.arange(self.tot) - self.fn
+
+    @property
+    def tp(self):
+        return self.p - self.fn
+
+    @property
+    def fp(self):
+        return self.n - self.tn
+
+
+MetricDefinition = namedtuple(
     "metric_definition",
     ("name", "function", "short_names", "explanation"))
 
+Metrics = [MetricDefinition(*args) for args in (
+    ("Actual probability",
+     None,
+     (),
+     ""),
+    ("Classification accuracy",
+     lambda d: (d.probs, ((d.tp + d.tn) / d.tot,)),
+     (),
+     ""),
+    ("F1",
+     lambda d: (d.probs, (2 * d.tp / (2 * d.tp + d.fp + d.fn),)),
+     (),
+     ""),
+    ("Sensitivity and specificity",
+     lambda d: (d.probs, (d.tp / d.p, d.tn / d.n)),
+     ("sens", "spec"),
+     "<p><b>Sensitivity</b> (falling) is the proportion of correctly "
+     "detected positive instances (TP&nbsp;/&nbsp;P).</p>"
+     "<p><b>Specificity</b> (rising) is the proportion of detected "
+     "negative instances (TP&nbsp;/&nbsp;N).</p>"),
+    ("Precision and recall",
+     lambda d: (d.probs[:-1], (d.tp[:-1] / np.arange(d.tot, 1, -1),
+                               d.tp[:-1] / d.p)),
+     ("prec", "recall"),
+     "<p><b>Precision</b> (rising) is the fraction of retrieved instances "
+     "that are relevant, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
+     "<p><b>Recall</b> (falling) is the proportion of discovered relevant "
+     "instances, TP&nbsp;/&nbsp;P.</p>"),
+    ("Pos and neg predictive value",
+     lambda d: (d.probs[:-1], (d.tp[:-1] / np.arange(d.tot, 1, -1),
+                               d.tn[:-1] / np.arange(1, d.tot))),
+     ("PPV", "TPV"),
+     "<p><b>Positive predictive value</b> (rising) is the proportion of "
+     "correct positives, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
+     "<p><b>Negative predictive value</b> is the proportion of correct "
+     "negatives, TN&nbsp;/&nbsp;(TN&nbsp;+&nbsp;FN).</p>"),
+    ("True and false positive rate",
+     lambda d: (d.probs, (d.tp / d.p, d.fp / d.n)),
+     ("TPR", "FPR"),
+     "<p><b>True and false positive rate</b> are proportions of detected "
+     "and omitted positive instances</p>"),
+)]
+
 
 class OWCalibrationPlot(widget.OWWidget):
     name = "Calibration Plot"
@@ -30,7 +96,7 @@ class OWCalibrationPlot(widget.OWWidget):
     keywords = []
 
     class Inputs:
-        evaluation_results = Input("Evaluation Results", Orange.evaluation.Results)
+        evaluation_results = Input("Evaluation Results", Results)
 
     class Warning(widget.OWWidget.Warning):
         empty_input = widget.Msg(
@@ -52,6 +118,7 @@ def __init__(self):
         self.scores = None
         self.classifier_names = []
         self.colors = []
+        self.line = None
 
         box = gui.vBox(self.controlArea, box="Settings")
         self.target_cb = gui.comboBox(
@@ -73,7 +140,7 @@ def __init__(self):
 
         box = gui.vBox(self.controlArea, "Metrics")
         combo = gui.comboBox(
-            box, self, "score", items=(metric.name for metric in self.Metrics),
+            box, self, "score", items=(metric.name for metric in Metrics),
             callback=self.score_changed)
 
         self.explanation = gui.widgetLabel(
@@ -94,6 +161,8 @@ def __init__(self):
         for axis_name in ("bottom", "left"):
             axis = self.plot.getAxis(axis_name)
             axis.setPen(pg.mkPen(color=0.0))
+            if axis_name != "bottom":  # remove if when pyqtgraph is fixed
+                axis.setStyle(stopAxisAtTick=(True, True))
 
         self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05)
         self.plotview.setCentralItem(self.plot)
@@ -128,7 +197,7 @@ def score_changed(self):
         self._replot()
 
     def _set_explanation(self):
-        explanation = self.Metrics[self.score].explanation
+        explanation = Metrics[self.score].explanation
         if explanation:
             self.explanation.setText(explanation)
             self.explanation.show()
@@ -140,7 +209,7 @@ def _set_explanation(self):
                       else "Threshold probability to classify as positive")
 
         axis = self.plot.getAxis("left")
-        axis.setLabel(self.Metrics[self.score].name)
+        axis.setLabel(Metrics[self.score].name)
 
     def _initialize(self, results):
         N = len(results.predicted)
@@ -161,20 +230,12 @@ def _initialize(self, results):
         self.selected_classifiers = list(range(N))
         self.target_cb.addItems(results.data.domain.class_var.values)
 
-    @staticmethod
-    def plot_metrics(ytrue, probs, metrics, pen_args):
-        sortind = np.argsort(probs)
-        probs = probs[sortind]
-        ytrue = ytrue[sortind]
-        fn = np.cumsum(ytrue)
-        return probs, metrics(ytrue, probs, fn, pen_args)
-
-    def _rug(self, ytrue, probs, _fn, pen_args):
+    def _rug(self, data, pen_args):
         color = pen_args["pen"].color()
         rh = 0.025
-        rug_x = np.c_[probs, probs]
-        rug_x_true = rug_x[ytrue].ravel()
-        rug_x_false = rug_x[~ytrue].ravel()
+        rug_x = np.c_[data.probs, data.probs]
+        rug_x_true = rug_x[data.ytrue].ravel()
+        rug_x_false = rug_x[~data.ytrue].ravel()
 
         rug_y_true = np.ones_like(rug_x_true)
         rug_y_true[1::2] = 1 - rh
@@ -188,9 +249,17 @@ def _rug(self, ytrue, probs, _fn, pen_args):
             rug_x_true, rug_y_true,
             pen=color, connect="pairs", antialias=True)
 
-    def _prob_curve(self, ytrue, probs, _fn, pen_args):
+    def plot_metrics(self, data, metrics, pen_args):
+        if metrics is None:
+            return self._prob_curve(data.ytrue, data.probs, pen_args)
+        x, ys = metrics(data)
+        for y in ys:
+            self.plot.plot(x, y, **pen_args)
+        return x, ys
+
+    def _prob_curve(self, ytrue, probs, pen_args):
         if not probs.size:
-            return
+            return None
 
         xmin, xmax = probs.min(), probs.max()
         x = np.linspace(xmin, xmax, 100)
@@ -202,60 +271,12 @@ def _prob_curve(self, ytrue, probs, _fn, pen_args):
 
         self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args)
         self.plot.plot([0, 1], [0, 1], antialias=True)
-
-    # For the following methods, at point x=i, we will have i negatives,
-    # fn[i] is the number of false negatives at that point, hence
-    # tn = i - fn[i]
-    # tp = real_pos - fn[i]
-    # fp = real_neg + tn = real_neg - (i - fn[i])
-
-    def _ca_curve(self, ytrue, probs, fn, pen_args):
-        # CA = (tn + tp) / n = ((i - fn[i]) + (real_pos - fn[i])) / n
-        n = len(probs)
-        real_pos = np.sum(ytrue)
-        ca = (real_pos + np.arange(n) - 2 * fn) / n
-        self.plot.plot(probs, ca, **pen_args)
-
-    def _sens_spec_curve(self, ytrue, probs, fn, pen_args):
-        # sens = tp / p = (real_pos - fn[i]) / real_pos
-        # spec = tn / n = (i - fn[i]) / real_neg
-        n = len(probs)
-        real_pos = np.sum(ytrue)
-        real_neg = n - real_pos
-        sens = 1 - fn / real_pos
-        spec = (np.arange(1, n + 1) - fn) / real_neg
-        self.plot.plot(probs, sens, **pen_args)
-        self.plot.plot(probs, spec, **pen_args)
-        return sens, spec
-
-    def _pr_curve(self, ytrue, probs, fn, pen_args):
-        # precision = tp / pred_pos = (real_pos - fn[i]) / (n - i)
-        # recall = tp / p = (real_pos - fn[i]) / real_pos
-        n = len(probs)
-        real_pos = np.sum(ytrue)
-        fn = fn[:-1]  # prevent falling to zero at the end
-        prec = (real_pos - fn) / (np.arange(n, 1, -1))
-        recall = 1 - fn / real_pos
-        self.plot.plot(probs[:-1], prec, **pen_args)
-        self.plot.plot(probs[:-1], recall, **pen_args)
-        return prec, recall
-
-    def _ppv_npv_curve(self, ytrue, probs, fn, pen_args):
-        # ppv = tp / pred_pos = (real_pos - fn[i]) / (n - i)
-        # npv = tn / pred_neg = (i - fn[i]) / i
-        n = len(probs)
-        real_pos = np.sum(ytrue)
-        fn = fn[:-1]  # prevent falling to zero at the end
-        ppv = (real_pos - fn) / (np.arange(n, 1, -1))
-        npv = 1 - fn / np.arange(1, n)
-        self.plot.plot(probs[:-1], ppv, **pen_args)
-        self.plot.plot(probs[:-1], npv, **pen_args)
-        return ppv, npv
+        return x, (y, )
 
     def _setup_plot(self):
         target = self.target_index
         results = self.results
-        metrics = partial(self.Metrics[self.score].function, self)
+        metrics = Metrics[self.score].function
         plot_folds = self.fold_curves and results.folds is not None
         self.scores = []
 
@@ -266,14 +287,15 @@ def _setup_plot(self):
             pen_args = dict(
                 pen=pg.mkPen(color, width=1),
                 shadowPen=pg.mkPen(color.lighter(160),
-                                   width=3 + 5 * plot_folds),
+                                   width=4 + 4 * plot_folds),
                 antiAlias=True)
+            data = Data(ytrue, probs)
             self.scores.append(
                 (self.classifier_names[clsf],
-                 self.plot_metrics(ytrue, probs, metrics, pen_args)))
+                 self.plot_metrics(data, metrics, pen_args)))
 
             if self.display_rug:
-                self.plot_metrics(ytrue, probs, self._rug, pen_args)
+                self._rug(data, pen_args)
 
             if plot_folds:
                 pen_args = dict(
@@ -283,7 +305,8 @@ def _setup_plot(self):
                     fold_results = results.get_fold(fold)
                     fold_ytrue = fold_results.actual == target
                     fold_probs = fold_results.probabilities[clsf, :, target]
-                    self.plot_metrics(fold_ytrue, fold_probs, metrics, pen_args)
+                    self.plot_metrics(Data(fold_ytrue, fold_probs),
+                                      metrics, pen_args)
 
     def _replot(self):
         self.plot.clear()
@@ -300,7 +323,6 @@ def _replot(self):
         self.plot.addItem(self.line)
         self._update_info()
 
-
     def _on_display_rug_changed(self):
         self._replot()
 
@@ -317,7 +339,7 @@ def _update_info(self):
                             <td colspan='4'>{self.threshold:.2f}<br/></td>
                         </tr>"""
         if self.scores is not None:
-            short_names = self.Metrics[self.score].short_names
+            short_names = Metrics[self.score].short_names
             if short_names:
                 text += f"""<tr>
                                 <th></th>
@@ -346,26 +368,6 @@ def send_report(self):
         self.report_plot()
         self.report_caption(caption)
 
-    Metrics = [metric_definition(*args) for args in (
-        ("Actual probability", _prob_curve, (), ""),
-        ("Classification accuracy", _ca_curve, (), ""),
-        ("Sensitivity and specificity", _sens_spec_curve, ("sens", "spec"),
-         "<p><b>Sensitivity</b> (falling) is the proportion of correctly "
-         "detected positive instances (TP / P).</p>"
-         "<p><b>Specificity</b> (rising) is the proportion of detected "
-         "negative instances (TP / N).</p>"),
-        ("Precision and recall", _pr_curve, ("prec", "recall"),
-         "<p><b>Precision</b> (rising) is the fraction of retrieved instances "
-         "that are relevant, TP / (TP + FP).</p>"
-         "<p><b>Recall</b> (falling) is the proportion of discovered relevant "
-         "instances, TP / P.</p>"),
-        ("Pos and neg predictive value", _ppv_npv_curve, ("PPV", "TPV"),
-         "<p><b>Positive predictive value</b> (rising) is the proportion of "
-         "correct positives, TP / (TP  + FP).</p>"
-         "<p><b>Negative predictive value</b> is the proportion of correct "
-         "negatives, TN / (TN + FN).</p>"),
-    )]
-
 
 def gaussian_smoother(x, y, sigma=1.0):
     x = np.asarray(x)

From 585feb2077e4ea5a3be79bda64665c03b4e07def Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 23:34:48 +0200
Subject: [PATCH 04/21] Testing: Keep 2d array of models when splitting Results
 by models

---
 Orange/evaluation/testing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Orange/evaluation/testing.py b/Orange/evaluation/testing.py
index 92c68d1c13f..9e6d0ca071c 100644
--- a/Orange/evaluation/testing.py
+++ b/Orange/evaluation/testing.py
@@ -317,7 +317,7 @@ def split_by_model(self):
                 res.probabilities = self.probabilities[(i,), :, :]
 
             if self.models is not None:
-                res.models = self.models[:, i]
+                res.models = self.models[:, i:i+1]
 
             res.failed = [self.failed[i]]
             yield res

From 7b876e64fab994f707f1c761f86dddeb4bae7281 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 23:35:49 +0200
Subject: [PATCH 05/21] Test Learners: Store models when there is just one;
 properly stack them

---
 Orange/widgets/evaluate/owtestlearners.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index 0577b448950..c3da72af0f6 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -735,7 +735,8 @@ def __update(self):
 
         if self.resampling == OWTestLearners.TestOnTest:
             test_f = partial(
-                Orange.evaluation.TestOnTestData(store_data=True),
+                Orange.evaluation.TestOnTestData(
+                    store_data=True, store_models=True),
                 self.data, self.test_data, learners_c, self.preprocessor
             )
         else:
@@ -756,7 +757,8 @@ def __update(self):
                     stratified=self.shuffle_stratified,
                     random_state=rstate)
             elif self.resampling == OWTestLearners.TestOnTrain:
-                sampler = Orange.evaluation.TestOnTrainingData()
+                sampler = Orange.evaluation.TestOnTrainingData(
+                    store_models=True)
             else:
                 assert False, "self.resampling %s" % self.resampling
 
@@ -916,7 +918,7 @@ def is_empty(res):
         res.probabilities = np.vstack((x.probabilities, y.probabilities))
 
     if x.models is not None:
-        res.models = [xm + ym for xm, ym in zip(x.models, y.models)]
+        res.models = np.hstack((x.models, y.models))
     return res
 
 

From 93b7a72ee89f7843ed89a5d4ab7e91234fd2fa4c Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 23:36:32 +0200
Subject: [PATCH 06/21] classification: Add ModelWithThreshold

---
 Orange/classification/__init__.py    |  1 +
 Orange/classification/calibration.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 Orange/classification/calibration.py

diff --git a/Orange/classification/__init__.py b/Orange/classification/__init__.py
index f0489b4cb74..842518fca31 100644
--- a/Orange/classification/__init__.py
+++ b/Orange/classification/__init__.py
@@ -19,3 +19,4 @@
 from .rules import *
 from .sgd import *
 from .neural_network import *
+from .calibration import *
diff --git a/Orange/classification/calibration.py b/Orange/classification/calibration.py
new file mode 100644
index 00000000000..891b8db81b6
--- /dev/null
+++ b/Orange/classification/calibration.py
@@ -0,0 +1,22 @@
+from Orange.classification import Model
+
+__all__ = ["ModelWithThreshold"]
+
+
+class ModelWithThreshold(Model):
+    def __init__(self, wrapped_model, threshold, target_class=1):
+        super().__init__(wrapped_model.domain, wrapped_model.original_domain)
+        self.name = f"{wrapped_model.name}, thresh={threshold:.2f}"
+        self.wrapped_model = wrapped_model
+        self.threshold = threshold
+        self.target_class = target_class
+
+    def __call__(self, data, ret=Model.Value):
+        probs = self.wrapped_model(data, ret=Model.Probs)
+        if ret == Model.Probs:
+            return probs
+        vals = probs[:, self.target_class].flatten() > self.threshold
+        if ret == Model.Value:
+            return vals
+        else:
+            return vals, probs

From ff67b4920be230566f7853719e70ebc2b9175adf Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Thu, 13 Jun 2019 23:37:32 +0200
Subject: [PATCH 07/21] Calibration plot: Output selected model

---
 Orange/widgets/evaluate/owcalibrationplot.py | 50 ++++++++++++++++++--
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index c0d1f10d9cd..3d782030467 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -7,13 +7,14 @@
 
 import pyqtgraph as pg
 
+from Orange.classification import ModelWithThreshold
 from Orange.evaluation import Results
 from Orange.widgets import widget, gui, settings
 from Orange.widgets.evaluate.utils import \
     check_results_adequacy, results_for_preview
 from Orange.widgets.utils import colorpalette, colorbrewer
 from Orange.widgets.utils.widgetpreview import WidgetPreview
-from Orange.widgets.widget import Input
+from Orange.widgets.widget import Input, Output, Msg
 from Orange.widgets import report
 
 
@@ -98,16 +99,31 @@ class OWCalibrationPlot(widget.OWWidget):
     class Inputs:
         evaluation_results = Input("Evaluation Results", Results)
 
+    class Outputs:
+        calibrated_model = Output("Calibrated Model", ModelWithThreshold)
+
     class Warning(widget.OWWidget.Warning):
         empty_input = widget.Msg(
             "Empty result on input. Nothing to display.")
 
+    class Information(widget.OWWidget.Information):
+        no_out = "Can't output a model: "
+        no_output_multiple_folds = Msg(
+            no_out + "every training data sample produced a different model")
+        no_output_no_models = Msg(
+            no_out + "test results do not contain stored models;\n"
+            "try testing on separate data or on training data")
+        no_output_multiple_selected = Msg(
+            no_out + "select a single model - the widget can output only one")
+
+
     target_index = settings.Setting(0)
     selected_classifiers = settings.Setting([])
     score = settings.Setting(0)
     fold_curves = settings.Setting(False)
     display_rug = settings.Setting(True)
     threshold = settings.Setting(0.5)
+    auto_commit = settings.Setting(True)
 
     graph_name = "plot"
 
@@ -136,7 +152,7 @@ def __init__(self):
             box="Classifier", selectionMode=QListWidget.ExtendedSelection,
             sizePolicy=(QSizePolicy.Preferred, QSizePolicy.Preferred),
             sizeHint=QSize(150, 40),
-            callback=self._replot)
+            callback=self._on_selection_changed)
 
         box = gui.vBox(self.controlArea, "Metrics")
         combo = gui.comboBox(
@@ -153,6 +169,9 @@ def __init__(self):
         box = gui.widgetBox(self.controlArea, "Info")
         self.info_label = gui.widgetLabel(box)
 
+        gui.auto_commit(
+            self.controlArea, self, "auto_commit", "Apply", commit=self.apply)
+
         self.plotview = pg.GraphicsView(background="w")
         self.plot = pg.PlotItem(enableMenu=False)
         self.plot.setMouseEnabled(False, False)
@@ -182,6 +201,7 @@ def set_results(self, results):
         if self.results is not None:
             self._initialize(results)
             self._replot()
+        self.apply()
 
     def clear(self):
         self.plot.clear()
@@ -326,13 +346,16 @@ def _replot(self):
     def _on_display_rug_changed(self):
         self._replot()
 
+    def _on_selection_changed(self):
+        self._replot()
+        self.apply()
+
     def threshold_change(self):
         self.threshold = round(self.line.pos().x(), 2)
         self.line.setPos(self.threshold)
         self._update_info()
 
     def _update_info(self):
-
         text = f"""<table>
                         <tr>
                             <th align='right'>Threshold: p=</th>
@@ -357,7 +380,26 @@ def _update_info(self):
         self.info_label.setText(text)
 
     def threshold_change_done(self):
-        ...
+        self.apply()
+
+    def apply(self):
+        info = self.Information
+        wrapped = None
+        problems = {}
+        if self.results is not None:
+            problems = {
+                info.no_output_multiple_folds: len(self.results.folds) > 1,
+                info.no_output_no_models: self.results.models is None,
+                info.no_output_multiple_selected:
+                    len(self.selected_classifiers) != 1}
+            if not any(problems.values()):
+                model = self.results.models[0][self.selected_classifiers[0]]
+                wrapped = ModelWithThreshold(model, self.threshold)
+
+        self.Outputs.calibrated_model.send(wrapped)
+        for info, shown in problems.items():
+            if info.is_shown() != shown:
+                info(shown=shown)
 
     def send_report(self):
         if self.results is None:

From a4424fbb0f264a42e69ee64e2cb20db0cf4ebd9a Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Sun, 16 Jun 2019 22:33:27 +0200
Subject: [PATCH 08/21] Orange.evaluation.performance_curves: Add module for
 computation of performance curves

---
 Orange/evaluation/performance_curves.py       | 150 ++++++++++++++++++
 Orange/evaluation/tests/__init__.py           |   0
 .../tests/test_performance_curves.py          | 125 +++++++++++++++
 .../evaluation.performance_curves.rst         |   8 +
 .../source/reference/evaluation.rst           |   1 +
 5 files changed, 284 insertions(+)
 create mode 100644 Orange/evaluation/performance_curves.py
 create mode 100644 Orange/evaluation/tests/__init__.py
 create mode 100644 Orange/evaluation/tests/test_performance_curves.py
 create mode 100644 doc/data-mining-library/source/reference/evaluation.performance_curves.rst

diff --git a/Orange/evaluation/performance_curves.py b/Orange/evaluation/performance_curves.py
new file mode 100644
index 00000000000..c7dee568e53
--- /dev/null
+++ b/Orange/evaluation/performance_curves.py
@@ -0,0 +1,150 @@
+import numpy as np
+
+
+class Curves:
+    # names of scores are standard acronyms, pylint: disable=invalid-name
+    """
+    Computation of performance curves (ca, f1, precision, recall and the rest
+    of the zoo) from test results.
+
+    The class works with binary classes. Attribute `probs` contains ordered
+    probabilities and all curves represent performance statistics if an
+    instance is classified as positive if it equals or exceeds the threshold
+    in `probs`, that is, `sensitivity[i]` is the sensitivity of the classifier
+    that classifies an instances as positive if the probability of being
+    positive is at least `probs[i]`.
+
+    Class can be constructed by giving `probs` and `ytrue`, or from test
+    results (see :obj:`Curves.from_results`). The latter removes instances
+    with missing class values or predicted probabilities.
+
+    The class treats all results as obtained from a single run instead of
+    computing separate curves and fancy averaging.
+
+    Arguments:
+        probs (np.ndarray): vector of predicted probabilities
+        ytrue (np.ndarray): corresponding true classes
+
+    Attributes:
+        probs (np.ndarray): ordered vector of predicted probabilities
+        ytrue (np.ndarray): corresponding true classes
+        tot (int): total number of data instances
+        p (int): number of real positive instances
+        n (int): number of real negative instances
+        tp (np.ndarray): number of true positives (property computed from `tn`)
+        fp (np.ndarray): number of false positives (property computed from `tn`)
+        tn (np.ndarray): number of true negatives (property computed from `tn`)
+        fn (np.ndarray): number of false negatives (precomputed, not a property)
+    """
+    def __init__(self, ytrue, probs):
+        sortind = np.argsort(probs)
+        self.probs = np.hstack((probs[sortind], [1]))
+        self.ytrue = ytrue[sortind]
+        self.fn = np.hstack(([0], np.cumsum(self.ytrue)))
+        self.tot = len(probs)
+        self.p = self.fn[-1]
+        self.n = self.tot - self.p
+
+    @classmethod
+    def from_results(cls, results, target_class=None, model_index=None):
+        """
+        Construct an instance of `Curves` from test results.
+
+        Args:
+            results (:obj:`Orange.evaluation.testing.Results`): test results
+            target_class (int): target class index; if the class is binary,
+                this defaults to `1`, otherwise it must be given
+            model_index (int): model index; if there is only one model, this
+                argument can be omitted
+
+        Returns:
+            curves (:obj:`Curves`)
+        """
+        if model_index is None:
+            if results.probabilities.shape[0] != 1:
+                raise ValueError("Argument 'model_index' is required when "
+                                 "there are multiple models")
+            model_index = 0
+        if target_class is None:
+            if results.probabilities.shape[2] != 2:
+                raise ValueError("Argument 'target_class' is required when the "
+                                 "class is not binary")
+            target_class = 1
+        actual = results.actual
+        probs = results.probabilities[model_index, :, target_class]
+        nans = np.isnan(actual) + np.isnan(probs)
+        if nans.any():
+            actual = actual[~nans]
+            probs = probs[~nans]
+        return cls(actual == target_class, probs)
+
+    @property
+    def tn(self):
+        return np.arange(self.tot + 1) - self.fn
+
+    @property
+    def tp(self):
+        return self.p - self.fn
+
+    @property
+    def fp(self):
+        return self.n - self.tn
+
+    def ca(self):
+        """Classification accuracy curve"""
+        return (self.tp + self.tn) / self.tot
+
+    def f1(self):
+        """F1 curve"""
+        return 2 * self.tp / (2 * self.tp + self.fp + self.fn)
+
+    def sensitivity(self):
+        """Sensitivity curve"""
+        return self.tp / self.p
+
+    def specificity(self):
+        """Specificity curve"""
+        return self.tn / self.n
+
+    def precision(self):
+        """
+        Precision curve
+
+        The last element represents precision at threshold 1. Unless such
+        a probability appears in the data, the precision at this point is
+        undefined. To avoid this, we copy the previous value to the last.
+        """
+        tp_fp = np.arange(self.tot, -1, -1)
+        tp_fp[-1] = 1  # avoid division by zero
+        prec = self.tp / tp_fp
+        prec[-1] = prec[-2]
+        return prec
+
+    def recall(self):
+        """Recall curve"""
+        return self.sensitivity()
+
+    def ppv(self):
+        """PPV curve; see the comment at :obj:`precision`"""
+        return self.precision()
+
+    def npv(self):
+        """
+        NPV curve
+
+        The first value is undefined (no negative instances). To avoid this,
+        we copy the second value into the first.
+        """
+        tn_fn = np.arange(self.tot + 1)
+        tn_fn[0] = 1  # avoid division by zero
+        npv = self.tn / tn_fn
+        npv[0] = npv[1]
+        return npv
+
+    def fpr(self):
+        """FPR curve"""
+        return self.fp / self.n
+
+    def tpr(self):
+        """TPR curve"""
+        return self.sensitivity()
diff --git a/Orange/evaluation/tests/__init__.py b/Orange/evaluation/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/Orange/evaluation/tests/test_performance_curves.py b/Orange/evaluation/tests/test_performance_curves.py
new file mode 100644
index 00000000000..a73d7165557
--- /dev/null
+++ b/Orange/evaluation/tests/test_performance_curves.py
@@ -0,0 +1,125 @@
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+
+from Orange.evaluation.testing import Results
+from Orange.evaluation.performance_curves import Curves
+
+
+# Test data and sensitivity/specificity are taken from
+# Tom Fawcett: An introduction to ROC analysis, with one true positive instance
+# removed, so that the number of positive and negative does not match
+
+class TestCurves(unittest.TestCase):
+    def setUp(self):
+        n, p = (0, 1)
+        self.data = np.array([
+            (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53),
+            (n, .52), (p, .51), (n, .505), (p, .4), (n, .39), (p, .38),
+            (n, .37), (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1)
+        ])
+
+    def test_curves(self):
+        np.random.shuffle(self.data)
+        ytrue, probs = self.data.T
+        curves = Curves(ytrue, probs)
+
+        tn = np.array(
+            [0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 9, 9, 10, 10])
+        np.testing.assert_equal(curves.tn, tn)
+        np.testing.assert_equal(curves.fp, 10 - tn)
+        np.testing.assert_almost_equal(curves.specificity(), tn / 10)
+
+        tp = np.array(
+            [9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 2, 1, 1, 0])
+        np.testing.assert_equal(curves.tp, tp)
+        np.testing.assert_equal(curves.fn, 9 - tp)
+        np.testing.assert_almost_equal(curves.sensitivity(), tp / 9)
+
+        np.testing.assert_almost_equal(
+            curves.ca(),
+            np.array([9, 10, 9, 10, 9, 10, 11, 12, 11, 12, 11, 12, 11, 12,
+                      13, 12, 11, 10, 11, 10]) / 19)
+
+        precision = np.array(
+            [9 / 19, 9 / 18, 8 / 17, 8 / 16, 7 / 15, 7 / 14, 7 / 13,
+             7 / 12, 6 / 11, 6 / 10, 5 / 9, 5 / 8, 4 / 7, 4 / 6,
+             4 / 5, 3 / 4, 2 / 3, 1 / 2, 1 / 1, 1])
+        np.testing.assert_almost_equal(curves.precision(), precision)
+        np.testing.assert_almost_equal(curves.recall(), tp / 9)
+
+        np.testing.assert_almost_equal(curves.ppv(), precision)
+        np.testing.assert_almost_equal(
+            curves.npv(),
+            np.array([1, 1 / 1, 1 / 2, 2 / 3, 2 / 4, 3 / 5, 4 / 6, 5 / 7,
+                      5 / 8, 6 / 9, 6 / 10, 7 / 11, 7 / 12, 8 / 13, 9 / 14,
+                      9 / 15, 9 / 16, 9 / 17, 10 / 18, 10 / 19]))
+
+        np.testing.assert_almost_equal(curves.tpr(), tp / 9)
+        np.testing.assert_almost_equal(curves.fpr(), (10 - tn) / 10)
+
+    @patch("Orange.evaluation.performance_curves.Curves.__init__",
+           return_value=None)
+    def test_curves_from_results(self, init):
+        res = Results()
+        ytrue, probs = self.data.T
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2)
+        Curves.from_results(res)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue)
+        np.testing.assert_equal(cprobs, probs)
+
+        Curves.from_results(res, target_class=0)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, 1 - ytrue)
+        np.testing.assert_equal(cprobs, 1 - probs)
+
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.random.random((2, 19, 2))
+        res.probabilities[1] = np.vstack((1 - probs, probs)).T
+
+        Curves.from_results(res, model_index=1)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue)
+        np.testing.assert_equal(cprobs, probs)
+
+        self.assertRaises(ValueError, Curves.from_results, res)
+
+        ytrue[ytrue == 0] = 2 * (np.arange(10) % 2)
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.random.random((2, 19, 3))
+        res.probabilities[1] = np.vstack(
+            ((1 - probs) / 3, probs, (1 - probs) * 2 / 3)).T
+
+        Curves.from_results(res, model_index=1, target_class=1)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 1)
+        np.testing.assert_equal(cprobs, probs)
+
+        Curves.from_results(res, model_index=1, target_class=0)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 0)
+        np.testing.assert_equal(cprobs, (1 - probs) / 3)
+
+        Curves.from_results(res, model_index=1, target_class=2)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 2)
+        np.testing.assert_equal(cprobs, (1 - probs) * 2 / 3)
+
+        self.assertRaises(ValueError, Curves.from_results, res, model_index=1)
+
+    @patch("Orange.evaluation.performance_curves.Curves.__init__",
+           return_value=None)
+    def test_curves_from_results_nans(self, init):
+        res = Results()
+        ytrue, probs = self.data.T
+        ytrue[0] = np.nan
+        probs[-1] = np.nan
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2)
+        Curves.from_results(res)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue[1:-1])
+        np.testing.assert_equal(cprobs, probs[1:-1])
diff --git a/doc/data-mining-library/source/reference/evaluation.performance_curves.rst b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst
new file mode 100644
index 00000000000..d9eaa515c0f
--- /dev/null
+++ b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst
@@ -0,0 +1,8 @@
+.. py:currentmodule:: Orange.evaluation.performance_curves
+
+##################
+Performance curves
+##################
+
+.. autoclass:: Orange.evaluation.performance_curves.Curves
+    :members:
diff --git a/doc/data-mining-library/source/reference/evaluation.rst b/doc/data-mining-library/source/reference/evaluation.rst
index 422371a41eb..a07c99ae44f 100644
--- a/doc/data-mining-library/source/reference/evaluation.rst
+++ b/doc/data-mining-library/source/reference/evaluation.rst
@@ -9,3 +9,4 @@ Evaluation (``evaluation``)
 
    evaluation.testing
    evaluation.cd
+   evaluation.performance_curves

From 60248970fc5d98a387caa5803fd6b2f24581b988 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Sun, 16 Jun 2019 22:47:36 +0200
Subject: [PATCH 09/21] Calibration plot: Use
 Orange.evaluation.testing.performance_curves to compute curves

---
 Orange/widgets/evaluate/owcalibrationplot.py | 65 +++++---------------
 1 file changed, 16 insertions(+), 49 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 3d782030467..3e316b990e8 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -9,6 +9,7 @@
 
 from Orange.classification import ModelWithThreshold
 from Orange.evaluation import Results
+from Orange.evaluation.performance_curves import Curves
 from Orange.widgets import widget, gui, settings
 from Orange.widgets.evaluate.utils import \
     check_results_adequacy, results_for_preview
@@ -18,71 +19,37 @@
 from Orange.widgets import report
 
 
-class Data:
-    def __init__(self, ytrue, probs):
-        sortind = np.argsort(probs)
-        self.probs = probs[sortind]
-        self.ytrue = ytrue[sortind]
-        self.fn = np.cumsum(self.ytrue)
-        self.tot = len(probs)
-        self.p = self.fn[-1]
-        self.n = self.tot - self.p
-
-    @property
-    def tn(self):
-        return np.arange(self.tot) - self.fn
-
-    @property
-    def tp(self):
-        return self.p - self.fn
-
-    @property
-    def fp(self):
-        return self.n - self.tn
-
-
 MetricDefinition = namedtuple(
     "metric_definition",
-    ("name", "function", "short_names", "explanation"))
+    ("name", "functions", "short_names", "explanation"))
 
 Metrics = [MetricDefinition(*args) for args in (
-    ("Actual probability",
-     None,
-     (),
-     ""),
-    ("Classification accuracy",
-     lambda d: (d.probs, ((d.tp + d.tn) / d.tot,)),
-     (),
-     ""),
-    ("F1",
-     lambda d: (d.probs, (2 * d.tp / (2 * d.tp + d.fp + d.fn),)),
-     (),
-     ""),
+    ("Calibration curve", None, (), ""),
+    ("Classification accuracy", (Curves.ca, ), (), ""),
+    ("F1", (Curves.f1, ), (), ""),
     ("Sensitivity and specificity",
-     lambda d: (d.probs, (d.tp / d.p, d.tn / d.n)),
+     (Curves.sensitivity, Curves.specificity),
      ("sens", "spec"),
      "<p><b>Sensitivity</b> (falling) is the proportion of correctly "
      "detected positive instances (TP&nbsp;/&nbsp;P).</p>"
      "<p><b>Specificity</b> (rising) is the proportion of detected "
      "negative instances (TP&nbsp;/&nbsp;N).</p>"),
     ("Precision and recall",
-     lambda d: (d.probs[:-1], (d.tp[:-1] / np.arange(d.tot, 1, -1),
-                               d.tp[:-1] / d.p)),
+     (Curves.precision, Curves.recall),
      ("prec", "recall"),
      "<p><b>Precision</b> (rising) is the fraction of retrieved instances "
      "that are relevant, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
      "<p><b>Recall</b> (falling) is the proportion of discovered relevant "
      "instances, TP&nbsp;/&nbsp;P.</p>"),
     ("Pos and neg predictive value",
-     lambda d: (d.probs[:-1], (d.tp[:-1] / np.arange(d.tot, 1, -1),
-                               d.tn[:-1] / np.arange(1, d.tot))),
+     (Curves.ppv, Curves.npv),
      ("PPV", "TPV"),
      "<p><b>Positive predictive value</b> (rising) is the proportion of "
      "correct positives, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
      "<p><b>Negative predictive value</b> is the proportion of correct "
      "negatives, TN&nbsp;/&nbsp;(TN&nbsp;+&nbsp;FN).</p>"),
     ("True and false positive rate",
-     lambda d: (d.probs, (d.tp / d.p, d.fp / d.n)),
+     (Curves.tpr, Curves.fpr),
      ("TPR", "FPR"),
      "<p><b>True and false positive rate</b> are proportions of detected "
      "and omitted positive instances</p>"),
@@ -253,7 +220,7 @@ def _initialize(self, results):
     def _rug(self, data, pen_args):
         color = pen_args["pen"].color()
         rh = 0.025
-        rug_x = np.c_[data.probs, data.probs]
+        rug_x = np.c_[data.probs[:-1], data.probs[:-1]]
         rug_x_true = rug_x[data.ytrue].ravel()
         rug_x_false = rug_x[~data.ytrue].ravel()
 
@@ -271,11 +238,11 @@ def _rug(self, data, pen_args):
 
     def plot_metrics(self, data, metrics, pen_args):
         if metrics is None:
-            return self._prob_curve(data.ytrue, data.probs, pen_args)
-        x, ys = metrics(data)
+            return self._prob_curve(data.ytrue, data.probs[:-1], pen_args)
+        ys = [metric(data) for metric in metrics]
         for y in ys:
-            self.plot.plot(x, y, **pen_args)
-        return x, ys
+            self.plot.plot(data.probs, y, **pen_args)
+        return data.probs, ys
 
     def _prob_curve(self, ytrue, probs, pen_args):
         if not probs.size:
@@ -296,7 +263,7 @@ def _prob_curve(self, ytrue, probs, pen_args):
     def _setup_plot(self):
         target = self.target_index
         results = self.results
-        metrics = Metrics[self.score].function
+        metrics = Metrics[self.score].functions
         plot_folds = self.fold_curves and results.folds is not None
         self.scores = []
 
@@ -309,7 +276,7 @@ def _setup_plot(self):
                 shadowPen=pg.mkPen(color.lighter(160),
                                    width=4 + 4 * plot_folds),
                 antiAlias=True)
-            data = Data(ytrue, probs)
+            data = Curves(ytrue, probs)
             self.scores.append(
                 (self.classifier_names[clsf],
                  self.plot_metrics(data, metrics, pen_args)))

From 1cfbeece2ec7c853683b9286643257f3ce9a55a0 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 17:51:57 +0200
Subject: [PATCH 10/21] Calibration plot: Fix selected model output

---
 Orange/classification/calibration.py          | 174 ++++++++++++++-
 .../classification/tests/test_calibration.py  | 203 ++++++++++++++++++
 Orange/widgets/evaluate/owcalibrationplot.py  |  90 +++++---
 .../source/reference/classification.rst       |  18 ++
 4 files changed, 451 insertions(+), 34 deletions(-)
 create mode 100644 Orange/classification/tests/test_calibration.py

diff --git a/Orange/classification/calibration.py b/Orange/classification/calibration.py
index 891b8db81b6..46bf2e8f242 100644
--- a/Orange/classification/calibration.py
+++ b/Orange/classification/calibration.py
@@ -1,22 +1,176 @@
-from Orange.classification import Model
+import numpy as np
+from sklearn.isotonic import IsotonicRegression
+from sklearn.calibration import _SigmoidCalibration
 
-__all__ = ["ModelWithThreshold"]
+from Orange.classification import Model, Learner
+from Orange.evaluation import TestOnTrainingData
+from Orange.evaluation.performance_curves import Curves
 
+__all__ = ["ThresholdClassifier", "ThresholdLearner",
+           "CalibratedLearner", "CalibratedClassifier"]
 
-class ModelWithThreshold(Model):
-    def __init__(self, wrapped_model, threshold, target_class=1):
-        super().__init__(wrapped_model.domain, wrapped_model.original_domain)
-        self.name = f"{wrapped_model.name}, thresh={threshold:.2f}"
-        self.wrapped_model = wrapped_model
+
+class ThresholdClassifier(Model):
+    """
+    A model that wraps a binary model and sets a different threshold.
+
+    The target class is the class with index 1. A data instances is classified
+    to class 1 it the probability of this class equals or exceeds the threshold
+
+    Attributes:
+        base_model (Orange.classification.Model): base mode
+        threshold (float): decision threshold
+    """
+    def __init__(self, base_model, threshold):
+        if not base_model.domain.class_var.is_discrete \
+                or len(base_model.domain.class_var.values) != 2:
+            raise ValueError("ThresholdClassifier requires a binary class")
+
+        super().__init__(base_model.domain, base_model.original_domain)
+        self.name = f"{base_model.name}, thresh={threshold:.2f}"
+        self.base_model = base_model
         self.threshold = threshold
-        self.target_class = target_class
 
     def __call__(self, data, ret=Model.Value):
-        probs = self.wrapped_model(data, ret=Model.Probs)
+        probs = self.base_model(data, ret=Model.Probs)
         if ret == Model.Probs:
             return probs
-        vals = probs[:, self.target_class].flatten() > self.threshold
+        class_probs = probs[:, 1].ravel()
+        with np.errstate(invalid="ignore"):  # we fix nanx below
+            vals = (class_probs >= self.threshold).astype(float)
+        vals[np.isnan(class_probs)] = np.nan
         if ret == Model.Value:
             return vals
         else:
             return vals, probs
+
+
+class ThresholdLearner(Learner):
+    """
+    A learner that runs another learner and then finds the optimal threshold
+    for CA or F1 on the training data.
+
+    Attributes:
+        base_leaner (Learner): base learner
+        threshold_criterion (int):
+            `ThresholdLearner.OptimizeCA` or `ThresholdLearner.OptimizeF1`
+    """
+    __returns__ = ThresholdClassifier
+
+    OptimizeCA, OptimizeF1 = range(2)
+
+    def __init__(self, base_learner, threshold_criterion=OptimizeCA):
+        super().__init__()
+        self.base_learner = base_learner
+        self.threshold_criterion = threshold_criterion
+
+    def fit_storage(self, data):
+        """
+        Induce a model using the provided `base_learner`, compute probabilities
+        on training data and the find the optimal decision thresholds. In case
+        of ties, select the threshold that is closest to 0.5.
+        """
+        if not data.domain.class_var.is_discrete \
+                or len(data.domain.class_var.values) != 2:
+            raise ValueError("ThresholdLearner requires a binary class")
+
+        res = TestOnTrainingData(data, [self.base_learner], store_models=True)
+        model = res.models[0, 0]
+        curves = Curves.from_results(res)
+        curve = [curves.ca, curves.f1][self.threshold_criterion]()
+        # In case of ties, we want the optimal threshold that is closest to 0.5
+        best_threshs = curves.probs[curve == np.max(curve)]
+        threshold = best_threshs[min(np.searchsorted(best_threshs, 0.5),
+                                     len(best_threshs) - 1)]
+        return ThresholdClassifier(model, threshold)
+
+
+class CalibratedClassifier(Model):
+    """
+    A model that wraps another model and recalibrates probabilities
+
+    Attributes:
+        base_model (Mode): base mode
+        calibrators (list of callable):
+            list of functions that get a vector of probabilities and return
+            calibrated probabilities
+    """
+    def __init__(self, base_model, calibrators):
+        if not base_model.domain.class_var.is_discrete:
+            raise ValueError("CalibratedClassifier requires a discrete target")
+
+        super().__init__(base_model.domain, base_model.original_domain)
+        self.base_model = base_model
+        self.calibrators = calibrators
+        self.name = f"{base_model.name}, calibrated"
+
+    def __call__(self, data, ret=Model.Value):
+        probs = self.base_model(data, Model.Probs)
+        cal_probs = self.calibrated_probs(probs)
+        if ret == Model.Probs:
+            return cal_probs
+        vals = np.argmax(cal_probs, axis=1)
+        if ret == Model.Value:
+            return vals
+        else:
+            return vals, cal_probs
+
+    def calibrated_probs(self, probs):
+        if self.calibrators:
+            ps = np.hstack(
+                tuple(
+                    calibr.predict(cls_probs).reshape(-1, 1)
+                    for calibr, cls_probs in zip(self.calibrators, probs.T)))
+        else:
+            ps = probs.copy()
+        sums = np.sum(ps, axis=1)
+        zero_sums = sums == 0
+        with np.errstate(invalid="ignore"):  # handled below
+            ps /= sums[:, None]
+        if zero_sums.any():
+            ps[zero_sums] = 1 / ps.shape[1]
+        return ps
+
+
+class CalibratedLearner(Learner):
+    """
+    Probability calibration for learning algorithms
+
+    This learner that wraps another learner, so that after training, it predicts
+    the probabilities on training data and calibrates them using sigmoid or
+    isotonic calibration. It then returns a :obj:`CalibratedClassifier`.
+
+    Attributes:
+        base_learner (Learner): base learner
+        calibration_method (int):
+            `CalibratedLearner.Sigmoid` or `CalibratedLearner.Isotonic`
+    """
+    __returns__ = CalibratedClassifier
+
+    Sigmoid, Isotonic = range(2)
+
+    def __init__(self, base_learner, calibration_method=Sigmoid):
+        super().__init__()
+        self.base_learner = base_learner
+        self.calibration_method = calibration_method
+
+    def fit_storage(self, data):
+        """
+        Induce a model using the provided `base_learner`, compute probabilities
+        on training data and use scipy's `_SigmoidCalibration` or
+        `IsotonicRegression` to prepare calibrators.
+        """
+        res = TestOnTrainingData(data, [self.base_learner], store_models=True)
+        model = res.models[0, 0]
+        probabilities = res.probabilities[0]
+        return self.get_model(model, res.actual, probabilities)
+
+    def get_model(self, model, ytrue, probabilities):
+        if self.calibration_method == CalibratedLearner.Sigmoid:
+            fitter = _SigmoidCalibration()
+        else:
+            fitter = IsotonicRegression(out_of_bounds='clip')
+        probabilities[np.isinf(probabilities)] = 1
+        calibrators = [fitter.fit(cls_probs, ytrue)
+                       for cls_idx, cls_probs in enumerate(probabilities.T)]
+        return CalibratedClassifier(model, calibrators)
diff --git a/Orange/classification/tests/test_calibration.py b/Orange/classification/tests/test_calibration.py
new file mode 100644
index 00000000000..a538a3b1870
--- /dev/null
+++ b/Orange/classification/tests/test_calibration.py
@@ -0,0 +1,203 @@
+import unittest
+from unittest.mock import Mock, patch
+
+import numpy as np
+
+from Orange.base import Model
+from Orange.classification.calibration import \
+    ThresholdLearner, ThresholdClassifier, \
+    CalibratedLearner, CalibratedClassifier
+from Orange.data import Table
+
+
+class TestThresholdClassifier(unittest.TestCase):
+    def setUp(self):
+        probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1)
+        self.probs = np.hstack((1 - probs1, probs1))
+        base_model = Mock(return_value=self.probs)
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a", "b"]
+        self.model = ThresholdClassifier(base_model, 0.5)
+        self.data = Mock()
+
+    def test_threshold(self):
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        self.model.threshold = 0.8
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 0, 0, 1, 1, 0])
+
+        self.model.threshold = 0
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [1] * 6)
+
+    def test_return_types(self):
+        vals = self.model(self.data, ret=Model.Value)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        probs = self.model(self.data, ret=Model.Probs)
+        np.testing.assert_equal(probs, self.probs)
+
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+        np.testing.assert_equal(probs, self.probs)
+
+    def test_nans(self):
+        self.probs[1, :] = np.nan
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_equal(vals, [0, np.nan, 0, 1, 1, 0])
+        np.testing.assert_equal(probs, self.probs)
+
+    def test_non_binary_base(self):
+        base_model = Mock()
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a"]
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+        base_model.domain.class_var.values = ["a", "b", "c"]
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+        base_model.domain.class_var = Mock()
+        base_model.domain.class_var.is_discrete = False
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+
+class TestThresholdLearner(unittest.TestCase):
+    @patch("Orange.evaluation.performance_curves.Curves.from_results")
+    @patch("Orange.classification.calibration.TestOnTrainingData")
+    def test_fit_storage(self, test_on_training, curves_from_results):
+        curves_from_results.return_value = curves = Mock()
+        curves.probs = np.array([0.1, 0.15, 0.3, 0.45, 0.6, 0.8])
+        curves.ca = lambda: np.array([0.1, 0.7, 0.4, 0.4, 0.3, 0.1])
+        curves.f1 = lambda: np.array([0.1, 0.2, 0.4, 0.4, 0.3, 0.1])
+        model = Mock()
+        model.domain.class_var.is_discrete = True
+        model.domain.class_var.values = ("a", "b")
+        data = Table("heart_disease")
+        learner = Mock()
+        test_on_training.return_value = res = Mock()
+        res.models = np.array([[model]])
+        test_on_training.return_value = res
+
+        thresh_learner = ThresholdLearner(
+            base_learner=learner,
+            threshold_criterion=ThresholdLearner.OptimizeCA)
+        thresh_model = thresh_learner(data)
+        self.assertEqual(thresh_model.threshold, 0.15)
+        args, kwargs = test_on_training.call_args
+        self.assertEqual(len(args), 2)
+        self.assertIs(args[0], data)
+        self.assertIs(args[1][0], learner)
+        self.assertEqual(len(args[1]), 1)
+        self.assertEqual(kwargs, {"store_models": 1})
+
+        thresh_learner = ThresholdLearner(
+            base_learner=learner,
+            threshold_criterion=ThresholdLearner.OptimizeF1)
+        thresh_model = thresh_learner(data)
+        self.assertEqual(thresh_model.threshold, 0.45)
+
+    def test_non_binary_class(self):
+        thresh_learner = ThresholdLearner(
+            base_learner=Mock(),
+            threshold_criterion=ThresholdLearner.OptimizeF1)
+
+        data = Mock()
+        data.domain.class_var.is_discrete = True
+        data.domain.class_var.values = ["a"]
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+        data.domain.class_var.values = ["a", "b", "c"]
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+        data.domain.class_var = Mock()
+        data.domain.class_var.is_discrete = False
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+
+class TestCalibratedClassifier(unittest.TestCase):
+    def setUp(self):
+        probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1)
+        self.probs = np.hstack((1 - probs1, probs1))
+        base_model = Mock(return_value=self.probs)
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a", "b"]
+        self.model = CalibratedClassifier(base_model, None)
+        self.data = Mock()
+
+    def test_call(self):
+        calprobs = np.arange(self.probs.size).reshape(self.probs.shape)
+        calprobs = calprobs / np.sum(calprobs, axis=1)[:, None]
+        calprobs[-1] = [0.7, 0.3]
+        self.model.calibrated_probs = Mock(return_value=calprobs)
+
+        probs = self.model(self.data, ret=Model.Probs)
+        self.model.calibrated_probs.assert_called_with(self.probs)
+        np.testing.assert_almost_equal(probs, calprobs)
+
+        vals = self.model(self.data, ret=Model.Value)
+        np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0])
+
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_almost_equal(probs, calprobs)
+        np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0])
+
+    def test_calibrated_probs(self):
+        self.model.calibrators = None
+        calprobs = self.model.calibrated_probs(self.probs)
+        np.testing.assert_equal(calprobs, self.probs)
+        self.assertIsNot(calprobs, self.probs)
+
+        calibrator = Mock()
+        calibrator.predict = lambda x: x**2
+        self.model.calibrators = [calibrator] * 2
+        calprobs = self.model.calibrated_probs(self.probs)
+        expprobs = self.probs ** 2 / np.sum(self.probs ** 2, axis=1)[:, None]
+        np.testing.assert_almost_equal(calprobs, expprobs)
+
+        self.probs[1] = 0
+        self.probs[2] = np.nan
+        expprobs[1] = 0.5
+        expprobs[2] = np.nan
+        calprobs = self.model.calibrated_probs(self.probs)
+        np.testing.assert_almost_equal(calprobs, expprobs)
+
+
+class TestCalibratedLearner(unittest.TestCase):
+    @patch("Orange.classification.calibration._SigmoidCalibration.fit")
+    @patch("Orange.classification.calibration.TestOnTrainingData")
+    def test_fit_storage(self, test_on_training, sigmoid_fit):
+        data = Table("heart_disease")
+        learner = Mock()
+
+        model = Mock()
+        model.domain.class_var.is_discrete = True
+        model.domain.class_var.values = ("a", "b")
+
+        test_on_training.return_value = res = Mock()
+        res.models = np.array([[model]])
+        res.probabilities = np.arange(20, dtype=float).reshape(1, 5, 4)
+        test_on_training.return_value = res
+
+        sigmoid_fit.return_value = Mock()
+
+        cal_learner = CalibratedLearner(
+            base_learner=learner, calibration_method=CalibratedLearner.Sigmoid)
+        cal_model = cal_learner(data)
+
+        self.assertIs(cal_model.base_model, model)
+        self.assertEqual(cal_model.calibrators, [sigmoid_fit.return_value] * 4)
+        args, kwargs = test_on_training.call_args
+        self.assertEqual(len(args), 2)
+        self.assertIs(args[0], data)
+        self.assertIs(args[1][0], learner)
+        self.assertEqual(len(args[1]), 1)
+        self.assertEqual(kwargs, {"store_models": 1})
+
+        for call, cls_probs in zip(sigmoid_fit.call_args_list,
+                                   res.probabilities[0].T):
+            np.testing.assert_equal(call[0][0], cls_probs)
diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 3e316b990e8..63c1b0190a2 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -7,7 +7,8 @@
 
 import pyqtgraph as pg
 
-from Orange.classification import ModelWithThreshold
+from Orange.base import Model
+from Orange.classification import ThresholdClassifier, CalibratedLearner
 from Orange.evaluation import Results
 from Orange.evaluation.performance_curves import Curves
 from Orange.widgets import widget, gui, settings
@@ -67,26 +68,27 @@ class Inputs:
         evaluation_results = Input("Evaluation Results", Results)
 
     class Outputs:
-        calibrated_model = Output("Calibrated Model", ModelWithThreshold)
+        calibrated_model = Output("Calibrated Model", Model)
 
     class Warning(widget.OWWidget.Warning):
-        empty_input = widget.Msg(
-            "Empty result on input. Nothing to display.")
+        empty_input = widget.Msg("Empty result on input. Nothing to display.")
 
     class Information(widget.OWWidget.Information):
         no_out = "Can't output a model: "
         no_output_multiple_folds = Msg(
-            no_out + "every training data sample produced a different model")
+            no_out + "each training data sample produces a different model")
         no_output_no_models = Msg(
             no_out + "test results do not contain stored models;\n"
             "try testing on separate data or on training data")
         no_output_multiple_selected = Msg(
             no_out + "select a single model - the widget can output only one")
+        non_binary_class = Msg(no_out + "cannot calibrate non-binary classes")
 
 
     target_index = settings.Setting(0)
     selected_classifiers = settings.Setting([])
     score = settings.Setting(0)
+    output_calibration = settings.Setting(0)
     fold_curves = settings.Setting(False)
     display_rug = settings.Setting(True)
     threshold = settings.Setting(0.5)
@@ -103,10 +105,13 @@ def __init__(self):
         self.colors = []
         self.line = None
 
+        self._last_score_value = -1
+
         box = gui.vBox(self.controlArea, box="Settings")
         self.target_cb = gui.comboBox(
             box, self, "target_index", label="Target:",
-            orientation=Qt.Horizontal, callback=self._replot, contentsLength=8)
+            orientation=Qt.Horizontal, callback=self.target_index_changed,
+            contentsLength=8)
         gui.checkBox(
             box, self, "display_rug", "Show rug",
             callback=self._on_display_rug_changed)
@@ -133,6 +138,11 @@ def __init__(self):
         font.setPointSizeF(0.85 * font.pointSizeF())
         self.explanation.setFont(font)
 
+        gui.radioButtons(
+            box, self, value="output_calibration",
+            btnLabels=("Sigmoid calibration", "Isotonic calibration"),
+            label="Output model calibration", callback=self.apply)
+
         box = gui.widgetBox(self.controlArea, "Info")
         self.info_label = gui.widgetLabel(box)
 
@@ -159,7 +169,7 @@ def __init__(self):
     @Inputs.evaluation_results
     def set_results(self, results):
         self.clear()
-        results = check_results_adequacy(results, self.Error)
+        results = check_results_adequacy(results, self.Error, check_nan=False)
         if results is not None and not results.actual.size:
             self.Warning.empty_input()
         else:
@@ -179,9 +189,19 @@ def clear(self):
         self.target_index = 0
         self.colors = []
 
+    def target_index_changed(self):
+        if len(self.results.domain.class_var.values) == 2:
+            self.threshold = 1 - self.threshold
+        self._set_explanation()
+        self._replot()
+        self.apply()
+
     def score_changed(self):
         self._set_explanation()
         self._replot()
+        if self._last_score_value != self.score:
+            self.apply()
+            self._last_score_value = self.score
 
     def _set_explanation(self):
         explanation = Metrics[self.score].explanation
@@ -191,6 +211,11 @@ def _set_explanation(self):
         else:
             self.explanation.hide()
 
+        if self.score == 0:
+            self.controls.output_calibration.show()
+        else:
+            self.controls.output_calibration.hide()
+
         axis = self.plot.getAxis("bottom")
         axis.setLabel("Predicted probability" if self.score == 0
                       else "Threshold probability to classify as positive")
@@ -292,22 +317,23 @@ def _setup_plot(self):
                     fold_results = results.get_fold(fold)
                     fold_ytrue = fold_results.actual == target
                     fold_probs = fold_results.probabilities[clsf, :, target]
-                    self.plot_metrics(Data(fold_ytrue, fold_probs),
+                    self.plot_metrics(Curves(fold_ytrue, fold_probs),
                                       metrics, pen_args)
 
     def _replot(self):
         self.plot.clear()
         if self.results is not None:
             self._setup_plot()
-        self.line = pg.InfiniteLine(
-            pos=self.threshold, movable=True,
-            pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
-            hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3),
-            bounds=(0, 1),
-        )
-        self.line.sigPositionChanged.connect(self.threshold_change)
-        self.line.sigPositionChangeFinished.connect(self.threshold_change_done)
-        self.plot.addItem(self.line)
+        if self.score != 0:
+            self.line = pg.InfiniteLine(
+                pos=self.threshold, movable=True,
+                pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
+                hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3),
+                bounds=(0, 1),
+            )
+            self.line.sigPositionChanged.connect(self.threshold_change)
+            self.line.sigPositionChangeFinished.connect(self.threshold_change_done)
+            self.plot.addItem(self.line)
         self._update_info()
 
     def _on_display_rug_changed(self):
@@ -336,7 +362,10 @@ def _update_info(self):
                                 {"<td></td>".join(f"<td align='right'>{n}</td>"
                                                   for n in short_names)}
                             </tr>"""
-            for name, (probs, curves) in self.scores:
+            for name, probs_curves in self.scores:
+                if probs_curves is None:
+                    continue
+                probs, curves = probs_curves
                 ind = min(np.searchsorted(probs, self.threshold),
                           len(probs) - 1)
                 text += f"<tr><th align='right'>{name}:</th>"
@@ -353,15 +382,28 @@ def apply(self):
         info = self.Information
         wrapped = None
         problems = {}
-        if self.results is not None:
+        results = self.results
+        if results is not None:
             problems = {
-                info.no_output_multiple_folds: len(self.results.folds) > 1,
-                info.no_output_no_models: self.results.models is None,
+                info.no_output_multiple_folds: len(results.folds) > 1,
+                info.no_output_no_models: results.models is None,
                 info.no_output_multiple_selected:
-                    len(self.selected_classifiers) != 1}
+                    len(self.selected_classifiers) != 1,
+                info.non_binary_class:
+                    self.score != 0
+                    and len(results.domain.class_var.values) != 2}
             if not any(problems.values()):
-                model = self.results.models[0][self.selected_classifiers[0]]
-                wrapped = ModelWithThreshold(model, self.threshold)
+                clsf_idx = self.selected_classifiers[0]
+                model = results.models[0, clsf_idx]
+                if self.score == 0:
+                    cal_learner = CalibratedLearner(
+                        None, self.output_calibration)
+                    wrapped = cal_learner.get_model(
+                        model, results.actual, results.probabilities[clsf_idx])
+                else:
+                    threshold = [1 - self.threshold,
+                                 self.threshold][self.target_index]
+                    wrapped = ThresholdClassifier(model, threshold)
 
         self.Outputs.calibrated_model.send(wrapped)
         for info, shown in problems.items():
diff --git a/doc/data-mining-library/source/reference/classification.rst b/doc/data-mining-library/source/reference/classification.rst
index 5095e147f2a..55792fa340f 100644
--- a/doc/data-mining-library/source/reference/classification.rst
+++ b/doc/data-mining-library/source/reference/classification.rst
@@ -196,3 +196,21 @@ CN2 Rule Induction
 
 .. autoclass:: CN2SDUnorderedLearner
    :members:
+
+
+Calibration and threshold optimization
+--------------------------------------
+
+.. automodule:: Orange.classification.calibration
+
+.. autoclass:: ThresholdClassifier
+   :members:
+
+.. autoclass:: ThresholdLearner
+   :members:
+
+.. autoclass:: CalibratedClassifier
+   :members:
+
+.. autoclass:: CalibratedLearner
+   :members:

From f742ff919d4c2985a38cdee5790cb7230962c398 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 18:11:10 +0200
Subject: [PATCH 11/21] OWLearnerWidget: Let default name appear as
 placeholder. This allows derived widget to change the default name without
 interferring with user-changed settings.

---
 Orange/widgets/tests/base.py                       |  3 ++-
 Orange/widgets/utils/owlearnerwidget.py            | 13 ++++++-------
 Orange/widgets/utils/tests/test_owlearnerwidget.py |  1 -
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/Orange/widgets/tests/base.py b/Orange/widgets/tests/base.py
index 635dd2e5fd8..1204e1c6ed5 100644
--- a/Orange/widgets/tests/base.py
+++ b/Orange/widgets/tests/base.py
@@ -672,7 +672,8 @@ def test_output_learner_name(self):
         new_name = "Learner Name"
         self.widget.apply_button.button.click()
         self.assertEqual(self.widget.learner.name,
-                         self.widget.name_line_edit.text())
+                         self.widget.name_line_edit.text()
+                         or self.widget.name_line_edit.placeholderText())
         self.widget.name_line_edit.setText(new_name)
         self.widget.apply_button.button.click()
         self.wait_until_stop_blocking()
diff --git a/Orange/widgets/utils/owlearnerwidget.py b/Orange/widgets/utils/owlearnerwidget.py
index 3c6ee6ea65f..63b2795c78e 100644
--- a/Orange/widgets/utils/owlearnerwidget.py
+++ b/Orange/widgets/utils/owlearnerwidget.py
@@ -65,7 +65,7 @@ class OWBaseLearner(OWWidget, metaclass=OWBaseLearnerMeta, openclass=True):
     LEARNER = None
     supports_sparse = True
 
-    learner_name = Setting(None, schema_only=True)
+    learner_name = Setting("", schema_only=True)
     want_main_area = False
     resizing_enabled = False
     auto_apply = Setting(True)
@@ -95,8 +95,6 @@ def __init__(self):
         self.data = None
         self.valid_data = False
         self.learner = None
-        if self.learner_name is None:
-            self.learner_name = self.name
         self.model = None
         self.preprocessors = None
         self.outdated_settings = False
@@ -149,7 +147,7 @@ def update_learner(self):
         if self.learner and issubclass(self.LEARNER, Fitter):
             self.learner.use_default_preprocessors = True
         if self.learner is not None:
-            self.learner.name = self.learner_name
+            self.learner.name = self.learner_name or self.name
         self.Outputs.learner.send(self.learner)
         self.outdated_settings = False
         self.Warning.outdated_learner.clear()
@@ -168,7 +166,7 @@ def update_model(self):
             except BaseException as exc:
                 self.show_fitting_failed(exc)
             else:
-                self.model.name = self.learner_name
+                self.model.name = self.learner_name or self.name
                 self.model.instances = self.data
         self.Outputs.model.send(self.model)
 
@@ -198,7 +196,7 @@ def settings_changed(self, *args, **kwargs):
 
     def _change_name(self, instance, output):
         if instance:
-            instance.name = self.learner_name
+            instance.name = self.learner_name or self.name
             if self.auto_apply:
                 output.send(instance)
 
@@ -207,7 +205,7 @@ def learner_name_changed(self):
         self._change_name(self.model, self.Outputs.model)
 
     def send_report(self):
-        self.report_items((("Name", self.learner_name),))
+        self.report_items((("Name", self.learner_name or self.name),))
 
         model_parameters = self.get_learner_parameters()
         if model_parameters:
@@ -264,6 +262,7 @@ def add_regression_layout(self, box):
     def add_learner_name_widget(self):
         self.name_line_edit = gui.lineEdit(
             self.controlArea, self, 'learner_name', box='Name',
+            placeholderText=self.name,
             tooltip='The name will identify this model in other widgets',
             orientation=Qt.Horizontal, callback=self.learner_name_changed)
 
diff --git a/Orange/widgets/utils/tests/test_owlearnerwidget.py b/Orange/widgets/utils/tests/test_owlearnerwidget.py
index 99f792196b6..9a43365a473 100644
--- a/Orange/widgets/utils/tests/test_owlearnerwidget.py
+++ b/Orange/widgets/utils/tests/test_owlearnerwidget.py
@@ -105,7 +105,6 @@ class WidgetA(OWBaseLearner):
             LEARNER = KNNLearner
 
         w1 = self.create_widget(WidgetA)
-        self.assertEqual(w1.learner_name, "A")
         w1.learner_name = "MyWidget"
 
         settings = w1.settingsHandler.pack_data(w1)

From c5d070df60bddecb81f3c17d8f70a1d6949fbb70 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 18:17:15 +0200
Subject: [PATCH 12/21] evaluations.testing: Minor fixes in unit tests

---
 Orange/evaluation/testing.py                        |  2 +-
 Orange/tests/test_evaluation_testing.py             |  2 +-
 .../evaluate/tests/test_owcalibrationplot.py        | 13 -------------
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/Orange/evaluation/testing.py b/Orange/evaluation/testing.py
index 9e6d0ca071c..22a917283e0 100644
--- a/Orange/evaluation/testing.py
+++ b/Orange/evaluation/testing.py
@@ -317,7 +317,7 @@ def split_by_model(self):
                 res.probabilities = self.probabilities[(i,), :, :]
 
             if self.models is not None:
-                res.models = self.models[:, i:i+1]
+                res.models = self.models[:, i:i + 1]
 
             res.failed = [self.failed[i]]
             yield res
diff --git a/Orange/tests/test_evaluation_testing.py b/Orange/tests/test_evaluation_testing.py
index a57910eb971..3bc21d3f2e8 100644
--- a/Orange/tests/test_evaluation_testing.py
+++ b/Orange/tests/test_evaluation_testing.py
@@ -233,7 +233,7 @@ def test_split_by_model(self):
             self.assertTrue((result.predicted == res.predicted[i]).all())
             self.assertTrue((result.probabilities == res.probabilities[i]).all())
             self.assertEqual(len(result.models), 5)
-            for model in result.models:
+            for model in result.models[0]:
                 self.assertIsInstance(model, learners[i].__returns__)
             self.assertSequenceEqual(result.learners, [res.learners[i]])
 
diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
index 0575e03e8d1..ac07e8a2fff 100644
--- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
+++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
@@ -1,7 +1,6 @@
 import copy
 import warnings
 
-import numpy as np
 from sklearn.exceptions import ConvergenceWarning
 
 from Orange.data import Table
@@ -42,15 +41,3 @@ def test_empty(self):
         res.predicted = res.predicted[:, 0]
         res.probabilities = res.probabilities[:, :0, :]
         self.send_signal(self.widget.Inputs.evaluation_results, res)
-
-    def test_nan_input(self):
-        res = copy.copy(self.res)
-        res.actual = res.actual.copy()
-        res.probabilities = res.probabilities.copy()
-
-        res.actual[0] = np.nan
-        res.probabilities[:, [0, 3], :] = np.nan
-        self.send_signal(self.widget.Inputs.evaluation_results, res)
-        self.assertTrue(self.widget.Error.invalid_results.is_shown())
-        self.send_signal(self.widget.Inputs.evaluation_results, None)
-        self.assertFalse(self.widget.Error.invalid_results.is_shown())

From 557fa2e78c91f339d415278941623ee77865f3eb Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 18:26:20 +0200
Subject: [PATCH 13/21] OWTestLearners: Skip inactive signals (e.g. learner
 widget outputs None)

---
 Orange/widgets/evaluate/owtestlearners.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index c3da72af0f6..d534bbe6a32 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -315,7 +315,7 @@ def set_learner(self, learner, key):
             # Removed
             self._invalidate([key])
             del self.learners[key]
-        else:
+        elif learner is not None:
             self.learners[key] = InputLearner(learner, None, None)
             self._invalidate([key])
 

From 1a8b013561486c7e7c87c814a3ed9f8ceac5d36f Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 21:43:23 +0200
Subject: [PATCH 14/21] Calibrated Learner: Add widget

---
 Orange/widgets/model/owcalibratedlearner.py   | 111 ++++++++++++
 .../model/tests/test_owcalibratedlearner.py   | 158 ++++++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 Orange/widgets/model/owcalibratedlearner.py
 create mode 100644 Orange/widgets/model/tests/test_owcalibratedlearner.py

diff --git a/Orange/widgets/model/owcalibratedlearner.py b/Orange/widgets/model/owcalibratedlearner.py
new file mode 100644
index 00000000000..558ac331539
--- /dev/null
+++ b/Orange/widgets/model/owcalibratedlearner.py
@@ -0,0 +1,111 @@
+from Orange.classification import CalibratedLearner, ThresholdLearner, \
+    NaiveBayesLearner
+from Orange.data import Table
+from Orange.modelling import Learner
+from Orange.widgets import gui
+from Orange.widgets.widget import Input
+from Orange.widgets.settings import Setting
+from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
+from Orange.widgets.utils.widgetpreview import WidgetPreview
+
+
+class OWCalibratedLearner(OWBaseLearner):
+    name = "Calibrated Learner"
+    description = "Wraps another learner with probability calibration and " \
+                  "decision threshold optimization"
+    icon = "icons/CalibratedLearner.svg"
+    priority = 20
+    keywords = ["calibration", "threshold"]
+
+    LEARNER = CalibratedLearner
+
+    SigmoidCalibration, IsotonicCalibration, NoCalibration = range(3)
+    CalibrationOptions = ("Sigmoid calibration",
+                          "Isotonic calibration",
+                          "No calibration")
+    CalibrationShort = ("Sigmoid", "Isotonic", "")
+    CalibrationMap = {
+        SigmoidCalibration: CalibratedLearner.Sigmoid,
+        IsotonicCalibration: CalibratedLearner.Isotonic}
+
+    OptimizeCA, OptimizeF1, NoThresholdOptimization = range(3)
+    ThresholdOptions = ("Optimize classification accuracy",
+                        "Optimize F1 score",
+                        "No threshold optimization")
+    ThresholdShort = ("CA", "F1", "")
+    ThresholdMap = {
+        OptimizeCA: ThresholdLearner.OptimizeCA,
+        OptimizeF1: ThresholdLearner.OptimizeF1}
+
+    learner_name = Setting("", schema_only=True)
+    calibration = Setting(SigmoidCalibration)
+    threshold = Setting(OptimizeCA)
+
+    class Inputs(OWBaseLearner.Inputs):
+        base_learner = Input("Base Learner", Learner)
+
+    def __init__(self):
+        super().__init__()
+        self.base_learner = None
+
+    def add_main_layout(self):
+        gui.radioButtons(
+            self.controlArea, self, "calibration", self.CalibrationOptions,
+            box="Probability calibration",
+            callback=self.calibration_options_changed)
+        gui.radioButtons(
+            self.controlArea, self, "threshold", self.ThresholdOptions,
+            box="Decision threshold optimization",
+            callback=self.calibration_options_changed)
+
+    @Inputs.base_learner
+    def set_learner(self, learner):
+        self.base_learner = learner
+        self._set_default_name()
+        self.unconditional_apply()
+
+    def _set_default_name(self):
+        if self.base_learner is None:
+            self.name = "Calibrated learner"
+        else:
+            self.name = " + ".join(part for part in (
+                self.base_learner.name.title(),
+                self.CalibrationShort[self.calibration],
+                self.ThresholdShort[self.threshold]) if part)
+        self.controls.learner_name.setPlaceholderText(self.name)
+
+    def calibration_options_changed(self):
+        self._set_default_name()
+        self.apply()
+
+    def create_learner(self):
+        class IdentityWrapper(Learner):
+            def fit_storage(self, data):
+                return self.base_learner.fit_storage(data)
+
+        if self.base_learner is None:
+            return None
+        learner = self.base_learner
+        if self.calibration != self.NoCalibration:
+            learner = CalibratedLearner(learner,
+                                        self.CalibrationMap[self.calibration])
+        if self.threshold != self.NoThresholdOptimization:
+            learner = ThresholdLearner(learner,
+                                       self.ThresholdMap[self.threshold])
+        if self.preprocessors:
+            if learner is self.base_learner:
+                learner = IdentityWrapper()
+            learner.preprocessors = (self.preprocessors, )
+        return learner
+
+    def get_learner_parameters(self):
+        return (("Calibrate probabilities",
+                 self.CalibrationOptions[self.calibrate]),
+                ("Threshold optimization",
+                 self.ThresholdOptions[self.threshold]))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    WidgetPreview(OWCalibratedLearner).run(
+        Table("heart_disease"),
+        set_learner=NaiveBayesLearner())
diff --git a/Orange/widgets/model/tests/test_owcalibratedlearner.py b/Orange/widgets/model/tests/test_owcalibratedlearner.py
new file mode 100644
index 00000000000..400d483a592
--- /dev/null
+++ b/Orange/widgets/model/tests/test_owcalibratedlearner.py
@@ -0,0 +1,158 @@
+from unittest.mock import Mock
+
+from Orange.classification import ThresholdLearner, CalibratedLearner, \
+    NaiveBayesLearner, ThresholdClassifier, CalibratedClassifier
+from Orange.classification.base_classification import ModelClassification, \
+    LearnerClassification
+from Orange.classification.naive_bayes import NaiveBayesModel
+from Orange.data import Table
+from Orange.widgets.model.owcalibratedlearner import OWCalibratedLearner
+from Orange.widgets.tests.base import WidgetTest, WidgetLearnerTestMixin, \
+    datasets
+
+
+class TestOWCalibratedLearner(WidgetTest, WidgetLearnerTestMixin):
+    def setUp(self):
+        self.widget = self.create_widget(
+            OWCalibratedLearner, stored_settings={"auto_apply": False})
+        self.send_signal(self.widget.Inputs.base_learner, NaiveBayesLearner())
+
+        self.data = Table("heart_disease")
+        self.valid_datasets = (self.data,)
+        self.inadequate_dataset = (Table(datasets.path("testing_dataset_reg")),)
+        self.learner_class = LearnerClassification
+        self.model_class = ModelClassification
+        self.model_name = 'Calibrated classifier'
+        self.parameters = []
+
+    def test_output_learner(self):
+        """Check if learner is on output after apply"""
+        # Overridden to change the output type in the last test
+        initial = self.get_output("Learner")
+        self.assertIsNotNone(initial, "Does not initialize the learner output")
+        self.widget.apply_button.button.click()
+        newlearner = self.get_output("Learner")
+        self.assertIsNot(initial, newlearner,
+                         "Does not send a new learner instance on `Apply`.")
+        self.assertIsNotNone(newlearner)
+        self.assertIsInstance(
+            newlearner,
+            (CalibratedLearner, ThresholdLearner, NaiveBayesLearner))
+
+    def test_output_model(self):
+        """Check if model is on output after sending data and apply"""
+        # Overridden to change the output type in the last two test
+        self.assertIsNone(self.get_output(self.widget.Outputs.model))
+        self.widget.apply_button.button.click()
+        self.assertIsNone(self.get_output(self.widget.Outputs.model))
+        self.send_signal('Data', self.data)
+        self.widget.apply_button.button.click()
+        self.wait_until_stop_blocking()
+        model = self.get_output(self.widget.Outputs.model)
+        self.assertIsNotNone(model)
+        self.assertIsInstance(
+            model, (CalibratedClassifier, ThresholdClassifier, NaiveBayesModel))
+
+    def test_create_learner(self):
+        widget = self.widget  #: OWCalibratedLearner
+        self.widget.base_learner = Mock()
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeF1)
+        cal_learner = learner.base_learner
+        self.assertIsInstance(cal_learner, CalibratedLearner)
+        self.assertEqual(cal_learner.calibration_method, cal_learner.Sigmoid)
+        self.assertIs(cal_learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.OptimizeCA
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeCA)
+        cal_learner = learner.base_learner
+        self.assertIsInstance(cal_learner, CalibratedLearner)
+        self.assertEqual(cal_learner.calibration_method, cal_learner.Isotonic)
+        self.assertIs(cal_learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.OptimizeCA
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeCA)
+        self.assertIs(learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, CalibratedLearner)
+        self.assertEqual(learner.calibration_method, cal_learner.Isotonic)
+        self.assertIs(learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIs(learner, self.widget.base_learner)
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        widget.base_learner = None
+        learner = self.widget.create_learner()
+        self.assertIsNone(learner)
+
+    def test_preprocessors(self):
+        widget = self.widget  #: OWCalibratedLearner
+        self.widget.base_learner = Mock()
+        self.widget.base_learner.preprocessors = ()
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        widget.preprocessors = Mock()
+        learner = self.widget.create_learner()
+        self.assertEqual(learner.preprocessors, (widget.preprocessors, ))
+        self.assertEqual(learner.base_learner.preprocessors, ())
+        self.assertEqual(learner.base_learner.base_learner.preprocessors, ())
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIsNot(learner, self.widget.base_learner)
+        self.assertFalse(
+            isinstance(learner, (CalibratedLearner, ThresholdLearner)))
+        self.assertEqual(learner.preprocessors, (widget.preprocessors, ))
+
+    def test_set_learner_calls_unconditional_apply(self):
+        widget = self.widget
+        self.assertIsNotNone(self.get_output(widget.Outputs.learner))
+
+        widget.auto_apply = False
+        self.send_signal(widget.Inputs.base_learner, None)
+        self.assertIsNone(self.get_output(widget.Outputs.learner))
+
+    def test_name_changes(self):
+        widget = self.widget
+        widget.auto_apply = True
+        learner = NaiveBayesLearner()
+        learner.name = "foo"
+        self.send_signal(widget.Inputs.base_learner, learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.OptimizeCA
+        widget.controls.calibration.group.buttonClicked[int].emit(
+            widget.IsotonicCalibration)
+
+        learner = self.get_output(widget.Outputs.learner)
+        self.assertEqual(learner.name, "Foo + Isotonic + CA")
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.OptimizeCA
+        widget.controls.calibration.group.buttonClicked[int].emit(
+            widget.NoCalibration)
+        learner = self.get_output(widget.Outputs.learner)
+        self.assertEqual(learner.name, "Foo + CA")
+
+        self.send_signal(widget.Inputs.base_learner, None)
+        self.assertEqual(widget.controls.learner_name.placeholderText(),
+                         "Calibrated learner")

From 6ac1db1995e011f08328706779e40ba9c9ca326c Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 17 Jun 2019 22:19:54 +0200
Subject: [PATCH 15/21] Calibration plot: Add context settings

---
 Orange/widgets/evaluate/contexthandlers.py   | 63 +++++++-------------
 Orange/widgets/evaluate/owcalibrationplot.py | 12 +++-
 2 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/Orange/widgets/evaluate/contexthandlers.py b/Orange/widgets/evaluate/contexthandlers.py
index d79def2ca60..3ad2796698d 100644
--- a/Orange/widgets/evaluate/contexthandlers.py
+++ b/Orange/widgets/evaluate/contexthandlers.py
@@ -1,47 +1,30 @@
+from Orange.data import Variable
 from Orange.widgets import settings
-from Orange.widgets.utils import getdeepattr
 
 
 class EvaluationResultsContextHandler(settings.ContextHandler):
-    def __init__(self, targetAttr, selectedAttr):
-        super().__init__()
-        self.targetAttr, self.selectedAttr = targetAttr, selectedAttr
+    """Context handler for evaluation results"""
 
-    #noinspection PyMethodOverriding
-    def match(self, context, cnames, cvalues):
-        return (cnames, cvalues) == (
-            context.classifierNames, context.classValues) and 2
+    def open_context(self, widget, classes, classifier_names):
+        if isinstance(classes, Variable):
+            if classes.is_discrete:
+                classes = classes.values
+            else:
+                classes = None
+        super().open_context(widget, classes, classifier_names)
 
-    def fast_save(self, widget, name, value):
-        context = widget.current_context
-        if name == self.targetAttr:
-            context.targetClass = value
-        elif name == self.selectedAttr:
-            context.selectedClassifiers = list(value)
+    def new_context(self, classes, classifier_names):
+        context = super().new_context()
+        context.classes = classes
+        context.classifier_names = classifier_names
+        return context
 
-    def settings_from_widget(self, widget, *args):
-        super().settings_from_widget(widget, *args)
-        context = widget.current_context
-        context.targetClass = getdeepattr(widget, self.targetAttr)
-        context.selectedClassifiers = list(getdeepattr(self.selectedAttr))
-
-    def settings_to_widget(self, widget, *args):
-        super().settings_to_widget(widget, *args)
-        context = widget.current_context
-        if context.targetClass is not None:
-            setattr(widget, self.targetAttr, context.targetClass)
-        if context.selectedClassifiers is not None:
-            setattr(widget, self.selectedAttr, context.selectedClassifiers)
-
-    #noinspection PyMethodOverriding
-    def find_or_create_context(self, widget, results):
-        cnames = [c.name for c in results.classifiers]
-        cvalues = results.classValues
-        context, isNew = super().find_or_create_context(
-            widget, results.classifierNames, results.classValues)
-        if isNew:
-            context.classifierNames = results.classifierNames
-            context.classValues = results.classValues
-            context.selectedClassifiers = None
-            context.targetClass = None
-        return context, isNew
+    def match(self, context, classes, classifier_names):
+        if classifier_names != context.classifier_names:
+            return self.NO_MATCH
+        elif isinstance(classes, Variable) and classes.is_continuous:
+            return (self.PERFECT_MATCH if context.classes is None
+                    else self.NO_MATCH)
+        else:
+            return (self.PERFECT_MATCH if context.classes == classes
+                    else self.NO_MATCH)
diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 63c1b0190a2..a38ecd30ed0 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -12,6 +12,8 @@
 from Orange.evaluation import Results
 from Orange.evaluation.performance_curves import Curves
 from Orange.widgets import widget, gui, settings
+from Orange.widgets.evaluate.contexthandlers import \
+    EvaluationResultsContextHandler
 from Orange.widgets.evaluate.utils import \
     check_results_adequacy, results_for_preview
 from Orange.widgets.utils import colorpalette, colorbrewer
@@ -84,9 +86,9 @@ class Information(widget.OWWidget.Information):
             no_out + "select a single model - the widget can output only one")
         non_binary_class = Msg(no_out + "cannot calibrate non-binary classes")
 
-
-    target_index = settings.Setting(0)
-    selected_classifiers = settings.Setting([])
+    settingsHandler = EvaluationResultsContextHandler()
+    target_index = settings.ContextSetting(0)
+    selected_classifiers = settings.ContextSetting([])
     score = settings.Setting(0)
     output_calibration = settings.Setting(0)
     fold_curves = settings.Setting(False)
@@ -168,6 +170,7 @@ def __init__(self):
 
     @Inputs.evaluation_results
     def set_results(self, results):
+        self.closeContext()
         self.clear()
         results = check_results_adequacy(results, self.Error, check_nan=False)
         if results is not None and not results.actual.size:
@@ -177,6 +180,9 @@ def set_results(self, results):
         self.results = results
         if self.results is not None:
             self._initialize(results)
+            class_var = self.results.domain.class_var
+            self.target_index = int(len(class_var.values) == 2)
+            self.openContext(class_var, self.classifier_names)
             self._replot()
         self.apply()
 

From 2edcb391aa185de53efd6da1d6f5f991c56e8f97 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Tue, 18 Jun 2019 22:52:32 +0200
Subject: [PATCH 16/21] OWCalibration Plot: Unit tests and some fixes

---
 Orange/evaluation/testing.py                  |   4 +-
 Orange/tests/test_evaluation_testing.py       |   2 +-
 Orange/widgets/evaluate/owcalibrationplot.py  |  62 +-
 Orange/widgets/evaluate/tests/base.py         |   2 +-
 .../evaluate/tests/test_owcalibrationplot.py  | 543 +++++++++++++++++-
 5 files changed, 575 insertions(+), 38 deletions(-)

diff --git a/Orange/evaluation/testing.py b/Orange/evaluation/testing.py
index 22a917283e0..93c0d563238 100644
--- a/Orange/evaluation/testing.py
+++ b/Orange/evaluation/testing.py
@@ -171,7 +171,7 @@ def set_or_raise(value, exp_values, msg):
             "mismatching number of class values")
         nmethods = set_or_raise(
             nmethods, [learners is not None and len(learners),
-                       models is not None and len(models),
+                       models is not None and models.shape[1],
                        failed is not None and len(failed),
                        predicted is not None and predicted.shape[0],
                        probabilities is not None and probabilities.shape[0]],
@@ -365,7 +365,7 @@ def __new__(cls,
                                  "and train_data are omitted")
             return self
 
-        warn("calling Validation's constructor with data and learners"
+        warn("calling Validation's constructor with data and learners "
              "is deprecated;\nconstruct an instance and call it",
              DeprecationWarning, stacklevel=2)
 
diff --git a/Orange/tests/test_evaluation_testing.py b/Orange/tests/test_evaluation_testing.py
index 3bc21d3f2e8..a5f78cb2972 100644
--- a/Orange/tests/test_evaluation_testing.py
+++ b/Orange/tests/test_evaluation_testing.py
@@ -756,7 +756,7 @@ def setUp(self):
         self.row_indices = np.arange(100)
         self.folds = (range(50), range(10, 60)), (range(50, 100), range(50))
         self.learners = [MajorityLearner(), MajorityLearner()]
-        self.models = [Mock(), Mock()]
+        self.models = np.array([[Mock(), Mock()]])
         self.predicted = np.zeros((2, 100))
         self.probabilities = np.zeros((2, 100, 3))
         self.failed = [False, True]
diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index a38ecd30ed0..e3b828fd2e2 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -14,8 +14,7 @@
 from Orange.widgets import widget, gui, settings
 from Orange.widgets.evaluate.contexthandlers import \
     EvaluationResultsContextHandler
-from Orange.widgets.evaluate.utils import \
-    check_results_adequacy, results_for_preview
+from Orange.widgets.evaluate.utils import results_for_preview
 from Orange.widgets.utils import colorpalette, colorbrewer
 from Orange.widgets.utils.widgetpreview import WidgetPreview
 from Orange.widgets.widget import Input, Output, Msg
@@ -72,7 +71,8 @@ class Inputs:
     class Outputs:
         calibrated_model = Output("Calibrated Model", Model)
 
-    class Warning(widget.OWWidget.Warning):
+    class Error(widget.OWWidget.Error):
+        non_discrete_target = Msg("Calibration plot requires a discrete target")
         empty_input = widget.Msg("Empty result on input. Nothing to display.")
 
     class Information(widget.OWWidget.Information):
@@ -84,7 +84,8 @@ class Information(widget.OWWidget.Information):
             "try testing on separate data or on training data")
         no_output_multiple_selected = Msg(
             no_out + "select a single model - the widget can output only one")
-        non_binary_class = Msg(no_out + "cannot calibrate non-binary classes")
+        no_output_non_binary_class = Msg(
+            no_out + "cannot calibrate non-binary classes")
 
     settingsHandler = EvaluationResultsContextHandler()
     target_index = settings.ContextSetting(0)
@@ -145,8 +146,8 @@ def __init__(self):
             btnLabels=("Sigmoid calibration", "Isotonic calibration"),
             label="Output model calibration", callback=self.apply)
 
-        box = gui.widgetBox(self.controlArea, "Info")
-        self.info_label = gui.widgetLabel(box)
+        self.info_box = gui.widgetBox(self.controlArea, "Info")
+        self.info_label = gui.widgetLabel(self.info_box)
 
         gui.auto_commit(
             self.controlArea, self, "auto_commit", "Apply", commit=self.apply)
@@ -159,6 +160,10 @@ def __init__(self):
         for axis_name in ("bottom", "left"):
             axis = self.plot.getAxis(axis_name)
             axis.setPen(pg.mkPen(color=0.0))
+            # Remove the condition (that is, allow setting this for bottom
+            # axis) when pyqtgraph is fixed
+            # Issue: https://github.com/pyqtgraph/pyqtgraph/issues/930
+            # Pull request: https://github.com/pyqtgraph/pyqtgraph/pull/932
             if axis_name != "bottom":  # remove if when pyqtgraph is fixed
                 axis.setStyle(stopAxisAtTick=(True, True))
 
@@ -172,11 +177,14 @@ def __init__(self):
     def set_results(self, results):
         self.closeContext()
         self.clear()
-        results = check_results_adequacy(results, self.Error, check_nan=False)
+        self.Error.clear()
+        self.Information.clear()
+        if results is not None and not results.domain.has_discrete_class:
+            self.Error.non_discrete_target()
+            results = None
         if results is not None and not results.actual.size:
-            self.Warning.empty_input()
-        else:
-            self.Warning.empty_input.clear()
+            self.Error.empty_input()
+            results = None
         self.results = results
         if self.results is not None:
             self._initialize(results)
@@ -219,8 +227,10 @@ def _set_explanation(self):
 
         if self.score == 0:
             self.controls.output_calibration.show()
+            self.info_box.hide()
         else:
             self.controls.output_calibration.hide()
+            self.info_box.show()
 
         axis = self.plot.getAxis("bottom")
         axis.setLabel("Predicted probability" if self.score == 0
@@ -230,23 +240,23 @@ def _set_explanation(self):
         axis.setLabel(Metrics[self.score].name)
 
     def _initialize(self, results):
-        N = len(results.predicted)
+        n = len(results.predicted)
         names = getattr(results, "learner_names", None)
         if names is None:
-            names = ["#{}".format(i + 1) for i in range(N)]
+            names = ["#{}".format(i + 1) for i in range(n)]
 
         self.classifier_names = names
         scheme = colorbrewer.colorSchemes["qualitative"]["Dark2"]
-        if N > len(scheme):
+        if n > len(scheme):
             scheme = colorpalette.DefaultRGBColors
-        self.colors = colorpalette.ColorPaletteGenerator(N, scheme)
+        self.colors = colorpalette.ColorPaletteGenerator(n, scheme)
 
-        for i in range(N):
+        for i in range(n):
             item = self.classifiers_list_box.item(i)
             item.setIcon(colorpalette.ColorPixmap(self.colors[i]))
 
-        self.selected_classifiers = list(range(N))
-        self.target_cb.addItems(results.data.domain.class_var.values)
+        self.selected_classifiers = list(range(n))
+        self.target_cb.addItems(results.domain.class_var.values)
 
     def _rug(self, data, pen_args):
         color = pen_args["pen"].color()
@@ -288,7 +298,6 @@ def _prob_curve(self, ytrue, probs, pen_args):
             y = np.full(100, xmax)
 
         self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args)
-        self.plot.plot([0, 1], [0, 1], antialias=True)
         return x, (y, )
 
     def _setup_plot(self):
@@ -326,6 +335,9 @@ def _setup_plot(self):
                     self.plot_metrics(Curves(fold_ytrue, fold_probs),
                                       metrics, pen_args)
 
+        if self.score == 0:
+            self.plot.plot([0, 1], [0, 1], antialias=True)
+
     def _replot(self):
         self.plot.clear()
         if self.results is not None:
@@ -379,7 +391,7 @@ def _update_info(self):
                                           for curve in curves)
                 text += "</tr>"
             text += "<table>"
-        self.info_label.setText(text)
+            self.info_label.setText(text)
 
     def threshold_change_done(self):
         self.apply()
@@ -395,7 +407,7 @@ def apply(self):
                 info.no_output_no_models: results.models is None,
                 info.no_output_multiple_selected:
                     len(self.selected_classifiers) != 1,
-                info.non_binary_class:
+                info.no_output_non_binary_class:
                     self.score != 0
                     and len(results.domain.class_var.values) != 2}
             if not any(problems.values()):
@@ -419,11 +431,19 @@ def apply(self):
     def send_report(self):
         if self.results is None:
             return
+        self.report_items((
+            ("Target class", self.target_cb.currentText()),
+            ("Output model calibration",
+             self.score == 0 and self.controls.score.currentText()),
+        ))
         caption = report.list_legend(self.classifiers_list_box,
                                      self.selected_classifiers)
-        self.report_items((("Target class", self.target_cb.currentText()),))
         self.report_plot()
         self.report_caption(caption)
+        self.report_caption(self.controls.score.currentText())
+
+        if self.score != 0:
+            self.report_raw(self.info_label.text())
 
 
 def gaussian_smoother(x, y, sigma=1.0):
diff --git a/Orange/widgets/evaluate/tests/base.py b/Orange/widgets/evaluate/tests/base.py
index 3100f1e1905..93fafea1e51 100644
--- a/Orange/widgets/evaluate/tests/base.py
+++ b/Orange/widgets/evaluate/tests/base.py
@@ -17,6 +17,6 @@ def test_many_evaluation_results(self):
             classification.NaiveBayesLearner(),
             classification.SGDClassificationLearner()
         ]
-        res = evaluation.CrossValidation(data, learners, k=2, store_data=True)
+        res = evaluation.CrossValidation(k=2, store_data=True)(data, learners)
         # this is a mixin; pylint: disable=no-member
         self.send_signal("Evaluation Results", res)
diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
index ac07e8a2fff..21cc067e50e 100644
--- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
+++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
@@ -1,11 +1,18 @@
 import copy
 import warnings
+from unittest.mock import Mock, patch
+
+import numpy as np
+from AnyQt.QtCore import QItemSelection
+from pyqtgraph import InfiniteLine
 
 from sklearn.exceptions import ConvergenceWarning
 
-from Orange.data import Table
+from Orange.data import Table, DiscreteVariable, Domain, ContinuousVariable
 import Orange.evaluation
 import Orange.classification
+from Orange.evaluation import Results
+from Orange.evaluation.performance_curves import Curves
 
 from Orange.widgets.evaluate.tests.base import EvaluateTest
 from Orange.widgets.evaluate.owcalibrationplot import OWCalibrationPlot
@@ -18,26 +25,536 @@ class TestOWCalibrationPlot(WidgetTest, EvaluateTest):
     def setUpClass(cls):
         super().setUpClass()
         cls.lenses = data = Table(test_filename("datasets/lenses.tab"))
-        cls.res = Orange.evaluation.TestOnTestData(
-            train_data=data[::2], test_data=data[1::2],
-            learners=[Orange.classification.MajorityLearner(),
-                      Orange.classification.KNNLearner()],
-            store_data=True,
-        )
+        majority = Orange.classification.MajorityLearner()
+        majority.name = "majority"
+        knn3 = Orange.classification.KNNLearner(n_neighbors=3)
+        knn3.name = "knn-3"
+        knn1 = Orange.classification.KNNLearner(n_neighbors=1)
+        knn1.name = "knn-1"
+        cls.lenses_results = Orange.evaluation.TestOnTestData(
+            store_data=True, store_models=True)(
+                data=data[::2], test_data=data[1::2],
+                learners=[majority, knn3, knn1])
+        cls.lenses_results.learner_names = ["majority", "knn-3", "knn-1"]
 
     def setUp(self):
         super().setUp()
+
+        n, p = (0, 1)
+        actual, probs = np.array([
+            (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53), (n, .52),
+            (p, .51), (n, .505), (p, .4), (n, .39), (p, .38), (n, .37),
+            (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1)]).T
+        self.curves = Curves(actual, probs)
+        probs2 = (probs + 0.5) / 2 + 1
+        self.curves2 = Curves(actual, probs2)
+        pred = probs > 0.5
+        pred2 = probs2 > 0.5
+        probs = np.vstack((1 - probs, probs)).T
+        probs2 = np.vstack((1 - probs2, probs2)).T
+        domain = Domain([], DiscreteVariable("y", values=("a", "b")))
+        self.results = Results(
+            domain=domain,
+            actual=actual,
+            folds=(Ellipsis, ),
+            models=np.array([[Mock(), Mock()]]),
+            row_indices=np.arange(19),
+            predicted=np.array((pred, pred2)),
+            probabilities=np.array([probs, probs2]))
+
         self.widget = self.create_widget(OWCalibrationPlot)  # type: OWCalibrationPlot
         warnings.filterwarnings("ignore", ".*", ConvergenceWarning)
 
-    def test_basic(self):
-        self.send_signal(self.widget.Inputs.evaluation_results, self.res)
-        self.widget.controls.display_rug.click()
+    def test_initialization(self):
+        """Test initialization of lists and combos"""
+        def check_clsfr_names(names):
+            self.assertEqual(widget.classifier_names, names)
+            clsf_list = widget.controls.selected_classifiers
+            self.assertEqual(
+                [clsf_list.item(i).text() for i in range(clsf_list.count())],
+                names)
+
+        widget = self.widget
+        tcomb = widget.controls.target_index
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        check_clsfr_names(["majority", "knn-3", "knn-1"])
+        self.assertEqual(widget.selected_classifiers, [0, 1, 2])
+        self.assertEqual(
+            [tcomb.itemText(i) for i in range(tcomb.count())],
+            self.lenses.domain.class_var.values)
+        self.assertEqual(widget.target_index, 0)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        check_clsfr_names(["#1", "#2"])
+        self.assertEqual(widget.selected_classifiers, [0, 1])
+        self.assertEqual(
+            [tcomb.itemText(i) for i in range(tcomb.count())], ["a", "b"])
+        self.assertEqual(widget.target_index, 1)
 
-    def test_empty(self):
-        res = copy.copy(self.res)
+        self.send_signal(widget.Inputs.evaluation_results, None)
+        check_clsfr_names([])
+        self.assertEqual(widget.selected_classifiers, [])
+        self.assertEqual(widget.controls.target_index.count(), 0)
+
+    def test_empty_input_error(self):
+        """Show an error when data is present but empty"""
+        widget = self.widget
+
+        res = copy.copy(self.results)
         res.row_indices = res.row_indices[:0]
         res.actual = res.actual[:0]
         res.predicted = res.predicted[:, 0]
         res.probabilities = res.probabilities[:, :0, :]
-        self.send_signal(self.widget.Inputs.evaluation_results, res)
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.empty_input.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, res)
+        self.assertTrue(widget.Error.empty_input.is_shown())
+        self.assertIsNone(widget.results)
+        self.assertFalse(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.empty_input.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+    def test_regression_input_error(self):
+        """Show an error for regression data"""
+        widget = self.widget
+
+        res = copy.copy(self.results)
+        res.domain = Domain([], ContinuousVariable("y"))
+        res.row_indices = res.row_indices[:0]
+        res.actual = res.actual[:0]
+        res.predicted = res.predicted[:, 0]
+        res.probabilities = res.probabilities[:, :0, :]
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.non_discrete_target.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, res)
+        self.assertTrue(widget.Error.non_discrete_target.is_shown())
+        self.assertIsNone(widget.results)
+        self.assertFalse(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.non_discrete_target.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+    @staticmethod
+    def _set_combo(combo, val):
+        combo.setCurrentIndex(val)
+        combo.activated[int].emit(val)
+        combo.activated[str].emit(combo.currentText())
+
+    @staticmethod
+    def _set_radio_buttons(radios, val):
+        radios.buttons[val].click()
+
+    @staticmethod
+    def _set_list_selection(listview, selection):
+        model = listview.model()
+        selectionmodel = listview.selectionModel()
+        itemselection = QItemSelection()
+        for item in selection:
+            itemselection.select(model.index(item, 0), model.index(item, 0))
+        selectionmodel.select(itemselection, selectionmodel.ClearAndSelect)
+
+    def _set_threshold(self, pos, done):
+        _, line = self._get_curves()
+        line.setPos(pos)
+        if done:
+            line.sigPositionChangeFinished.emit(line)
+        else:
+            line.sigPositionChanged.emit(line)
+
+    def _get_curves(self):
+        plot_items = self.widget.plot.items[:]
+        for i, item in enumerate(plot_items):
+            if isinstance(item, InfiniteLine):
+                del plot_items[i]
+                return plot_items, item
+        return plot_items, None
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_plotting_curves(self, *_):
+        """Curve coordinates match those computed by `Curves`"""
+        widget = self.widget
+        widget.display_rug = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        combo = widget.controls.score
+
+        c = self.curves
+        combinations = ([c.ca()],
+                        [c.f1()],
+                        [c.sensitivity(), c.specificity()],
+                        [c.precision(), c.recall()],
+                        [c.ppv(), c.npv()],
+                        [c.tpr(), c.fpr()])
+        for idx, curves_data in enumerate(combinations, start=1):
+            self._set_combo(combo, idx)
+            curves, line = self._get_curves()
+            self.assertEqual(len(curves), len(curves_data))
+            self.assertIsNotNone(line)
+            for curve in curves:
+                x, y = curve.getData()
+                np.testing.assert_almost_equal(x, self.curves.probs)
+                for i, curve_data in enumerate(curves_data):
+                    if np.max(curve_data - y) < 1e-6:
+                        del curves_data[i]
+                        break
+                else:
+                    self.fail(f"invalid curve for {combo.currentText()}")
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_multiple_fold_curves(self, *_):
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(widget.controls.selected_classifiers, [0])
+        self._set_combo(widget.controls.score, 1)  # CA
+
+        self.results.folds = [slice(1, 5), slice(5, 19)]
+        self.results.models = np.array([[Mock(), Mock()]] * 2)
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 1)
+
+        widget.controls.fold_curves.click()
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 3)
+
+        widget.controls.fold_curves.click()
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 1)
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_change_target_class(self, *_):
+        """Changing target combo changes the curves"""
+        widget = self.widget
+        widget.display_rug = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        score_combo = widget.controls.score
+        target_combo = widget.controls.target_index
+
+        self._set_combo(score_combo, 1)  # ca
+        self._set_combo(target_combo, 1)
+        (ca, ), _ = self._get_curves()
+        np.testing.assert_almost_equal(ca.getData()[1], self.curves.ca())
+
+        self._set_combo(target_combo, 0)
+        (ca, ), _ = self._get_curves()
+        curves = Curves(1 - self.curves.ytrue, 1 - self.curves.probs[:-1])
+        np.testing.assert_almost_equal(ca.getData()[1], curves.ca())
+
+    def test_changing_score_explanation(self):
+        """Changing score hides/shows explanation and options for calibration"""
+        widget = self.widget
+        score_combo = widget.controls.score
+        explanation = widget.explanation
+        calibrations = widget.controls.output_calibration
+
+        self._set_combo(score_combo, 1)  # ca
+        self.assertTrue(explanation.isHidden())
+        self.assertTrue(calibrations.isHidden())
+
+        self._set_combo(score_combo, 0)  # calibration
+        self.assertTrue(explanation.isHidden())
+        self.assertFalse(calibrations.isHidden())
+
+        self._set_combo(score_combo, 3)  # sens/spec
+        self.assertFalse(explanation.isHidden())
+        self.assertTrue(calibrations.isHidden())
+
+    def test_rug(self):
+        """Test rug appearance and positions"""
+        def get_rugs():
+            rugs = [None, None]
+            for item in widget.plot.items:
+                if item.curve.opts.get("connect", "") == "pairs":
+                    x, y = item.getData()
+                    np.testing.assert_almost_equal(x[::2], x[1::2])
+                    rugs[int(y[0] == 1)] = x[::2]
+            return rugs
+
+        widget = self.widget
+        widget.display_rug = True
+        model_list = widget.controls.selected_classifiers
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+
+        self._set_list_selection(model_list, [0])
+        probs = self.curves.probs[:-1]
+        truex = probs[self.curves.ytrue == 1]
+        falsex = probs[self.curves.ytrue == 0]
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, falsex)
+        np.testing.assert_almost_equal(top, truex)
+
+        # Switching targets should switch rugs and takes other probabilities
+        self._set_combo(widget.controls.target_index, 0)
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, (1 - truex)[::-1])
+        np.testing.assert_almost_equal(top, (1 - falsex)[::-1])
+        self._set_combo(widget.controls.target_index, 1)
+
+        # Changing models gives a different rug
+        self._set_list_selection(model_list, [1])
+        probs2 = self.curves2.probs[:-1]
+        truex2 = probs2[self.curves2.ytrue == 1]
+        falsex2 = probs2[self.curves2.ytrue == 0]
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, falsex2)
+        np.testing.assert_almost_equal(top, truex2)
+
+        # Two models - two rugs - four rug items
+        self._set_list_selection(model_list, [0, 1])
+        self.assertEqual(sum(item.curve.opts.get("connect", "") == "pairs"
+                             for item in widget.plot.items), 4)
+
+        # No models - no rugs
+        self._set_list_selection(model_list, [])
+        self.assertEqual(get_rugs(), [None, None])
+
+        # Bring the rug back
+        self._set_list_selection(model_list, [1])
+        self.assertIsNotNone(get_rugs()[0])
+
+        # Disable it with checkbox
+        widget.controls.display_rug.click()
+        self.assertEqual(get_rugs(), [None, None])
+
+    def test_calibration_curve(self):
+        """Test the correct number of calibration curves"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        widget.display_rug = False
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertEqual(len(widget.plot.items), 3)  # 2 + diagonal
+
+        self._set_list_selection(model_list, [1])
+        self.assertEqual(len(widget.plot.items), 2)
+
+        self._set_list_selection(model_list, [])
+        self.assertEqual(len(widget.plot.items), 1)
+
+    def test_threshold_change_updates_info(self):
+        """Changing the threshold updates info label"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)
+
+        original_text = widget.info_label.text()
+        self._set_threshold(0.3, False)
+        self.assertNotEqual(widget.info_label.text(), original_text)
+
+    def test_threshold_rounding(self):
+        """Threshold is rounded to two decimals"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)
+        self._set_threshold(0.367, False)
+        self.assertAlmostEqual(widget.threshold, 0.37)
+
+    def test_threshold_flips_on_two_classes(self):
+        """Threshold changes to 1 - threshold if *binary* class is switched"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.target_index, 0)
+        self._set_combo(widget.controls.score, 1) # CA
+        self._set_threshold(0.25, False)
+        self.assertEqual(widget.threshold, 0.25)
+        self._set_combo(widget.controls.target_index, 1)
+        self.assertEqual(widget.threshold, 0.75)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_combo(widget.controls.target_index, 0)
+        self._set_combo(widget.controls.score, 1) # CA
+        self._set_threshold(0.25, False)
+        self.assertEqual(widget.threshold, 0.25)
+        self._set_combo(widget.controls.target_index, 1)
+        self.assertEqual(widget.threshold, 0.25)
+
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_apply_no_output(self, *_):
+        """Test no output warnings"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+
+        info = widget.Information
+        infos = (info.no_output_multiple_folds,
+                 info.no_output_no_models,
+                 info.no_output_multiple_selected,
+                 info.no_output_non_binary_class)
+        multiple_folds, no_models, multiple_selected, non_binary_class = infos
+
+        def test_shown(shown):
+            for info in infos:
+                self.assertEqual(
+                    info.is_shown(), info in shown,
+                    f"{info} is unexpectedly "
+                    f"{'' if info.is_shown() else 'not'} shown")
+            output = self.get_output(widget.Outputs.calibrated_model)
+            if shown:
+                self.assertIsNone(output)
+            else:
+                self.assertIsNotNone(output)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)  # CA
+        test_shown({multiple_selected})
+
+        self._set_list_selection(model_list, [0])
+        test_shown(())
+        self._set_list_selection(model_list, [0, 1])
+
+        self.results.models = None
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        test_shown({multiple_selected, no_models})
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        test_shown({multiple_selected, non_binary_class})
+
+        self._set_list_selection(model_list, [0])
+        test_shown({non_binary_class})
+
+        self.results.folds = [slice(0, 5), slice(5, 10), slice(10, 19)]
+        self.results.models = np.array([[Mock(), Mock()]] * 3)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        test_shown({multiple_selected, multiple_folds})
+
+        self._set_list_selection(model_list, [0])
+        test_shown({multiple_folds})
+
+        self._set_combo(widget.controls.score, 0)  # calibration
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_list_selection(model_list, [0, 1])
+        test_shown({multiple_selected})
+        self._set_list_selection(model_list, [0])
+        test_shown(())
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    def test_output_threshold_classifier(self, threshold_classifier):
+        """Test threshold classifier on output"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        models = self.results.models.ravel()
+        target_combo = widget.controls.target_index
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(model_list, [0])
+        widget.target_index = 1
+
+        widget.threshold = 0.3
+        self._set_combo(widget.controls.score, 1)  # CA
+        model = self.get_output(widget.Outputs.calibrated_model)
+        threshold_classifier.assert_called_with(models[0], 0.3)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        widget.auto_commit = True
+        self._set_threshold(0.4, False)
+        threshold_classifier.assert_not_called()
+
+        widget.auto_commit = False
+        self._set_threshold(0.35, True)
+        threshold_classifier.assert_not_called()
+
+        widget.auto_commit = True
+        self._set_threshold(0.4, True)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_combo(target_combo, 0)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_combo(target_combo, 1)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_list_selection(model_list, [1])
+        threshold_classifier.assert_called_with(models[1], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_output_calibrated_classifier(self, calibrated_learner):
+        """Test calibrated classifier on output"""
+        calibrated_instance = calibrated_learner.return_value
+        get_model = calibrated_instance.get_model
+
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        models = self.lenses_results.models.ravel()
+        results = self.lenses_results
+        self.send_signal(widget.Inputs.evaluation_results, results)
+        self._set_combo(widget.controls.score, 0)
+
+        self._set_list_selection(model_list, [1])
+
+        self._set_radio_buttons(widget.controls.output_calibration, 0)
+        calibrated_learner.assert_called_with(None, 0)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[1])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[1])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+        self._set_radio_buttons(widget.controls.output_calibration, 1)
+        calibrated_learner.assert_called_with(None, 1)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[1])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[1])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+        self._set_list_selection(model_list, [0])
+        self._set_radio_buttons(widget.controls.output_calibration, 1)
+        calibrated_learner.assert_called_with(None, 1)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[0])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[0])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+    def test_contexts(self):
+        """Test storing and retrieving context settings"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        target_combo = widget.controls.target_index
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_list_selection(model_list, [0, 2])
+        self._set_combo(target_combo, 2)
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(model_list, [0])
+        self._set_combo(target_combo, 0)
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self.assertEqual(widget.selected_classifiers, [0, 2])
+        self.assertEqual(widget.target_index, 2)
+
+    def test_report(self):
+        """Test that report does not crash"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        widget.send_report()

From 2049afae83b0fcf46a3733ff407572e6583f2b60 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Wed, 19 Jun 2019 13:19:03 +0200
Subject: [PATCH 17/21] Calibration plot: Test missing probabilities and single
 classes

---
 Orange/widgets/evaluate/owcalibrationplot.py  | 159 +++++++++++-------
 .../evaluate/tests/test_owcalibrationplot.py  | 142 ++++++++++++----
 2 files changed, 208 insertions(+), 93 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index e3b828fd2e2..637db82aa9b 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -74,18 +74,23 @@ class Outputs:
     class Error(widget.OWWidget.Error):
         non_discrete_target = Msg("Calibration plot requires a discrete target")
         empty_input = widget.Msg("Empty result on input. Nothing to display.")
+        nan_classes = \
+            widget.Msg("Remove test data instances with unknown classes")
+        all_target_class = widget.Msg(
+            "All data instances belong to target class")
+        no_target_class = widget.Msg(
+            "No data instances belong to target class")
+
+    class Warning(widget.OWWidget.Warning):
+        omitted_folds = widget.Msg(
+            "Test folds where all data belongs to (non)-target are not shown")
+        omitted_nan_prob_points = widget.Msg(
+            "Instance for which the model couldn't compute probabilities are"
+            "skipped")
+        no_valid_data = widget.Msg("No valid data for model(s) {}")
 
     class Information(widget.OWWidget.Information):
-        no_out = "Can't output a model: "
-        no_output_multiple_folds = Msg(
-            no_out + "each training data sample produces a different model")
-        no_output_no_models = Msg(
-            no_out + "test results do not contain stored models;\n"
-            "try testing on separate data or on training data")
-        no_output_multiple_selected = Msg(
-            no_out + "select a single model - the widget can output only one")
-        no_output_non_binary_class = Msg(
-            no_out + "cannot calibrate non-binary classes")
+        no_output = Msg("Can't output a model: {}")
 
     settingsHandler = EvaluationResultsContextHandler()
     target_index = settings.ContextSetting(0)
@@ -179,19 +184,23 @@ def set_results(self, results):
         self.clear()
         self.Error.clear()
         self.Information.clear()
-        if results is not None and not results.domain.has_discrete_class:
-            self.Error.non_discrete_target()
-            results = None
-        if results is not None and not results.actual.size:
-            self.Error.empty_input()
-            results = None
-        self.results = results
-        if self.results is not None:
-            self._initialize(results)
-            class_var = self.results.domain.class_var
-            self.target_index = int(len(class_var.values) == 2)
-            self.openContext(class_var, self.classifier_names)
-            self._replot()
+
+        self.results = None
+        if results is not None:
+            if not results.domain.has_discrete_class:
+                self.Error.non_discrete_target()
+            elif not results.actual.size:
+                self.Error.empty_input()
+            elif np.any(np.isnan(results.actual)):
+                self.Error.nan_classes()
+            else:
+                self.results = results
+                self._initialize(results)
+                class_var = self.results.domain.class_var
+                self.target_index = int(len(class_var.values) == 2)
+                self.openContext(class_var, self.classifier_names)
+                self._replot()
+
         self.apply()
 
     def clear(self):
@@ -286,9 +295,6 @@ def plot_metrics(self, data, metrics, pen_args):
         return data.probs, ys
 
     def _prob_curve(self, ytrue, probs, pen_args):
-        if not probs.size:
-            return None
-
         xmin, xmax = probs.min(), probs.max()
         x = np.linspace(xmin, xmax, 100)
         if xmax != xmin:
@@ -307,16 +313,25 @@ def _setup_plot(self):
         plot_folds = self.fold_curves and results.folds is not None
         self.scores = []
 
-        ytrue = results.actual == target
+        if not self._check_class_presence(results.actual == target):
+            return
+
+        self.Warning.omitted_folds.clear()
+        self.Warning.omitted_nan_prob_points.clear()
+        no_valid_models = []
+        shadow_width = 4 + 4 * plot_folds
         for clsf in self.selected_classifiers:
-            probs = results.probabilities[clsf, :, target]
+            data = Curves.from_results(results, target, clsf)
+            if data.tot == 0:  # all probabilities are nan
+                no_valid_models.append(clsf)
+                continue
+            if data.tot != results.probabilities.shape[1]:  # some are nan
+                self.Warning.omitted_nan_prob_points()
+
             color = self.colors[clsf]
             pen_args = dict(
-                pen=pg.mkPen(color, width=1),
-                shadowPen=pg.mkPen(color.lighter(160),
-                                   width=4 + 4 * plot_folds),
-                antiAlias=True)
-            data = Curves(ytrue, probs)
+                pen=pg.mkPen(color, width=1), antiAlias=True,
+                shadowPen=pg.mkPen(color.lighter(160), width=shadow_width))
             self.scores.append(
                 (self.classifier_names[clsf],
                  self.plot_metrics(data, metrics, pen_args)))
@@ -330,19 +345,20 @@ def _setup_plot(self):
                     antiAlias=True)
                 for fold in range(len(results.folds)):
                     fold_results = results.get_fold(fold)
-                    fold_ytrue = fold_results.actual == target
-                    fold_probs = fold_results.probabilities[clsf, :, target]
-                    self.plot_metrics(Curves(fold_ytrue, fold_probs),
-                                      metrics, pen_args)
+                    fold_curve = Curves.from_results(fold_results, target, clsf)
+                    # Can't check this before: p and n can be 0 because of
+                    # nan probabilities
+                    if fold_curve.p * fold_curve.n == 0:
+                        self.Warning.omitted_folds()
+                    self.plot_metrics(fold_curve, metrics, pen_args)
+
+        if no_valid_models:
+            self.Warning.no_valid_data(
+                ", ".join(self.classifier_names[i] for i in no_valid_models))
 
         if self.score == 0:
             self.plot.plot([0, 1], [0, 1], antialias=True)
-
-    def _replot(self):
-        self.plot.clear()
-        if self.results is not None:
-            self._setup_plot()
-        if self.score != 0:
+        else:
             self.line = pg.InfiniteLine(
                 pos=self.threshold, movable=True,
                 pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
@@ -350,8 +366,25 @@ def _replot(self):
                 bounds=(0, 1),
             )
             self.line.sigPositionChanged.connect(self.threshold_change)
-            self.line.sigPositionChangeFinished.connect(self.threshold_change_done)
+            self.line.sigPositionChangeFinished.connect(
+                self.threshold_change_done)
             self.plot.addItem(self.line)
+
+    def _check_class_presence(self, ytrue):
+        self.Error.all_target_class.clear()
+        self.Error.no_target_class.clear()
+        if np.max(ytrue) == 0:
+            self.Error.no_target_class()
+            return False
+        if np.min(ytrue) == 1:
+            self.Error.all_target_class()
+            return False
+        return True
+
+    def _replot(self):
+        self.plot.clear()
+        if self.results is not None:
+            self._setup_plot()
         self._update_info()
 
     def _on_display_rug_changed(self):
@@ -380,10 +413,7 @@ def _update_info(self):
                                 {"<td></td>".join(f"<td align='right'>{n}</td>"
                                                   for n in short_names)}
                             </tr>"""
-            for name, probs_curves in self.scores:
-                if probs_curves is None:
-                    continue
-                probs, curves = probs_curves
+            for name, (probs, curves) in self.scores:
                 ind = min(np.searchsorted(probs, self.threshold),
                           len(probs) - 1)
                 text += f"<tr><th align='right'>{name}:</th>"
@@ -397,20 +427,28 @@ def threshold_change_done(self):
         self.apply()
 
     def apply(self):
-        info = self.Information
+        self.Information.no_output.clear()
         wrapped = None
-        problems = {}
         results = self.results
         if results is not None:
-            problems = {
-                info.no_output_multiple_folds: len(results.folds) > 1,
-                info.no_output_no_models: results.models is None,
-                info.no_output_multiple_selected:
-                    len(self.selected_classifiers) != 1,
-                info.no_output_non_binary_class:
-                    self.score != 0
-                    and len(results.domain.class_var.values) != 2}
-            if not any(problems.values()):
+            problems = [
+                msg for condition, msg in (
+                    (len(results.folds) > 1,
+                     "each training data sample produces a different model"),
+                    (results.models is None,
+                     "test results do not contain stored models - try testing on"
+                     "separate data or on training data"),
+                    (len(self.selected_classifiers) != 1,
+                     "select a single model - the widget can output only one"),
+                    (self.score != 0 and len(results.domain.class_var.values) != 2,
+                     "cannot calibrate non-binary classes"))
+                if condition]
+            if len(problems) == 1:
+                self.Information.no_output(problems[0])
+            elif problems:
+                self.Information.no_output(
+                    "".join(f"\n - {problem}" for problem in problems))
+            else:
                 clsf_idx = self.selected_classifiers[0]
                 model = results.models[0, clsf_idx]
                 if self.score == 0:
@@ -424,9 +462,6 @@ def apply(self):
                     wrapped = ThresholdClassifier(model, threshold)
 
         self.Outputs.calibrated_model.send(wrapped)
-        for info, shown in problems.items():
-            if info.is_shown() != shown:
-                info(shown=shown)
 
     def send_report(self):
         if self.results is None:
diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
index 21cc067e50e..2d28c050fa2 100644
--- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
+++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
@@ -21,22 +21,6 @@
 
 
 class TestOWCalibrationPlot(WidgetTest, EvaluateTest):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls.lenses = data = Table(test_filename("datasets/lenses.tab"))
-        majority = Orange.classification.MajorityLearner()
-        majority.name = "majority"
-        knn3 = Orange.classification.KNNLearner(n_neighbors=3)
-        knn3.name = "knn-3"
-        knn1 = Orange.classification.KNNLearner(n_neighbors=1)
-        knn1.name = "knn-1"
-        cls.lenses_results = Orange.evaluation.TestOnTestData(
-            store_data=True, store_models=True)(
-                data=data[::2], test_data=data[1::2],
-                learners=[majority, knn3, knn1])
-        cls.lenses_results.learner_names = ["majority", "knn-3", "knn-1"]
-
     def setUp(self):
         super().setUp()
 
@@ -56,12 +40,25 @@ def setUp(self):
         self.results = Results(
             domain=domain,
             actual=actual,
-            folds=(Ellipsis, ),
+            folds=np.array([Ellipsis]),
             models=np.array([[Mock(), Mock()]]),
             row_indices=np.arange(19),
             predicted=np.array((pred, pred2)),
             probabilities=np.array([probs, probs2]))
 
+        self.lenses = data = Table(test_filename("datasets/lenses.tab"))
+        majority = Orange.classification.MajorityLearner()
+        majority.name = "majority"
+        knn3 = Orange.classification.KNNLearner(n_neighbors=3)
+        knn3.name = "knn-3"
+        knn1 = Orange.classification.KNNLearner(n_neighbors=1)
+        knn1.name = "knn-1"
+        self.lenses_results = Orange.evaluation.TestOnTestData(
+            store_data=True, store_models=True)(
+                data=data[::2], test_data=data[1::2],
+                learners=[majority, knn3, knn1])
+        self.lenses_results.learner_names = ["majority", "knn-3", "knn-1"]
+
         self.widget = self.create_widget(OWCalibrationPlot)  # type: OWCalibrationPlot
         warnings.filterwarnings("ignore", ".*", ConvergenceWarning)
 
@@ -389,24 +386,31 @@ def test_apply_no_output(self, *_):
         widget = self.widget
         model_list = widget.controls.selected_classifiers
 
-        info = widget.Information
-        infos = (info.no_output_multiple_folds,
-                 info.no_output_no_models,
-                 info.no_output_multiple_selected,
-                 info.no_output_non_binary_class)
-        multiple_folds, no_models, multiple_selected, non_binary_class = infos
+        multiple_folds, multiple_selected, no_models, non_binary_class = "abcd"
+        messages = {
+            multiple_folds:
+                "each training data sample produces a different model",
+            no_models:
+                "test results do not contain stored models - try testing on"
+                "separate data or on training data",
+            multiple_selected:
+                "select a single model - the widget can output only one",
+            non_binary_class:
+                "cannot calibrate non-binary classes"}
 
         def test_shown(shown):
-            for info in infos:
-                self.assertEqual(
-                    info.is_shown(), info in shown,
-                    f"{info} is unexpectedly "
-                    f"{'' if info.is_shown() else 'not'} shown")
+            widget_msg = widget.Information.no_output
             output = self.get_output(widget.Outputs.calibrated_model)
-            if shown:
-                self.assertIsNone(output)
-            else:
+            if not shown:
+                self.assertFalse(widget_msg.is_shown())
                 self.assertIsNotNone(output)
+            else:
+                self.assertTrue(widget_msg.is_shown())
+                self.assertIsNone(output)
+                for msg_id in shown:
+                    msg = messages[msg_id]
+                    self.assertIn(msg, widget_msg.formatted,
+                                  f"{msg} not included in the message")
 
         self.send_signal(widget.Inputs.evaluation_results, self.results)
         self._set_combo(widget.controls.score, 1)  # CA
@@ -558,3 +562,79 @@ def test_report(self):
         widget = self.widget
         self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
         widget.send_report()
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_single_class(self, *_):
+        """Curves are not plotted if all data belongs to (non)-target"""
+        def check_error(shown):
+            for error in (errors.no_target_class, errors.all_target_class,
+                          errors.nan_classes):
+                self.assertEqual(error.is_shown(), error is shown,
+                                 f"{error} is unexpectedly"
+                                 f"{'' if error.is_shown() else ' not'} shown")
+            if shown is not None:
+                self.assertEqual(len(widget.plot.items), 0)
+            else:
+                self.assertGreater(len(widget.plot.items), 0)
+
+        widget = self.widget
+        errors = widget.Error
+        widget.display_rug = True
+        combo = widget.controls.score
+
+        original_actual = self.results.actual.copy()
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        for idx in range(combo.count()):
+            self._set_combo(combo, idx)
+            self.results.actual[:] = 0
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.no_target_class)
+
+            self.results.actual[:] = 1
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.all_target_class)
+
+            self.results.actual[:] = original_actual
+            self.results.actual[3] = np.nan
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.nan_classes)
+
+            self.results.actual[:] = original_actual
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(None)
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_single_class_folds(self, *_):
+        """Curves for single-class folds are not plotted"""
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+
+        results = self.lenses_results
+        results.folds = [slice(0, 5), slice(5, 19)]
+        results.models = results.models.repeat(2, axis=0)
+        results.actual[:3] = 0
+        results.probabilities[1, 3:5] = np.nan
+        # after this, model 1 has just negative instances in fold 0
+        self.send_signal(widget.Inputs.evaluation_results, results)
+        self._set_combo(widget.controls.score, 1)  # CA
+        self.assertFalse(widget.Warning.omitted_folds.is_shown())
+        widget.controls.fold_curves.click()
+        self.assertTrue(widget.Warning.omitted_folds.is_shown())
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_warn_nan_probabilities(self, *_):
+        """Warn about omitted points with nan probabiities"""
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+
+        self.results.probabilities[1, 3] = np.nan
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertTrue(widget.Warning.omitted_nan_prob_points.is_shown())
+        self._set_list_selection(widget.controls.selected_classifiers, [0, 2])
+        self.assertFalse(widget.Warning.omitted_folds.is_shown())

From 04d05f447f14ec69b534b6117caaa9c1b4ce2f98 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 24 Jun 2019 21:50:58 +0200
Subject: [PATCH 18/21] Calibration plot: Minor fixes

---
 Orange/widgets/evaluate/owcalibrationplot.py            | 9 ++++++---
 Orange/widgets/evaluate/tests/test_owcalibrationplot.py | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 637db82aa9b..55b1f57c2a9 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -400,6 +400,9 @@ def threshold_change(self):
         self._update_info()
 
     def _update_info(self):
+        def elided(s):
+            return s[:17] + "..." if len(s) > 20 else s
+
         text = f"""<table>
                         <tr>
                             <th align='right'>Threshold: p=</th>
@@ -416,7 +419,7 @@ def _update_info(self):
             for name, (probs, curves) in self.scores:
                 ind = min(np.searchsorted(probs, self.threshold),
                           len(probs) - 1)
-                text += f"<tr><th align='right'>{name}:</th>"
+                text += f"<tr><th align='right'>{elided(name)}:</th>"
                 text += "<td>/</td>".join(f'<td>{curve[ind]:.3f}</td>'
                                           for curve in curves)
                 text += "</tr>"
@@ -436,8 +439,8 @@ def apply(self):
                     (len(results.folds) > 1,
                      "each training data sample produces a different model"),
                     (results.models is None,
-                     "test results do not contain stored models - try testing on"
-                     "separate data or on training data"),
+                     "test results do not contain stored models - try testing "
+                     "on separate data or on training data"),
                     (len(self.selected_classifiers) != 1,
                      "select a single model - the widget can output only one"),
                     (self.score != 0 and len(results.domain.class_var.values) != 2,
diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
index 2d28c050fa2..e4f18231686 100644
--- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
+++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
@@ -391,7 +391,7 @@ def test_apply_no_output(self, *_):
             multiple_folds:
                 "each training data sample produces a different model",
             no_models:
-                "test results do not contain stored models - try testing on"
+                "test results do not contain stored models - try testing on "
                 "separate data or on training data",
             multiple_selected:
                 "select a single model - the widget can output only one",

From 6695ee942204296ee11c1fff150de6148b626414 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 28 Jun 2019 13:40:07 +0200
Subject: [PATCH 19/21] Calibrated Learner: Fix report

---
 Orange/widgets/model/owcalibratedlearner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Orange/widgets/model/owcalibratedlearner.py b/Orange/widgets/model/owcalibratedlearner.py
index 558ac331539..0edf3184797 100644
--- a/Orange/widgets/model/owcalibratedlearner.py
+++ b/Orange/widgets/model/owcalibratedlearner.py
@@ -100,7 +100,7 @@ def fit_storage(self, data):
 
     def get_learner_parameters(self):
         return (("Calibrate probabilities",
-                 self.CalibrationOptions[self.calibrate]),
+                 self.CalibrationOptions[self.calibration]),
                 ("Threshold optimization",
                  self.ThresholdOptions[self.threshold]))
 

From 65c69e2b890ff123f78dedde1089be9e4f81e9de Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 28 Jun 2019 14:06:37 +0200
Subject: [PATCH 20/21] Calibrated Learner: Add icon

---
 .../widgets/model/icons/CalibratedLearner.svg | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 Orange/widgets/model/icons/CalibratedLearner.svg

diff --git a/Orange/widgets/model/icons/CalibratedLearner.svg b/Orange/widgets/model/icons/CalibratedLearner.svg
new file mode 100644
index 00000000000..360a0d188ba
--- /dev/null
+++ b/Orange/widgets/model/icons/CalibratedLearner.svg
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 16.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="48px" height="48px" viewBox="0 0 48 48" enable-background="new 0 0 48 48" xml:space="preserve">
+<rect x="28.113" y="18.581" transform="matrix(-0.6953 -0.7187 0.7187 -0.6953 34.1212 54.4932)" fill="#333333" width="0.999" height="2.865"/>
+<rect x="32.225" y="14.592" transform="matrix(-0.6958 -0.7183 0.7183 -0.6958 43.9836 50.6797)" fill="#333333" width="0.999" height="2.866"/>
+<rect x="36.339" y="10.605" transform="matrix(-0.696 -0.718 0.718 -0.696 53.8361 46.8655)" fill="#333333" width="0.999" height="2.865"/>
+<rect x="11.66" y="34.532" transform="matrix(0.6953 0.7187 -0.7187 0.6953 29.5543 2.2188)" fill="#333333" width="1" height="2.865"/>
+<rect x="15.774" y="30.545" transform="matrix(0.6963 0.7178 -0.7178 0.6963 27.8945 -1.9692)" fill="#333333" width="1" height="2.863"/>
+<rect x="19.886" y="26.556" transform="matrix(0.6953 0.7187 -0.7187 0.6953 26.3278 -6.1237)" fill="#333333" width="1" height="2.865"/>
+<path fill="#333333" d="M9.19,40c0.311-1.484,1.055-4.244,2.672-6.993c2.714-4.614,6.661-7.25,11.751-7.864l0.206,0.213l0.269-0.26
+	C24.71,25.036,25.346,25,26,25c4.834,0,8.686-1.485,11.447-4.414c4.982-5.284,4.57-13.306,4.551-13.645l-1.996,0.117
+	c0.001,0.018,0.023,0.477-0.005,1.218l-0.422,0.409l0.375,0.387c-0.193,2.545-0.974,6.976-3.958,10.141
+	C33.623,21.726,30.262,23,26,23c-0.157,0-0.305,0.012-0.46,0.015l-0.36-0.371l-0.408,0.397C14.065,23.625,9.693,31.754,8,36.836V40
+	H9.19z"/>
+<polygon fill="#B2B2B2" points="9.19,40 8,40 8,36.836 8,6 6,6 6,40 6,42 8,42 42,42 42,40 "/>
+<g>
+	<path fill="#333333" stroke="#FFFFFF" stroke-miterlimit="10" d="M40.724,28.729v-2.145l-3.217-1.072l-0.58-1.613l1.518-3.032
+		l-1.516-1.516l-3.033,1.516l-1.75-0.717l-1.072-3.217h-2.145l-1.072,3.217l-1.608,0.547l-3.033-1.518l-1.519,1.516l1.519,3.034
+		l-0.722,1.783l-3.217,1.072v2.145l3.217,1.074l0.724,1.775l-1.521,3.033l1.519,1.516l3.033-1.518l1.608,0.555l1.072,3.219h2.143
+		l1.074-3.219l1.75-0.723l3.033,1.52l1.516-1.518l-1.516-3.033l0.578-1.607L40.724,28.729z M30.001,30.875
+		c-1.777,0-3.217-1.441-3.217-3.219c0-1.775,1.438-3.219,3.217-3.219c1.777,0,3.217,1.441,3.217,3.219S31.778,30.875,30.001,30.875z
+		"/>
+</g>
+</svg>

From 864d7b59fafc7f2322088929a23af0491fe1b2c7 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 28 Jun 2019 14:24:06 +0200
Subject: [PATCH 21/21] Calibration plot: Nicer report

---
 Orange/widgets/evaluate/owcalibrationplot.py | 43 ++++++++++++++------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index 55b1f57c2a9..562c3d5aa01 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -399,15 +399,29 @@ def threshold_change(self):
         self.line.setPos(self.threshold)
         self._update_info()
 
-    def _update_info(self):
-        def elided(s):
-            return s[:17] + "..." if len(s) > 20 else s
-
-        text = f"""<table>
-                        <tr>
-                            <th align='right'>Threshold: p=</th>
-                            <td colspan='4'>{self.threshold:.2f}<br/></td>
-                        </tr>"""
+    def get_info_text(self, short):
+        if short:
+            def elided(s):
+                return s[:17] + "..." if len(s) > 20 else s
+
+            text = f"""<table>
+                            <tr>
+                                <th align='right'>Threshold: p=</th>
+                                <td colspan='4'>{self.threshold:.2f}<br/></td>
+                            </tr>"""
+
+        else:
+            def elided(s):
+                return s
+
+            text = f"""<table>
+                            <tr>
+                                <th align='right'>Threshold:</th>
+                                <td colspan='4'>p = {self.threshold:.2f}<br/>
+                                </td>
+                                <tr/>
+                            </tr>"""
+
         if self.scores is not None:
             short_names = Metrics[self.score].short_names
             if short_names:
@@ -424,7 +438,10 @@ def elided(s):
                                           for curve in curves)
                 text += "</tr>"
             text += "<table>"
-            self.info_label.setText(text)
+            return text
+
+    def _update_info(self):
+        self.info_label.setText(self.get_info_text(short=True))
 
     def threshold_change_done(self):
         self.apply()
@@ -472,7 +489,9 @@ def send_report(self):
         self.report_items((
             ("Target class", self.target_cb.currentText()),
             ("Output model calibration",
-             self.score == 0 and self.controls.score.currentText()),
+             self.score == 0
+             and ("Sigmoid calibration",
+                  "Isotonic calibration")[self.output_calibration])
         ))
         caption = report.list_legend(self.classifiers_list_box,
                                      self.selected_classifiers)
@@ -481,7 +500,7 @@ def send_report(self):
         self.report_caption(self.controls.score.currentText())
 
         if self.score != 0:
-            self.report_raw(self.info_label.text())
+            self.report_raw(self.get_info_text(short=False))
 
 
 def gaussian_smoother(x, y, sigma=1.0):