From baeece37df57b5052687cd67fc3e4738b55ff3d0 Mon Sep 17 00:00:00 2001
From: Nick Wardle <nckw@cern.ch>
Date: Thu, 13 Jun 2024 19:13:03 +0200
Subject: [PATCH 1/2] Adding option to remove outliers

Added option in `HybridNew` to remove outliers from test-stat distributions before calculating p-values/limits.

This option removes test-statistic values from the test-statistic distributions that are further than $X\sigma$ from the mean of the distribution, where $\sigma$ is the standard deviation of the distribution and $X$ is the value passed to the option, before calculating $p-$values and limits. Setting this to some reasonably large number (eg 50) should remove the spurious values. By default, no removal of outliers is performed.

Added text to documentation about this option.
---
 docs/part3/commonstatsmethods.md |  2 +
 interface/HybridNew.h            |  3 +-
 src/HybridNew.cc                 | 85 ++++++++++++++++++++++----------
 3 files changed, 63 insertions(+), 27 deletions(-)
diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md
index 78946e2ac70..3eb89187787 100644
--- a/docs/part3/commonstatsmethods.md
+++ b/docs/part3/commonstatsmethods.md
@@ -533,6 +533,8 @@ Failed to delete temporary file roostats-Sprxsw.root: No such file or directory
 
 The result stored in the **limit** branch of the output tree will be the upper limit (and its error, stored in **limitErr**). The default behaviour will be, as above, to search for the upper limit on **r**. However, the values of $p_{\mu}, p_{b}$ and CL<sub>s</sub> can be calculated for a particular value **r=X** by specifying the option `--singlePoint=X`. In this case, the value stored in the branch **limit** will be the value of CL<sub>s</sub> (or $p_{\mu}$) (see the [FAQ](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part4/usefullinks/#faq) section).
 
+In more complicated models, some of the toy datasets may lead to poor fits which results in spurious values of the test-statistic. In general, it is advised that diagnostic tools on the model/fits provided are used to avoid these poor fits, however if the fraction of these results is small, the option `--removeOutliersForPValue X` can be used. This option removes test-statistic values from the test-statistic distributions that are further than $X\sigma$ from the mean of the distribution, where $\sigma$ is the standard deviation of the distribution and $X$ is the value passed to the option, before calculating $p-$values and limits. Setting this to some reasonably large number (eg 50) should remove the spurious values. By default, no removal of outliers is performed. 
+
 #### Expected Limits
 
 For simple models, we can run interactively 5 times to compute the median expected and the 68% and 95% central interval boundaries. For this, we can use the `HybridNew` method with the same options as for the observed limit, but adding a `--expectedFromGrid=<quantile>`. Here, the quantile should be set to 0.5 for the median, 0.84 for the +ve side of the 68% band, 0.16 for the -ve side of the 68% band, 0.975 for the +ve side of the 95% band, and 0.025 for the -ve side of the 95% band.
diff --git a/interface/HybridNew.h b/interface/HybridNew.h
index b08b341ddec..e5bbc28cbf5 100644
--- a/interface/HybridNew.h
+++ b/interface/HybridNew.h
@@ -76,6 +76,7 @@ class HybridNew : public LimitAlgo {
   static float adaptiveToys_;
 
   static double EPS;
+  static float removeOulierForPvalThreshold_;
   // graph, used to compute the limit, not just for plotting!
   std::unique_ptr<TGraphErrors> limitPlot_;
  
@@ -137,7 +138,7 @@ class HybridNew : public LimitAlgo {
   void useGrid();
 
   bool doFC_;
-  
+  void removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t> *sw);
 };
 
 #endif
diff --git a/src/HybridNew.cc b/src/HybridNew.cc
index 5345dc09d7d..fa1dcc458d9 100644
--- a/src/HybridNew.cc
+++ b/src/HybridNew.cc
@@ -98,6 +98,7 @@ float HybridNew::confidenceToleranceForToyScaling_ = 0.2;
 float HybridNew::maxProbability_ = 0.999;
 double HybridNew::EPS = 1e-4;
 std::string HybridNew::mode_ = "";
+float HybridNew::removeOulierForPvalThreshold_ = -1.0; 
 
 HybridNew::HybridNew() :
 LimitAlgo("HybridNew specific options") {
@@ -146,6 +147,7 @@ LimitAlgo("HybridNew specific options") {
         ("importantContours",boost::program_options::value<std::string>(&scaleAndConfidenceSelection_)->default_value(scaleAndConfidenceSelection_), "Throw fewer toys far from interesting contours , format : CL_1,CL_2,..CL_N (--toysH scaled down when probability is far from any of CL_i) ")
         ("maxProbability", boost::program_options::value<float>(&maxProbability_)->default_value(maxProbability_),  "when point is >  maxProbability countour, don't bother throwing toys")
         ("confidenceTolerance", boost::program_options::value<float>(&confidenceToleranceForToyScaling_)->default_value(confidenceToleranceForToyScaling_),  "Determine what 'far' means for adatptiveToys. (relative in terms of (1-cl))")
+        ("removeOutliersForPValues", boost::program_options::value<float>(&removeOulierForPvalThreshold_)->default_value(removeOulierForPvalThreshold_),  "When calculating p-values (pmu, 1-pb), remove outliers from the distribution of toys. Outliers are defined as those further from the mean than this threshold times the standard deviation")
         ("LHCmode", boost::program_options::value<std::string>(&mode_)->default_value(mode_),  "Shortcuts for LHC style running modes. --LHCmode LHC-significance: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for discovery) --significance, --LHCmode LHC-limits: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for upper limits) --rule CLs, --LHCmode LHC-feldman-cousins: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=PL (Q_Profile, includes boundaries) --rule Pmu")
 
     ;
@@ -1189,16 +1191,17 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
 
     double minimizerTolerance_  = ROOT::Math::MinimizerOptions::DefaultTolerance();
 
+    RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
+    const std::vector<Double_t> & bdist   = bDistribution->GetSamplingDistribution();
+    const std::vector<Double_t> & bweight = bDistribution->GetSampleWeights();
+    const std::vector<Double_t> & sdist   = sDistribution->GetSamplingDistribution();
+    const std::vector<Double_t> & sweight = sDistribution->GetSampleWeights();
+    Double_t data =  hcres.GetTestStatisticData();
+    std::vector<Double_t> absbdist(bdist.size()), abssdist(sdist.size());
+    std::vector<Double_t> absbweight(bweight), abssweight(sweight);
+    Double_t absdata = data;
+
     if (testStat_ == "LHCFC") {
-        RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
-        const std::vector<Double_t> & bdist   = bDistribution->GetSamplingDistribution();
-        const std::vector<Double_t> & bweight = bDistribution->GetSampleWeights();
-        const std::vector<Double_t> & sdist   = sDistribution->GetSamplingDistribution();
-        const std::vector<Double_t> & sweight = sDistribution->GetSampleWeights();
-        Double_t data =  hcres.GetTestStatisticData();
-        std::vector<Double_t> absbdist(bdist.size()), abssdist(sdist.size());
-        std::vector<Double_t> absbweight(bweight), abssweight(sweight);
-        Double_t absdata;
         if (rule_ == "FC") {
             for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = fabs(bdist[i]);
             for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = fabs(sdist[i]);
@@ -1214,24 +1217,28 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
             abssweight.reserve(absbweight.size() + abssweight.size());
             abssweight.insert(abssweight.end(), absbweight.begin(), absbweight.end());
         }
-        RooStats::HypoTestResult result;
-        RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
-        RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
-        result.SetNullDistribution(absbDist);
-        result.SetAltDistribution(abssDist);
-        result.SetTestStatisticData(absdata);
-        result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
-        if (CLs_) {
-            return std::pair<double,double>(result.CLs(), result.CLsError());
-        } else {
-            return std::pair<double,double>(result.CLsplusb(), result.CLsplusbError());
-        }
     } else {
-        if (CLs_) {
-            return std::pair<double,double>(hcres.CLs(), hcres.CLsError());
-        } else {
-            return std::pair<double,double>(hcres.CLsplusb(), hcres.CLsplusbError());
-        }
+        // In this case we could probably just set the new vectors to the old ones (copy)
+        for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = bdist[i];
+        for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = sdist[i]; 
+    }
+
+    // should remove outliers from the distribution (will return immediately if no threshold set) 
+    removeOutliers(&abssdist,&abssweight);
+    removeOutliers(&absbdist,&absbweight);
+
+    RooStats::HypoTestResult result;
+    RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
+    RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
+    result.SetNullDistribution(absbDist);
+    result.SetAltDistribution(abssDist);
+    result.SetTestStatisticData(absdata);
+    if (testStat_ == "LHCFC") result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
+
+    if (CLs_) {
+        return std::pair<double,double>(result.CLs(), result.CLsError());
+    } else {
+        return std::pair<double,double>(result.CLsplusb(), result.CLsplusbError());
     }
 }
 
@@ -1763,3 +1770,29 @@ std::vector<std::pair<double,double> > HybridNew::findIntervalsFromSplines(TGrap
 	std::sort(points.begin(),points.end());
 	return points;
 }
+
+void HybridNew::removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t> *sw){
+
+    if ( removeOulierForPvalThreshold_ <= 0 ) return;
+    // calculate mean and stddev of distriution
+    double mean = std::accumulate(sd->begin(), sd->end(), 0.0) / sd->size();
+    double accum = 0.0;
+    std::for_each (std::begin(*sd), std::end(*sd), [&](const double d) {
+        accum += (d - mean) * (d - mean);
+    });
+    double stdev = sqrt(accum / (sd->size()));
+
+    // filter outliers based on threshold
+    std::vector<int> indices;
+    for(unsigned int i = 0; i < sd->size(); i++) {
+        if(std::abs((*sd)[i] - mean) > removeOulierForPvalThreshold_*stdev) {
+            indices.push_back(i);
+        }
+    }
+
+    // Remove elements from sd and sw
+    for(auto it = indices.rbegin(); it != indices.rend(); ++it) {
+        sd->erase(sd->begin() + *it);
+        sw->erase(sw->begin() + *it);
+    } 
+}
\ No newline at end of file

From 59c875524c7eff63354b5f4aed61f11555a1c0a7 Mon Sep 17 00:00:00 2001
From: Nick Wardle <nckw@cern.ch>
Date: Mon, 17 Jun 2024 11:53:35 +0200
Subject: [PATCH 2/2]  mean -> median

---
 src/HybridNew.cc | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/HybridNew.cc b/src/HybridNew.cc
index fa1dcc458d9..9564f1cf0a2 100644
--- a/src/HybridNew.cc
+++ b/src/HybridNew.cc
@@ -1223,7 +1223,6 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
         for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = sdist[i]; 
     }
 
-    // should remove outliers from the distribution (will return immediately if no threshold set) 
     removeOutliers(&abssdist,&abssweight);
     removeOutliers(&absbdist,&absbweight);
 
@@ -1235,6 +1234,21 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
     result.SetTestStatisticData(absdata);
     if (testStat_ == "LHCFC") result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
 
+    // print comparison of Results before and after cleaning //
+    CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Original: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+    	,rVal
+	,hcres.CLs(), hcres.CLsError()
+	,hcres.CLb(), hcres.CLbError()
+	,hcres.CLsplusb(), hcres.CLsplusbError()))
+	,__func__);
+    CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Update: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+    	,rVal
+	,result.CLs(), result.CLsError()
+	,result.CLb(), result.CLbError()
+	,result.CLsplusb(), result.CLsplusbError()))
+	,__func__);
+
+
     if (CLs_) {
         return std::pair<double,double>(result.CLs(), result.CLsError());
     } else {
@@ -1774,18 +1788,22 @@ std::vector<std::pair<double,double> > HybridNew::findIntervalsFromSplines(TGrap
 void HybridNew::removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t> *sw){
 
     if ( removeOulierForPvalThreshold_ <= 0 ) return;
-    // calculate mean and stddev of distriution
-    double mean = std::accumulate(sd->begin(), sd->end(), 0.0) / sd->size();
+    // calculate median and stddev of distriution
+    std::vector<Double_t> sorted_sd(sd->size());
+    std::partial_sort_copy(std::begin(*sd), std::end(*sd), std::begin(sorted_sd), std::end(sorted_sd));
+    
+    double median = sorted_sd[(int)0.5*sorted_sd.size()];
+
     double accum = 0.0;
     std::for_each (std::begin(*sd), std::end(*sd), [&](const double d) {
-        accum += (d - mean) * (d - mean);
+        accum += (d - median) * (d - median);
     });
     double stdev = sqrt(accum / (sd->size()));
-
+    
     // filter outliers based on threshold
     std::vector<int> indices;
     for(unsigned int i = 0; i < sd->size(); i++) {
-        if(std::abs((*sd)[i] - mean) > removeOulierForPvalThreshold_*stdev) {
+        if(std::abs((*sd)[i] - median) > removeOulierForPvalThreshold_*stdev) {
             indices.push_back(i);
         }
     }
@@ -1795,4 +1813,5 @@ void HybridNew::removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t>
         sd->erase(sd->begin() + *it);
         sw->erase(sw->begin() + *it);
     } 
-}
\ No newline at end of file
+}
+