cms-analysis · nucleosynthesis · Jun 13, 2024 · Jun 17, 2024
diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md
@@ -533,6 +533,8 @@ Failed to delete temporary file roostats-Sprxsw.root: No such file or directory
 
 The result stored in the **limit** branch of the output tree will be the upper limit (and its error, stored in **limitErr**). The default behaviour will be, as above, to search for the upper limit on **r**. However, the values of $p_{\mu}, p_{b}$ and CL<sub>s</sub> can be calculated for a particular value **r=X** by specifying the option `--singlePoint=X`. In this case, the value stored in the branch **limit** will be the value of CL<sub>s</sub> (or $p_{\mu}$) (see the [FAQ](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part4/usefullinks/#faq) section).
 
+In more complicated models, some of the toy datasets may lead to poor fits which results in spurious values of the test-statistic. In general, it is advised that diagnostic tools on the model/fits provided are used to avoid these poor fits, however if the fraction of these results is small, the option `--removeOutliersForPValue X` can be used. This option removes test-statistic values from the test-statistic distributions that are further than $X\sigma$ from the mean of the distribution, where $\sigma$ is the standard deviation of the distribution and $X$ is the value passed to the option, before calculating $p-$values and limits. Setting this to some reasonably large number (eg 50) should remove the spurious values. By default, no removal of outliers is performed. 
+
 #### Expected Limits
 
 For simple models, we can run interactively 5 times to compute the median expected and the 68% and 95% central interval boundaries. For this, we can use the `HybridNew` method with the same options as for the observed limit, but adding a `--expectedFromGrid=<quantile>`. Here, the quantile should be set to 0.5 for the median, 0.84 for the +ve side of the 68% band, 0.16 for the -ve side of the 68% band, 0.975 for the +ve side of the 95% band, and 0.025 for the -ve side of the 95% band.

diff --git a/interface/HybridNew.h b/interface/HybridNew.h
@@ -76,6 +76,7 @@ class HybridNew : public LimitAlgo {
   static float adaptiveToys_;
 
   static double EPS;
+  static float removeOulierForPvalThreshold_;
   // graph, used to compute the limit, not just for plotting!
   std::unique_ptr<TGraphErrors> limitPlot_;
 
@@ -137,7 +138,7 @@ class HybridNew : public LimitAlgo {
   void useGrid();
 
   bool doFC_;
-
+  void removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t> *sw);
 };
 
 #endif
diff --git a/src/HybridNew.cc b/src/HybridNew.cc
@@ -98,6 +98,7 @@ float HybridNew::confidenceToleranceForToyScaling_ = 0.2;
 float HybridNew::maxProbability_ = 0.999;
 double HybridNew::EPS = 1e-4;
 std::string HybridNew::mode_ = "";
+float HybridNew::removeOulierForPvalThreshold_ = -1.0; 
 
 HybridNew::HybridNew() :
 LimitAlgo("HybridNew specific options") {
@@ -146,6 +147,7 @@ LimitAlgo("HybridNew specific options") {
         ("importantContours",boost::program_options::value<std::string>(&scaleAndConfidenceSelection_)->default_value(scaleAndConfidenceSelection_), "Throw fewer toys far from interesting contours , format : CL_1,CL_2,..CL_N (--toysH scaled down when probability is far from any of CL_i) ")
         ("maxProbability", boost::program_options::value<float>(&maxProbability_)->default_value(maxProbability_),  "when point is >  maxProbability countour, don't bother throwing toys")
         ("confidenceTolerance", boost::program_options::value<float>(&confidenceToleranceForToyScaling_)->default_value(confidenceToleranceForToyScaling_),  "Determine what 'far' means for adatptiveToys. (relative in terms of (1-cl))")
+        ("removeOutliersForPValues", boost::program_options::value<float>(&removeOulierForPvalThreshold_)->default_value(removeOulierForPvalThreshold_),  "When calculating p-values (pmu, 1-pb), remove outliers from the distribution of toys. Outliers are defined as those further from the mean than this threshold times the standard deviation")
         ("LHCmode", boost::program_options::value<std::string>(&mode_)->default_value(mode_),  "Shortcuts for LHC style running modes. --LHCmode LHC-significance: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for discovery) --significance, --LHCmode LHC-limits: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for upper limits) --rule CLs, --LHCmode LHC-feldman-cousins: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=PL (Q_Profile, includes boundaries) --rule Pmu")
 
     ;
@@ -1189,16 +1191,17 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
 
     double minimizerTolerance_  = ROOT::Math::MinimizerOptions::DefaultTolerance();
 
+    RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
+    const std::vector<Double_t> & bdist   = bDistribution->GetSamplingDistribution();
+    const std::vector<Double_t> & bweight = bDistribution->GetSampleWeights();
+    const std::vector<Double_t> & sdist   = sDistribution->GetSamplingDistribution();
+    const std::vector<Double_t> & sweight = sDistribution->GetSampleWeights();
+    Double_t data =  hcres.GetTestStatisticData();
+    std::vector<Double_t> absbdist(bdist.size()), abssdist(sdist.size());
+    std::vector<Double_t> absbweight(bweight), abssweight(sweight);
+    Double_t absdata = data;
+
     if (testStat_ == "LHCFC") {
-        RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
-        const std::vector<Double_t> & bdist   = bDistribution->GetSamplingDistribution();
-        const std::vector<Double_t> & bweight = bDistribution->GetSampleWeights();
-        const std::vector<Double_t> & sdist   = sDistribution->GetSamplingDistribution();
-        const std::vector<Double_t> & sweight = sDistribution->GetSampleWeights();
-        Double_t data =  hcres.GetTestStatisticData();
-        std::vector<Double_t> absbdist(bdist.size()), abssdist(sdist.size());
-        std::vector<Double_t> absbweight(bweight), abssweight(sweight);
-        Double_t absdata;
         if (rule_ == "FC") {
             for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = fabs(bdist[i]);
             for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = fabs(sdist[i]);
@@ -1214,24 +1217,42 @@ std::pair<double,double> HybridNew::eval(const RooStats::HypoTestResult &hcres,
             abssweight.reserve(absbweight.size() + abssweight.size());
             abssweight.insert(abssweight.end(), absbweight.begin(), absbweight.end());
         }
-        RooStats::HypoTestResult result;
-        RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
-        RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
-        result.SetNullDistribution(absbDist);
-        result.SetAltDistribution(abssDist);
-        result.SetTestStatisticData(absdata);
-        result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
-        if (CLs_) {
-            return std::pair<double,double>(result.CLs(), result.CLsError());
-        } else {
-            return std::pair<double,double>(result.CLsplusb(), result.CLsplusbError());
-        }
     } else {
-        if (CLs_) {
-            return std::pair<double,double>(hcres.CLs(), hcres.CLsError());
-        } else {
-            return std::pair<double,double>(hcres.CLsplusb(), hcres.CLsplusbError());
-        }
+        // In this case we could probably just set the new vectors to the old ones (copy)
+        for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = bdist[i];
+        for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = sdist[i]; 
+    }
+
+    removeOutliers(&abssdist,&abssweight);
+    removeOutliers(&absbdist,&absbweight);
+
+    RooStats::HypoTestResult result;
+    RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
+    RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
+    result.SetNullDistribution(absbDist);
+    result.SetAltDistribution(abssDist);
+    result.SetTestStatisticData(absdata);
+    if (testStat_ == "LHCFC") result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
+
+    // print comparison of Results before and after cleaning //
+    CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Original: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+    	,rVal
+	,hcres.CLs(), hcres.CLsError()
+	,hcres.CLb(), hcres.CLbError()
+	,hcres.CLsplusb(), hcres.CLsplusbError()))
+	,__func__);
+    CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Update: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+    	,rVal
+	,result.CLs(), result.CLsError()
+	,result.CLb(), result.CLbError()
+	,result.CLsplusb(), result.CLsplusbError()))
+	,__func__);
+
+
+    if (CLs_) {
+        return std::pair<double,double>(result.CLs(), result.CLsError());
+    } else {
+        return std::pair<double,double>(result.CLsplusb(), result.CLsplusbError());
     }
 }
 
@@ -1763,3 +1784,34 @@ std::vector<std::pair<double,double> > HybridNew::findIntervalsFromSplines(TGrap
 	std::sort(points.begin(),points.end());
 	return points;
 }
+
+void HybridNew::removeOutliers(std::vector<Double_t> *sd, std::vector<Double_t> *sw){
+
+    if ( removeOulierForPvalThreshold_ <= 0 ) return;
+    // calculate median and stddev of distriution
+    std::vector<Double_t> sorted_sd(sd->size());
+    std::partial_sort_copy(std::begin(*sd), std::end(*sd), std::begin(sorted_sd), std::end(sorted_sd));
+
+    double median = sorted_sd[(int)0.5*sorted_sd.size()];
+
+    double accum = 0.0;
+    std::for_each (std::begin(*sd), std::end(*sd), [&](const double d) {
+        accum += (d - median) * (d - median);
+    });
+    double stdev = sqrt(accum / (sd->size()));
+
+    // filter outliers based on threshold
+    std::vector<int> indices;
+    for(unsigned int i = 0; i < sd->size(); i++) {
+        if(std::abs((*sd)[i] - median) > removeOulierForPvalThreshold_*stdev) {
+            indices.push_back(i);
+        }
+    }
+
+    // Remove elements from sd and sw
+    for(auto it = indices.rbegin(); it != indices.rend(); ++it) {
+        sd->erase(sd->begin() + *it);
+        sw->erase(sw->begin() + *it);
+    } 
+}
+