Merge pull request #376 from Shopify/rwstauner/regression-detection

Remove unused code (mostly around perf tripwires)
Shopify · Jan 14, 2025 · 0d8bc9b · 0d8bc9b
2 parents fed94c3 + aa876ec
commit 0d8bc9b
Show file tree

Hide file tree

Showing 6 changed files with 7 additions and 326 deletions.
diff --git a/continuous_reporting/benchmark_and_update.rb b/continuous_reporting/benchmark_and_update.rb
@@ -9,34 +9,6 @@
 require "optparse"
 require 'rbconfig'
 
-# TODO: should the benchmark-run and perf-check parts of this script be separated? Probably.
-
-# This is intended to be the top-level script for running benchmarks, reporting on them
-# and checking performance.
-
-# The tolerances below are for detecting big drops in benchmark performance. So far I haven't
-# had good luck with that -- it's either too sensitive and files spurious bugs a lot, or it's
-# too insensitive and doesn't notice real bugs. Right now, auto-filing bugs is turned off.
-
-# The STDDEV_TOLERANCE is what multiple of the standard deviation it's okay to drop on
-# a given run. That effectively determines the false-positive rate since we're comparing samples
-# from a Gaussian-ish distribution.
-NORMAL_STDDEV_TOLERANCE = 1.5
-# The DROP_TOLERANCE is what absolute multiple-of-the-mean drop (e.g. 0.05 means 5%) is
-# assumed to be okay. For each metric we use the more permissive of the two tolerances
-# on the more permissive of the two mean values. All four must be outside of tolerance
-# for us to count a failure.
-NORMAL_DROP_TOLERANCE = 0.07
-# A microbenchmark will normally have a very small stddev from run to run. That means
-# it's actually *less* tolerant of noise on the host, since "twice the stddev" is a
-# significantly smaller absolute value.
-MICRO_STDDEV_TOLERANCE = 2.0
-# A microbenchmark will routinely show persistent speed changes of much larger magnitude
-# than a larger, more varied benchmark. For example, setivar is surprisingly prone to
-# large swings in time taken depending on tiny changes to memory layout. So the drop
-# tolerance needs to be significantly larger to avoid frequent false positives.
-MICRO_DROP_TOLERANCE = 0.20
-
 BUILT_REPORTS_ROOT = YJITMetrics::ContinuousReporting::BUILT_REPORTS_ROOT
 RAW_BENCHMARK_ROOT = YJITMetrics::ContinuousReporting::RAW_BENCHMARK_ROOT
 
@@ -91,11 +63,6 @@ def args_for_bench_type(bench_type_arg)
 
 benchmark_args = nil
 bench_params_file = nil
-should_file_gh_issue = true
-no_perf_tripwires = false
-all_perf_tripwires = false
-single_perf_tripwire = nil
-is_verbose = false
 data_dir = nil
 full_rebuild = nil
 
@@ -110,33 +77,6 @@ def args_for_bench_type(bench_type_arg)
       benchmark_args = args_for_bench_type(btype)
     end
 
-    # NOTE: GH issue filing is currently disabled
-    opts.on("-g", "--no-gh-issue", "Do not file an actual GitHub issue, only print failures to console") do
-        should_file_gh_issue = false
-    end
-
-    opts.on("-a", "--all-perf-tripwires", "Check performance tripwires on all pairs of benchmarks (implies --no-gh-issue)") do
-        if no_perf_tripwires || single_perf_tripwire
-            raise "Error! Please do not specify --all-perf-tripwires along with --no-perf-tripwires or --single-perf-tripwire!"
-        end
-        all_perf_tripwires = true
-        should_file_gh_issue = false
-    end
-
-    opts.on("-t TS", "--perf-timestamp TIMESTAMP", "Check performance tripwire at this specific timestamp") do |ts|
-        if all_perf_tripwires || no_perf_tripwires
-            raise "Error! Please do not specify a specific perf tripwire along with --no-perf-tripwires or --all-perf-tripwires!"
-        end
-        single_perf_tripwire = ts.strip
-    end
-
-    opts.on("-np", "--no-perf-tripwires", "Do not check performance tripwires") do
-        if all_perf_tripwires || single_perf_tripwire
-            raise "Error! Please do not specify --no-perf-tripwires along with --single-perf-tripwire or --all-perf-tripwires!"
-        end
-        no_perf_tripwires = true
-    end
-
     opts.on("-fr YN", "--full-rebuild YN") do |fr|
         if fr.nil? || fr.strip == ""
             full_rebuild = true
@@ -154,51 +94,17 @@ def args_for_bench_type(bench_type_arg)
         raise("No such directory: #{ddir.inspect}!") unless File.directory?(ddir)
         data_dir = ddir
     end
-
-    opts.on("-v", "--verbose", "Print verbose output about tripwire checks") do
-        is_verbose = true
-    end
 end.parse!
 
 bench_params_data = bench_params_file ? JSON.parse(File.read bench_params_file) : {}
 
 BENCHMARK_ARGS = benchmark_args || (bench_params_data["bench_type"] ? args_for_bench_type(bench_params_data["bench_type"]) : BENCH_TYPES["default"])
 FULL_REBUILD = !full_rebuild.nil? ? full_rebuild : (bench_params_data["full_rebuild"] || false)
-FILE_GH_ISSUE = should_file_gh_issue
-ALL_PERF_TRIPWIRES = all_perf_tripwires
-SINGLE_PERF_TRIPWIRE = single_perf_tripwire
-VERBOSE = is_verbose
 BENCH_PARAMS_FILE = bench_params_file
 DATA_DIR = File.expand_path(data_dir || bench_params_data["data_directory"] || "continuous_reporting/data")
 
 PIDFILE = "/home/ubuntu/benchmark_ci.pid"
 
-GITHUB_TOKEN=ENV["BENCHMARK_CI_GITHUB_TOKEN"]
-if FILE_GH_ISSUE && !GITHUB_TOKEN
-    raise "Set BENCHMARK_CI_GITHUB_TOKEN to an appropriate GitHub token to allow auto-filing issues! (or set --no-gh-issue)"
-end
-
-def ghapi_post(api_uri, params, verb: :post)
-    uri = URI("https://api.github.com" + api_uri)
-
-    req = Net::HTTP::Post.new(uri)
-    #req.basic_auth GITHUB_USER, GITHUB_TOKEN
-    req['Accept'] = "application/vnd.github.v3+json"
-    req['Content-Type'] = "application/json"
-    req['Authorization'] = "Bearer #{GITHUB_TOKEN}"
-    req.body = JSON.dump(params)
-    result = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |http| http.request(req) }
-
-    unless result.is_a?(Net::HTTPSuccess)
-        $stderr.puts "Error in HTTP #{verb.upcase}: #{result.inspect}"
-        $stderr.puts result.body
-        $stderr.puts "------"
-        raise "HTTP error when posting to #{api_uri}!"
-    end
-
-    JSON.load(result.body)
-end
-
 class BenchmarkDetails
     def initialize(timestamp)
         @timestamp = timestamp
@@ -227,32 +133,6 @@ def escape_markdown(s)
     s.gsub(/(\*|\_|\`)/) { '\\' + $1 }.gsub("<", "&lt;")
 end
 
-def file_gh_issue(title, message)
-    host = `uname -a`.chomp
-    issue_body = <<~ISSUE
-        Error running benchmark CI job on #{host}:
-
-        #{message}
-    ISSUE
-
-    unless FILE_GH_ISSUE
-        print "We would file a GitHub issue, but we were asked not to. Details:\n\n"
-
-        print "==============================\n"
-        print "Title: CI Benchmarking: #{title}\n"
-        puts issue_body
-        print "==============================\n"
-        return
-    end
-
-    # Note: if you're set up as the GitHub user, it's not gonna email you since it thinks you filed it yourself.
-    ghapi_post "/repos/Shopify/yjit-metrics/issues",
-        {
-            "title" => "CI Benchmarking: #{title}",
-            "body" => issue_body
-        }
-end
-
 if File.exist?(PIDFILE)
     pid = File.read(PIDFILE).to_i
     if pid && pid > 0
@@ -280,10 +160,6 @@ def run_benchmarks
     end
 end
 
-def ts_from_tripwire_filename(filename)
-    filename.split("blog_speed_details_")[1].split(".")[0]
-end
-
 def timestr_from_ts(ts)
     if ts =~ /\A(\d{4}-\d{2}-\d{2})-(\d{6})\Z/
         year_month_day = $1
@@ -297,158 +173,9 @@ def timestr_from_ts(ts)
     end
 end
 
-# If benchmark results drop noticeably, file a Github issue
-def check_perf_tripwires
-    Dir.chdir(YM_REPORT_DIR) do
-        # Grab only non-platform-specific tripwire files that do *not* have a platform name in them,
-        # but instead end in a six-digit timestamp.
-        tripwire_files = Dir["*.tripwires.json"].to_a.select {|f| f =~ /\d{6}\.tripwires\.json\Z/}.sort
-
-        if ALL_PERF_TRIPWIRES
-            (tripwire_files.size - 1).times do |index|
-                check_one_perf_tripwire(tripwire_files[index], tripwire_files[index - 1])
-            end
-        elsif SINGLE_PERF_TRIPWIRE
-            specified_file = tripwire_files.detect { |f| f.include?(SINGLE_PERF_TRIPWIRE) }
-            raise "Couldn't find perf tripwire report containing #{SINGLE_PERF_TRIPWIRE.inspect}!" unless specified_file
-
-            specified_index = tripwire_files.index(specified_file)
-            raise "Can't check perf on the very first report!" if specified_index == 0
-
-            check_one_perf_tripwire(tripwire_files[specified_index], tripwire_files[specified_index - 1])
-        else
-            check_one_perf_tripwire(tripwire_files[-1], tripwire_files[-2])
-        end
-    end
-end
-
-def check_one_perf_tripwire(current_filename, compared_filename, verbose: VERBOSE)
-    YJITMetrics::PLATFORMS.each do |platform|
-        platform_current_filename = current_filename.gsub(".tripwires.json", ".#{platform}.tripwires.json")
-    end
-
-end
-
-def check_one_perf_tripwire_file(current_filename, compared_filename, verbose: VERBOSE)
-    current_data = JSON.parse File.read(current_filename)
-    compared_data = JSON.parse File.read(compared_filename)
-
-    check_failures = []
-
-    compared_data.each do |bench_name, values|
-        # Only compare if both sets of data have the benchmark
-        next unless current_data[bench_name]
-
-        current_mean = current_data[bench_name]["mean"]
-        current_rsd_pct = current_data[bench_name]["rsd_pct"]
-        compared_mean = values["mean"]
-        compared_rsd_pct = values["rsd_pct"]
-
-        current_stddev = (current_rsd_pct.to_f / 100.0) * current_mean
-        compared_stddev = (compared_rsd_pct.to_f / 100.0) * compared_mean
-
-        # Normally is_micro should be the same in all cases for any one specific benchmark.
-        # So we just assume the most recent data has the correct value.
-        is_micro = current_data[bench_name]["micro"]
-
-        if is_micro
-            bench_stddev_tolerance = MICRO_STDDEV_TOLERANCE
-            bench_drop_tolerance = MICRO_DROP_TOLERANCE
-        else
-            bench_stddev_tolerance = NORMAL_STDDEV_TOLERANCE
-            bench_drop_tolerance = NORMAL_DROP_TOLERANCE
-        end
-
-        # Occasionally stddev can change pretty wildly from run to run. Take the most tolerant of multiple-of-recent-stddev,
-        # or a percentage of the larger mean runtime. Basically, a drop must be unusual enough (stddev) and large enough (mean)
-        # for us to flag it.
-        tolerance = [ current_stddev * bench_stddev_tolerance, compared_stddev * bench_stddev_tolerance,
-            current_mean * bench_drop_tolerance, compared_mean * bench_drop_tolerance ].max
-
-        drop = current_mean - compared_mean
-
-        if verbose
-            puts "#{is_micro ? "Microbenchmark" : "Benchmark"} #{bench_name}, tolerance is #{ "%.2f" % tolerance }, latest mean is #{ "%.2f" % current_mean } (stddev #{"%.2f" % current_stddev}), " +
-                "next-latest mean is #{ "%.2f" % compared_mean } (stddev #{ "%.2f" % compared_stddev}), drop is #{ "%.2f" % drop }..."
-        end
-
-        if drop > tolerance
-            puts "Benchmark #{bench_name} marked as failure!" if verbose
-            check_failures.push({
-                benchmark: bench_name,
-                current_mean: current_mean,
-                second_current_mean: compared_mean,
-                current_stddev: current_stddev,
-                current_rsd_pct: current_rsd_pct,
-                second_current_stddev: compared_stddev,
-                second_current_rsd_pct: compared_rsd_pct,
-            })
-        end
-    end
-
-    if check_failures.empty?
-      puts "No benchmarks failing performance tripwire (#{current_filename})"
-      return
-    end
-
-    puts "Failing benchmarks (#{current_filename}): #{check_failures.map { |h| h[:benchmark] }}"
-    file_perf_bug(current_filename, compared_filename, check_failures)
-end
-
-def file_perf_bug(current_filename, compared_filename, check_failures)
-    ts_latest = ts_from_tripwire_filename(current_filename)
-    ts_penultimate = ts_from_tripwire_filename(compared_filename)
-
-    # This expects to be called from the _includes/reports directory
-    latest_yjit_results = BenchmarkDetails.new(ts_latest)
-    latest_yjit_result_file = latest_yjit_results.yjit_test_result
-    penultimate_yjit_results = BenchmarkDetails.new(ts_penultimate)
-    penultimate_yjit_result_file = penultimate_yjit_results.yjit_test_result
-    latest_yjit_data = JSON.parse File.read(latest_yjit_result_file)
-    penultimate_yjit_data = JSON.parse File.read(penultimate_yjit_result_file)
-    latest_yjit_ruby_desc = latest_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]
-    penultimate_yjit_ruby_desc = penultimate_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]
-
-    failing_benchmarks = check_failures.map { |h| h[:benchmark] }
-
-    puts "Filing Github issue - slower benchmark(s) found."
-    body = <<~BODY_TOP
-    Latest failing benchmark:
-
-    * Time: #{timestr_from_ts(ts_latest)}
-    * Ruby: #{escape_markdown latest_yjit_ruby_desc}
-    * [Raw YJIT prod data](#{latest_yjit_results.yjit_permalink})
-
-    Compared to previous benchmark:
-
-    * Time: #{timestr_from_ts(ts_penultimate)}
-    * Ruby: #{escape_markdown penultimate_yjit_ruby_desc}
-    * [Raw YJIT prod data](#{penultimate_yjit_results.yjit_permalink})
-
-    Slower benchmarks: #{failing_benchmarks.join(", ")}
-
-    [Timeline Graph](https://speed.yjit.org/timeline-deep##{failing_benchmarks.join("+")})
-
-    Slowdown details:
-
-    BODY_TOP
-
-    check_failures.each do |bench_hash|
-        # Indentation with here-docs is hard - use the old-style with extremely literal whitespace handling.
-        body += <<ONE_BENCH_REPORT
-* #{bench_hash[:benchmark]}:
-    * Speed before: #{"%.2f" % bench_hash[:second_current_mean]} +/- #{"%.1f" % bench_hash[:second_current_rsd_pct]}%
-    * Speed after: #{"%.2f" % bench_hash[:current_mean]} +/- #{"%.1f" % bench_hash[:current_rsd_pct]}%
-ONE_BENCH_REPORT
-    end
-
-    file_gh_issue("Benchmark at #{ts_latest} is significantly slower than the one before (#{ts_penultimate})!", body)
-end
-
 begin
     run_benchmarks
 rescue
-    host = `uname -a`.chomp
     puts $!.full_message
     raise "Exception in CI benchmarks: #{$!.message}!"
 end

diff --git a/continuous_reporting/generate_and_upload_reports.rb b/continuous_reporting/generate_and_upload_reports.rb
@@ -36,11 +36,11 @@
     },
     "blog_speed_details" => {
         report_type: :basic_report,
-        extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
+        extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
     },
     "blog_memory_details" => {
         report_type: :basic_report,
-        extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
+        extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
     },
     "blog_yjit_stats" => {
         report_type: :basic_report,
@@ -168,7 +168,6 @@ def ruby_version_from_metadata(metadata)
         report_name_in_file = $1
         raise "Non-matching report name with filename, #{report_name.inspect} vs #{report_name_in_file.inspect}!" unless report_name == report_name_in_file
         ts = $2
-        ext = $3
 
         report_timestamps[ts] ||= {}
         report_timestamps[ts][report_name] ||= []
@@ -333,21 +332,6 @@ def ruby_version_from_metadata(metadata)
 
 puts "Static site seems to build correctly. That means that GHPages should do the right thing on push."
 
-# Benchmark raw data was already committed. Built reports are now locally cached, not pushed.
-#dirs_to_commit = [ "_benchmarks", "_includes", "reports" ]
-### Commit built reports if there is something to commit
-#diffs = (YJITMetrics.check_output "git status --porcelain #{dirs_to_commit.join(" ")}").chomp
-#if diffs == ""
-#    puts "No changes found. Not committing or pushing."
-#elsif no_push
-#    puts "Changes found, but --no-push was specified. Not committing or pushing."
-#else
-#    puts "Changes found. Committing and pushing."
-#    YJITMetrics.check_call "git add #{dirs_to_commit.join(" ")}"
-#    YJITMetrics.check_call 'git commit -m "Update reports via continuous_reporting.rb script"'
-#    YJITMetrics.check_call "git push"
-#end
-
 # Copy built _site directory into raw pages repo as a new single commit, to branch new_pages
 Dir.chdir GHPAGES_REPO
 puts "Switched to #{Dir.pwd}"

diff --git a/continuous_reporting/gh_tasks/run_benchmarks.sh b/continuous_reporting/gh_tasks/run_benchmarks.sh
@@ -6,6 +6,6 @@ cd ~/ym/yjit-metrics
 
 bundle
 
-ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --no-perf-tripwires --bench-params=$BENCH_PARAMS
+ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --bench-params=$BENCH_PARAMS
 
 echo "Completed benchmarking successfully."