From bf890cb9b7783b724035edafb5d65708bf661231 Mon Sep 17 00:00:00 2001 From: Randy Stauner Date: Mon, 13 Jan 2025 09:47:35 -0700 Subject: [PATCH 1/4] Remove old, unused perf tripwire code --- continuous_reporting/benchmark_and_update.rb | 272 ------------------ .../generate_and_upload_reports.rb | 4 +- .../gh_tasks/run_benchmarks.sh | 2 +- .../reports/memory_details_report.rb | 7 +- .../reports/speed_details_report.rb | 19 +- 5 files changed, 5 insertions(+), 299 deletions(-) diff --git a/continuous_reporting/benchmark_and_update.rb b/continuous_reporting/benchmark_and_update.rb index 1df0058d9..d17b7a70a 100755 --- a/continuous_reporting/benchmark_and_update.rb +++ b/continuous_reporting/benchmark_and_update.rb @@ -9,34 +9,6 @@ require "optparse" require 'rbconfig' -# TODO: should the benchmark-run and perf-check parts of this script be separated? Probably. - -# This is intended to be the top-level script for running benchmarks, reporting on them -# and checking performance. - -# The tolerances below are for detecting big drops in benchmark performance. So far I haven't -# had good luck with that -- it's either too sensitive and files spurious bugs a lot, or it's -# too insensitive and doesn't notice real bugs. Right now, auto-filing bugs is turned off. - -# The STDDEV_TOLERANCE is what multiple of the standard deviation it's okay to drop on -# a given run. That effectively determines the false-positive rate since we're comparing samples -# from a Gaussian-ish distribution. -NORMAL_STDDEV_TOLERANCE = 1.5 -# The DROP_TOLERANCE is what absolute multiple-of-the-mean drop (e.g. 0.05 means 5%) is -# assumed to be okay. For each metric we use the more permissive of the two tolerances -# on the more permissive of the two mean values. All four must be outside of tolerance -# for us to count a failure. -NORMAL_DROP_TOLERANCE = 0.07 -# A microbenchmark will normally have a very small stddev from run to run. That means -# it's actually *less* tolerant of noise on the host, since "twice the stddev" is a -# significantly smaller absolute value. -MICRO_STDDEV_TOLERANCE = 2.0 -# A microbenchmark will routinely show persistent speed changes of much larger magnitude -# than a larger, more varied benchmark. For example, setivar is surprisingly prone to -# large swings in time taken depending on tiny changes to memory layout. So the drop -# tolerance needs to be significantly larger to avoid frequent false positives. -MICRO_DROP_TOLERANCE = 0.20 - BUILT_REPORTS_ROOT = YJITMetrics::ContinuousReporting::BUILT_REPORTS_ROOT RAW_BENCHMARK_ROOT = YJITMetrics::ContinuousReporting::RAW_BENCHMARK_ROOT @@ -91,11 +63,6 @@ def args_for_bench_type(bench_type_arg) benchmark_args = nil bench_params_file = nil -should_file_gh_issue = true -no_perf_tripwires = false -all_perf_tripwires = false -single_perf_tripwire = nil -is_verbose = false data_dir = nil full_rebuild = nil @@ -110,33 +77,6 @@ def args_for_bench_type(bench_type_arg) benchmark_args = args_for_bench_type(btype) end - # NOTE: GH issue filing is currently disabled - opts.on("-g", "--no-gh-issue", "Do not file an actual GitHub issue, only print failures to console") do - should_file_gh_issue = false - end - - opts.on("-a", "--all-perf-tripwires", "Check performance tripwires on all pairs of benchmarks (implies --no-gh-issue)") do - if no_perf_tripwires || single_perf_tripwire - raise "Error! Please do not specify --all-perf-tripwires along with --no-perf-tripwires or --single-perf-tripwire!" - end - all_perf_tripwires = true - should_file_gh_issue = false - end - - opts.on("-t TS", "--perf-timestamp TIMESTAMP", "Check performance tripwire at this specific timestamp") do |ts| - if all_perf_tripwires || no_perf_tripwires - raise "Error! Please do not specify a specific perf tripwire along with --no-perf-tripwires or --all-perf-tripwires!" - end - single_perf_tripwire = ts.strip - end - - opts.on("-np", "--no-perf-tripwires", "Do not check performance tripwires") do - if all_perf_tripwires || single_perf_tripwire - raise "Error! Please do not specify --no-perf-tripwires along with --single-perf-tripwire or --all-perf-tripwires!" - end - no_perf_tripwires = true - end - opts.on("-fr YN", "--full-rebuild YN") do |fr| if fr.nil? || fr.strip == "" full_rebuild = true @@ -154,51 +94,17 @@ def args_for_bench_type(bench_type_arg) raise("No such directory: #{ddir.inspect}!") unless File.directory?(ddir) data_dir = ddir end - - opts.on("-v", "--verbose", "Print verbose output about tripwire checks") do - is_verbose = true - end end.parse! bench_params_data = bench_params_file ? JSON.parse(File.read bench_params_file) : {} BENCHMARK_ARGS = benchmark_args || (bench_params_data["bench_type"] ? args_for_bench_type(bench_params_data["bench_type"]) : BENCH_TYPES["default"]) FULL_REBUILD = !full_rebuild.nil? ? full_rebuild : (bench_params_data["full_rebuild"] || false) -FILE_GH_ISSUE = should_file_gh_issue -ALL_PERF_TRIPWIRES = all_perf_tripwires -SINGLE_PERF_TRIPWIRE = single_perf_tripwire -VERBOSE = is_verbose BENCH_PARAMS_FILE = bench_params_file DATA_DIR = File.expand_path(data_dir || bench_params_data["data_directory"] || "continuous_reporting/data") PIDFILE = "/home/ubuntu/benchmark_ci.pid" -GITHUB_TOKEN=ENV["BENCHMARK_CI_GITHUB_TOKEN"] -if FILE_GH_ISSUE && !GITHUB_TOKEN - raise "Set BENCHMARK_CI_GITHUB_TOKEN to an appropriate GitHub token to allow auto-filing issues! (or set --no-gh-issue)" -end - -def ghapi_post(api_uri, params, verb: :post) - uri = URI("https://api.github.com" + api_uri) - - req = Net::HTTP::Post.new(uri) - #req.basic_auth GITHUB_USER, GITHUB_TOKEN - req['Accept'] = "application/vnd.github.v3+json" - req['Content-Type'] = "application/json" - req['Authorization'] = "Bearer #{GITHUB_TOKEN}" - req.body = JSON.dump(params) - result = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |http| http.request(req) } - - unless result.is_a?(Net::HTTPSuccess) - $stderr.puts "Error in HTTP #{verb.upcase}: #{result.inspect}" - $stderr.puts result.body - $stderr.puts "------" - raise "HTTP error when posting to #{api_uri}!" - end - - JSON.load(result.body) -end - class BenchmarkDetails def initialize(timestamp) @timestamp = timestamp @@ -227,32 +133,6 @@ def escape_markdown(s) s.gsub(/(\*|\_|\`)/) { '\\' + $1 }.gsub("<", "<") end -def file_gh_issue(title, message) - host = `uname -a`.chomp - issue_body = <<~ISSUE - Error running benchmark CI job on #{host}: - - #{message} - ISSUE - - unless FILE_GH_ISSUE - print "We would file a GitHub issue, but we were asked not to. Details:\n\n" - - print "==============================\n" - print "Title: CI Benchmarking: #{title}\n" - puts issue_body - print "==============================\n" - return - end - - # Note: if you're set up as the GitHub user, it's not gonna email you since it thinks you filed it yourself. - ghapi_post "/repos/Shopify/yjit-metrics/issues", - { - "title" => "CI Benchmarking: #{title}", - "body" => issue_body - } -end - if File.exist?(PIDFILE) pid = File.read(PIDFILE).to_i if pid && pid > 0 @@ -280,10 +160,6 @@ def run_benchmarks end end -def ts_from_tripwire_filename(filename) - filename.split("blog_speed_details_")[1].split(".")[0] -end - def timestr_from_ts(ts) if ts =~ /\A(\d{4}-\d{2}-\d{2})-(\d{6})\Z/ year_month_day = $1 @@ -297,154 +173,6 @@ def timestr_from_ts(ts) end end -# If benchmark results drop noticeably, file a Github issue -def check_perf_tripwires - Dir.chdir(YM_REPORT_DIR) do - # Grab only non-platform-specific tripwire files that do *not* have a platform name in them, - # but instead end in a six-digit timestamp. - tripwire_files = Dir["*.tripwires.json"].to_a.select {|f| f =~ /\d{6}\.tripwires\.json\Z/}.sort - - if ALL_PERF_TRIPWIRES - (tripwire_files.size - 1).times do |index| - check_one_perf_tripwire(tripwire_files[index], tripwire_files[index - 1]) - end - elsif SINGLE_PERF_TRIPWIRE - specified_file = tripwire_files.detect { |f| f.include?(SINGLE_PERF_TRIPWIRE) } - raise "Couldn't find perf tripwire report containing #{SINGLE_PERF_TRIPWIRE.inspect}!" unless specified_file - - specified_index = tripwire_files.index(specified_file) - raise "Can't check perf on the very first report!" if specified_index == 0 - - check_one_perf_tripwire(tripwire_files[specified_index], tripwire_files[specified_index - 1]) - else - check_one_perf_tripwire(tripwire_files[-1], tripwire_files[-2]) - end - end -end - -def check_one_perf_tripwire(current_filename, compared_filename, verbose: VERBOSE) - YJITMetrics::PLATFORMS.each do |platform| - platform_current_filename = current_filename.gsub(".tripwires.json", ".#{platform}.tripwires.json") - end - -end - -def check_one_perf_tripwire_file(current_filename, compared_filename, verbose: VERBOSE) - current_data = JSON.parse File.read(current_filename) - compared_data = JSON.parse File.read(compared_filename) - - check_failures = [] - - compared_data.each do |bench_name, values| - # Only compare if both sets of data have the benchmark - next unless current_data[bench_name] - - current_mean = current_data[bench_name]["mean"] - current_rsd_pct = current_data[bench_name]["rsd_pct"] - compared_mean = values["mean"] - compared_rsd_pct = values["rsd_pct"] - - current_stddev = (current_rsd_pct.to_f / 100.0) * current_mean - compared_stddev = (compared_rsd_pct.to_f / 100.0) * compared_mean - - # Normally is_micro should be the same in all cases for any one specific benchmark. - # So we just assume the most recent data has the correct value. - is_micro = current_data[bench_name]["micro"] - - if is_micro - bench_stddev_tolerance = MICRO_STDDEV_TOLERANCE - bench_drop_tolerance = MICRO_DROP_TOLERANCE - else - bench_stddev_tolerance = NORMAL_STDDEV_TOLERANCE - bench_drop_tolerance = NORMAL_DROP_TOLERANCE - end - - # Occasionally stddev can change pretty wildly from run to run. Take the most tolerant of multiple-of-recent-stddev, - # or a percentage of the larger mean runtime. Basically, a drop must be unusual enough (stddev) and large enough (mean) - # for us to flag it. - tolerance = [ current_stddev * bench_stddev_tolerance, compared_stddev * bench_stddev_tolerance, - current_mean * bench_drop_tolerance, compared_mean * bench_drop_tolerance ].max - - drop = current_mean - compared_mean - - if verbose - puts "#{is_micro ? "Microbenchmark" : "Benchmark"} #{bench_name}, tolerance is #{ "%.2f" % tolerance }, latest mean is #{ "%.2f" % current_mean } (stddev #{"%.2f" % current_stddev}), " + - "next-latest mean is #{ "%.2f" % compared_mean } (stddev #{ "%.2f" % compared_stddev}), drop is #{ "%.2f" % drop }..." - end - - if drop > tolerance - puts "Benchmark #{bench_name} marked as failure!" if verbose - check_failures.push({ - benchmark: bench_name, - current_mean: current_mean, - second_current_mean: compared_mean, - current_stddev: current_stddev, - current_rsd_pct: current_rsd_pct, - second_current_stddev: compared_stddev, - second_current_rsd_pct: compared_rsd_pct, - }) - end - end - - if check_failures.empty? - puts "No benchmarks failing performance tripwire (#{current_filename})" - return - end - - puts "Failing benchmarks (#{current_filename}): #{check_failures.map { |h| h[:benchmark] }}" - file_perf_bug(current_filename, compared_filename, check_failures) -end - -def file_perf_bug(current_filename, compared_filename, check_failures) - ts_latest = ts_from_tripwire_filename(current_filename) - ts_penultimate = ts_from_tripwire_filename(compared_filename) - - # This expects to be called from the _includes/reports directory - latest_yjit_results = BenchmarkDetails.new(ts_latest) - latest_yjit_result_file = latest_yjit_results.yjit_test_result - penultimate_yjit_results = BenchmarkDetails.new(ts_penultimate) - penultimate_yjit_result_file = penultimate_yjit_results.yjit_test_result - latest_yjit_data = JSON.parse File.read(latest_yjit_result_file) - penultimate_yjit_data = JSON.parse File.read(penultimate_yjit_result_file) - latest_yjit_ruby_desc = latest_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"] - penultimate_yjit_ruby_desc = penultimate_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"] - - failing_benchmarks = check_failures.map { |h| h[:benchmark] } - - puts "Filing Github issue - slower benchmark(s) found." - body = <<~BODY_TOP - Latest failing benchmark: - - * Time: #{timestr_from_ts(ts_latest)} - * Ruby: #{escape_markdown latest_yjit_ruby_desc} - * [Raw YJIT prod data](#{latest_yjit_results.yjit_permalink}) - - Compared to previous benchmark: - - * Time: #{timestr_from_ts(ts_penultimate)} - * Ruby: #{escape_markdown penultimate_yjit_ruby_desc} - * [Raw YJIT prod data](#{penultimate_yjit_results.yjit_permalink}) - - Slower benchmarks: #{failing_benchmarks.join(", ")} - - [Timeline Graph](https://speed.yjit.org/timeline-deep##{failing_benchmarks.join("+")}) - - Slowdown details: - - BODY_TOP - - check_failures.each do |bench_hash| - # Indentation with here-docs is hard - use the old-style with extremely literal whitespace handling. - body += < { report_type: :basic_report, - extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ], + extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ], }, "blog_memory_details" => { report_type: :basic_report, - extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ], + extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ], }, "blog_yjit_stats" => { report_type: :basic_report, diff --git a/continuous_reporting/gh_tasks/run_benchmarks.sh b/continuous_reporting/gh_tasks/run_benchmarks.sh index b09e45b9d..c1981fdd3 100755 --- a/continuous_reporting/gh_tasks/run_benchmarks.sh +++ b/continuous_reporting/gh_tasks/run_benchmarks.sh @@ -6,6 +6,6 @@ cd ~/ym/yjit-metrics bundle -ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --no-perf-tripwires --bench-params=$BENCH_PARAMS +ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --bench-params=$BENCH_PARAMS echo "Completed benchmarking successfully." diff --git a/lib/yjit_metrics/reports/memory_details_report.rb b/lib/yjit_metrics/reports/memory_details_report.rb index 143d2b46a..53c1e1d24 100644 --- a/lib/yjit_metrics/reports/memory_details_report.rb +++ b/lib/yjit_metrics/reports/memory_details_report.rb @@ -12,7 +12,7 @@ class MemoryDetailsReport < SpeedDetailsReport #end def self.report_extensions - [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ] + [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ] end def initialize(config_names, results, platform:, benchmarks: []) @@ -82,10 +82,5 @@ def html_template_path def relative_values_by_config_and_benchmark @peak_mb_relative_by_config end - - # FIXME: We aren't reporting on the tripwires currently, but it makes sense to implement it and report on it. - def tripwires - {} - end end end diff --git a/lib/yjit_metrics/reports/speed_details_report.rb b/lib/yjit_metrics/reports/speed_details_report.rb index 5e3297e3d..d23ac2f5c 100644 --- a/lib/yjit_metrics/reports/speed_details_report.rb +++ b/lib/yjit_metrics/reports/speed_details_report.rb @@ -14,7 +14,7 @@ class SpeedDetailsReport < BloggableSingleReport #end def self.report_extensions - [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ] + [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ] end def initialize(orig_config_names, results, platform:, benchmarks: []) @@ -426,19 +426,6 @@ def svg_object(relative_values_by_config_and_benchmark, benchmarks: @benchmark_n svg end - def tripwires - tripwires = {} - micro = micro_benchmarks - @benchmark_names.each_with_index do |bench_name, idx| - tripwires[bench_name] = { - mean: @mean_by_config[@with_yjit_config][idx], - rsd_pct: @rsd_pct_by_config[@with_yjit_config][idx], - micro: micro.include?(bench_name), - } - end - tripwires - end - def html_template_path File.expand_path("../report_templates/blog_speed_details.html.erb", __dir__) end @@ -491,10 +478,6 @@ def write_file(filename) html_output = script_template.result(binding) File.open(filename + ".#{@platform}.html", "w") { |f| f.write(html_output) } - # The Tripwire report is used to tell when benchmark performance drops suddenly - json_data = tripwires - File.open(filename + ".#{@platform}.tripwires.json", "w") { |f| f.write JSON.pretty_generate json_data } - write_to_csv(filename + ".#{@platform}.csv", [@headings] + report_table_data) end end From 54feedeb5182fefe38bea76f0fd57ab7da6838e3 Mon Sep 17 00:00:00 2001 From: Randy Stauner Date: Mon, 13 Jan 2025 09:55:30 -0700 Subject: [PATCH 2/4] Remove unused var This was a leftover from a previous edit of the tripwire code. --- continuous_reporting/benchmark_and_update.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/continuous_reporting/benchmark_and_update.rb b/continuous_reporting/benchmark_and_update.rb index d17b7a70a..09ab6fda5 100755 --- a/continuous_reporting/benchmark_and_update.rb +++ b/continuous_reporting/benchmark_and_update.rb @@ -176,7 +176,6 @@ def timestr_from_ts(ts) begin run_benchmarks rescue - host = `uname -a`.chomp puts $!.full_message raise "Exception in CI benchmarks: #{$!.message}!" end From 25ab7f5c138e78c556c089fdae61bdbd0875bc94 Mon Sep 17 00:00:00 2001 From: Randy Stauner Date: Mon, 13 Jan 2025 15:00:54 -0700 Subject: [PATCH 3/4] Remove more unused code --- .../generate_and_upload_reports.rb | 16 ---------------- timeline_report.rb | 7 ------- 2 files changed, 23 deletions(-) diff --git a/continuous_reporting/generate_and_upload_reports.rb b/continuous_reporting/generate_and_upload_reports.rb index 6d8aa486e..c1d030793 100755 --- a/continuous_reporting/generate_and_upload_reports.rb +++ b/continuous_reporting/generate_and_upload_reports.rb @@ -168,7 +168,6 @@ def ruby_version_from_metadata(metadata) report_name_in_file = $1 raise "Non-matching report name with filename, #{report_name.inspect} vs #{report_name_in_file.inspect}!" unless report_name == report_name_in_file ts = $2 - ext = $3 report_timestamps[ts] ||= {} report_timestamps[ts][report_name] ||= [] @@ -333,21 +332,6 @@ def ruby_version_from_metadata(metadata) puts "Static site seems to build correctly. That means that GHPages should do the right thing on push." -# Benchmark raw data was already committed. Built reports are now locally cached, not pushed. -#dirs_to_commit = [ "_benchmarks", "_includes", "reports" ] -### Commit built reports if there is something to commit -#diffs = (YJITMetrics.check_output "git status --porcelain #{dirs_to_commit.join(" ")}").chomp -#if diffs == "" -# puts "No changes found. Not committing or pushing." -#elsif no_push -# puts "Changes found, but --no-push was specified. Not committing or pushing." -#else -# puts "Changes found. Committing and pushing." -# YJITMetrics.check_call "git add #{dirs_to_commit.join(" ")}" -# YJITMetrics.check_call 'git commit -m "Update reports via continuous_reporting.rb script"' -# YJITMetrics.check_call "git push" -#end - # Copy built _site directory into raw pages repo as a new single commit, to branch new_pages Dir.chdir GHPAGES_REPO puts "Switched to #{Dir.pwd}" diff --git a/timeline_report.rb b/timeline_report.rb index 358fbb0da..63660ddb5 100755 --- a/timeline_report.rb +++ b/timeline_report.rb @@ -83,16 +83,12 @@ def ruby_desc_to_sha(ruby_desc) exit -1 end -latest_ts = relevant_results.map { |_, _, timestamp, _, _| timestamp }.max puts "Loading #{relevant_results.size} data files..." result_set_by_ts = {} -filepaths_by_ts = {} ruby_desc_by_config_and_ts = {} relevant_results.each do |filepath, config_name, timestamp, run_num, platform| benchmark_data = JSON.load(File.read(filepath)) - filepaths_by_ts[timestamp] ||= [] - filepaths_by_ts[timestamp].push filepath # Is this used? I'm not sure this is used. ruby_desc_by_config_and_ts[config_name] ||= {} ruby_desc_by_config_and_ts[config_name][timestamp] = ruby_desc_to_sha benchmark_data["ruby_metadata"]["RUBY_DESCRIPTION"] @@ -109,9 +105,6 @@ def ruby_desc_to_sha(ruby_desc) configs = relevant_results.map { |_, config_name, _, _, _| config_name }.uniq.sort all_timestamps = result_set_by_ts.keys.sort -# This should match the JS parser in the template files -TIME_FORMAT = "%Y %m %d %H %M %S" - # Grab a statistical summary of every timestamp, config and benchmark summary_by_ts = {} all_benchmarks = [] From aa876ec56b29bbc65582ed77eccac23065873c18 Mon Sep 17 00:00:00 2001 From: Randy Stauner Date: Mon, 13 Jan 2025 15:02:29 -0700 Subject: [PATCH 4/4] Remove unnecessary nearly duplicate regexp --- timeline_report.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/timeline_report.rb b/timeline_report.rb index 63660ddb5..06e469776 100755 --- a/timeline_report.rb +++ b/timeline_report.rb @@ -41,7 +41,6 @@ output_dir = File.expand_path(output_dir) DATASET_FILENAME_RE = /^(\d{4}-\d{2}-\d{2}-\d{6})_basic_benchmark_(\d{4}_)?(.*).json$/ -DATASET_FILEPATH_RE = /^(.*\/)?(\d{4}-\d{2}-\d{2}-\d{6})_basic_benchmark_(\d{4}_)?(.*).json$/ # Return the information from the file path - run_num is nil if the file isn't in multi-run format def parse_dataset_filepath(filepath) filename = filepath.split("/")[-1] @@ -75,12 +74,12 @@ def ruby_desc_to_sha(ruby_desc) Dir.chdir(data_dir) -files_in_dir = Dir["**/*"].grep(DATASET_FILEPATH_RE) +files_in_dir = Dir["**/*_basic_benchmark_*.json"].select { |x| File.basename(x) =~ DATASET_FILENAME_RE } relevant_results = files_in_dir.map { |filepath| parse_dataset_filepath(filepath) } if relevant_results.size == 0 puts "No relevant data files found for directory #{data_dir.inspect} and specified arguments!" - exit -1 + exit(-1) end puts "Loading #{relevant_results.size} data files..."