Skip to content

Commit

Permalink
Merge pull request #376 from Shopify/rwstauner/regression-detection
Browse files Browse the repository at this point in the history
Remove unused code (mostly around perf tripwires)
  • Loading branch information
maximecb authored Jan 14, 2025
2 parents fed94c3 + aa876ec commit 0d8bc9b
Show file tree
Hide file tree
Showing 6 changed files with 7 additions and 326 deletions.
273 changes: 0 additions & 273 deletions continuous_reporting/benchmark_and_update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,6 @@
require "optparse"
require 'rbconfig'

# TODO: should the benchmark-run and perf-check parts of this script be separated? Probably.

# This is intended to be the top-level script for running benchmarks, reporting on them
# and checking performance.

# The tolerances below are for detecting big drops in benchmark performance. So far I haven't
# had good luck with that -- it's either too sensitive and files spurious bugs a lot, or it's
# too insensitive and doesn't notice real bugs. Right now, auto-filing bugs is turned off.

# The STDDEV_TOLERANCE is what multiple of the standard deviation it's okay to drop on
# a given run. That effectively determines the false-positive rate since we're comparing samples
# from a Gaussian-ish distribution.
NORMAL_STDDEV_TOLERANCE = 1.5
# The DROP_TOLERANCE is what absolute multiple-of-the-mean drop (e.g. 0.05 means 5%) is
# assumed to be okay. For each metric we use the more permissive of the two tolerances
# on the more permissive of the two mean values. All four must be outside of tolerance
# for us to count a failure.
NORMAL_DROP_TOLERANCE = 0.07
# A microbenchmark will normally have a very small stddev from run to run. That means
# it's actually *less* tolerant of noise on the host, since "twice the stddev" is a
# significantly smaller absolute value.
MICRO_STDDEV_TOLERANCE = 2.0
# A microbenchmark will routinely show persistent speed changes of much larger magnitude
# than a larger, more varied benchmark. For example, setivar is surprisingly prone to
# large swings in time taken depending on tiny changes to memory layout. So the drop
# tolerance needs to be significantly larger to avoid frequent false positives.
MICRO_DROP_TOLERANCE = 0.20

BUILT_REPORTS_ROOT = YJITMetrics::ContinuousReporting::BUILT_REPORTS_ROOT
RAW_BENCHMARK_ROOT = YJITMetrics::ContinuousReporting::RAW_BENCHMARK_ROOT

Expand Down Expand Up @@ -91,11 +63,6 @@ def args_for_bench_type(bench_type_arg)

benchmark_args = nil
bench_params_file = nil
should_file_gh_issue = true
no_perf_tripwires = false
all_perf_tripwires = false
single_perf_tripwire = nil
is_verbose = false
data_dir = nil
full_rebuild = nil

Expand All @@ -110,33 +77,6 @@ def args_for_bench_type(bench_type_arg)
benchmark_args = args_for_bench_type(btype)
end

# NOTE: GH issue filing is currently disabled
opts.on("-g", "--no-gh-issue", "Do not file an actual GitHub issue, only print failures to console") do
should_file_gh_issue = false
end

opts.on("-a", "--all-perf-tripwires", "Check performance tripwires on all pairs of benchmarks (implies --no-gh-issue)") do
if no_perf_tripwires || single_perf_tripwire
raise "Error! Please do not specify --all-perf-tripwires along with --no-perf-tripwires or --single-perf-tripwire!"
end
all_perf_tripwires = true
should_file_gh_issue = false
end

opts.on("-t TS", "--perf-timestamp TIMESTAMP", "Check performance tripwire at this specific timestamp") do |ts|
if all_perf_tripwires || no_perf_tripwires
raise "Error! Please do not specify a specific perf tripwire along with --no-perf-tripwires or --all-perf-tripwires!"
end
single_perf_tripwire = ts.strip
end

opts.on("-np", "--no-perf-tripwires", "Do not check performance tripwires") do
if all_perf_tripwires || single_perf_tripwire
raise "Error! Please do not specify --no-perf-tripwires along with --single-perf-tripwire or --all-perf-tripwires!"
end
no_perf_tripwires = true
end

opts.on("-fr YN", "--full-rebuild YN") do |fr|
if fr.nil? || fr.strip == ""
full_rebuild = true
Expand All @@ -154,51 +94,17 @@ def args_for_bench_type(bench_type_arg)
raise("No such directory: #{ddir.inspect}!") unless File.directory?(ddir)
data_dir = ddir
end

opts.on("-v", "--verbose", "Print verbose output about tripwire checks") do
is_verbose = true
end
end.parse!

bench_params_data = bench_params_file ? JSON.parse(File.read bench_params_file) : {}

BENCHMARK_ARGS = benchmark_args || (bench_params_data["bench_type"] ? args_for_bench_type(bench_params_data["bench_type"]) : BENCH_TYPES["default"])
FULL_REBUILD = !full_rebuild.nil? ? full_rebuild : (bench_params_data["full_rebuild"] || false)
FILE_GH_ISSUE = should_file_gh_issue
ALL_PERF_TRIPWIRES = all_perf_tripwires
SINGLE_PERF_TRIPWIRE = single_perf_tripwire
VERBOSE = is_verbose
BENCH_PARAMS_FILE = bench_params_file
DATA_DIR = File.expand_path(data_dir || bench_params_data["data_directory"] || "continuous_reporting/data")

PIDFILE = "/home/ubuntu/benchmark_ci.pid"

GITHUB_TOKEN=ENV["BENCHMARK_CI_GITHUB_TOKEN"]
if FILE_GH_ISSUE && !GITHUB_TOKEN
raise "Set BENCHMARK_CI_GITHUB_TOKEN to an appropriate GitHub token to allow auto-filing issues! (or set --no-gh-issue)"
end

def ghapi_post(api_uri, params, verb: :post)
uri = URI("https://api.github.com" + api_uri)

req = Net::HTTP::Post.new(uri)
#req.basic_auth GITHUB_USER, GITHUB_TOKEN
req['Accept'] = "application/vnd.github.v3+json"
req['Content-Type'] = "application/json"
req['Authorization'] = "Bearer #{GITHUB_TOKEN}"
req.body = JSON.dump(params)
result = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |http| http.request(req) }

unless result.is_a?(Net::HTTPSuccess)
$stderr.puts "Error in HTTP #{verb.upcase}: #{result.inspect}"
$stderr.puts result.body
$stderr.puts "------"
raise "HTTP error when posting to #{api_uri}!"
end

JSON.load(result.body)
end

class BenchmarkDetails
def initialize(timestamp)
@timestamp = timestamp
Expand Down Expand Up @@ -227,32 +133,6 @@ def escape_markdown(s)
s.gsub(/(\*|\_|\`)/) { '\\' + $1 }.gsub("<", "&lt;")
end

def file_gh_issue(title, message)
host = `uname -a`.chomp
issue_body = <<~ISSUE
Error running benchmark CI job on #{host}:
#{message}
ISSUE

unless FILE_GH_ISSUE
print "We would file a GitHub issue, but we were asked not to. Details:\n\n"

print "==============================\n"
print "Title: CI Benchmarking: #{title}\n"
puts issue_body
print "==============================\n"
return
end

# Note: if you're set up as the GitHub user, it's not gonna email you since it thinks you filed it yourself.
ghapi_post "/repos/Shopify/yjit-metrics/issues",
{
"title" => "CI Benchmarking: #{title}",
"body" => issue_body
}
end

if File.exist?(PIDFILE)
pid = File.read(PIDFILE).to_i
if pid && pid > 0
Expand Down Expand Up @@ -280,10 +160,6 @@ def run_benchmarks
end
end

def ts_from_tripwire_filename(filename)
filename.split("blog_speed_details_")[1].split(".")[0]
end

def timestr_from_ts(ts)
if ts =~ /\A(\d{4}-\d{2}-\d{2})-(\d{6})\Z/
year_month_day = $1
Expand All @@ -297,158 +173,9 @@ def timestr_from_ts(ts)
end
end

# If benchmark results drop noticeably, file a Github issue
def check_perf_tripwires
Dir.chdir(YM_REPORT_DIR) do
# Grab only non-platform-specific tripwire files that do *not* have a platform name in them,
# but instead end in a six-digit timestamp.
tripwire_files = Dir["*.tripwires.json"].to_a.select {|f| f =~ /\d{6}\.tripwires\.json\Z/}.sort

if ALL_PERF_TRIPWIRES
(tripwire_files.size - 1).times do |index|
check_one_perf_tripwire(tripwire_files[index], tripwire_files[index - 1])
end
elsif SINGLE_PERF_TRIPWIRE
specified_file = tripwire_files.detect { |f| f.include?(SINGLE_PERF_TRIPWIRE) }
raise "Couldn't find perf tripwire report containing #{SINGLE_PERF_TRIPWIRE.inspect}!" unless specified_file

specified_index = tripwire_files.index(specified_file)
raise "Can't check perf on the very first report!" if specified_index == 0

check_one_perf_tripwire(tripwire_files[specified_index], tripwire_files[specified_index - 1])
else
check_one_perf_tripwire(tripwire_files[-1], tripwire_files[-2])
end
end
end

def check_one_perf_tripwire(current_filename, compared_filename, verbose: VERBOSE)
YJITMetrics::PLATFORMS.each do |platform|
platform_current_filename = current_filename.gsub(".tripwires.json", ".#{platform}.tripwires.json")
end

end

def check_one_perf_tripwire_file(current_filename, compared_filename, verbose: VERBOSE)
current_data = JSON.parse File.read(current_filename)
compared_data = JSON.parse File.read(compared_filename)

check_failures = []

compared_data.each do |bench_name, values|
# Only compare if both sets of data have the benchmark
next unless current_data[bench_name]

current_mean = current_data[bench_name]["mean"]
current_rsd_pct = current_data[bench_name]["rsd_pct"]
compared_mean = values["mean"]
compared_rsd_pct = values["rsd_pct"]

current_stddev = (current_rsd_pct.to_f / 100.0) * current_mean
compared_stddev = (compared_rsd_pct.to_f / 100.0) * compared_mean

# Normally is_micro should be the same in all cases for any one specific benchmark.
# So we just assume the most recent data has the correct value.
is_micro = current_data[bench_name]["micro"]

if is_micro
bench_stddev_tolerance = MICRO_STDDEV_TOLERANCE
bench_drop_tolerance = MICRO_DROP_TOLERANCE
else
bench_stddev_tolerance = NORMAL_STDDEV_TOLERANCE
bench_drop_tolerance = NORMAL_DROP_TOLERANCE
end

# Occasionally stddev can change pretty wildly from run to run. Take the most tolerant of multiple-of-recent-stddev,
# or a percentage of the larger mean runtime. Basically, a drop must be unusual enough (stddev) and large enough (mean)
# for us to flag it.
tolerance = [ current_stddev * bench_stddev_tolerance, compared_stddev * bench_stddev_tolerance,
current_mean * bench_drop_tolerance, compared_mean * bench_drop_tolerance ].max

drop = current_mean - compared_mean

if verbose
puts "#{is_micro ? "Microbenchmark" : "Benchmark"} #{bench_name}, tolerance is #{ "%.2f" % tolerance }, latest mean is #{ "%.2f" % current_mean } (stddev #{"%.2f" % current_stddev}), " +
"next-latest mean is #{ "%.2f" % compared_mean } (stddev #{ "%.2f" % compared_stddev}), drop is #{ "%.2f" % drop }..."
end

if drop > tolerance
puts "Benchmark #{bench_name} marked as failure!" if verbose
check_failures.push({
benchmark: bench_name,
current_mean: current_mean,
second_current_mean: compared_mean,
current_stddev: current_stddev,
current_rsd_pct: current_rsd_pct,
second_current_stddev: compared_stddev,
second_current_rsd_pct: compared_rsd_pct,
})
end
end

if check_failures.empty?
puts "No benchmarks failing performance tripwire (#{current_filename})"
return
end

puts "Failing benchmarks (#{current_filename}): #{check_failures.map { |h| h[:benchmark] }}"
file_perf_bug(current_filename, compared_filename, check_failures)
end

def file_perf_bug(current_filename, compared_filename, check_failures)
ts_latest = ts_from_tripwire_filename(current_filename)
ts_penultimate = ts_from_tripwire_filename(compared_filename)

# This expects to be called from the _includes/reports directory
latest_yjit_results = BenchmarkDetails.new(ts_latest)
latest_yjit_result_file = latest_yjit_results.yjit_test_result
penultimate_yjit_results = BenchmarkDetails.new(ts_penultimate)
penultimate_yjit_result_file = penultimate_yjit_results.yjit_test_result
latest_yjit_data = JSON.parse File.read(latest_yjit_result_file)
penultimate_yjit_data = JSON.parse File.read(penultimate_yjit_result_file)
latest_yjit_ruby_desc = latest_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]
penultimate_yjit_ruby_desc = penultimate_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]

failing_benchmarks = check_failures.map { |h| h[:benchmark] }

puts "Filing Github issue - slower benchmark(s) found."
body = <<~BODY_TOP
Latest failing benchmark:
* Time: #{timestr_from_ts(ts_latest)}
* Ruby: #{escape_markdown latest_yjit_ruby_desc}
* [Raw YJIT prod data](#{latest_yjit_results.yjit_permalink})
Compared to previous benchmark:
* Time: #{timestr_from_ts(ts_penultimate)}
* Ruby: #{escape_markdown penultimate_yjit_ruby_desc}
* [Raw YJIT prod data](#{penultimate_yjit_results.yjit_permalink})
Slower benchmarks: #{failing_benchmarks.join(", ")}
[Timeline Graph](https://speed.yjit.org/timeline-deep##{failing_benchmarks.join("+")})
Slowdown details:
BODY_TOP

check_failures.each do |bench_hash|
# Indentation with here-docs is hard - use the old-style with extremely literal whitespace handling.
body += <<ONE_BENCH_REPORT
* #{bench_hash[:benchmark]}:
* Speed before: #{"%.2f" % bench_hash[:second_current_mean]} +/- #{"%.1f" % bench_hash[:second_current_rsd_pct]}%
* Speed after: #{"%.2f" % bench_hash[:current_mean]} +/- #{"%.1f" % bench_hash[:current_rsd_pct]}%
ONE_BENCH_REPORT
end

file_gh_issue("Benchmark at #{ts_latest} is significantly slower than the one before (#{ts_penultimate})!", body)
end

begin
run_benchmarks
rescue
host = `uname -a`.chomp
puts $!.full_message
raise "Exception in CI benchmarks: #{$!.message}!"
end
Expand Down
20 changes: 2 additions & 18 deletions continuous_reporting/generate_and_upload_reports.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
},
"blog_speed_details" => {
report_type: :basic_report,
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
},
"blog_memory_details" => {
report_type: :basic_report,
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
},
"blog_yjit_stats" => {
report_type: :basic_report,
Expand Down Expand Up @@ -168,7 +168,6 @@ def ruby_version_from_metadata(metadata)
report_name_in_file = $1
raise "Non-matching report name with filename, #{report_name.inspect} vs #{report_name_in_file.inspect}!" unless report_name == report_name_in_file
ts = $2
ext = $3

report_timestamps[ts] ||= {}
report_timestamps[ts][report_name] ||= []
Expand Down Expand Up @@ -333,21 +332,6 @@ def ruby_version_from_metadata(metadata)

puts "Static site seems to build correctly. That means that GHPages should do the right thing on push."

# Benchmark raw data was already committed. Built reports are now locally cached, not pushed.
#dirs_to_commit = [ "_benchmarks", "_includes", "reports" ]
### Commit built reports if there is something to commit
#diffs = (YJITMetrics.check_output "git status --porcelain #{dirs_to_commit.join(" ")}").chomp
#if diffs == ""
# puts "No changes found. Not committing or pushing."
#elsif no_push
# puts "Changes found, but --no-push was specified. Not committing or pushing."
#else
# puts "Changes found. Committing and pushing."
# YJITMetrics.check_call "git add #{dirs_to_commit.join(" ")}"
# YJITMetrics.check_call 'git commit -m "Update reports via continuous_reporting.rb script"'
# YJITMetrics.check_call "git push"
#end

# Copy built _site directory into raw pages repo as a new single commit, to branch new_pages
Dir.chdir GHPAGES_REPO
puts "Switched to #{Dir.pwd}"
Expand Down
2 changes: 1 addition & 1 deletion continuous_reporting/gh_tasks/run_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ cd ~/ym/yjit-metrics

bundle

ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --no-perf-tripwires --bench-params=$BENCH_PARAMS
ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --bench-params=$BENCH_PARAMS

echo "Completed benchmarking successfully."
Loading

0 comments on commit 0d8bc9b

Please sign in to comment.