Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unused code (mostly around perf tripwires) #376

Merged
merged 4 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 0 additions & 273 deletions continuous_reporting/benchmark_and_update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,6 @@
require "optparse"
require 'rbconfig'

# TODO: should the benchmark-run and perf-check parts of this script be separated? Probably.

# This is intended to be the top-level script for running benchmarks, reporting on them
# and checking performance.

# The tolerances below are for detecting big drops in benchmark performance. So far I haven't
# had good luck with that -- it's either too sensitive and files spurious bugs a lot, or it's
# too insensitive and doesn't notice real bugs. Right now, auto-filing bugs is turned off.

# The STDDEV_TOLERANCE is what multiple of the standard deviation it's okay to drop on
# a given run. That effectively determines the false-positive rate since we're comparing samples
# from a Gaussian-ish distribution.
NORMAL_STDDEV_TOLERANCE = 1.5
# The DROP_TOLERANCE is what absolute multiple-of-the-mean drop (e.g. 0.05 means 5%) is
# assumed to be okay. For each metric we use the more permissive of the two tolerances
# on the more permissive of the two mean values. All four must be outside of tolerance
# for us to count a failure.
NORMAL_DROP_TOLERANCE = 0.07
# A microbenchmark will normally have a very small stddev from run to run. That means
# it's actually *less* tolerant of noise on the host, since "twice the stddev" is a
# significantly smaller absolute value.
MICRO_STDDEV_TOLERANCE = 2.0
# A microbenchmark will routinely show persistent speed changes of much larger magnitude
# than a larger, more varied benchmark. For example, setivar is surprisingly prone to
# large swings in time taken depending on tiny changes to memory layout. So the drop
# tolerance needs to be significantly larger to avoid frequent false positives.
MICRO_DROP_TOLERANCE = 0.20

BUILT_REPORTS_ROOT = YJITMetrics::ContinuousReporting::BUILT_REPORTS_ROOT
RAW_BENCHMARK_ROOT = YJITMetrics::ContinuousReporting::RAW_BENCHMARK_ROOT

Expand Down Expand Up @@ -91,11 +63,6 @@ def args_for_bench_type(bench_type_arg)

benchmark_args = nil
bench_params_file = nil
should_file_gh_issue = true
no_perf_tripwires = false
all_perf_tripwires = false
single_perf_tripwire = nil
is_verbose = false
data_dir = nil
full_rebuild = nil

Expand All @@ -110,33 +77,6 @@ def args_for_bench_type(bench_type_arg)
benchmark_args = args_for_bench_type(btype)
end

# NOTE: GH issue filing is currently disabled
opts.on("-g", "--no-gh-issue", "Do not file an actual GitHub issue, only print failures to console") do
should_file_gh_issue = false
end

opts.on("-a", "--all-perf-tripwires", "Check performance tripwires on all pairs of benchmarks (implies --no-gh-issue)") do
if no_perf_tripwires || single_perf_tripwire
raise "Error! Please do not specify --all-perf-tripwires along with --no-perf-tripwires or --single-perf-tripwire!"
end
all_perf_tripwires = true
should_file_gh_issue = false
end

opts.on("-t TS", "--perf-timestamp TIMESTAMP", "Check performance tripwire at this specific timestamp") do |ts|
if all_perf_tripwires || no_perf_tripwires
raise "Error! Please do not specify a specific perf tripwire along with --no-perf-tripwires or --all-perf-tripwires!"
end
single_perf_tripwire = ts.strip
end

opts.on("-np", "--no-perf-tripwires", "Do not check performance tripwires") do
if all_perf_tripwires || single_perf_tripwire
raise "Error! Please do not specify --no-perf-tripwires along with --single-perf-tripwire or --all-perf-tripwires!"
end
no_perf_tripwires = true
end

opts.on("-fr YN", "--full-rebuild YN") do |fr|
if fr.nil? || fr.strip == ""
full_rebuild = true
Expand All @@ -154,51 +94,17 @@ def args_for_bench_type(bench_type_arg)
raise("No such directory: #{ddir.inspect}!") unless File.directory?(ddir)
data_dir = ddir
end

opts.on("-v", "--verbose", "Print verbose output about tripwire checks") do
is_verbose = true
end
end.parse!

bench_params_data = bench_params_file ? JSON.parse(File.read bench_params_file) : {}

BENCHMARK_ARGS = benchmark_args || (bench_params_data["bench_type"] ? args_for_bench_type(bench_params_data["bench_type"]) : BENCH_TYPES["default"])
FULL_REBUILD = !full_rebuild.nil? ? full_rebuild : (bench_params_data["full_rebuild"] || false)
FILE_GH_ISSUE = should_file_gh_issue
ALL_PERF_TRIPWIRES = all_perf_tripwires
SINGLE_PERF_TRIPWIRE = single_perf_tripwire
VERBOSE = is_verbose
BENCH_PARAMS_FILE = bench_params_file
DATA_DIR = File.expand_path(data_dir || bench_params_data["data_directory"] || "continuous_reporting/data")

PIDFILE = "/home/ubuntu/benchmark_ci.pid"

GITHUB_TOKEN=ENV["BENCHMARK_CI_GITHUB_TOKEN"]
if FILE_GH_ISSUE && !GITHUB_TOKEN
raise "Set BENCHMARK_CI_GITHUB_TOKEN to an appropriate GitHub token to allow auto-filing issues! (or set --no-gh-issue)"
end

def ghapi_post(api_uri, params, verb: :post)
uri = URI("https://api.github.com" + api_uri)

req = Net::HTTP::Post.new(uri)
#req.basic_auth GITHUB_USER, GITHUB_TOKEN
req['Accept'] = "application/vnd.github.v3+json"
req['Content-Type'] = "application/json"
req['Authorization'] = "Bearer #{GITHUB_TOKEN}"
req.body = JSON.dump(params)
result = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |http| http.request(req) }

unless result.is_a?(Net::HTTPSuccess)
$stderr.puts "Error in HTTP #{verb.upcase}: #{result.inspect}"
$stderr.puts result.body
$stderr.puts "------"
raise "HTTP error when posting to #{api_uri}!"
end

JSON.load(result.body)
end

class BenchmarkDetails
def initialize(timestamp)
@timestamp = timestamp
Expand Down Expand Up @@ -227,32 +133,6 @@ def escape_markdown(s)
s.gsub(/(\*|\_|\`)/) { '\\' + $1 }.gsub("<", "&lt;")
end

def file_gh_issue(title, message)
host = `uname -a`.chomp
issue_body = <<~ISSUE
Error running benchmark CI job on #{host}:

#{message}
ISSUE

unless FILE_GH_ISSUE
print "We would file a GitHub issue, but we were asked not to. Details:\n\n"

print "==============================\n"
print "Title: CI Benchmarking: #{title}\n"
puts issue_body
print "==============================\n"
return
end

# Note: if you're set up as the GitHub user, it's not gonna email you since it thinks you filed it yourself.
ghapi_post "/repos/Shopify/yjit-metrics/issues",
{
"title" => "CI Benchmarking: #{title}",
"body" => issue_body
}
end

if File.exist?(PIDFILE)
pid = File.read(PIDFILE).to_i
if pid && pid > 0
Expand Down Expand Up @@ -280,10 +160,6 @@ def run_benchmarks
end
end

def ts_from_tripwire_filename(filename)
filename.split("blog_speed_details_")[1].split(".")[0]
end

def timestr_from_ts(ts)
if ts =~ /\A(\d{4}-\d{2}-\d{2})-(\d{6})\Z/
year_month_day = $1
Expand All @@ -297,158 +173,9 @@ def timestr_from_ts(ts)
end
end

# If benchmark results drop noticeably, file a Github issue
def check_perf_tripwires
Dir.chdir(YM_REPORT_DIR) do
# Grab only non-platform-specific tripwire files that do *not* have a platform name in them,
# but instead end in a six-digit timestamp.
tripwire_files = Dir["*.tripwires.json"].to_a.select {|f| f =~ /\d{6}\.tripwires\.json\Z/}.sort

if ALL_PERF_TRIPWIRES
(tripwire_files.size - 1).times do |index|
check_one_perf_tripwire(tripwire_files[index], tripwire_files[index - 1])
end
elsif SINGLE_PERF_TRIPWIRE
specified_file = tripwire_files.detect { |f| f.include?(SINGLE_PERF_TRIPWIRE) }
raise "Couldn't find perf tripwire report containing #{SINGLE_PERF_TRIPWIRE.inspect}!" unless specified_file

specified_index = tripwire_files.index(specified_file)
raise "Can't check perf on the very first report!" if specified_index == 0

check_one_perf_tripwire(tripwire_files[specified_index], tripwire_files[specified_index - 1])
else
check_one_perf_tripwire(tripwire_files[-1], tripwire_files[-2])
end
end
end

def check_one_perf_tripwire(current_filename, compared_filename, verbose: VERBOSE)
YJITMetrics::PLATFORMS.each do |platform|
platform_current_filename = current_filename.gsub(".tripwires.json", ".#{platform}.tripwires.json")
end

end

def check_one_perf_tripwire_file(current_filename, compared_filename, verbose: VERBOSE)
current_data = JSON.parse File.read(current_filename)
compared_data = JSON.parse File.read(compared_filename)

check_failures = []

compared_data.each do |bench_name, values|
# Only compare if both sets of data have the benchmark
next unless current_data[bench_name]

current_mean = current_data[bench_name]["mean"]
current_rsd_pct = current_data[bench_name]["rsd_pct"]
compared_mean = values["mean"]
compared_rsd_pct = values["rsd_pct"]

current_stddev = (current_rsd_pct.to_f / 100.0) * current_mean
compared_stddev = (compared_rsd_pct.to_f / 100.0) * compared_mean

# Normally is_micro should be the same in all cases for any one specific benchmark.
# So we just assume the most recent data has the correct value.
is_micro = current_data[bench_name]["micro"]

if is_micro
bench_stddev_tolerance = MICRO_STDDEV_TOLERANCE
bench_drop_tolerance = MICRO_DROP_TOLERANCE
else
bench_stddev_tolerance = NORMAL_STDDEV_TOLERANCE
bench_drop_tolerance = NORMAL_DROP_TOLERANCE
end

# Occasionally stddev can change pretty wildly from run to run. Take the most tolerant of multiple-of-recent-stddev,
# or a percentage of the larger mean runtime. Basically, a drop must be unusual enough (stddev) and large enough (mean)
# for us to flag it.
tolerance = [ current_stddev * bench_stddev_tolerance, compared_stddev * bench_stddev_tolerance,
current_mean * bench_drop_tolerance, compared_mean * bench_drop_tolerance ].max

drop = current_mean - compared_mean

if verbose
puts "#{is_micro ? "Microbenchmark" : "Benchmark"} #{bench_name}, tolerance is #{ "%.2f" % tolerance }, latest mean is #{ "%.2f" % current_mean } (stddev #{"%.2f" % current_stddev}), " +
"next-latest mean is #{ "%.2f" % compared_mean } (stddev #{ "%.2f" % compared_stddev}), drop is #{ "%.2f" % drop }..."
end

if drop > tolerance
puts "Benchmark #{bench_name} marked as failure!" if verbose
check_failures.push({
benchmark: bench_name,
current_mean: current_mean,
second_current_mean: compared_mean,
current_stddev: current_stddev,
current_rsd_pct: current_rsd_pct,
second_current_stddev: compared_stddev,
second_current_rsd_pct: compared_rsd_pct,
})
end
end

if check_failures.empty?
puts "No benchmarks failing performance tripwire (#{current_filename})"
return
end

puts "Failing benchmarks (#{current_filename}): #{check_failures.map { |h| h[:benchmark] }}"
file_perf_bug(current_filename, compared_filename, check_failures)
end

def file_perf_bug(current_filename, compared_filename, check_failures)
ts_latest = ts_from_tripwire_filename(current_filename)
ts_penultimate = ts_from_tripwire_filename(compared_filename)

# This expects to be called from the _includes/reports directory
latest_yjit_results = BenchmarkDetails.new(ts_latest)
latest_yjit_result_file = latest_yjit_results.yjit_test_result
penultimate_yjit_results = BenchmarkDetails.new(ts_penultimate)
penultimate_yjit_result_file = penultimate_yjit_results.yjit_test_result
latest_yjit_data = JSON.parse File.read(latest_yjit_result_file)
penultimate_yjit_data = JSON.parse File.read(penultimate_yjit_result_file)
latest_yjit_ruby_desc = latest_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]
penultimate_yjit_ruby_desc = penultimate_yjit_data["ruby_metadata"]["RUBY_DESCRIPTION"]

failing_benchmarks = check_failures.map { |h| h[:benchmark] }

puts "Filing Github issue - slower benchmark(s) found."
body = <<~BODY_TOP
Latest failing benchmark:

* Time: #{timestr_from_ts(ts_latest)}
* Ruby: #{escape_markdown latest_yjit_ruby_desc}
* [Raw YJIT prod data](#{latest_yjit_results.yjit_permalink})

Compared to previous benchmark:

* Time: #{timestr_from_ts(ts_penultimate)}
* Ruby: #{escape_markdown penultimate_yjit_ruby_desc}
* [Raw YJIT prod data](#{penultimate_yjit_results.yjit_permalink})

Slower benchmarks: #{failing_benchmarks.join(", ")}

[Timeline Graph](https://speed.yjit.org/timeline-deep##{failing_benchmarks.join("+")})

Slowdown details:

BODY_TOP

check_failures.each do |bench_hash|
# Indentation with here-docs is hard - use the old-style with extremely literal whitespace handling.
body += <<ONE_BENCH_REPORT
* #{bench_hash[:benchmark]}:
* Speed before: #{"%.2f" % bench_hash[:second_current_mean]} +/- #{"%.1f" % bench_hash[:second_current_rsd_pct]}%
* Speed after: #{"%.2f" % bench_hash[:current_mean]} +/- #{"%.1f" % bench_hash[:current_rsd_pct]}%
ONE_BENCH_REPORT
end

file_gh_issue("Benchmark at #{ts_latest} is significantly slower than the one before (#{ts_penultimate})!", body)
end

begin
run_benchmarks
rescue
host = `uname -a`.chomp
puts $!.full_message
raise "Exception in CI benchmarks: #{$!.message}!"
end
Expand Down
20 changes: 2 additions & 18 deletions continuous_reporting/generate_and_upload_reports.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
},
"blog_speed_details" => {
report_type: :basic_report,
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
},
"blog_memory_details" => {
report_type: :basic_report,
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "tripwires.json", "csv" ],
extensions: [ "html", "svg", "head.svg", "back.svg", "micro.svg", "csv" ],
},
"blog_yjit_stats" => {
report_type: :basic_report,
Expand Down Expand Up @@ -168,7 +168,6 @@ def ruby_version_from_metadata(metadata)
report_name_in_file = $1
raise "Non-matching report name with filename, #{report_name.inspect} vs #{report_name_in_file.inspect}!" unless report_name == report_name_in_file
ts = $2
ext = $3

report_timestamps[ts] ||= {}
report_timestamps[ts][report_name] ||= []
Expand Down Expand Up @@ -333,21 +332,6 @@ def ruby_version_from_metadata(metadata)

puts "Static site seems to build correctly. That means that GHPages should do the right thing on push."

# Benchmark raw data was already committed. Built reports are now locally cached, not pushed.
#dirs_to_commit = [ "_benchmarks", "_includes", "reports" ]
### Commit built reports if there is something to commit
#diffs = (YJITMetrics.check_output "git status --porcelain #{dirs_to_commit.join(" ")}").chomp
#if diffs == ""
# puts "No changes found. Not committing or pushing."
#elsif no_push
# puts "Changes found, but --no-push was specified. Not committing or pushing."
#else
# puts "Changes found. Committing and pushing."
# YJITMetrics.check_call "git add #{dirs_to_commit.join(" ")}"
# YJITMetrics.check_call 'git commit -m "Update reports via continuous_reporting.rb script"'
# YJITMetrics.check_call "git push"
#end

# Copy built _site directory into raw pages repo as a new single commit, to branch new_pages
Dir.chdir GHPAGES_REPO
puts "Switched to #{Dir.pwd}"
Expand Down
2 changes: 1 addition & 1 deletion continuous_reporting/gh_tasks/run_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ cd ~/ym/yjit-metrics

bundle

ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --no-perf-tripwires --bench-params=$BENCH_PARAMS
ruby continuous_reporting/benchmark_and_update.rb --no-gh-issue --bench-params=$BENCH_PARAMS

echo "Completed benchmarking successfully."
Loading
Loading