test/perf_compare.py

#!/usr/bin/env python3

# Compare results generated by `blimp-test --perf-report`. For each test common
# to both runs, we report the ratio of the average times. We also flag
# statistically significant performance regressions.

import argparse
import csv
from functools import reduce
import math
import itertools

class Result(object):
    def __init__(self, line):
        (self.group, self.test, self.benchmark, self.iter, self.ops,
         self.cycles_per_ns, self.avg, self.std_dev, self.min, self.max) = line

        self.iter          = int(self.iter)
        self.ops           = int(self.ops)
        self.cycles_per_ns = float(self.cycles_per_ns)
        self.avg           = float(self.avg)
        self.std_dev       = float(self.std_dev)
        self.min           = float(self.min)
        self.max           = float(self.max)

        self.key = (self.group, self.test, self.benchmark)

def read_report(path):
    with open(path, "r") as f:
        r = csv.reader(f)
        next(r)
            # Drop the first line, which is a header.
        return {r.key: r for r in (Result(line) for line in r)}

def is_regression(r1, r2):
    # Some of the benchmarks are multimodal, which results in a very large
    # standard deviation which is not representative of the true uncertainty of
    # the benchmark. For these benchmarks, we use the average of the averages as
    # a tolerance. This method is imprecise and not statistically meaningful. A
    # better approach would be to get an actual distribution, try to find the
    # modes, and compare them.
    tolerance1 = 1.5*r1.std_dev if r1.std_dev < r1.avg else r1.avg/2
    tolerance2 = 1.5*r2.std_dev if r2.std_dev < r2.avg else r2.avg/2

    return (r2.avg - tolerance2) > (r1.avg + tolerance1)

def ratio_dev(r1, r2):
    assert r1.avg != 0
    assert r2.avg != 0
    ret = (r2.avg/r1.avg)*math.sqrt((r1.std_dev/r1.avg)**2 + (r2.std_dev/r2.avg)**2)
    if math.isnan(ret):
        print((r1.avg, r1.std_dev, r2.avg, r2.std_dev))
    return ret

def compare_results(r1, r2):
    def rmax(field):
        result_max = max(len(getattr(r, field))
            for r in itertools.chain(r1.values(), r2.values()))
        special_max = max(map(len, ["Geomean"]))
        return max(result_max, special_max)

    fmt_params = {
        "group_len":     rmax("group"),
        "test_len":      rmax("test"),
        "benchmark_len": rmax("benchmark"),
    }

    print(" {group:<{group_len}} {test:<{test_len}} {benchmark:<{benchmark_len}} Ratio (lower is better)".format(
        group="Group", test="Test", benchmark="Benchmark", ratio="Ratio",
        **fmt_params))
    print("-"*80)

    fmt = "{regression}{group:<{group_len}} {test:<{test_len}} {benchmark:<{benchmark_len}} {ratio:3.3f} +- {std_dev:2.3f}"

    # Find all the tests which have results in both results sets, and compute
    # the average improvement ratio (lower is better).
    compared = [(r, r2[key], r2[key].avg/r.avg)
        for key, r in r1.items() if key in r2]

    # Sort from worst to best.
    compared.sort(key=lambda c: (c[2], c[0].key), reverse=True)

    # Print the results.
    for result1, result2, ratio in compared:
        print(fmt.format(
            group=result1.group,
            test=result1.test,
            benchmark=result1.benchmark,
            ratio=ratio,
            std_dev=ratio_dev(result1, result2),
            regression="*" if is_regression(result1, result2) else " ",
            **fmt_params))

    print("-"*80)

    geomean = reduce(lambda x,y: x*y, (c[2] for c in compared))**(1.0/len(compared))
    print(fmt.format(
        group="Geomean",
        test="",
        benchmark="",
        ratio=geomean,
        std_dev=float("nan"),
        regression=" ",
        **fmt_params))


def main():
    parser = argparse.ArgumentParser(
        description="compare performance reports generated by blimp-test")
    parser.add_argument("baseline", help="the baseline CSV to compare against")
    parser.add_argument("report", help="the new CSV to compare")
    args = parser.parse_args()

    compare_results(read_report(args.baseline), read_report(args.report))

if __name__ == "__main__":
    main()