Add Style Control to Chatbot Arena Leaderboard 🔥 (#3495)

Co-authored-by: CodingWithTim <tim@inst-builder-debian-12-build-build-4zqb5.us-central1-a.c.gce-image-builder.internal>
lm-sys · Aug 27, 2024 · 05b9305 · 05b9305
1 parent 282534b
commit 05b9305
Show file tree

Hide file tree

Showing 2 changed files with 219 additions and 4 deletions.
diff --git a/fastchat/serve/monitor/add_markdown_info.py b/fastchat/serve/monitor/add_markdown_info.py
@@ -0,0 +1,84 @@
+import pandas as pd
+import re
+import argparse
+
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+def count_markdown_elements(markdown_text, suffix):
+    counters = {
+        f"header_count{suffix}": {
+            "h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
+            "h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
+            "h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
+            "h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
+            "h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
+            "h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
+        },
+        f"list_count{suffix}": {
+            "ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
+            "unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
+        },
+        f"bold_count{suffix}": {
+            "**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
+            "__": len(re.findall(r"__[^_\n]+__", markdown_text)),
+        },
+    }
+    return counters
+
+
+def remove_pattern(answer, pattern):
+    blocks = pattern.findall(answer)
+    for block in blocks:
+        answer = answer.replace(block, "")
+    return answer
+
+
+def get_element_counts(df, column):
+    pattern = re.compile("```([^`]*)```")
+    answers = df[column].map(
+        lambda convo: "\n".join(
+            [turn["content"] for turn in convo if turn["role"] == "assistant"]
+        )
+    )
+    results = answers.progress_map(
+        lambda answer: count_markdown_elements(
+            remove_pattern(answer, pattern),
+            suffix=column[-2:],  # Remove code block first
+        )
+    )
+
+    return results.tolist()
+
+
+def add_markdown_meta(row):
+    conv_meta = {k: v for k, v in row["conv_metadata"].items()}
+    return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-file", type=str, required=True)
+    parser.add_argument("--output-file", type=str, required=True)
+    args = parser.parse_args()
+
+    print("loading file...")
+    data = pd.read_json(args.input_file)
+
+    assert "conv_metadata" in data.columns
+
+    temp = data[["question_id", "conv_metadata"]].copy()
+
+    print("Processing conversation_a")
+    temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a")
+
+    print("Processing conversation_b")
+    temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b")
+
+    print("Post-processing...")
+    data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1)
+
+    print("Saving to file...")
+    data.to_json(args.output_file, orient="records", indent=4, force_ascii=False)
diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
@@ -21,6 +21,18 @@
 pd.options.display.float_format = "{:.2f}".format
 
 
+STYLE_CONTROL_ELEMENTS_V1 = [
+    "sum_assistant_a_tokens",
+    "header_count_a",
+    "list_count_a",
+    "bold_count_a",
+    "sum_assistant_b_tokens",
+    "header_count_b",
+    "list_count_b",
+    "bold_count_b",
+]
+
+
 def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
     rating = defaultdict(lambda: INIT_RATING)
 
@@ -399,6 +411,109 @@ def outlier_detect(
     return battles
 
 
+def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
+    from sklearn.linear_model import LogisticRegression
+
+    p = len(models.index)
+
+    lr = LogisticRegression(fit_intercept=False)
+    if indices:
+        lr.fit(X[indices], Y[indices])
+    else:
+        lr.fit(X, Y)
+
+    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
+    # calibrate llama-13b to 800 if applicable
+    if "mixtral-8x7b-instruct-v0.1" in models.index:
+        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
+    return (
+        pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
+        lr.coef_[0][p:],
+    )
+
+
+def construct_style_matrices(
+    df,
+    BASE=10,
+    apply_ratio=[1, 1, 1, 1],
+    style_elements=STYLE_CONTROL_ELEMENTS_V1,
+    add_one=True,
+):
+    models = pd.concat([battles["model_a"], battles["model_b"]]).unique()
+    models = pd.Series(np.arange(len(models)), index=models)
+
+    # duplicate battles
+    df = pd.concat([df, df], ignore_index=True)
+    p = len(models.index)
+    n = df.shape[0]
+    assert len(style_elements) % 2 == 0
+    k = int(len(style_elements) / 2)
+
+    X = np.zeros([n, p + k])
+    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
+    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
+
+    # creates turn each of the specified column in "conv_metadata" into a vector
+    style_vector = np.array(
+        [
+            df.conv_metadata.map(
+                lambda x: x[element]
+                if type(x[element]) is int
+                else sum(x[element].values())
+            ).tolist()
+            for element in style_elements
+        ]
+    )
+
+    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
+    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
+
+    if add_one:
+        style_sum = style_sum + np.ones(style_diff.shape)
+
+    apply_ratio = np.flatnonzero(apply_ratio)
+
+    style_diff[apply_ratio] /= style_sum[
+        apply_ratio
+    ]  # Apply ratio where necessary (length, etc)
+
+    style_mean = np.mean(style_diff, axis=1)
+    style_std = np.std(style_diff, axis=1)
+
+    X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
+
+    # one A win => two A win
+    Y = np.zeros(n)
+    Y[df["winner"] == "model_a"] = 1.0
+
+    # one tie => one A win + one B win
+    # find tie + tie (both bad) index
+    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
+    tie_idx[len(tie_idx) // 2 :] = False
+    Y[tie_idx] = 1.0
+
+    return X, Y, models
+
+
+def get_bootstrap_result_style_control(X, Y, models, func_compute_elo, num_round=1000):
+    elos = []
+    coefs = []
+    for _ in tqdm(range(num_round), desc="bootstrap"):
+        indices = np.random.choice(
+            list(range(len(battles))), size=(len(battles)), replace=True
+        )
+        _X = X[indices]
+        _Y = Y[indices]
+        states = ~_X[:, : len(models)].any(axis=0)
+
+        elo, coef = func_compute_elo(_X, _Y, models=models[~states])
+        elos.append(elo)
+        coefs.append(coef)
+
+    df = pd.DataFrame(elos)
+    return df[df.median().sort_values(ascending=False).index], coefs
+
+
 def filter_long_conv(row):
     threshold = 768
     for conversation_type in ["conversation_a", "conversation_b"]:
@@ -421,6 +536,7 @@ def report_elo_analysis_results(
     run_outlier_detect=False,
     scale=1,
     filter_func=lambda x: True,
+    style_control=False,
 ):
     battles = pd.DataFrame(battles_json)
 
@@ -461,10 +577,17 @@ def report_elo_analysis_results(
     elo_rating_online = compute_elo(battles)
 
     if rating_system == "bt":
-        bootstrap_df = get_bootstrap_result(
-            battles, compute_elo_mle_with_tie, num_round=num_bootstrap
-        )
-        elo_rating_final = compute_elo_mle_with_tie(battles)
+        if style_control:
+            X, Y, models = construct_style_matrices(battles)
+            bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
+                X, Y, models, fit_mle_elo, num_round=num_bootstrap
+            )
+            elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
+        else:
+            bootstrap_df = get_bootstrap_result(
+                battles, compute_elo_mle_with_tie, num_round=num_bootstrap
+            )
+            elo_rating_final = compute_elo_mle_with_tie(battles)
     elif rating_system == "elo":
         bootstrap_df = get_bootstrap_result(
             battles, compute_elo, num_round=num_bootstrap
@@ -538,6 +661,12 @@ def report_elo_analysis_results(
         "last_updated_tstamp": last_updated_tstamp,
         "bootstrap_df": bootstrap_df,
         "leaderboard_table_df": leaderboard_table_df,
+        "style_coefficients": {
+            "bootstrap": np.vstack(boostrap_coef),
+            "final": coef_final,
+        }
+        if rating_system == "bt" and style_control
+        else {},
     }
 
 
@@ -565,6 +694,7 @@ def pretty_print_elo_rating(rating):
     parser.add_argument("--run-outlier-detect", action="store_true", default=False)
     parser.add_argument("--category", nargs="+", default=["full"])
     parser.add_argument("--scale", type=float, default=1)
+    parser.add_argument("--style-control", action="store_true")
     args = parser.parse_args()
 
     np.random.seed(42)
@@ -602,6 +732,7 @@ def pretty_print_elo_rating(rating):
             run_outlier_detect=args.run_outlier_detect,
             scale=args.scale,
             filter_func=filter_func,
+            style_control=args.style_control,
         )
 
     for cat in args.category: