From 05b930550cadc3e3ccfb1c0a5f2c6ff0bc0faf97 Mon Sep 17 00:00:00 2001 From: "Tianle (Tim) Li" <67527391+CodingWithTim@users.noreply.github.com> Date: Mon, 26 Aug 2024 19:49:28 -0700 Subject: [PATCH] =?UTF-8?q?Add=20Style=20Control=20to=20Chatbot=20Arena=20?= =?UTF-8?q?Leaderboard=20=F0=9F=94=A5=20(#3495)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: CodingWithTim --- fastchat/serve/monitor/add_markdown_info.py | 84 ++++++++++++ fastchat/serve/monitor/elo_analysis.py | 139 +++++++++++++++++++- 2 files changed, 219 insertions(+), 4 deletions(-) create mode 100644 fastchat/serve/monitor/add_markdown_info.py diff --git a/fastchat/serve/monitor/add_markdown_info.py b/fastchat/serve/monitor/add_markdown_info.py new file mode 100644 index 000000000..f05468ff9 --- /dev/null +++ b/fastchat/serve/monitor/add_markdown_info.py @@ -0,0 +1,84 @@ +import pandas as pd +import re +import argparse + +from tqdm import tqdm + +tqdm.pandas() + + +def count_markdown_elements(markdown_text, suffix): + counters = { + f"header_count{suffix}": { + "h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)), + "h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)), + "h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)), + "h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)), + "h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)), + "h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)), + }, + f"list_count{suffix}": { + "ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)), + "unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)), + }, + f"bold_count{suffix}": { + "**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)), + "__": len(re.findall(r"__[^_\n]+__", markdown_text)), + }, + } + return counters + + +def remove_pattern(answer, pattern): + blocks = pattern.findall(answer) + for block in blocks: + answer = answer.replace(block, "") + return answer + + +def get_element_counts(df, column): + pattern = re.compile("```([^`]*)```") + answers = df[column].map( + lambda convo: "\n".join( + [turn["content"] for turn in convo if turn["role"] == "assistant"] + ) + ) + results = answers.progress_map( + lambda answer: count_markdown_elements( + remove_pattern(answer, pattern), + suffix=column[-2:], # Remove code block first + ) + ) + + return results.tolist() + + +def add_markdown_meta(row): + conv_meta = {k: v for k, v in row["conv_metadata"].items()} + return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input-file", type=str, required=True) + parser.add_argument("--output-file", type=str, required=True) + args = parser.parse_args() + + print("loading file...") + data = pd.read_json(args.input_file) + + assert "conv_metadata" in data.columns + + temp = data[["question_id", "conv_metadata"]].copy() + + print("Processing conversation_a") + temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a") + + print("Processing conversation_b") + temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b") + + print("Post-processing...") + data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1) + + print("Saving to file...") + data.to_json(args.output_file, orient="records", indent=4, force_ascii=False) diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index 4982b2f0d..4eeb53fa1 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -21,6 +21,18 @@ pd.options.display.float_format = "{:.2f}".format +STYLE_CONTROL_ELEMENTS_V1 = [ + "sum_assistant_a_tokens", + "header_count_a", + "list_count_a", + "bold_count_a", + "sum_assistant_b_tokens", + "header_count_b", + "list_count_b", + "bold_count_b", +] + + def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000): rating = defaultdict(lambda: INIT_RATING) @@ -399,6 +411,109 @@ def outlier_detect( return battles +def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000): + from sklearn.linear_model import LogisticRegression + + p = len(models.index) + + lr = LogisticRegression(fit_intercept=False) + if indices: + lr.fit(X[indices], Y[indices]) + else: + lr.fit(X, Y) + + elo_scores = SCALE * lr.coef_[0] + INIT_RATING + # calibrate llama-13b to 800 if applicable + if "mixtral-8x7b-instruct-v0.1" in models.index: + elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]] + return ( + pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False), + lr.coef_[0][p:], + ) + + +def construct_style_matrices( + df, + BASE=10, + apply_ratio=[1, 1, 1, 1], + style_elements=STYLE_CONTROL_ELEMENTS_V1, + add_one=True, +): + models = pd.concat([battles["model_a"], battles["model_b"]]).unique() + models = pd.Series(np.arange(len(models)), index=models) + + # duplicate battles + df = pd.concat([df, df], ignore_index=True) + p = len(models.index) + n = df.shape[0] + assert len(style_elements) % 2 == 0 + k = int(len(style_elements) / 2) + + X = np.zeros([n, p + k]) + X[np.arange(n), models[df["model_a"]]] = +math.log(BASE) + X[np.arange(n), models[df["model_b"]]] = -math.log(BASE) + + # creates turn each of the specified column in "conv_metadata" into a vector + style_vector = np.array( + [ + df.conv_metadata.map( + lambda x: x[element] + if type(x[element]) is int + else sum(x[element].values()) + ).tolist() + for element in style_elements + ] + ) + + style_diff = (style_vector[:k] - style_vector[k:]).astype(float) + style_sum = (style_vector[:k] + style_vector[k:]).astype(float) + + if add_one: + style_sum = style_sum + np.ones(style_diff.shape) + + apply_ratio = np.flatnonzero(apply_ratio) + + style_diff[apply_ratio] /= style_sum[ + apply_ratio + ] # Apply ratio where necessary (length, etc) + + style_mean = np.mean(style_diff, axis=1) + style_std = np.std(style_diff, axis=1) + + X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T + + # one A win => two A win + Y = np.zeros(n) + Y[df["winner"] == "model_a"] = 1.0 + + # one tie => one A win + one B win + # find tie + tie (both bad) index + tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)") + tie_idx[len(tie_idx) // 2 :] = False + Y[tie_idx] = 1.0 + + return X, Y, models + + +def get_bootstrap_result_style_control(X, Y, models, func_compute_elo, num_round=1000): + elos = [] + coefs = [] + for _ in tqdm(range(num_round), desc="bootstrap"): + indices = np.random.choice( + list(range(len(battles))), size=(len(battles)), replace=True + ) + _X = X[indices] + _Y = Y[indices] + states = ~_X[:, : len(models)].any(axis=0) + + elo, coef = func_compute_elo(_X, _Y, models=models[~states]) + elos.append(elo) + coefs.append(coef) + + df = pd.DataFrame(elos) + return df[df.median().sort_values(ascending=False).index], coefs + + def filter_long_conv(row): threshold = 768 for conversation_type in ["conversation_a", "conversation_b"]: @@ -421,6 +536,7 @@ def report_elo_analysis_results( run_outlier_detect=False, scale=1, filter_func=lambda x: True, + style_control=False, ): battles = pd.DataFrame(battles_json) @@ -461,10 +577,17 @@ def report_elo_analysis_results( elo_rating_online = compute_elo(battles) if rating_system == "bt": - bootstrap_df = get_bootstrap_result( - battles, compute_elo_mle_with_tie, num_round=num_bootstrap - ) - elo_rating_final = compute_elo_mle_with_tie(battles) + if style_control: + X, Y, models = construct_style_matrices(battles) + bootstrap_df, boostrap_coef = get_bootstrap_result_style_control( + X, Y, models, fit_mle_elo, num_round=num_bootstrap + ) + elo_rating_final, coef_final = fit_mle_elo(X, Y, models) + else: + bootstrap_df = get_bootstrap_result( + battles, compute_elo_mle_with_tie, num_round=num_bootstrap + ) + elo_rating_final = compute_elo_mle_with_tie(battles) elif rating_system == "elo": bootstrap_df = get_bootstrap_result( battles, compute_elo, num_round=num_bootstrap @@ -538,6 +661,12 @@ def report_elo_analysis_results( "last_updated_tstamp": last_updated_tstamp, "bootstrap_df": bootstrap_df, "leaderboard_table_df": leaderboard_table_df, + "style_coefficients": { + "bootstrap": np.vstack(boostrap_coef), + "final": coef_final, + } + if rating_system == "bt" and style_control + else {}, } @@ -565,6 +694,7 @@ def pretty_print_elo_rating(rating): parser.add_argument("--run-outlier-detect", action="store_true", default=False) parser.add_argument("--category", nargs="+", default=["full"]) parser.add_argument("--scale", type=float, default=1) + parser.add_argument("--style-control", action="store_true") args = parser.parse_args() np.random.seed(42) @@ -602,6 +732,7 @@ def pretty_print_elo_rating(rating): run_outlier_detect=args.run_outlier_detect, scale=args.scale, filter_func=filter_func, + style_control=args.style_control, ) for cat in args.category: