Skip to content

Commit

Permalink
Add Style Control to Chatbot Arena Leaderboard 🔥 (#3495)
Browse files Browse the repository at this point in the history
Co-authored-by: CodingWithTim <tim@inst-builder-debian-12-build-build-4zqb5.us-central1-a.c.gce-image-builder.internal>
  • Loading branch information
CodingWithTim and CodingWithTim authored Aug 27, 2024
1 parent 282534b commit 05b9305
Show file tree
Hide file tree
Showing 2 changed files with 219 additions and 4 deletions.
84 changes: 84 additions & 0 deletions fastchat/serve/monitor/add_markdown_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import re
import argparse

from tqdm import tqdm

tqdm.pandas()


def count_markdown_elements(markdown_text, suffix):
counters = {
f"header_count{suffix}": {
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
},
f"list_count{suffix}": {
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
},
f"bold_count{suffix}": {
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
"__": len(re.findall(r"__[^_\n]+__", markdown_text)),
},
}
return counters


def remove_pattern(answer, pattern):
blocks = pattern.findall(answer)
for block in blocks:
answer = answer.replace(block, "")
return answer


def get_element_counts(df, column):
pattern = re.compile("```([^`]*)```")
answers = df[column].map(
lambda convo: "\n".join(
[turn["content"] for turn in convo if turn["role"] == "assistant"]
)
)
results = answers.progress_map(
lambda answer: count_markdown_elements(
remove_pattern(answer, pattern),
suffix=column[-2:], # Remove code block first
)
)

return results.tolist()


def add_markdown_meta(row):
conv_meta = {k: v for k, v in row["conv_metadata"].items()}
return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"]


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", type=str, required=True)
parser.add_argument("--output-file", type=str, required=True)
args = parser.parse_args()

print("loading file...")
data = pd.read_json(args.input_file)

assert "conv_metadata" in data.columns

temp = data[["question_id", "conv_metadata"]].copy()

print("Processing conversation_a")
temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a")

print("Processing conversation_b")
temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b")

print("Post-processing...")
data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1)

print("Saving to file...")
data.to_json(args.output_file, orient="records", indent=4, force_ascii=False)
139 changes: 135 additions & 4 deletions fastchat/serve/monitor/elo_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@
pd.options.display.float_format = "{:.2f}".format


STYLE_CONTROL_ELEMENTS_V1 = [
"sum_assistant_a_tokens",
"header_count_a",
"list_count_a",
"bold_count_a",
"sum_assistant_b_tokens",
"header_count_b",
"list_count_b",
"bold_count_b",
]


def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
rating = defaultdict(lambda: INIT_RATING)

Expand Down Expand Up @@ -399,6 +411,109 @@ def outlier_detect(
return battles


def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
from sklearn.linear_model import LogisticRegression

p = len(models.index)

lr = LogisticRegression(fit_intercept=False)
if indices:
lr.fit(X[indices], Y[indices])
else:
lr.fit(X, Y)

elo_scores = SCALE * lr.coef_[0] + INIT_RATING
# calibrate llama-13b to 800 if applicable
if "mixtral-8x7b-instruct-v0.1" in models.index:
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
return (
pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
lr.coef_[0][p:],
)


def construct_style_matrices(
df,
BASE=10,
apply_ratio=[1, 1, 1, 1],
style_elements=STYLE_CONTROL_ELEMENTS_V1,
add_one=True,
):
models = pd.concat([battles["model_a"], battles["model_b"]]).unique()
models = pd.Series(np.arange(len(models)), index=models)

# duplicate battles
df = pd.concat([df, df], ignore_index=True)
p = len(models.index)
n = df.shape[0]
assert len(style_elements) % 2 == 0
k = int(len(style_elements) / 2)

X = np.zeros([n, p + k])
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

# creates turn each of the specified column in "conv_metadata" into a vector
style_vector = np.array(
[
df.conv_metadata.map(
lambda x: x[element]
if type(x[element]) is int
else sum(x[element].values())
).tolist()
for element in style_elements
]
)

style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)

if add_one:
style_sum = style_sum + np.ones(style_diff.shape)

apply_ratio = np.flatnonzero(apply_ratio)

style_diff[apply_ratio] /= style_sum[
apply_ratio
] # Apply ratio where necessary (length, etc)

style_mean = np.mean(style_diff, axis=1)
style_std = np.std(style_diff, axis=1)

X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T

# one A win => two A win
Y = np.zeros(n)
Y[df["winner"] == "model_a"] = 1.0

# one tie => one A win + one B win
# find tie + tie (both bad) index
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
tie_idx[len(tie_idx) // 2 :] = False
Y[tie_idx] = 1.0

return X, Y, models


def get_bootstrap_result_style_control(X, Y, models, func_compute_elo, num_round=1000):
elos = []
coefs = []
for _ in tqdm(range(num_round), desc="bootstrap"):
indices = np.random.choice(
list(range(len(battles))), size=(len(battles)), replace=True
)
_X = X[indices]
_Y = Y[indices]
states = ~_X[:, : len(models)].any(axis=0)

elo, coef = func_compute_elo(_X, _Y, models=models[~states])
elos.append(elo)
coefs.append(coef)

df = pd.DataFrame(elos)
return df[df.median().sort_values(ascending=False).index], coefs


def filter_long_conv(row):
threshold = 768
for conversation_type in ["conversation_a", "conversation_b"]:
Expand All @@ -421,6 +536,7 @@ def report_elo_analysis_results(
run_outlier_detect=False,
scale=1,
filter_func=lambda x: True,
style_control=False,
):
battles = pd.DataFrame(battles_json)

Expand Down Expand Up @@ -461,10 +577,17 @@ def report_elo_analysis_results(
elo_rating_online = compute_elo(battles)

if rating_system == "bt":
bootstrap_df = get_bootstrap_result(
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
)
elo_rating_final = compute_elo_mle_with_tie(battles)
if style_control:
X, Y, models = construct_style_matrices(battles)
bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
X, Y, models, fit_mle_elo, num_round=num_bootstrap
)
elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
else:
bootstrap_df = get_bootstrap_result(
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
)
elo_rating_final = compute_elo_mle_with_tie(battles)
elif rating_system == "elo":
bootstrap_df = get_bootstrap_result(
battles, compute_elo, num_round=num_bootstrap
Expand Down Expand Up @@ -538,6 +661,12 @@ def report_elo_analysis_results(
"last_updated_tstamp": last_updated_tstamp,
"bootstrap_df": bootstrap_df,
"leaderboard_table_df": leaderboard_table_df,
"style_coefficients": {
"bootstrap": np.vstack(boostrap_coef),
"final": coef_final,
}
if rating_system == "bt" and style_control
else {},
}


Expand Down Expand Up @@ -565,6 +694,7 @@ def pretty_print_elo_rating(rating):
parser.add_argument("--run-outlier-detect", action="store_true", default=False)
parser.add_argument("--category", nargs="+", default=["full"])
parser.add_argument("--scale", type=float, default=1)
parser.add_argument("--style-control", action="store_true")
args = parser.parse_args()

np.random.seed(42)
Expand Down Expand Up @@ -602,6 +732,7 @@ def pretty_print_elo_rating(rating):
run_outlier_detect=args.run_outlier_detect,
scale=args.scale,
filter_func=filter_func,
style_control=args.style_control,
)

for cat in args.category:
Expand Down

0 comments on commit 05b9305

Please sign in to comment.