From d0ba312d22af9828e77621c3bb825f3a386468b4 Mon Sep 17 00:00:00 2001 From: Christopher Chou Date: Sun, 15 Sep 2024 17:25:52 +0000 Subject: [PATCH 1/5] vision-category --- fastchat/serve/monitor/classify/category.py | 16 ++++++++++++++++ pbcopy | 0 2 files changed, 16 insertions(+) create mode 100644 pbcopy diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py index 223144a32..c20b0eeab 100644 --- a/fastchat/serve/monitor/classify/category.py +++ b/fastchat/serve/monitor/classify/category.py @@ -134,3 +134,19 @@ def pre_process(self, prompt): def post_process(self, judgment): score = self.get_score(judgment=judgment) return {"math": bool(score == "yes") if score else False} + +class CategoryVisionHardPrompt(CategoryHardPrompt): + def __init__(self): + super().__init__() + self.name_tag = "criteria_vision_v0.1" + self.pattern = re.compile(r"(\[\d(?:\,\s\d)*\])") + self.sys_prompt = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice." + self.tags = { + 1: "specificity", + 2: "domain_knowledge", + 3: "complexity", + 4: "problem_solving", + 5: "creativity", + 6: "technical_accuracy", + 7: "real_world", + } \ No newline at end of file diff --git a/pbcopy b/pbcopy new file mode 100644 index 000000000..e69de29bb From 059b654a5c4bb53e3c61cb5c2d1721a9beabab98 Mon Sep 17 00:00:00 2001 From: Christopher Chou Date: Sun, 15 Sep 2024 17:27:26 +0000 Subject: [PATCH 2/5] Keep same sys --- fastchat/serve/monitor/classify/category.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py index c20b0eeab..f1a6dc3df 100644 --- a/fastchat/serve/monitor/classify/category.py +++ b/fastchat/serve/monitor/classify/category.py @@ -138,15 +138,4 @@ def post_process(self, judgment): class CategoryVisionHardPrompt(CategoryHardPrompt): def __init__(self): super().__init__() - self.name_tag = "criteria_vision_v0.1" - self.pattern = re.compile(r"(\[\d(?:\,\s\d)*\])") - self.sys_prompt = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice." - self.tags = { - 1: "specificity", - 2: "domain_knowledge", - 3: "complexity", - 4: "problem_solving", - 5: "creativity", - 6: "technical_accuracy", - 7: "real_world", - } \ No newline at end of file + self.name_tag = "criteria_vision_v0.1" \ No newline at end of file From d4b1fd3952a881b1dde3846fa201f953c982104e Mon Sep 17 00:00:00 2001 From: Christopher Chou Date: Sun, 15 Sep 2024 18:41:32 +0000 Subject: [PATCH 3/5] add --- fastchat/serve/monitor/classify/category.py | 21 +++++++++++- fastchat/serve/monitor/classify/label.py | 36 +++++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py index f1a6dc3df..62a845d9b 100644 --- a/fastchat/serve/monitor/classify/category.py +++ b/fastchat/serve/monitor/classify/category.py @@ -9,6 +9,8 @@ # - if # - score import ast +import base64 +import os import re @@ -24,6 +26,8 @@ def create_category(name): return CategoryIF() elif name == "math_v0.1": return CategoryMath() + elif name == "criteria_vision_v0.1": + return CategoryVisionHardPrompt() raise Exception(f"Category name is incorrect: {name}") @@ -138,4 +142,19 @@ def post_process(self, judgment): class CategoryVisionHardPrompt(CategoryHardPrompt): def __init__(self): super().__init__() - self.name_tag = "criteria_vision_v0.1" \ No newline at end of file + self.name_tag = "criteria_vision_v0.1" + + def _convert_filepath_to_base64(self, filepath): + with open(filepath, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + def pre_process(self, prompt: str, image_list: list): + # Prompt is a list where the first element is text and the second element is a list of image in base64 format + conv = [{"role": "system", "content": self.sys_prompt}] + single_turn_content_list = [] + single_turn_content_list.append({"type": "text", "text": prompt}) + for image_url in image_list: + single_turn_content_list.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}"}}) + + conv.append({"role": "user", "content": single_turn_content_list}) + return conv \ No newline at end of file diff --git a/fastchat/serve/monitor/classify/label.py b/fastchat/serve/monitor/classify/label.py index 2d0471a1f..b411cf2e2 100644 --- a/fastchat/serve/monitor/classify/label.py +++ b/fastchat/serve/monitor/classify/label.py @@ -107,7 +107,10 @@ def get_answer( output_log = {} for category in categories: - conv = category.pre_process(question["prompt"]) + if config["images_dir"]: + conv = category.pre_process(question["prompt"], question["image_list"]) + else: + conv = category.pre_process(question["prompt"]) output = chat_completion_openai( model=model_name, messages=conv, @@ -164,6 +167,30 @@ def find_required_tasks(row): ) ] +def aggregate_entire_conversation(conversation, images_dir): + final_text_content = "" + final_image_list = [] + + for i in range(0, len(conversation), 2): + content = conversation[i]["content"] + if isinstance(content, str): + final_text_content += "\n" + content + elif isinstance(content, list): + text_content, image_list = content + final_text_content += "\n" + text_content + + for image in image_list: + image_url = os.path.join(images_dir, f"{image}.png") + if os.path.exists(image_url): + final_image_list.append(image_url) + + return final_text_content, final_image_list + +def get_prompt_from_conversation(conversation): + return conversation[0] + +def get_image_list_from_conversation(conversation): + return conversation[1] if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -247,8 +274,13 @@ def find_required_tasks(row): ) not_labeled["prompt"] = not_labeled.conversation_a.map( - lambda convo: "\n".join([convo[i]["content"] for i in range(0, len(convo), 2)]) + lambda convo: aggregate_entire_conversation(convo, config["images_dir"]) ) + + if config["images_dir"]: + not_labeled["image_list"] = not_labeled.prompt.map(get_image_list_from_conversation) + not_labeled = not_labeled[not_labeled.image_list.map(len) > 0] + not_labeled["prompt"] = not_labeled.prompt.map(get_prompt_from_conversation) not_labeled["prompt"] = not_labeled.prompt.map(lambda x: x[:12500]) with concurrent.futures.ThreadPoolExecutor( From 5e02abd69c660f0119317f94c6f9b8143adb4bee Mon Sep 17 00:00:00 2001 From: Christopher Chou Date: Mon, 16 Sep 2024 20:10:04 +0000 Subject: [PATCH 4/5] Add filter --- fastchat/serve/monitor/elo_analysis.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index bea808fc5..6db863eb3 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -721,7 +721,10 @@ def pretty_print_elo_rating(rating): if args.clean_battle_file: # Read data from a cleaned battle files - battles = pd.read_json(args.clean_battle_file) + if args.clean_battle_file.endswith(".jsonl"): + battles = pd.read_json(args.clean_battle_file, lines=True) + else: + battles = pd.read_json(args.clean_battle_file) else: # Read data from all log files log_files = get_log_files(args.max_num_files) @@ -732,6 +735,7 @@ def pretty_print_elo_rating(rating): "long": filter_long_conv, "chinese": lambda x: x["language"] == "Chinese", "english": lambda x: x["language"] == "English", + "criteria_vision_v0.1": lambda x: sum(x["category_tag"]["criteria_vision_v0.1"].values()) >= 6, } assert all( [cat in filter_func_map for cat in args.category] From 4debf32210b849942cc0403eed557aed8aa520a1 Mon Sep 17 00:00:00 2001 From: Christopher Chou Date: Mon, 16 Sep 2024 20:11:03 +0000 Subject: [PATCH 5/5] Format --- fastchat/serve/monitor/classify/category.py | 18 +++++++++++++----- fastchat/serve/monitor/classify/label.py | 12 +++++++++--- fastchat/serve/monitor/elo_analysis.py | 5 ++++- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py index 62a845d9b..12efbf4b3 100644 --- a/fastchat/serve/monitor/classify/category.py +++ b/fastchat/serve/monitor/classify/category.py @@ -139,14 +139,15 @@ def post_process(self, judgment): score = self.get_score(judgment=judgment) return {"math": bool(score == "yes") if score else False} + class CategoryVisionHardPrompt(CategoryHardPrompt): def __init__(self): super().__init__() self.name_tag = "criteria_vision_v0.1" - + def _convert_filepath_to_base64(self, filepath): with open(filepath, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') + return base64.b64encode(image_file.read()).decode("utf-8") def pre_process(self, prompt: str, image_list: list): # Prompt is a list where the first element is text and the second element is a list of image in base64 format @@ -154,7 +155,14 @@ def pre_process(self, prompt: str, image_list: list): single_turn_content_list = [] single_turn_content_list.append({"type": "text", "text": prompt}) for image_url in image_list: - single_turn_content_list.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}"}}) - + single_turn_content_list.append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}" + }, + } + ) + conv.append({"role": "user", "content": single_turn_content_list}) - return conv \ No newline at end of file + return conv diff --git a/fastchat/serve/monitor/classify/label.py b/fastchat/serve/monitor/classify/label.py index b411cf2e2..deb15cc76 100644 --- a/fastchat/serve/monitor/classify/label.py +++ b/fastchat/serve/monitor/classify/label.py @@ -167,7 +167,8 @@ def find_required_tasks(row): ) ] -def aggregate_entire_conversation(conversation, images_dir): + +def aggregate_entire_conversation(conversation, images_dir): final_text_content = "" final_image_list = [] @@ -186,12 +187,15 @@ def aggregate_entire_conversation(conversation, images_dir): return final_text_content, final_image_list + def get_prompt_from_conversation(conversation): return conversation[0] + def get_image_list_from_conversation(conversation): return conversation[1] + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, required=True) @@ -276,9 +280,11 @@ def get_image_list_from_conversation(conversation): not_labeled["prompt"] = not_labeled.conversation_a.map( lambda convo: aggregate_entire_conversation(convo, config["images_dir"]) ) - + if config["images_dir"]: - not_labeled["image_list"] = not_labeled.prompt.map(get_image_list_from_conversation) + not_labeled["image_list"] = not_labeled.prompt.map( + get_image_list_from_conversation + ) not_labeled = not_labeled[not_labeled.image_list.map(len) > 0] not_labeled["prompt"] = not_labeled.prompt.map(get_prompt_from_conversation) not_labeled["prompt"] = not_labeled.prompt.map(lambda x: x[:12500]) diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index 6db863eb3..b2fa24aab 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -735,7 +735,10 @@ def pretty_print_elo_rating(rating): "long": filter_long_conv, "chinese": lambda x: x["language"] == "Chinese", "english": lambda x: x["language"] == "English", - "criteria_vision_v0.1": lambda x: sum(x["category_tag"]["criteria_vision_v0.1"].values()) >= 6, + "criteria_vision_v0.1": lambda x: sum( + x["category_tag"]["criteria_vision_v0.1"].values() + ) + >= 6, } assert all( [cat in filter_func_map for cat in args.category]