From d0ba312d22af9828e77621c3bb825f3a386468b4 Mon Sep 17 00:00:00 2001
From: Christopher Chou <emu4yolife@gmail.com>
Date: Sun, 15 Sep 2024 17:25:52 +0000
Subject: [PATCH 1/5] vision-category

---
 fastchat/serve/monitor/classify/category.py | 16 ++++++++++++++++
 pbcopy                                      |  0
 2 files changed, 16 insertions(+)
 create mode 100644 pbcopy

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
index 223144a32..c20b0eeab 100644
--- a/fastchat/serve/monitor/classify/category.py
+++ b/fastchat/serve/monitor/classify/category.py
@@ -134,3 +134,19 @@ def pre_process(self, prompt):
     def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
         return {"math": bool(score == "yes") if score else False}
+
+class CategoryVisionHardPrompt(CategoryHardPrompt):
+    def __init__(self):
+        super().__init__()
+        self.name_tag = "criteria_vision_v0.1"
+        self.pattern = re.compile(r"(\[\d(?:\,\s\d)*\])")
+        self.sys_prompt = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice."
+        self.tags = {
+            1: "specificity",
+            2: "domain_knowledge",
+            3: "complexity",
+            4: "problem_solving",
+            5: "creativity",
+            6: "technical_accuracy",
+            7: "real_world",
+        }
\ No newline at end of file
diff --git a/pbcopy b/pbcopy
new file mode 100644
index 000000000..e69de29bb

From 059b654a5c4bb53e3c61cb5c2d1721a9beabab98 Mon Sep 17 00:00:00 2001
From: Christopher Chou <emu4yolife@gmail.com>
Date: Sun, 15 Sep 2024 17:27:26 +0000
Subject: [PATCH 2/5] Keep same sys

---
 fastchat/serve/monitor/classify/category.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
index c20b0eeab..f1a6dc3df 100644
--- a/fastchat/serve/monitor/classify/category.py
+++ b/fastchat/serve/monitor/classify/category.py
@@ -138,15 +138,4 @@ def post_process(self, judgment):
 class CategoryVisionHardPrompt(CategoryHardPrompt):
     def __init__(self):
         super().__init__()
-        self.name_tag = "criteria_vision_v0.1"
-        self.pattern = re.compile(r"(\[\d(?:\,\s\d)*\])")
-        self.sys_prompt = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice."
-        self.tags = {
-            1: "specificity",
-            2: "domain_knowledge",
-            3: "complexity",
-            4: "problem_solving",
-            5: "creativity",
-            6: "technical_accuracy",
-            7: "real_world",
-        }
\ No newline at end of file
+        self.name_tag = "criteria_vision_v0.1"
\ No newline at end of file

From d4b1fd3952a881b1dde3846fa201f953c982104e Mon Sep 17 00:00:00 2001
From: Christopher Chou <emu4yolife@gmail.com>
Date: Sun, 15 Sep 2024 18:41:32 +0000
Subject: [PATCH 3/5] add

---
 fastchat/serve/monitor/classify/category.py | 21 +++++++++++-
 fastchat/serve/monitor/classify/label.py    | 36 +++++++++++++++++++--
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
index f1a6dc3df..62a845d9b 100644
--- a/fastchat/serve/monitor/classify/category.py
+++ b/fastchat/serve/monitor/classify/category.py
@@ -9,6 +9,8 @@
 #         - if
 #         - score
 import ast
+import base64
+import os
 import re
 
 
@@ -24,6 +26,8 @@ def create_category(name):
             return CategoryIF()
         elif name == "math_v0.1":
             return CategoryMath()
+        elif name == "criteria_vision_v0.1":
+            return CategoryVisionHardPrompt()
 
         raise Exception(f"Category name is incorrect: {name}")
 
@@ -138,4 +142,19 @@ def post_process(self, judgment):
 class CategoryVisionHardPrompt(CategoryHardPrompt):
     def __init__(self):
         super().__init__()
-        self.name_tag = "criteria_vision_v0.1"
\ No newline at end of file
+        self.name_tag = "criteria_vision_v0.1"
+    
+    def _convert_filepath_to_base64(self, filepath):
+        with open(filepath, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def pre_process(self, prompt: str, image_list: list):
+        # Prompt is a list where the first element is text and the second element is a list of image in base64 format
+        conv = [{"role": "system", "content": self.sys_prompt}]
+        single_turn_content_list = []
+        single_turn_content_list.append({"type": "text", "text": prompt})
+        for image_url in image_list:
+            single_turn_content_list.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}"}})
+        
+        conv.append({"role": "user", "content": single_turn_content_list})
+        return conv
\ No newline at end of file
diff --git a/fastchat/serve/monitor/classify/label.py b/fastchat/serve/monitor/classify/label.py
index 2d0471a1f..b411cf2e2 100644
--- a/fastchat/serve/monitor/classify/label.py
+++ b/fastchat/serve/monitor/classify/label.py
@@ -107,7 +107,10 @@ def get_answer(
     output_log = {}
 
     for category in categories:
-        conv = category.pre_process(question["prompt"])
+        if config["images_dir"]:
+            conv = category.pre_process(question["prompt"], question["image_list"])
+        else:
+            conv = category.pre_process(question["prompt"])
         output = chat_completion_openai(
             model=model_name,
             messages=conv,
@@ -164,6 +167,30 @@ def find_required_tasks(row):
         )
     ]
 
+def aggregate_entire_conversation(conversation, images_dir):              
+    final_text_content = ""
+    final_image_list = []
+
+    for i in range(0, len(conversation), 2):
+        content = conversation[i]["content"]
+        if isinstance(content, str):
+            final_text_content += "\n" + content
+        elif isinstance(content, list):
+            text_content, image_list = content
+            final_text_content += "\n" + text_content
+
+            for image in image_list:
+                image_url = os.path.join(images_dir, f"{image}.png")
+                if os.path.exists(image_url):
+                    final_image_list.append(image_url)
+
+    return final_text_content, final_image_list
+
+def get_prompt_from_conversation(conversation):
+    return conversation[0]
+
+def get_image_list_from_conversation(conversation):
+    return conversation[1]
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -247,8 +274,13 @@ def find_required_tasks(row):
         )
 
     not_labeled["prompt"] = not_labeled.conversation_a.map(
-        lambda convo: "\n".join([convo[i]["content"] for i in range(0, len(convo), 2)])
+        lambda convo: aggregate_entire_conversation(convo, config["images_dir"])
     )
+    
+    if config["images_dir"]:
+        not_labeled["image_list"] = not_labeled.prompt.map(get_image_list_from_conversation)
+        not_labeled = not_labeled[not_labeled.image_list.map(len) > 0]
+    not_labeled["prompt"] = not_labeled.prompt.map(get_prompt_from_conversation)
     not_labeled["prompt"] = not_labeled.prompt.map(lambda x: x[:12500])
 
     with concurrent.futures.ThreadPoolExecutor(

From 5e02abd69c660f0119317f94c6f9b8143adb4bee Mon Sep 17 00:00:00 2001
From: Christopher Chou <emu4yolife@gmail.com>
Date: Mon, 16 Sep 2024 20:10:04 +0000
Subject: [PATCH 4/5] Add filter

---
 fastchat/serve/monitor/elo_analysis.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
index bea808fc5..6db863eb3 100644
--- a/fastchat/serve/monitor/elo_analysis.py
+++ b/fastchat/serve/monitor/elo_analysis.py
@@ -721,7 +721,10 @@ def pretty_print_elo_rating(rating):
 
     if args.clean_battle_file:
         # Read data from a cleaned battle files
-        battles = pd.read_json(args.clean_battle_file)
+        if args.clean_battle_file.endswith(".jsonl"):
+            battles = pd.read_json(args.clean_battle_file, lines=True)
+        else:
+            battles = pd.read_json(args.clean_battle_file)
     else:
         # Read data from all log files
         log_files = get_log_files(args.max_num_files)
@@ -732,6 +735,7 @@ def pretty_print_elo_rating(rating):
         "long": filter_long_conv,
         "chinese": lambda x: x["language"] == "Chinese",
         "english": lambda x: x["language"] == "English",
+        "criteria_vision_v0.1": lambda x: sum(x["category_tag"]["criteria_vision_v0.1"].values()) >= 6,
     }
     assert all(
         [cat in filter_func_map for cat in args.category]

From 4debf32210b849942cc0403eed557aed8aa520a1 Mon Sep 17 00:00:00 2001
From: Christopher Chou <emu4yolife@gmail.com>
Date: Mon, 16 Sep 2024 20:11:03 +0000
Subject: [PATCH 5/5] Format

---
 fastchat/serve/monitor/classify/category.py | 18 +++++++++++++-----
 fastchat/serve/monitor/classify/label.py    | 12 +++++++++---
 fastchat/serve/monitor/elo_analysis.py      |  5 ++++-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
index 62a845d9b..12efbf4b3 100644
--- a/fastchat/serve/monitor/classify/category.py
+++ b/fastchat/serve/monitor/classify/category.py
@@ -139,14 +139,15 @@ def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
         return {"math": bool(score == "yes") if score else False}
 
+
 class CategoryVisionHardPrompt(CategoryHardPrompt):
     def __init__(self):
         super().__init__()
         self.name_tag = "criteria_vision_v0.1"
-    
+
     def _convert_filepath_to_base64(self, filepath):
         with open(filepath, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
+            return base64.b64encode(image_file.read()).decode("utf-8")
 
     def pre_process(self, prompt: str, image_list: list):
         # Prompt is a list where the first element is text and the second element is a list of image in base64 format
@@ -154,7 +155,14 @@ def pre_process(self, prompt: str, image_list: list):
         single_turn_content_list = []
         single_turn_content_list.append({"type": "text", "text": prompt})
         for image_url in image_list:
-            single_turn_content_list.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}"}})
-        
+            single_turn_content_list.append(
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{self._convert_filepath_to_base64(image_url)}"
+                    },
+                }
+            )
+
         conv.append({"role": "user", "content": single_turn_content_list})
-        return conv
\ No newline at end of file
+        return conv
diff --git a/fastchat/serve/monitor/classify/label.py b/fastchat/serve/monitor/classify/label.py
index b411cf2e2..deb15cc76 100644
--- a/fastchat/serve/monitor/classify/label.py
+++ b/fastchat/serve/monitor/classify/label.py
@@ -167,7 +167,8 @@ def find_required_tasks(row):
         )
     ]
 
-def aggregate_entire_conversation(conversation, images_dir):              
+
+def aggregate_entire_conversation(conversation, images_dir):
     final_text_content = ""
     final_image_list = []
 
@@ -186,12 +187,15 @@ def aggregate_entire_conversation(conversation, images_dir):
 
     return final_text_content, final_image_list
 
+
 def get_prompt_from_conversation(conversation):
     return conversation[0]
 
+
 def get_image_list_from_conversation(conversation):
     return conversation[1]
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", type=str, required=True)
@@ -276,9 +280,11 @@ def get_image_list_from_conversation(conversation):
     not_labeled["prompt"] = not_labeled.conversation_a.map(
         lambda convo: aggregate_entire_conversation(convo, config["images_dir"])
     )
-    
+
     if config["images_dir"]:
-        not_labeled["image_list"] = not_labeled.prompt.map(get_image_list_from_conversation)
+        not_labeled["image_list"] = not_labeled.prompt.map(
+            get_image_list_from_conversation
+        )
         not_labeled = not_labeled[not_labeled.image_list.map(len) > 0]
     not_labeled["prompt"] = not_labeled.prompt.map(get_prompt_from_conversation)
     not_labeled["prompt"] = not_labeled.prompt.map(lambda x: x[:12500])
diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
index 6db863eb3..b2fa24aab 100644
--- a/fastchat/serve/monitor/elo_analysis.py
+++ b/fastchat/serve/monitor/elo_analysis.py
@@ -735,7 +735,10 @@ def pretty_print_elo_rating(rating):
         "long": filter_long_conv,
         "chinese": lambda x: x["language"] == "Chinese",
         "english": lambda x: x["language"] == "English",
-        "criteria_vision_v0.1": lambda x: sum(x["category_tag"]["criteria_vision_v0.1"].values()) >= 6,
+        "criteria_vision_v0.1": lambda x: sum(
+            x["category_tag"]["criteria_vision_v0.1"].values()
+        )
+        >= 6,
     }
     assert all(
         [cat in filter_func_map for cat in args.category]