-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathrun_benchmarks.py
75 lines (65 loc) · 2.58 KB
/
run_benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from bot_pipeline import BoT
import argparse
import os
import datetime
parser = argparse.ArgumentParser()
parser.add_argument('--task_name',type=str,default='gameof24',choices=['gameof24','checkmate','wordsorting'])
parser.add_argument('--api_key',default=None,type=str,help='input your api key here')
parser.add_argument('--model_id',type=str,default='gpt-4o',help='Input model id here, if use local model, input the path to the local model')
GameOf24 = """
Let's play a game called 24. You'll be given four integers, and your objective is to use each number only once, combined with any of the four arithmetic operations (addition, subtraction, multiplication, and division) and parentheses, to achieve a total of 24. For example, if the input is 4, 7, 8, and 8, the output could be 7 * 8 - 4 * 8 = 24. You only need to find one feasible solution!
Input:
"""
CheckmateInOne = """
Given a series of chess moves written in Standard Algebraic Notation (SAN), determine the next move that will result in a checkmate.
Input:
"""
WordSorting = """
Sort a list of words alphabetically, placing them in a single line of text separated by spaces.
Input:
"""
if __name__ == "__main__":
args = parser.parse_args()
task = args.task_name
api_key = args.api_key
model_id = args.model_id
now = datetime.datetime.now()
timestamp_str = now.strftime("%Y-%m-%d-%H:%M:%S")
output_dir = 'test_results'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
benchmark_dict = {
'gameof24':GameOf24,
'checkmate':CheckmateInOne,
'wordsorting':WordSorting
}
path_dict = {
'gameof24':'benchmarks/gameof24.jsonl',
'checkmate':'benchmarks/CheckmateInOne.jsonl',
'wordsorting':'benchmarks/word_sorting.jsonl'
}
buffer_dict = {
'gameof24':0,
'checkmate':1,
'wordsorting':2
}
user_prompt = benchmark_dict[task]
path = path_dict[task]
problem_id = buffer_dict[task]
test_bot = BoT(
user_input = None,
problem_id= problem_id,
api_key= api_key,
model_id= model_id,
need_check = True
)
for line in (open(path)):
input = json.loads(line)['input']
user_input = user_prompt + input
test_bot.update_input(user_input)
result = test_bot.bot_run()
tmp = {'input':input,'result':result}
with open(f'test_results/BoT_{task}_{timestamp_str}.jsonl', 'a+', encoding='utf-8') as file:
json_str = json.dumps(tmp)
file.write(json_str + '\n')