-
Notifications
You must be signed in to change notification settings - Fork 0
/
puzzle_pair_solve.py
99 lines (83 loc) · 3.86 KB
/
puzzle_pair_solve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import chess
import numpy as np
import io
import json
import csv
from pathlib import Path
from tqdm import tqdm
from puzzle_solver import convert_pgn_to_game, solve_puzzle
import chessllm
from matplotlib import pyplot as plt
DATA_DIR = Path("/data/chess-data/lichess_puzzles")
FILE_NAME = DATA_DIR / "pairs.csv"
"""
Solve puzzle pairs given in FILE_NAME, and report whether the model can solve them.
Separate by rating buckets; take 40 samples from each bucket.
It has the following columns: uid,rating,pgn,proofgame,solution
Helper functions:
def solve_puzzle(board, solution) -> bool: whether model can solve the puzzle
convert_pgn_to_game(pgn_moves) -> game
"""
DATA_DIR = Path("/data/chess-data/lichess_puzzles")
FILE_NAME = DATA_DIR / "pairs.csv"
def plot_acc_pairs(engine, bucket_size=200, enough_samples=10):
# Create buckets
buckets = {i*bucket_size: [] for i in range(30)}
# Read the data and sort into buckets
with open(FILE_NAME) as f:
reader = csv.reader(f)
print(reader.__next__())
for uid, rating, pgn, proofgame, solution in tqdm(list(reader)):
rating_bucket = int(rating) // bucket_size * bucket_size
if len(buckets[rating_bucket]) < enough_samples:
buckets[rating_bucket].append((pgn, proofgame, solution))
# print how many elems in buckets
for k, v in buckets.items():
print(f'rating [{k}, {k + bucket_size})', 'n', len(v))
nonempty_buckets = [k for k, v in buckets.items() if len(v) > 0]
# Test the puzzles
ok_pgn = {i*bucket_size: [] for i in range(30)}
ok_proofgame = {i*bucket_size: [] for i in range(30)}
for rating_bucket, puzzles in tqdm(buckets.items()):
for pgn, proofgame, solution in puzzles:
board_pgn = chess.Board()
board_proofgame = chess.Board()
print("pgn origi", pgn)
print("proofgame", proofgame)
# Iterate over the moves and apply them to the board
for move in convert_pgn_to_game(pgn).mainline_moves():
board_pgn.push(move)
for move in convert_pgn_to_game(proofgame).mainline_moves():
board_proofgame.push(move)
is_right_pgn = solve_puzzle(board_pgn, solution, engine)
is_right_proofgame = solve_puzzle(board_proofgame, solution, engine)
ok_pgn[rating_bucket].append(is_right_pgn)
ok_proofgame[rating_bucket].append(is_right_proofgame)
# Compare the results
for bucket_start in nonempty_buckets:
if len(ok_pgn[bucket_start]) > 0 and len(ok_proofgame[bucket_start]) > 0:
pgn_acc = np.mean(ok_pgn[bucket_start])
proofgame_acc = np.mean(ok_proofgame[bucket_start])
print(f'rating [{bucket_start}, {bucket_start + bucket_size})', f'pgn acc {pgn_acc:.3f}', f'proofgame acc {proofgame_acc:.3f}', 'n', len(ok_pgn[bucket_start]))
# Plot pgn and proofgame on the same plot
bucket_ranges = [(k, k + bucket_size) for k in nonempty_buckets]
bucket_labels = [f"{low}-{high}" for low, high in bucket_ranges]
pgn_acc = [np.mean(ok_pgn[bucket_start]) for bucket_start in nonempty_buckets]
proofgame_acc = [np.mean(ok_proofgame[bucket_start]) for bucket_start in nonempty_buckets]
plt.figure(figsize=(8, 4))
plt.bar(bucket_labels, pgn_acc, label="pgn")
plt.bar(bucket_labels, proofgame_acc, label="proofgame")
plt.xlabel('Puzzle Rating (Elo)')
plt.ylabel('Probability correct')
plt.title('Ratings vs. Buckets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()
plt.savefig("/tmp/b.png", dpi=600)
plt.savefig("accuracy_both.png", dpi=600)
if __name__ == "__main__":
api_key = open("OPENAI_API_KEY").read().strip()
config = { "temperature": 0, "num_lookahead_tokens": 30}
engine = chessllm.ChessLLM(api_key, config, model="gpt-3.5-turbo-instruct")
plot_acc_pairs(engine)