-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubmit.py
98 lines (83 loc) · 3.07 KB
/
submit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from models import DEVICE, AttentionModel
import torch
import torch.utils.data as data
from tqdm import tqdm
from datasets import Dataset
import os
def pipeline(
model: torch.nn.Module,
input_ds: str,
out: str,
batch_size: int,
):
"""
Make predictions on the test dataset and write them to a csv file
Parameters:
- model: torch.nn.Module - the model trained on both the 2a3 and dms distributions
- input_ds: str - name of the dataset to load
- out: str - name of the file to write to
- batch_size: int - size of the batches to use to process the data.
In general, larger batch sizes mean faster runtime
"""
ds = Dataset.load_from_disk(input_ds).with_format("torch")
loader = data.DataLoader(ds, batch_size=batch_size, shuffle=False)
iterable = iter(loader)
with open(out, "w") as outfile:
# write the header
outfile.write("id,reactivity_DMS_MaP,reactivity_2A3_MaP\n")
for _ in tqdm(range(len(loader))):
# get the next group of data
tdata = next(iterable)
tokens = tdata["simple_tokens"].to(DEVICE)
bpp = tdata["bpp"].to(DEVICE)
min_ids = tdata["id_min"].numpy()
max_ids = tdata["id_max"].numpy()
# make predictions w/o gradients
with torch.no_grad():
preds = model(tokens, bpp).cpu().numpy()
# write preds
for i in range(tokens.shape[0]):
outfile.writelines(
map(
# dms is index 0, 2a3 is index 1
lambda seq_idx: f"{seq_idx},{preds[i, seq_idx-min_ids[i], 0]:.3f},{preds[i, seq_idx-min_ids[i], 1]:.3f}\n",
# +1 since the id_max is inclusive
range(min_ids[i], max_ids[i] + 1),
)
)
def submit(
batch_size: int = 64,
model_dict: dict = dict(
latent_dim=32,
n_heads=1,
enc_layers=4,
dec_layers=4,
ff_dim=2048,
),
model_name: str = "full",
):
"""
Generate a submission.csv.zip file for submitting
Arguments:
- batch_size: int - the batch size to use during inference. Defaults to 64
- model_2a3_dict: dict - a dictionary containing all the arguments to be passed when instantiating
the 2a3 `AttentionModel`
- model_dms_dict: a dictionary containing all the arguments to be passed when instantiating
the dms `AttentionModel`
"""
# initialize models
model = AttentionModel(**model_dict)
# load weights
model.load_state_dict(torch.load(f"{model_name}_model.pt"))
# set in evaluation mode and move to device
model.eval().to(DEVICE)
pipeline(
model,
"test_data_preprocessed",
"submission.csv",
batch_size=batch_size,
)
# zip our submission into an easily-uploadable zip file
print("zipping submissions. This may take a while...")
os.system("zip submission.csv.zip submission.csv")
print("Done zipping submissions!")