-
Notifications
You must be signed in to change notification settings - Fork 12
/
regression_model_assurance.py
102 lines (86 loc) · 4.07 KB
/
regression_model_assurance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Run a simple linear regression for each pathology using the model’s probability output as the single independent variable and using the predicted evaluation metric (IoU or hit/miss) as the dependent variable. The script also runs a simple regression that uses the same approach as above, but that includes all 10 pathologies.
"""
from argparse import ArgumentParser
import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import torch
from eval_constants import CHEXPERT_TASKS, LOCALIZATION_TASKS
from utils import format_ci, parse_pkl_filename, run_linear_regression
def get_model_probability(map_dir):
"""
Extract model's predicted probability per cxr and per pathology
"""
prob_dict = {}
img_ids = []
for task in sorted(LOCALIZATION_TASKS):
print(f'Extracting model probability for {task}')
probs = []
pkl_paths = sorted(list(Path(map_dir).rglob(f"*{task}_map.pkl")))
for pkl_path in pkl_paths:
# get model probability
task, img_id = parse_pkl_filename(pkl_path)
info = pickle.load(open(pkl_path,'rb'))
if torch.is_tensor(info['prob']) and info['prob'].size()[0] == 14:
prob_idx = CHEXPERT_TASKS.index(info['task'])
pred_prob = info['prob'][prob_idx]
else:
pred_prob = info['prob']
probs.append(pred_prob)
if img_id not in img_ids:
img_ids.append(img_id)
prob_dict[task] = probs
prob_df = pd.DataFrame.from_dict(prob_dict)
prob_df['img_id'] = sorted(img_ids)
return prob_df
def run_model_assurance_regression(args):
"""Run regression using model probability as the independent variable."""
pred_results = pd.read_csv(args.pred_results)
model_probs_df = get_model_probability(args.map_dir)
y = args.metric
# align localization perf metrics and probabilities
ids = pred_results['img_id'].tolist()
prob_results = model_probs_df[model_probs_df['img_id'].isin(ids)]
coef_summary = pd.DataFrame(columns = ["lower", "mean", "upper",
"coef_pval","corr_lower", "corr","corr_upper",
"corr_pval", "feature", "task"])
overall_regression = pd.DataFrame()
for task in sorted(LOCALIZATION_TASKS):
df = pd.DataFrame()
# create regression data frame
data = {y: pred_results[task].values,
'prob': prob_results[task].tolist()}
regression_df = pd.DataFrame(data)
overall_regression = pd.concat([overall_regression, regression_df])
# run regression
results = run_linear_regression(regression_df, task, y, 'prob')
coef_summary = pd.concat([coef_summary, results])
# add overall regression
results = run_linear_regression(overall_regression, 'Overall', y, 'prob')
coef_summary = pd.concat([coef_summary, results])
coef_summary = coef_summary.apply(format_ci,
bonferroni_correction=1,
axis = 1)\
[['task', 'n',
'Linear regression coefficients',
'Spearman correlations']]
coef_summary.to_csv(f'{args.save_dir}/regression_modelprob_{y}.csv',
index = False)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--metric', type=str,
help='options are: iou or hitmiss')
parser.add_argument('--map_dir', type=str,
help='directory with pickle files containing heatmaps')
parser.add_argument('--pred_results', type=str,
help='path to csv file with saliency method IoU or \
hit/miss results for each CXR and each pathology.')
parser.add_argument('--save_dir', type=str, default='.',
help='where to save regression results')
args = parser.parse_args()
assert args.metric in ['iou', 'hitmiss'], \
"`metric` flag must be either `iou` or `hitmiss`"
run_model_assurance_regression(args)