Skip to content

Commit

Permalink
Merge pull request #8 from rajpurkarlab/paper_code
Browse files Browse the repository at this point in the history
Paper code
  • Loading branch information
asaporta authored Aug 5, 2022
2 parents 14f83ae + ff8114b commit 71d6d38
Show file tree
Hide file tree
Showing 16 changed files with 1,390 additions and 100 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,5 @@ lightning_logs/

# sandbox
sandbox

.DS_Store
221 changes: 194 additions & 27 deletions README.md

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions calculate_percentage_decrease.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Calculate the percentage decrease from human benchmark localization
performance to saliency method pipeline localization performance, along
with the 95% CIs.
"""
from argparse import ArgumentParser
import numpy as np
import pandas as pd

from eval import compute_cis, create_ci_record


def create_pct_diff_df(metric, pred_bootstrap_results, hb_bootstrap_results,
save_dir):
"""
Calculate percentage decrease per pathology from human benchmark
localization metric to saliency method pipeline localization metric,
and obtain 95% CI on the percentage decreases.
"""
# get 1000 bootstrap samples of IoU or hit/miss
pred_bs = pd.read_csv(pred_bootstrap_results)
hb_bs = pd.read_csv(hb_bootstrap_results)

# use the percentage difference as the statistic;
# get the CI (2.5th and 97.5th percentile) on the percentage difference
pct_diff_bs = (hb_bs - pred_bs)/hb_bs
records = []
for task in pct_diff_bs.columns:
records.append(create_ci_record(pct_diff_bs[task], task))
pct_diff_ci = pd.DataFrame.from_records(records)

# create results df
pct_diff_df = pd.DataFrame()
pct_diff_df['hb'] = hb_bs.mean()
pct_diff_df['pred'] = pred_bs.mean()
pct_diff_df['pct_diff'] = round(
(pct_diff_df['hb']-pct_diff_df['pred'])/pct_diff_df['hb']*100,
3)
pct_diff_df['pct_diff_lower'] = round(pct_diff_ci['lower'] * 100, 3).\
tolist()
pct_diff_df['pct_diff_upper'] = round(pct_diff_ci['upper'] * 100, 3).\
tolist()
pct_diff_df = pct_diff_df.sort_values(['pct_diff'], ascending=False)

# calculate avg human benchmark and saliency method localization metric
avg_pred = round(pct_diff_df['pred'].mean(), 3)
avg_hb = round(pct_diff_df['hb'].mean(), 3)
avg_pct_diff = round((avg_hb-avg_pred)/avg_hb * 100, 3)

# find the 95% CI of the percentage difference between average saliency
# method and average human benchmark
# - get bootstrap sample of average saliency method localization metric
avg_pred_bs = pred_bs.mean(axis=1)
# - get bootstrap sample of average human benchmark localization metric
avg_hb_bs = hb_bs.mean(axis=1)
# - use pct diff between avg saliency and avg human benchmark as the
# statistic, and get the 2.5th and 97.5th percentile of the bootstrap
# distribution to create the CI
avg_bs_df = 100 * (avg_hb_bs - avg_pred_bs)/avg_hb_bs
lower, mean, upper = compute_cis(avg_bs_df, confidence_level = 0.05)

pct_diff_df.loc['Average'] = {'pred': avg_pred, 'hb': avg_hb,
'pct_diff': avg_pct_diff,
'pct_diff_lower': round(lower, 3),
'pct_diff_upper': round(upper, 3)}
print(pct_diff_df)
pct_diff_df.to_csv(f'{save_dir}/{metric}_pct_decrease.csv')


if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--metric', type=str,
help='options are: miou or hitrate')
parser.add_argument('--hb_bootstrap_results', type=str,
help='path to csv file with 1000 bootstrap samples of \
human benchmark IoU or hit/miss for each \
pathology')
parser.add_argument('--pred_bootstrap_results', type=str,
help='path to csv file with 1000 bootstrap samples of \
saliency method IoU or hit/miss for each \
pathology')
parser.add_argument('--save_dir', default='.',
help='where to save results')
parser.add_argument('--seed', type=int, default=0,
help='random seed to fix')
args = parser.parse_args()

assert args.metric in ['miou', 'hitrate'], \
"`metric` flag must be either `miou` or `hitrate`"

np.random.seed(args.seed)

create_pct_diff_df(args.metric, args.pred_bootstrap_results,
args.hb_bootstrap_results, args.save_dir)
121 changes: 121 additions & 0 deletions compute_pathology_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
Compute four pathological features: (1) number of instances (for example, bilateral Pleural Effusion would have two instances, whereas there is only one instance for Cardiomegaly), (2) size (pathology area with respect to the area of the whole CXR), (3) elongation and (4) irrectangularity (the last two features measure the complexity of the pathology shape and were calculated by fitting a rectangle of minimum area enclosing the binary mask).
Note that we use the ground-truth annotations to extract the number of instances, and we use the ground-truth segmentation masks to calculate area, elongation and rectangularity. We chose to extract number of instances from annotations because sometimes radiologists draw two instances for a pathology that are overlapping; in this case, the number of annotations would be 2, but the number of segmentations would be 1.
"""
from argparse import ArgumentParser
import cv2
import glob
import json
import numpy as np
import pandas as pd
import pickle
from pycocotools import mask

from eval_constants import LOCALIZATION_TASKS


def get_geometric_features(segm):
"""
Given a segmentation mask, return geometric features.
Args:
segm (np.array): the binary segmentation mask
"""
# load segmentation
rgb_img = cv2.cvtColor(255 * segm, cv2.COLOR_GRAY2RGB)

# find contours
contours, _ = cv2.findContours(segm.copy(), 1, 1)

# get number of instances and area
n_instance = len(contours)
area_ratio = np.sum(segm) / (segm.shape[0] * segm.shape[1])

# use the longest coutour to calculate geometric features
max_idx = np.argmax([len(contour) for contour in contours])
cnt = contours[max_idx]

rect = cv2.minAreaRect(cnt)
(x, y), (w, h), a = rect

instance_area = cv2.contourArea(cnt)
elongation = max(w, h) / min(w, h)
rec_area_ratio = instance_area / (w * h)

return n_instance, area_ratio, elongation, rec_area_ratio


def main(args):
# load ground-truth annotations (needed to extract number of instances)
# and ground-truth segmentations
with open(args.gt_ann) as f:
gt_ann = json.load(f)
with open(args.gt_seg) as f:
gt_seg = json.load(f)

# extract features from all cxrs with at least one pathology
all_instances = {}
all_areas = {}
all_elongations = {}
all_rec_area_ratios = {}
all_ids = sorted(gt_ann.keys())
pos_ids = sorted(gt_seg.keys())
for task in sorted(LOCALIZATION_TASKS):
print(task)
n_instances = []
areas = []
elongations = []
rec_area_ratios = []
for img_id in all_ids:
n_instance = 0
area = 0
elongation = np.nan
rec_area_ratio = np.nan
# calculate features for cxr with a pathology segmentation
if img_id in pos_ids:
gt_item = gt_seg[img_id][task]
gt_mask = mask.decode(gt_item)
if np.sum(gt_mask) > 0:
# use annotation to get number of instances
n_instance = len(gt_ann[img_id][task]) \
if task in gt_ann[img_id] else 0
# use segmentation to get other features
n_instance_segm, area, elongation, rec_area_ratio = \
get_geometric_features(gt_mask)
n_instances.append(n_instance)
areas.append(area)
elongations.append(elongation)
rec_area_ratios.append(rec_area_ratio)
all_instances[task] = n_instances
all_areas[task] = areas
all_elongations[task] = elongations
all_rec_area_ratios[task] = rec_area_ratios

instance_df = pd.DataFrame(all_instances)
area_df = pd.DataFrame(all_areas)
elongation_df = pd.DataFrame(all_elongations)
rec_area_ratio_df = pd.DataFrame(all_rec_area_ratios)

instance_df['img_id'] = all_ids
area_df['img_id'] = all_ids
elongation_df['img_id'] = all_ids
rec_area_ratio_df['img_id'] = all_ids

instance_df.to_csv(f'{args.save_dir}/num_instances.csv', index=False)
area_df.to_csv(f'{args.save_dir}/area_ratio.csv', index=False)
elongation_df.to_csv(f'{args.save_dir}/elongation.csv', index=False)
rec_area_ratio_df.to_csv(f'{args.save_dir}/rec_area_ratio.csv', index=False)


if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--gt_ann', type=str,
help='path to json file with raw ground-truth annotations')
parser.add_argument('--gt_seg', type=str,
help='path to json file with ground-truth segmentations \
(encoded)')
parser.add_argument('--save_dir', default='.',
help='where to save feature dataframes')
args = parser.parse_args()
main(args)
44 changes: 44 additions & 0 deletions count_segs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from argparse import ArgumentParser
import json
import numpy as np
import pandas as pd
from pycocotools import mask

from eval_constants import LOCALIZATION_TASKS

def main(args):
"""
For each pathology, count the number of CXRs with at least one segmentation.
"""
with open(args.seg_path) as f:
seg_dict = json.load(f)

cxr_ids = sorted(seg_dict.keys())
segmentation_label = {}
for task in sorted(LOCALIZATION_TASKS):
print(task)
has_seg = []
for cxr_id in cxr_ids:
seg_item = seg_dict[cxr_id][task]
seg_mask = mask.decode(seg_item)
if np.sum(seg_mask) == 0:
has_segmentation = 0
else:
has_segmentation = 1
has_seg.append(has_segmentation)
segmentation_label[task] = has_seg

df = pd.DataFrame.from_dict(segmentation_label)
n_cxr_per_pathology = df.sum()
print(n_cxr_per_pathology)
n_cxr_per_pathology.to_csv(f'{args.save_dir}/n_segs.csv')

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--seg_path', type=str,
help='json file path where segmentations are saved \
(encoded)')
parser.add_argument('--save_dir', default='.',
help='where to save results')
args = parser.parse_args()
main(args)
Loading

0 comments on commit 71d6d38

Please sign in to comment.