Merge pull request #8 from rajpurkarlab/paper_code

Paper code
rajpurkarlab · Aug 5, 2022 · 71d6d38 · 71d6d38
2 parents 14f83ae + ff8114b
commit 71d6d38
Show file tree

Hide file tree

Showing 16 changed files with 1,390 additions and 100 deletions.
diff --git a/.gitignore b/.gitignore
@@ -123,3 +123,5 @@ lightning_logs/
 
 # sandbox
 sandbox
+
+.DS_Store
diff --git a/README.md b/README.md
diff --git a/calculate_percentage_decrease.py b/calculate_percentage_decrease.py
@@ -0,0 +1,94 @@
+"""
+Calculate the percentage decrease from human benchmark localization
+performance to saliency method pipeline localization performance, along
+with the 95% CIs.
+"""
+from argparse import ArgumentParser
+import numpy as np
+import pandas as pd
+
+from eval import compute_cis, create_ci_record
+
+
+def create_pct_diff_df(metric, pred_bootstrap_results, hb_bootstrap_results,
+                       save_dir):
+    """
+    Calculate percentage decrease per pathology from human benchmark
+    localization metric to saliency method pipeline localization metric,
+    and obtain 95% CI on the percentage decreases.
+    """
+    # get 1000 bootstrap samples of IoU or hit/miss
+    pred_bs = pd.read_csv(pred_bootstrap_results)
+    hb_bs = pd.read_csv(hb_bootstrap_results)
+
+    # use the percentage difference as the statistic;
+    # get the CI (2.5th and 97.5th percentile) on the percentage difference
+    pct_diff_bs = (hb_bs - pred_bs)/hb_bs
+    records = []
+    for task in pct_diff_bs.columns:
+        records.append(create_ci_record(pct_diff_bs[task], task))
+    pct_diff_ci = pd.DataFrame.from_records(records)
+
+    # create results df
+    pct_diff_df = pd.DataFrame()
+    pct_diff_df['hb'] = hb_bs.mean()
+    pct_diff_df['pred'] = pred_bs.mean()
+    pct_diff_df['pct_diff'] = round(
+            (pct_diff_df['hb']-pct_diff_df['pred'])/pct_diff_df['hb']*100,
+            3)
+    pct_diff_df['pct_diff_lower'] = round(pct_diff_ci['lower'] * 100, 3).\
+                                        tolist()
+    pct_diff_df['pct_diff_upper'] = round(pct_diff_ci['upper'] * 100, 3).\
+                                        tolist()
+    pct_diff_df = pct_diff_df.sort_values(['pct_diff'], ascending=False)
+
+    # calculate avg human benchmark and saliency method localization metric
+    avg_pred = round(pct_diff_df['pred'].mean(), 3)
+    avg_hb = round(pct_diff_df['hb'].mean(), 3)
+    avg_pct_diff = round((avg_hb-avg_pred)/avg_hb * 100, 3)
+
+    # find the 95% CI of the percentage difference between average saliency
+    # method and average human benchmark
+    #   - get bootstrap sample of average saliency method localization metric
+    avg_pred_bs = pred_bs.mean(axis=1)
+    #   - get bootstrap sample of average human benchmark localization metric
+    avg_hb_bs = hb_bs.mean(axis=1)
+    #   - use pct diff between avg saliency and avg human benchmark as the
+    #     statistic, and get the 2.5th and 97.5th percentile of the bootstrap
+    #     distribution to create the CI
+    avg_bs_df = 100 * (avg_hb_bs - avg_pred_bs)/avg_hb_bs
+    lower, mean, upper = compute_cis(avg_bs_df, confidence_level = 0.05)
+
+    pct_diff_df.loc['Average'] = {'pred': avg_pred, 'hb': avg_hb,
+                                  'pct_diff': avg_pct_diff,
+                                  'pct_diff_lower': round(lower, 3),
+                                  'pct_diff_upper': round(upper, 3)}
+    print(pct_diff_df)
+    pct_diff_df.to_csv(f'{save_dir}/{metric}_pct_decrease.csv')
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--metric', type=str,
+                        help='options are: miou or hitrate')
+    parser.add_argument('--hb_bootstrap_results', type=str,
+                        help='path to csv file with 1000 bootstrap samples of \
+                              human benchmark IoU or hit/miss for each \
+                              pathology')
+    parser.add_argument('--pred_bootstrap_results', type=str,
+                        help='path to csv file with 1000 bootstrap samples of \
+                              saliency method IoU or hit/miss for each \
+                              pathology')
+    parser.add_argument('--save_dir', default='.',
+                        help='where to save results')
+    parser.add_argument('--seed', type=int, default=0,
+                        help='random seed to fix')
+    args = parser.parse_args()
+
+    assert args.metric in ['miou', 'hitrate'], \
+        "`metric` flag must be either `miou` or `hitrate`"
+
+    np.random.seed(args.seed)
+
+    create_pct_diff_df(args.metric, args.pred_bootstrap_results,
+                       args.hb_bootstrap_results, args.save_dir)
diff --git a/compute_pathology_features.py b/compute_pathology_features.py
@@ -0,0 +1,121 @@
+"""
+Compute four pathological features: (1) number of instances (for example, bilateral Pleural Effusion would have two instances, whereas there is only one instance for Cardiomegaly), (2) size (pathology area with respect to the area of the whole CXR), (3) elongation and (4) irrectangularity (the last two features measure the complexity of the pathology shape and were calculated by fitting a rectangle of minimum area enclosing the binary mask).
+
+Note that we use the ground-truth annotations to extract the number of instances, and we use the ground-truth segmentation masks to calculate area, elongation and rectangularity. We chose to extract number of instances from annotations because sometimes radiologists draw two instances for a pathology that are overlapping; in this case, the number of annotations would be 2, but the number of segmentations would be 1.
+"""
+from argparse import ArgumentParser
+import cv2
+import glob
+import json
+import numpy as np
+import pandas as pd
+import pickle
+from pycocotools import mask
+
+from eval_constants import LOCALIZATION_TASKS
+
+
+def get_geometric_features(segm):
+    """
+    Given a segmentation mask, return geometric features.
+
+    Args:
+        segm (np.array): the binary segmentation mask
+    """
+    # load segmentation
+    rgb_img = cv2.cvtColor(255 * segm, cv2.COLOR_GRAY2RGB)
+
+    # find contours
+    contours, _ = cv2.findContours(segm.copy(), 1, 1)
+
+    # get number of instances and area
+    n_instance = len(contours)
+    area_ratio = np.sum(segm) / (segm.shape[0] * segm.shape[1])
+
+    # use the longest coutour to calculate geometric features
+    max_idx = np.argmax([len(contour) for contour in contours])
+    cnt = contours[max_idx]
+
+    rect = cv2.minAreaRect(cnt)
+    (x, y), (w, h), a = rect
+
+    instance_area = cv2.contourArea(cnt)
+    elongation = max(w, h) / min(w, h)
+    rec_area_ratio = instance_area / (w * h)
+
+    return n_instance, area_ratio, elongation, rec_area_ratio
+
+
+def main(args):
+    # load ground-truth annotations (needed to extract number of instances)
+    # and ground-truth segmentations
+    with open(args.gt_ann) as f:
+        gt_ann = json.load(f)
+    with open(args.gt_seg) as f:
+        gt_seg = json.load(f)
+
+    # extract features from all cxrs with at least one pathology
+    all_instances = {}
+    all_areas = {}
+    all_elongations = {}
+    all_rec_area_ratios = {}
+    all_ids = sorted(gt_ann.keys())
+    pos_ids = sorted(gt_seg.keys())
+    for task in sorted(LOCALIZATION_TASKS):
+        print(task)
+        n_instances = []
+        areas = []
+        elongations = []
+        rec_area_ratios = []
+        for img_id in all_ids:
+            n_instance = 0
+            area = 0
+            elongation = np.nan
+            rec_area_ratio = np.nan
+            # calculate features for cxr with a pathology segmentation
+            if img_id in pos_ids:
+                gt_item = gt_seg[img_id][task]
+                gt_mask = mask.decode(gt_item)
+                if np.sum(gt_mask) > 0:
+                    # use annotation to get number of instances
+                    n_instance = len(gt_ann[img_id][task]) \
+                            if task in gt_ann[img_id] else 0
+                    # use segmentation to get other features
+                    n_instance_segm, area, elongation, rec_area_ratio = \
+                            get_geometric_features(gt_mask)
+            n_instances.append(n_instance)
+            areas.append(area)
+            elongations.append(elongation)
+            rec_area_ratios.append(rec_area_ratio)
+        all_instances[task] = n_instances
+        all_areas[task] = areas
+        all_elongations[task] = elongations
+        all_rec_area_ratios[task] = rec_area_ratios
+
+    instance_df = pd.DataFrame(all_instances)
+    area_df = pd.DataFrame(all_areas)
+    elongation_df = pd.DataFrame(all_elongations)
+    rec_area_ratio_df = pd.DataFrame(all_rec_area_ratios)
+
+    instance_df['img_id'] = all_ids
+    area_df['img_id'] = all_ids
+    elongation_df['img_id'] = all_ids
+    rec_area_ratio_df['img_id'] = all_ids
+
+    instance_df.to_csv(f'{args.save_dir}/num_instances.csv', index=False)
+    area_df.to_csv(f'{args.save_dir}/area_ratio.csv', index=False)
+    elongation_df.to_csv(f'{args.save_dir}/elongation.csv', index=False)
+    rec_area_ratio_df.to_csv(f'{args.save_dir}/rec_area_ratio.csv', index=False)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--gt_ann', type=str,
+                        help='path to json file with raw ground-truth annotations')
+    parser.add_argument('--gt_seg', type=str,
+                        help='path to json file with ground-truth segmentations \
+                              (encoded)')
+    parser.add_argument('--save_dir', default='.',
+                        help='where to save feature dataframes')
+    args = parser.parse_args()
+    main(args)
diff --git a/count_segs.py b/count_segs.py
@@ -0,0 +1,44 @@
+from argparse import ArgumentParser
+import json
+import numpy as np
+import pandas as pd
+from pycocotools import mask
+
+from eval_constants import LOCALIZATION_TASKS
+
+def main(args):
+    """
+    For each pathology, count the number of CXRs with at least one segmentation.
+    """
+    with open(args.seg_path) as f:
+        seg_dict = json.load(f)
+
+    cxr_ids = sorted(seg_dict.keys())
+    segmentation_label = {}
+    for task in sorted(LOCALIZATION_TASKS):
+        print(task)
+        has_seg = []
+        for cxr_id in cxr_ids:
+            seg_item = seg_dict[cxr_id][task]
+            seg_mask = mask.decode(seg_item)
+            if np.sum(seg_mask) == 0:
+                has_segmentation = 0
+            else:
+                has_segmentation = 1
+            has_seg.append(has_segmentation)
+        segmentation_label[task] = has_seg
+
+    df = pd.DataFrame.from_dict(segmentation_label)
+    n_cxr_per_pathology = df.sum()
+    print(n_cxr_per_pathology)
+    n_cxr_per_pathology.to_csv(f'{args.save_dir}/n_segs.csv')
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument('--seg_path', type=str,
+                        help='json file path where segmentations are saved \
+                              (encoded)')
+    parser.add_argument('--save_dir', default='.',
+                        help='where to save results')
+    args = parser.parse_args()
+    main(args)