add pert signal benchmarks

recursionpharma · Aug 6, 2024 · dcedec9 · dcedec9
1 parent 51ee0a5
commit dcedec9
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -30,11 +30,13 @@ pip install efaar_benchmarking
 
 ## Usage guidance:
 
-First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, JUMP, and PERISCOPE individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook.
+First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, cpg0016, and cpg0021 individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook.
 
-We've uploaded the 128-dimensional TVN maps we constructed for GWPS, JUMP, and PERISCOPE to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset.
-RPIE CNNBC embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. It's important to note that in the rxrx3 data, all but ~733 genes are anonymized.
-Note that you have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository.
+We've uploaded the 128-dimensional PCA-TVN maps we constructed for GWPS, cpg0016, and cpg0021 to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset.
+
+RxRx3 embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. Note that in this data, all but 733 genes are anonymized.
+
+You have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository.
 
 ## Contribution guidance:
 

diff --git a/notebooks/map_building_benchmarking.ipynb b/notebooks/map_building_benchmarking.ipynb
@@ -20,7 +20,10 @@
     "from efaar_benchmarking.plotting import *\n",
     "import pickle\n",
     "\n",
-    "pc_count = 128"
+    "pc_count = 128\n",
+    "save_results = False  # Results already uploaded to the notebooks/data folder in the repo. If True, will replace these files.\n",
+    "pert_signal_pval_cutoff = 0.05\n",
+    "recall_thr_pairs = [(.05, .95)]"
    ]
   },
   {
@@ -42,10 +45,12 @@
     "control_key = \"non-targeting\"\n",
     "all_controls = [\"non-targeting\"]\n",
     "\n",
+    "# Load the perturbation dataset\n",
     "adata_raw = load_gwps(\"raw\")\n",
     "print(\"Perturbation dataset loaded\")\n",
     "metadata = adata_raw.obs\n",
     "\n",
+    "# Run EFAAR pipelines\n",
     "all_embeddings_pre_agg = {}\n",
     "print(\"Running for embedding size\", pc_count)\n",
     "all_embeddings_pre_agg[f\"scVI{pc_count}\"] = embed_by_scvi_anndata(adata_raw, batch_col=gem_group_colname, n_latent=pc_count, n_hidden=pc_count*2)\n",
@@ -61,22 +66,21 @@
     "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=gem_group_colname)\n",
     "print(\"tvn completed\")\n",
     "\n",
-    "array_list = []\n",
+    "# Run biological relationship benchmarks\n",
     "for k, emb in all_embeddings_pre_agg.items():\n",
+    "    print(k)\n",
     "    print(\"Aggregating...\")\n",
     "    map_data = aggregate(emb, metadata, pert_col=pert_colname, keys_to_remove=all_controls)\n",
     "    print(\"Computing recall...\")\n",
-    "    metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
-    "    array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
-    "res = np.vstack(array_list)\n",
-    "formatted_result = np.array2string(res, separator=\", \")\n",
-    "print(formatted_result)\n",
-    "\n",
-    "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
-    "    pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
-    "\n",
-    "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
-    "    pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
+    "    metrics = known_relationship_benchmark(map_data, recall_thr_pairs=recall_thr_pairs, pert_col=pert_colname)\n",
+    "    print(metrics[list(metrics.columns)[::-1]])\n",
+    "\n",
+    "# Save results\n",
+    "if save_results:\n",
+    "    with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
+    "        pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
+    "    with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
+    "        pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
    ]
   },
   {
@@ -99,10 +103,18 @@
     "control_key = \"non-targeting\"\n",
     "all_controls = [\"non-targeting\", \"no-guide\"]\n",
     "\n",
+    "# Load the perturbation dataset and expression information\n",
     "features, metadata = load_cpg16_crispr()\n",
     "print(\"Perturbation dataset loaded\")\n",
     "features, metadata = filter_cell_profiler_features(features, metadata)\n",
     "\n",
+    "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
+    "expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n",
+    "unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n",
+    "expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n",
+    "expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
+    "\n",
+    "# Run EFAAR pipelines\n",
     "all_embeddings_pre_agg = {}\n",
     "print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n",
     "all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n",
@@ -111,28 +123,29 @@
     "print(\"Computing TVN...\")\n",
     "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=run_colname)\n",
     "\n",
-    "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
-    "expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n",
-    "unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n",
-    "expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n",
-    "ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
+    "# Run perturbation signal benchmarks\n",
+    "for k, emb in all_embeddings_pre_agg.items():\n",
+    "    cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n",
+    "    print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n",
     "\n",
-    "array_list = []\n",
+    "    magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n",
+    "    print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n",
+    "\n",
+    "# Run biological relationship benchmarks\n",
     "for k, emb in all_embeddings_pre_agg.items():\n",
+    "    print(k)\n",
     "    print(\"Aggregating...\")\n",
-    "    map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
+    "    map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
     "    print(\"Computing recall...\")\n",
     "    metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
-    "    array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
-    "res = np.vstack(array_list)\n",
-    "formatted_result = np.array2string(res, separator=\", \")\n",
-    "print(formatted_result)\n",
-    "\n",
-    "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
-    "    pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
-    "\n",
-    "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
-    "    pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
+    "    print(metrics[list(metrics.columns)[::-1]])\n",
+    "\n",
+    "# Save results\n",
+    "if save_results:\n",
+    "    with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
+    "        pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
+    "    with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
+    "        pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
    ]
   },
   {
@@ -154,9 +167,19 @@
     "control_key = \"nontargeting\"\n",
     "all_controls = [\"nontargeting\", \"negCtrl\"]\n",
     "\n",
+    "# Load the perturbation dataset and expression information\n",
     "features, metadata = load_periscope()\n",
     "print(\"Perturbation dataset loaded\")\n",
     "\n",
+    "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
+    "expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n",
+    "expr.columns = [\"gene\", \"tpm\"]\n",
+    "expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n",
+    "unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n",
+    "expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n",
+    "expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
+    "\n",
+    "# Run EFAAR pipelines\n",
     "all_embeddings_pre_agg = {}\n",
     "print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n",
     "all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n",
@@ -165,30 +188,29 @@
     "print(\"Computing TVN...\")\n",
     "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=plate_colname)\n",
     "\n",
-    "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
-    "expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n",
-    "expr.columns = [\"gene\", \"tpm\"]\n",
-    "expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n",
-    "unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n",
-    "expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n",
-    "ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
+    "# Run perturbation signal benchmarks\n",
+    "for k, emb in all_embeddings_pre_agg.items():\n",
+    "    cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n",
+    "    print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n",
+    "\n",
+    "    magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n",
+    "    print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n",
     "\n",
-    "array_list = []\n",
+    "# Run biological relationship benchmarks\n",
     "for k, emb in all_embeddings_pre_agg.items():\n",
+    "    print(k)\n",
     "    print(\"Aggregating...\")\n",
-    "    map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
+    "    map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
     "    print(\"Computing recall...\")\n",
     "    metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
-    "    array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
-    "res = np.vstack(array_list)\n",
-    "formatted_result = np.array2string(res, separator=\", \")\n",
-    "print(formatted_result)\n",
-    "\n",
-    "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
-    "    pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
-    "\n",
-    "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
-    "    pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
+    "    print(metrics[list(metrics.columns)[::-1]])\n",
+    "\n",
+    "# Save results\n",
+    "if save_results:\n",
+    "    with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
+    "        pickle.dump(map_data, f)  # storing the PCA-TVN map data for downstream analysis\n",
+    "    with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
+    "        pickle.dump(metadata, f)  # storing the metadata for downstream analysis"
    ]
   }
  ],
@@ -199,7 +221,15 @@
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
    "version": "3.11.5"
   }
  },