diff --git a/README.md b/README.md index 31c44da..2e57123 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,13 @@ pip install efaar_benchmarking ## Usage guidance: -First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, JUMP, and PERISCOPE individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook. +First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, cpg0016, and cpg0021 individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook. -We've uploaded the 128-dimensional TVN maps we constructed for GWPS, JUMP, and PERISCOPE to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset. -RPIE CNNBC embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. It's important to note that in the rxrx3 data, all but ~733 genes are anonymized. -Note that you have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository. +We've uploaded the 128-dimensional PCA-TVN maps we constructed for GWPS, cpg0016, and cpg0021 to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset. + +RxRx3 embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. Note that in this data, all but 733 genes are anonymized. + +You have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository. ## Contribution guidance: diff --git a/notebooks/map_building_benchmarking.ipynb b/notebooks/map_building_benchmarking.ipynb index 1193bab..b10bcc1 100644 --- a/notebooks/map_building_benchmarking.ipynb +++ b/notebooks/map_building_benchmarking.ipynb @@ -20,7 +20,10 @@ "from efaar_benchmarking.plotting import *\n", "import pickle\n", "\n", - "pc_count = 128" + "pc_count = 128\n", + "save_results = False # Results already uploaded to the notebooks/data folder in the repo. If True, will replace these files.\n", + "pert_signal_pval_cutoff = 0.05\n", + "recall_thr_pairs = [(.05, .95)]" ] }, { @@ -42,10 +45,12 @@ "control_key = \"non-targeting\"\n", "all_controls = [\"non-targeting\"]\n", "\n", + "# Load the perturbation dataset\n", "adata_raw = load_gwps(\"raw\")\n", "print(\"Perturbation dataset loaded\")\n", "metadata = adata_raw.obs\n", "\n", + "# Run EFAAR pipelines\n", "all_embeddings_pre_agg = {}\n", "print(\"Running for embedding size\", pc_count)\n", "all_embeddings_pre_agg[f\"scVI{pc_count}\"] = embed_by_scvi_anndata(adata_raw, batch_col=gem_group_colname, n_latent=pc_count, n_hidden=pc_count*2)\n", @@ -61,22 +66,21 @@ "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=gem_group_colname)\n", "print(\"tvn completed\")\n", "\n", - "array_list = []\n", + "# Run biological relationship benchmarks\n", "for k, emb in all_embeddings_pre_agg.items():\n", + " print(k)\n", " print(\"Aggregating...\")\n", " map_data = aggregate(emb, metadata, pert_col=pert_colname, keys_to_remove=all_controls)\n", " print(\"Computing recall...\")\n", - " metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n", - " array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n", - "res = np.vstack(array_list)\n", - "formatted_result = np.array2string(res, separator=\", \")\n", - "print(formatted_result)\n", - "\n", - "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", - " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", - "\n", - "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", - " pickle.dump(metadata, f) # storing the metadata for downstream analysis" + " metrics = known_relationship_benchmark(map_data, recall_thr_pairs=recall_thr_pairs, pert_col=pert_colname)\n", + " print(metrics[list(metrics.columns)[::-1]])\n", + "\n", + "# Save results\n", + "if save_results:\n", + " with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", + " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", + " with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", + " pickle.dump(metadata, f) # storing the metadata for downstream analysis" ] }, { @@ -99,10 +103,18 @@ "control_key = \"non-targeting\"\n", "all_controls = [\"non-targeting\", \"no-guide\"]\n", "\n", + "# Load the perturbation dataset and expression information\n", "features, metadata = load_cpg16_crispr()\n", "print(\"Perturbation dataset loaded\")\n", "features, metadata = filter_cell_profiler_features(features, metadata)\n", "\n", + "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n", + "expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n", + "unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n", + "expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n", + "expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n", + "\n", + "# Run EFAAR pipelines\n", "all_embeddings_pre_agg = {}\n", "print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n", "all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n", @@ -111,28 +123,29 @@ "print(\"Computing TVN...\")\n", "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=run_colname)\n", "\n", - "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n", - "expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n", - "unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n", - "expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n", - "ind = metadata[pert_colname].isin(expr_genes + [control_key])\n", + "# Run perturbation signal benchmarks\n", + "for k, emb in all_embeddings_pre_agg.items():\n", + " cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n", + " print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n", "\n", - "array_list = []\n", + " magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n", + " print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n", + "\n", + "# Run biological relationship benchmarks\n", "for k, emb in all_embeddings_pre_agg.items():\n", + " print(k)\n", " print(\"Aggregating...\")\n", - " map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n", + " map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n", " print(\"Computing recall...\")\n", " metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n", - " array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n", - "res = np.vstack(array_list)\n", - "formatted_result = np.array2string(res, separator=\", \")\n", - "print(formatted_result)\n", - "\n", - "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", - " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", - "\n", - "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", - " pickle.dump(metadata, f) # storing the metadata for downstream analysis" + " print(metrics[list(metrics.columns)[::-1]])\n", + "\n", + "# Save results\n", + "if save_results:\n", + " with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", + " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", + " with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", + " pickle.dump(metadata, f) # storing the metadata for downstream analysis" ] }, { @@ -154,9 +167,19 @@ "control_key = \"nontargeting\"\n", "all_controls = [\"nontargeting\", \"negCtrl\"]\n", "\n", + "# Load the perturbation dataset and expression information\n", "features, metadata = load_periscope()\n", "print(\"Perturbation dataset loaded\")\n", "\n", + "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n", + "expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n", + "expr.columns = [\"gene\", \"tpm\"]\n", + "expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n", + "unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n", + "expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n", + "expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n", + "\n", + "# Run EFAAR pipelines\n", "all_embeddings_pre_agg = {}\n", "print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n", "all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n", @@ -165,30 +188,29 @@ "print(\"Computing TVN...\")\n", "all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=plate_colname)\n", "\n", - "expression_data_folder = \"../efaar_benchmarking/expression_data\"\n", - "expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n", - "expr.columns = [\"gene\", \"tpm\"]\n", - "expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n", - "unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n", - "expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n", - "ind = metadata[pert_colname].isin(expr_genes + [control_key])\n", + "# Run perturbation signal benchmarks\n", + "for k, emb in all_embeddings_pre_agg.items():\n", + " cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n", + " print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n", + "\n", + " magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n", + " print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n", "\n", - "array_list = []\n", + "# Run biological relationship benchmarks\n", "for k, emb in all_embeddings_pre_agg.items():\n", + " print(k)\n", " print(\"Aggregating...\")\n", - " map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n", + " map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n", " print(\"Computing recall...\")\n", " metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n", - " array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n", - "res = np.vstack(array_list)\n", - "formatted_result = np.array2string(res, separator=\", \")\n", - "print(formatted_result)\n", - "\n", - "with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", - " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", - "\n", - "with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", - " pickle.dump(metadata, f) # storing the metadata for downstream analysis" + " print(metrics[list(metrics.columns)[::-1]])\n", + "\n", + "# Save results\n", + "if save_results:\n", + " with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n", + " pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n", + " with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n", + " pickle.dump(metadata, f) # storing the metadata for downstream analysis" ] } ], @@ -199,7 +221,15 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", "version": "3.11.5" } },