Skip to content

Commit

Permalink
add pert signal benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
safiyecelik committed Aug 6, 2024
1 parent 51ee0a5 commit dcedec9
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 53 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ pip install efaar_benchmarking

## Usage guidance:

First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, JUMP, and PERISCOPE individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook.
First, run `notebooks/map_building_benchmarking.ipynb` for GWPS, cpg0016, and cpg0021 individually. This process will build each of these maps and report the perturbation signal and biological relationship benchmarks. Afterwards, run `notebooks/map_evaluation_comparison.ipynb` to explore the constructed maps using the methods presented in our paper. In order for the latter notebook to work, make sure to set the `save_results` parameter to True in the former notebook.

We've uploaded the 128-dimensional TVN maps we constructed for GWPS, JUMP, and PERISCOPE to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset.
RPIE CNNBC embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. It's important to note that in the rxrx3 data, all but ~733 genes are anonymized.
Note that you have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository.
We've uploaded the 128-dimensional PCA-TVN maps we constructed for GWPS, cpg0016, and cpg0021 to the `notebooks/data` directory. So, for convenience, one can run `notebooks/map_evaluation_comparison.ipynb` directly on these uploaded map files if they wish to explore the maps further without running `notebooks/map_building_benchmarking.ipynb`. See `notebooks/data/LICENSE` for terms of use for each dataset.

RxRx3 embeddings are available as separate parquet files per plate in the embeddings.tar file, downloadable from https://rxrx3.rxrx.ai/downloads. Note that in this data, all but 733 genes are anonymized.

You have to install GitHub LFS (Large File Storage) to properly utilize the files under `notebooks/data` and `efaar_benchmarking/expression_data` in your local clone of the repository.

## Contribution guidance:

Expand Down
128 changes: 79 additions & 49 deletions notebooks/map_building_benchmarking.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
"from efaar_benchmarking.plotting import *\n",
"import pickle\n",
"\n",
"pc_count = 128"
"pc_count = 128\n",
"save_results = False # Results already uploaded to the notebooks/data folder in the repo. If True, will replace these files.\n",
"pert_signal_pval_cutoff = 0.05\n",
"recall_thr_pairs = [(.05, .95)]"
]
},
{
Expand All @@ -42,10 +45,12 @@
"control_key = \"non-targeting\"\n",
"all_controls = [\"non-targeting\"]\n",
"\n",
"# Load the perturbation dataset\n",
"adata_raw = load_gwps(\"raw\")\n",
"print(\"Perturbation dataset loaded\")\n",
"metadata = adata_raw.obs\n",
"\n",
"# Run EFAAR pipelines\n",
"all_embeddings_pre_agg = {}\n",
"print(\"Running for embedding size\", pc_count)\n",
"all_embeddings_pre_agg[f\"scVI{pc_count}\"] = embed_by_scvi_anndata(adata_raw, batch_col=gem_group_colname, n_latent=pc_count, n_hidden=pc_count*2)\n",
Expand All @@ -61,22 +66,21 @@
"all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=gem_group_colname)\n",
"print(\"tvn completed\")\n",
"\n",
"array_list = []\n",
"# Run biological relationship benchmarks\n",
"for k, emb in all_embeddings_pre_agg.items():\n",
" print(k)\n",
" print(\"Aggregating...\")\n",
" map_data = aggregate(emb, metadata, pert_col=pert_colname, keys_to_remove=all_controls)\n",
" print(\"Computing recall...\")\n",
" metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
" array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
"res = np.vstack(array_list)\n",
"formatted_result = np.array2string(res, separator=\", \")\n",
"print(formatted_result)\n",
"\n",
"with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
"\n",
"with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
" metrics = known_relationship_benchmark(map_data, recall_thr_pairs=recall_thr_pairs, pert_col=pert_colname)\n",
" print(metrics[list(metrics.columns)[::-1]])\n",
"\n",
"# Save results\n",
"if save_results:\n",
" with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
" with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
]
},
{
Expand All @@ -99,10 +103,18 @@
"control_key = \"non-targeting\"\n",
"all_controls = [\"non-targeting\", \"no-guide\"]\n",
"\n",
"# Load the perturbation dataset and expression information\n",
"features, metadata = load_cpg16_crispr()\n",
"print(\"Perturbation dataset loaded\")\n",
"features, metadata = filter_cell_profiler_features(features, metadata)\n",
"\n",
"expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
"expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n",
"unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n",
"expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n",
"expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
"\n",
"# Run EFAAR pipelines\n",
"all_embeddings_pre_agg = {}\n",
"print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n",
"all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n",
Expand All @@ -111,28 +123,29 @@
"print(\"Computing TVN...\")\n",
"all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=run_colname)\n",
"\n",
"expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
"expr = pd.read_csv(f\"{expression_data_folder}/U2OS_expression.csv\", index_col=0).groupby(\"gene\").zfpkm.agg(\"median\").reset_index()\n",
"unexpr_genes = list(expr.loc[expr.zfpkm < -3, \"gene\"])\n",
"expr_genes = list(expr.loc[expr.zfpkm >= -3, \"gene\"])\n",
"ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
"# Run perturbation signal benchmarks\n",
"for k, emb in all_embeddings_pre_agg.items():\n",
" cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n",
" print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n",
"\n",
"array_list = []\n",
" magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n",
" print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n",
"\n",
"# Run biological relationship benchmarks\n",
"for k, emb in all_embeddings_pre_agg.items():\n",
" print(k)\n",
" print(\"Aggregating...\")\n",
" map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
" map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
" print(\"Computing recall...\")\n",
" metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
" array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
"res = np.vstack(array_list)\n",
"formatted_result = np.array2string(res, separator=\", \")\n",
"print(formatted_result)\n",
"\n",
"with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
"\n",
"with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
" print(metrics[list(metrics.columns)[::-1]])\n",
"\n",
"# Save results\n",
"if save_results:\n",
" with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
" with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
]
},
{
Expand All @@ -154,9 +167,19 @@
"control_key = \"nontargeting\"\n",
"all_controls = [\"nontargeting\", \"negCtrl\"]\n",
"\n",
"# Load the perturbation dataset and expression information\n",
"features, metadata = load_periscope()\n",
"print(\"Perturbation dataset loaded\")\n",
"\n",
"expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
"expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n",
"expr.columns = [\"gene\", \"tpm\"]\n",
"expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n",
"unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n",
"expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n",
"expr_ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
"\n",
"# Run EFAAR pipelines\n",
"all_embeddings_pre_agg = {}\n",
"print(\"Computing PCA embedding for\", pc_count, \"dimensions...\")\n",
"all_embeddings_pre_agg[f\"PCA{pc_count}\"] = embed_by_pca(features.values, metadata, variance_or_ncomp=pc_count, batch_col=plate_colname)\n",
Expand All @@ -165,30 +188,29 @@
"print(\"Computing TVN...\")\n",
"all_embeddings_pre_agg[f\"PCA{pc_count}-TVN\"] = tvn_on_controls(all_embeddings_pre_agg[f\"PCA{pc_count}\"], metadata, pert_col=pert_colname, control_key=control_key, batch_col=plate_colname)\n",
"\n",
"expression_data_folder = \"../efaar_benchmarking/expression_data\"\n",
"expr = pd.read_csv(f\"{expression_data_folder}/HeLa_expression.csv\") # note that we assume the HeLa expression data was used for PERISCOPE which is the default option in load_periscope()\n",
"expr.columns = [\"gene\", \"tpm\"]\n",
"expr.gene = expr.gene.apply(lambda x: x.split(\" \")[0])\n",
"unexpr_genes = list(expr.loc[expr.tpm == 0, \"gene\"])\n",
"expr_genes = list(expr.loc[expr.tpm > 0, \"gene\"])\n",
"ind = metadata[pert_colname].isin(expr_genes + [control_key])\n",
"# Run perturbation signal benchmarks\n",
"for k, emb in all_embeddings_pre_agg.items():\n",
" cons_res = pert_signal_consistency_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, keys_to_drop=all_controls)\n",
" print(k, round(sum(cons_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(cons_res.pval)) * 100, 1))\n",
"\n",
" magn_res = pert_signal_distance_benchmark(emb, metadata, pert_col=pert_colname, neg_ctrl_perts=unexpr_genes, control_key=control_key, keys_to_drop=[x for x in all_controls if x!=control_key])\n",
" print(k, round(sum(magn_res.pval <= pert_signal_pval_cutoff) / sum(~pd.isna(magn_res.pval)) * 100, 1))\n",
"\n",
"array_list = []\n",
"# Run biological relationship benchmarks\n",
"for k, emb in all_embeddings_pre_agg.items():\n",
" print(k)\n",
" print(\"Aggregating...\")\n",
" map_data = aggregate(emb[ind], metadata[ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
" map_data = aggregate(emb[expr_ind], metadata[expr_ind], pert_col=pert_colname, keys_to_remove=all_controls)\n",
" print(\"Computing recall...\")\n",
" metrics = known_relationship_benchmark(map_data, recall_thr_pairs=[(.05, .95)], pert_col=pert_colname)\n",
" array_list.append((metrics.groupby(\"source\")[\"recall_0.05_0.95\"].mean() * 100).round(1).values)\n",
"res = np.vstack(array_list)\n",
"formatted_result = np.array2string(res, separator=\", \")\n",
"print(formatted_result)\n",
"\n",
"with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
"\n",
"with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
" print(metrics[list(metrics.columns)[::-1]])\n",
"\n",
"# Save results\n",
"if save_results:\n",
" with open(f'data/{dataset}_map_cache.pkl', 'wb') as f:\n",
" pickle.dump(map_data, f) # storing the PCA-TVN map data for downstream analysis\n",
" with open(f'data/{dataset}_metadata.pkl', 'wb') as f:\n",
" pickle.dump(metadata, f) # storing the metadata for downstream analysis"
]
}
],
Expand All @@ -199,7 +221,15 @@
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
Expand Down

0 comments on commit dcedec9

Please sign in to comment.