update readme

recursionpharma · Oct 26, 2023 · 9b8ae5b · 9b8ae5b
1 parent 063755c
commit 9b8ae5b
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # EFAAR_benchmarking
 
 This library enables computation and retrieval of metrics to benchmark a whole-genome perturbative map created by the pipeline.
-Metrics that can be computed using this repo are pairwise gene-gene recall for the reactome, corum, and humap datasets which
-are publicly available.
+Metrics that can be computed using this repo are pairwise gene-gene recall for the Reactome, HuMAP, CORUM, SIGNOR, and StringDB datasets which
+are publicly available (see `efaar_benchmarking/benchmark_annotations/LICENSE` for terms of use for each source).
 
 By default, we do not filter on perturbation fingerprint, although filtering is available through the parameters to the `benchmark` function.
 We compute the metrics for three different random seeds used to generate empirical null entities.
@@ -13,27 +13,21 @@ Here are the descriptions for the constants used in the code to configure and co
 
 `BENCHMARK_DATA_DIR`: The directory path to the benchmark annotations data. It is obtained using the resources module from the importlib package.
 
-`BENCHMARK_SOURCES`: A list of benchmark sources, including "Reactome", "HuMAP", and "CORUM".
+`BENCHMARK_SOURCES`: A list of benchmark sources, including "Reactome", "HuMAP", "CORUM", "SIGNOR", and "StringDB".
 
 `PERT_LABEL_COL`: The column name for the gene perturbation labels.
 
+`CONTROL_PERT_LABEL`: The perturbation label value for the control perturbation units.
+
 `PERT_SIG_PVAL_COL`: The column name for the perturbation p-value.
 
 `PERT_SIG_PVAL_THR`: The threshold value for the perturbation p-value.
 
-`PERT_TYPE_COL`: The column name for the perturbation type.
-
-`PERT_TYPE`: The specific perturbation type, which is "GENE" by default.
-
-`WELL_TYPE_COL`: The column name for the well type.
-
-`WELL_TYPE`: The specific well type, which is "query_guides".
-
-`RECALL_PERC_THR_PAIR`: A tuple representing the threshold pair (lower threshold, upper threshold) for calculating recall percentages.
+`RECALL_PERC_THRS`: A list of tuples of two floats between 0 and 1 representing the threshold pair (lower threshold, upper threshold) for calculating recall.
 
-`RANDOM_SEED`: The random seed value used for random number generation.
+`RANDOM_SEED`: The random seed value used for random number generation for sampling the null distribution.
 
-`RANDOM_COUNT`: The number of runs for benchmarking.
+`RANDOM_COUNT`: The number of runs for benchmarking to compute error in metrics.
 
 `N_NULL_SAMPLES`: The number of null samples used in benchmarking.
 
@@ -47,6 +41,23 @@ This package is installable via `pip`.
 pip install efaar_benchmarking
 ```
 
+## Example code:
+
+```from efaar_benchmarking.data_loading import load_replogle
+from efaar_benchmarking.efaar import embed_by_scvi, align_by_centering, aggregate_by_mean
+from efaar_benchmarking.benchmarking import benchmark
+from efaar_benchmarking.plotting import plot_recall
+
+adata = load_replogle("genome_wide", "raw")
+metadata = adata.obs
+embeddings_scvi = embed_by_scvi(adata)
+embeddings_aligned = align_by_centering(embeddings_scvi, metadata)
+map_data = aggregate_by_mean(embeddings_aligned, metadata)
+metrics = benchmark(map_data, 
+recall_thr_pairs=[(0.01,0.99),(0.02,0.98),(0.03,0.97),(0.04,0.96),(0.05,0.95),(0.06,0.94),(0.07,0.93),(0.08,0.92),(0.09,0.91),(0.1,0.9)])
+plot_recall(metrics["summary"])
+```
+
 ## References
 **Reactome:**
 
@@ -59,3 +70,9 @@ _Giurgiu, M., Reinhard, J., Brauner, B., Dunger-Kaltenbach, I., Fobo, G., Frishm
 **HuMAP:**
 
 _Drew, K., Wallingford, J.B., and Marcotte, E.M. (2021). hu.MAP 2.0: integration of over 15,000 proteomic experiments builds a global compendium of human multiprotein assemblies. Mol. Syst. Biol. 17, e10016. 10.15252/msb.202010016._
+
+**SIGNOR:**
+_Licata, L., Lo Surdo, P., Iannuccelli, M., Palma, A., Micarelli, E., Perfetto, L., Peluso, D., Calderone, A., Castagnoli, L., and Cesareni, G. (2019). SIGNOR 2.0, the SIGnaling Network Open Resource 2.0: 2019 update. Nucleic Acids Research. 10.1093/nar/gkz949._
+
+**StringDB:**
+_von Mering C, Jensen LJ, Snel B, Hooper SD, Krupp M, Foglierini M, Jouffre N, Huynen MA, Bork P. STRING: known and predicted protein-protein associations, integrated and transferred across organisms. Nucleic Acids Res. 2005 Jan 1;33(Database issue):D433-7. doi: 10.1093/nar/gki005._
diff --git a/efaar_benchmarking/constants.py b/efaar_benchmarking/constants.py
@@ -4,9 +4,9 @@
     "benchmark_annotations"
 )
 BENCHMARK_SOURCES = ["Reactome", "HuMAP", "CORUM", "SIGNOR", "StringDB"]
-PERT_LABEL_COL = "perturbation"
+PERT_LABEL_COL = "gene"
 CONTROL_PERT_LABEL = "non-targeting"
-PERT_SIG_PVAL_COL = "perturbation_pvalue"
+PERT_SIG_PVAL_COL = "gene_pvalue"
 PERT_SIG_PVAL_THR = 0.01
 RECALL_PERC_THRS = [(0.05, 0.95), (0.1, 0.9)]
 RANDOM_SEED = 42

diff --git a/efaar_benchmarking/efaar.py b/efaar_benchmarking/efaar.py
@@ -7,79 +7,81 @@
 import efaar_benchmarking.constants as cst
 
 
-def embed_by_scvi(adata, BATCH_KEY="gem_group", N_LATENT=128, N_HIDDEN=256):
+def embed_by_scvi(adata, batch_key="gem_group", n_latent=128, n_hidden=256):
     """
     Embeds the input AnnData object using scVI.
 
     Parameters:
     adata (anndata.AnnData): The AnnData object to be embedded.
-    BATCH_KEY (str): The batch key in the AnnData object. Default is "gem_group".
-    N_LATENT (int): The number of latent dimensions. Default is 128.
-    N_HIDDEN (int): The number of hidden dimensions. Default is 256.
+    batch_key (str): The batch key in the AnnData object. Default is "gem_group".
+    n_latent (int): The number of latent dimensions. Default is 128.
+    n_hidden (int): The number of hidden dimensions. Default is 256.
 
     Returns:
     None
     """
-    SCVI.setup_anndata(adata, batch_key=BATCH_KEY)
-    vae = SCVI(adata, n_hidden=N_HIDDEN, n_latent=N_LATENT)
+    SCVI.setup_anndata(adata, batch_key=batch_key)
+    vae = SCVI(adata, n_hidden=n_hidden, n_latent=n_latent)
     vae.train(use_gpu=True)
     return vae.get_latent_representation()
 
 
-def embed_by_pca(adata, N_LATENT=100):
+def embed_by_pca(adata, n_latent=100):
     """
     Embeds the input data using PCA.
 
     Parameters:
     adata (AnnData): Annotated data matrix.
-    N_LATENT (int): Number of principal components to use.
+    n_latent (int): Number of principal components to use.
 
     Returns:
     numpy.ndarray: Embedding of the input data using PCA.
     """
-    sc.pp.pca(adata, n_comps=N_LATENT)
+    sc.pp.pca(adata, n_comps=n_latent)
     return adata.obsm["X_pca"]
 
 
-def align_by_centering(embeddings, metadata, NTC_KEY="non-targeting", PERT_COL="gene"):
+def align_by_centering(embeddings, metadata, control_key=cst.CONTROL_PERT_LABEL, pert_col=cst.PERT_LABEL_COL):
     """
     Applies the centerscale method to align embeddings based on the centering perturbations in the metadata.
 
     Args:
         embeddings (numpy.ndarray): The embeddings to be aligned.
         metadata (pandas.DataFrame): The metadata containing information about the embeddings.
-        NTC_KEY (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting".
-        PERT_COL (str, optional): The column in the metadata containing perturbation information. Defaults to "gene".
+        control_key (str, optional): The key for non-targeting controls in the metadata.
+            Defaults to cst.CONTROL_PERT_LABEL.
+        pert_col (str, optional): The column in the metadata containing perturbation information.
+            Defaults to cst.PERT_LABEL_COL.
 
     Returns:
         numpy.ndarray: The aligned embeddings.
     """
-    ntc_idxs = np.where(metadata[PERT_COL].values == NTC_KEY)[0]
+    ntc_idxs = np.where(metadata[pert_col].values == control_key)[0]
     ntc_center = embeddings[ntc_idxs].mean(0)
     return embeddings - ntc_center
 
 
-def aggregate_by_mean(embeddings, metadata, NTC_KEY="non-targeting", PERT_COL="gene"):
+def aggregate_by_mean(embeddings, metadata, control_key=cst.CONTROL_PERT_LABEL, pert_col=cst.PERT_LABEL_COL):
     """
     Applies the mean aggregation to aggregate replicate embeddings for each perturbation.
 
     Args:
         embeddings (numpy.ndarray): The embeddings to be aggregated.
         metadata (pandas.DataFrame): The metadata containing information about the embeddings.
-        NTC_KEY (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting".
+        control_key (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting".
         PERT_COL (str, optional): The column in the metadata containing perturbation information. Defaults to "gene".
 
     Returns:
         Bunch: A named tuple containing two pandas DataFrames:
             - 'features': The aggregated embeddings.
             - 'metadata': A DataFrame containing the perturbation labels for each row in 'data'.
     """
-    unique_perts = list(np.unique(metadata[PERT_COL].values))
-    unique_perts.remove(NTC_KEY)
+    unique_perts = list(np.unique(metadata[pert_col].values))
+    unique_perts.remove(control_key)
     final_embeddings = np.zeros((len(unique_perts), embeddings.shape[1]))
     for i, pert in enumerate(unique_perts):
-        idxs = np.where(metadata[PERT_COL].values == pert)[0]
+        idxs = np.where(metadata[pert_col].values == pert)[0]
         final_embeddings[i, :] = embeddings[idxs, :].mean(0)
     return Bunch(
-        features=pd.DataFrame(final_embeddings), metadata=pd.DataFrame.from_dict({cst.PERT_LABEL_COL: unique_perts})
+        features=pd.DataFrame(final_embeddings), metadata=pd.DataFrame.from_dict({pert_col: unique_perts})
     )