From 9b8ae5b8476de5681f7728ffc02c188fc659e769 Mon Sep 17 00:00:00 2001 From: Safiye Celik Date: Wed, 25 Oct 2023 20:23:55 -0400 Subject: [PATCH] update readme --- README.md | 45 +++++++++++++++++++++++---------- efaar_benchmarking/constants.py | 4 +-- efaar_benchmarking/efaar.py | 40 +++++++++++++++-------------- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 3951c4c..f4c2de6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # EFAAR_benchmarking This library enables computation and retrieval of metrics to benchmark a whole-genome perturbative map created by the pipeline. -Metrics that can be computed using this repo are pairwise gene-gene recall for the reactome, corum, and humap datasets which -are publicly available. +Metrics that can be computed using this repo are pairwise gene-gene recall for the Reactome, HuMAP, CORUM, SIGNOR, and StringDB datasets which +are publicly available (see `efaar_benchmarking/benchmark_annotations/LICENSE` for terms of use for each source). By default, we do not filter on perturbation fingerprint, although filtering is available through the parameters to the `benchmark` function. We compute the metrics for three different random seeds used to generate empirical null entities. @@ -13,27 +13,21 @@ Here are the descriptions for the constants used in the code to configure and co `BENCHMARK_DATA_DIR`: The directory path to the benchmark annotations data. It is obtained using the resources module from the importlib package. -`BENCHMARK_SOURCES`: A list of benchmark sources, including "Reactome", "HuMAP", and "CORUM". +`BENCHMARK_SOURCES`: A list of benchmark sources, including "Reactome", "HuMAP", "CORUM", "SIGNOR", and "StringDB". `PERT_LABEL_COL`: The column name for the gene perturbation labels. +`CONTROL_PERT_LABEL`: The perturbation label value for the control perturbation units. + `PERT_SIG_PVAL_COL`: The column name for the perturbation p-value. `PERT_SIG_PVAL_THR`: The threshold value for the perturbation p-value. -`PERT_TYPE_COL`: The column name for the perturbation type. - -`PERT_TYPE`: The specific perturbation type, which is "GENE" by default. - -`WELL_TYPE_COL`: The column name for the well type. - -`WELL_TYPE`: The specific well type, which is "query_guides". - -`RECALL_PERC_THR_PAIR`: A tuple representing the threshold pair (lower threshold, upper threshold) for calculating recall percentages. +`RECALL_PERC_THRS`: A list of tuples of two floats between 0 and 1 representing the threshold pair (lower threshold, upper threshold) for calculating recall. -`RANDOM_SEED`: The random seed value used for random number generation. +`RANDOM_SEED`: The random seed value used for random number generation for sampling the null distribution. -`RANDOM_COUNT`: The number of runs for benchmarking. +`RANDOM_COUNT`: The number of runs for benchmarking to compute error in metrics. `N_NULL_SAMPLES`: The number of null samples used in benchmarking. @@ -47,6 +41,23 @@ This package is installable via `pip`. pip install efaar_benchmarking ``` +## Example code: + +```from efaar_benchmarking.data_loading import load_replogle +from efaar_benchmarking.efaar import embed_by_scvi, align_by_centering, aggregate_by_mean +from efaar_benchmarking.benchmarking import benchmark +from efaar_benchmarking.plotting import plot_recall + +adata = load_replogle("genome_wide", "raw") +metadata = adata.obs +embeddings_scvi = embed_by_scvi(adata) +embeddings_aligned = align_by_centering(embeddings_scvi, metadata) +map_data = aggregate_by_mean(embeddings_aligned, metadata) +metrics = benchmark(map_data, +recall_thr_pairs=[(0.01,0.99),(0.02,0.98),(0.03,0.97),(0.04,0.96),(0.05,0.95),(0.06,0.94),(0.07,0.93),(0.08,0.92),(0.09,0.91),(0.1,0.9)]) +plot_recall(metrics["summary"]) +``` + ## References **Reactome:** @@ -59,3 +70,9 @@ _Giurgiu, M., Reinhard, J., Brauner, B., Dunger-Kaltenbach, I., Fobo, G., Frishm **HuMAP:** _Drew, K., Wallingford, J.B., and Marcotte, E.M. (2021). hu.MAP 2.0: integration of over 15,000 proteomic experiments builds a global compendium of human multiprotein assemblies. Mol. Syst. Biol. 17, e10016. 10.15252/msb.202010016._ + +**SIGNOR:** +_Licata, L., Lo Surdo, P., Iannuccelli, M., Palma, A., Micarelli, E., Perfetto, L., Peluso, D., Calderone, A., Castagnoli, L., and Cesareni, G. (2019). SIGNOR 2.0, the SIGnaling Network Open Resource 2.0: 2019 update. Nucleic Acids Research. 10.1093/nar/gkz949._ + +**StringDB:** +_von Mering C, Jensen LJ, Snel B, Hooper SD, Krupp M, Foglierini M, Jouffre N, Huynen MA, Bork P. STRING: known and predicted protein-protein associations, integrated and transferred across organisms. Nucleic Acids Res. 2005 Jan 1;33(Database issue):D433-7. doi: 10.1093/nar/gki005._ \ No newline at end of file diff --git a/efaar_benchmarking/constants.py b/efaar_benchmarking/constants.py index c2a52f6..417a0ac 100644 --- a/efaar_benchmarking/constants.py +++ b/efaar_benchmarking/constants.py @@ -4,9 +4,9 @@ "benchmark_annotations" ) BENCHMARK_SOURCES = ["Reactome", "HuMAP", "CORUM", "SIGNOR", "StringDB"] -PERT_LABEL_COL = "perturbation" +PERT_LABEL_COL = "gene" CONTROL_PERT_LABEL = "non-targeting" -PERT_SIG_PVAL_COL = "perturbation_pvalue" +PERT_SIG_PVAL_COL = "gene_pvalue" PERT_SIG_PVAL_THR = 0.01 RECALL_PERC_THRS = [(0.05, 0.95), (0.1, 0.9)] RANDOM_SEED = 42 diff --git a/efaar_benchmarking/efaar.py b/efaar_benchmarking/efaar.py index d6a0450..600ff3a 100644 --- a/efaar_benchmarking/efaar.py +++ b/efaar_benchmarking/efaar.py @@ -7,66 +7,68 @@ import efaar_benchmarking.constants as cst -def embed_by_scvi(adata, BATCH_KEY="gem_group", N_LATENT=128, N_HIDDEN=256): +def embed_by_scvi(adata, batch_key="gem_group", n_latent=128, n_hidden=256): """ Embeds the input AnnData object using scVI. Parameters: adata (anndata.AnnData): The AnnData object to be embedded. - BATCH_KEY (str): The batch key in the AnnData object. Default is "gem_group". - N_LATENT (int): The number of latent dimensions. Default is 128. - N_HIDDEN (int): The number of hidden dimensions. Default is 256. + batch_key (str): The batch key in the AnnData object. Default is "gem_group". + n_latent (int): The number of latent dimensions. Default is 128. + n_hidden (int): The number of hidden dimensions. Default is 256. Returns: None """ - SCVI.setup_anndata(adata, batch_key=BATCH_KEY) - vae = SCVI(adata, n_hidden=N_HIDDEN, n_latent=N_LATENT) + SCVI.setup_anndata(adata, batch_key=batch_key) + vae = SCVI(adata, n_hidden=n_hidden, n_latent=n_latent) vae.train(use_gpu=True) return vae.get_latent_representation() -def embed_by_pca(adata, N_LATENT=100): +def embed_by_pca(adata, n_latent=100): """ Embeds the input data using PCA. Parameters: adata (AnnData): Annotated data matrix. - N_LATENT (int): Number of principal components to use. + n_latent (int): Number of principal components to use. Returns: numpy.ndarray: Embedding of the input data using PCA. """ - sc.pp.pca(adata, n_comps=N_LATENT) + sc.pp.pca(adata, n_comps=n_latent) return adata.obsm["X_pca"] -def align_by_centering(embeddings, metadata, NTC_KEY="non-targeting", PERT_COL="gene"): +def align_by_centering(embeddings, metadata, control_key=cst.CONTROL_PERT_LABEL, pert_col=cst.PERT_LABEL_COL): """ Applies the centerscale method to align embeddings based on the centering perturbations in the metadata. Args: embeddings (numpy.ndarray): The embeddings to be aligned. metadata (pandas.DataFrame): The metadata containing information about the embeddings. - NTC_KEY (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting". - PERT_COL (str, optional): The column in the metadata containing perturbation information. Defaults to "gene". + control_key (str, optional): The key for non-targeting controls in the metadata. + Defaults to cst.CONTROL_PERT_LABEL. + pert_col (str, optional): The column in the metadata containing perturbation information. + Defaults to cst.PERT_LABEL_COL. Returns: numpy.ndarray: The aligned embeddings. """ - ntc_idxs = np.where(metadata[PERT_COL].values == NTC_KEY)[0] + ntc_idxs = np.where(metadata[pert_col].values == control_key)[0] ntc_center = embeddings[ntc_idxs].mean(0) return embeddings - ntc_center -def aggregate_by_mean(embeddings, metadata, NTC_KEY="non-targeting", PERT_COL="gene"): +def aggregate_by_mean(embeddings, metadata, control_key=cst.CONTROL_PERT_LABEL, pert_col=cst.PERT_LABEL_COL): """ Applies the mean aggregation to aggregate replicate embeddings for each perturbation. Args: embeddings (numpy.ndarray): The embeddings to be aggregated. metadata (pandas.DataFrame): The metadata containing information about the embeddings. - NTC_KEY (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting". + control_key (str, optional): The key for non-targeting controls in the metadata. Defaults to "non-targeting". PERT_COL (str, optional): The column in the metadata containing perturbation information. Defaults to "gene". Returns: @@ -74,12 +76,12 @@ def aggregate_by_mean(embeddings, metadata, NTC_KEY="non-targeting", PERT_COL="g - 'features': The aggregated embeddings. - 'metadata': A DataFrame containing the perturbation labels for each row in 'data'. """ - unique_perts = list(np.unique(metadata[PERT_COL].values)) - unique_perts.remove(NTC_KEY) + unique_perts = list(np.unique(metadata[pert_col].values)) + unique_perts.remove(control_key) final_embeddings = np.zeros((len(unique_perts), embeddings.shape[1])) for i, pert in enumerate(unique_perts): - idxs = np.where(metadata[PERT_COL].values == pert)[0] + idxs = np.where(metadata[pert_col].values == pert)[0] final_embeddings[i, :] = embeddings[idxs, :].mean(0) return Bunch( - features=pd.DataFrame(final_embeddings), metadata=pd.DataFrame.from_dict({cst.PERT_LABEL_COL: unique_perts}) + features=pd.DataFrame(final_embeddings), metadata=pd.DataFrame.from_dict({pert_col: unique_perts}) )