periscope pipeline (#29)

recursionpharma · Dec 3, 2023 · 5ec8b44 · 5ec8b44
1 parent f3090a9
commit 5ec8b44
Show file tree

Hide file tree

Showing 5 changed files with 1,502 additions and 11 deletions.
diff --git a/efaar_benchmarking/constants.py b/efaar_benchmarking/constants.py
@@ -9,9 +9,16 @@
 MIN_REQ_ENT_CNT = 20
 PERT_SIG_PVAL_COL = "gene_pvalue"
 PERT_SIG_PVAL_THR = 0.01
-CONTROL_PERT_LABEL = "non-targeting"
+
+REPLOGLE_CONTROL_PERT_LABEL = "non-targeting"
 REPLOGLE_PERT_LABEL_COL = "gene"
 REPLOGLE_BATCH_COL = "gem_group"
+
+JUMP_CONTROL_PERT_LABEL = "non-targeting"
 JUMP_PERT_LABEL_COL = "Metadata_Symbol"
 JUMP_PLATE_COL = "Metadata_Plate"
 JUMP_BATCH_COL = "Metadata_Batch"
+
+PERISCOPE_CONTROL_PERT_LABEL = "nontargeting"
+PERISCOPE_PERT_LABEL_COL = "Metadata_Foci_Barcode_MatchedTo_GeneCode"
+PERISCOPE_PLATE_COL = "Metadata_Plate"
diff --git a/efaar_benchmarking/data_loading.py b/efaar_benchmarking/data_loading.py
@@ -9,23 +9,78 @@
 import scanpy as sc
 import wget
 
+from efaar_benchmarking.constants import PERISCOPE_PLATE_COL
+
+
+def load_periscope(cell_type="HeLa", normalized=False) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Load PERISCOPE (cpg0021) data for a specific cell type.
+    Find more information about the dataset here: https://www.biorxiv.org/content/10.1101/2023.08.06.552164v1
+    The files containing metadata and CellProfiler features are downloaded from here:
+        https://cellpainting-gallery.s3.amazonaws.com/index.html#cpg0021-periscope/
+
+    Parameters:
+    cell_type (str, optional): The cell type to load data for. Defaults to "HeLa".
+    normalized (bool, optional): Whether to load normalized data. Defaults to False.
+
+    Returns:
+    tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames named `features` and `metadata`,
+        representing the loaded data.
+    """
+    per_data_all = []
+    cp_feature_source_formatter = "s3://cellpainting-gallery/cpg0021-periscope/broad/workspace/profiles/{cell_type}/"
+    if cell_type == "A549":
+        plates = ["A", "B", "C", "D", "E", "F", "G", "H", "N"]
+        if normalized:
+            filename_formatter = "20200805_A549_WG_Screen_guide_normalized_ALLBATCHES___CP186{plate}___ALLWELLS.csv.gz"
+        else:
+            filename_formatter = "20200805_A549_WG_Screen_guide_ALLBATCHES___CP186{plate}___ALLWELLS.csv.gz"
+    elif cell_type == "HeLa":
+        plates = ["A", "B", "D", "F", "H", "J", "K", "L", "N"]
+        if normalized:
+            filename_formatter = "20210422_6W_CP257_guide_normalized_ALLBATCHES___CP257{plate}___ALLWELLS.csv.gz"
+        else:
+            filename_formatter = "20210422_6W_CP257_guide_ALLBATCHES___CP257{plate}___ALLWELLS.csv.gz"
+    else:
+        raise ValueError("cell_type must be either HeLa or A549")
+    cp_feature_source_formatter += filename_formatter
+
+    with ThreadPoolExecutor(max_workers=10) as executer:
+        future_to_plate = {
+            executer.submit(
+                lambda path: pd.read_csv(path, compression="gzip", storage_options={"anon": True}),
+                cp_feature_source_formatter.format(cell_type=cell_type, plate=plate),
+            ): plate
+            for plate in plates
+        }
+        for future in as_completed(future_to_plate):
+            per_data = future.result()
+            per_data[PERISCOPE_PLATE_COL] = future_to_plate[future]
+            per_data_all.append(per_data)
+
+    per_data_all = pd.concat(per_data_all)
+    mcols = ["Metadata_Foci_Barcode_MatchedTo_GeneCode", "Metadata_Foci_Barcode_MatchedTo_Barcode", PERISCOPE_PLATE_COL]
+    metadata = per_data[mcols]
+    features = per_data.drop(mcols, axis=1).dropna(axis=1)
+    return features, metadata
+
 
 def load_cpg16_crispr(data_path: str = "data/") -> tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Load and return the JUMP-CP (cpg0016) CRISPR dataset.
+    Load and return the JUMP-CP (cpg0016) CRISPR dataset CellProfiler features.
+    Find more information about the dataset here: https://www.biorxiv.org/content/10.1101/2023.03.23.534023v2
+    We download the metadata first, filter it to CRISPR plates, and load the features for these plates only.
     The metadata is downloaded from here:
         https://zenodo.org/records/7661296/files/jump-cellpainting/metadata-v0.5.0.zip?download=1
-    The cellprofiler features are downloaded from here:
+    The CellProfiler features corresponding to the appropriate plates are loaded from here:
         https://cellpainting-gallery.s3.amazonaws.com/index.html#cpg0016-jump/
-    We read the metadata first, filter it to CRISPR plates, and download the features for these plates only.
 
     Parameters:
     data_path (str): Path to the directory containing the dataset files.
 
     Returns:
-    tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
-        - features: A DataFrame containing the CRISPR dataset features.
-        - metadata: A DataFrame containing the CRISPR dataset metadata.
+    tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames named `features` and `metadata`,
+        representing the loaded data.
     """
     metadata_source_path = "https://zenodo.org/records/7661296/files/jump-cellpainting/datasets-v0.5.0.zip?download=1"
     plate_file_name = "plate.csv.gz"
@@ -81,8 +136,8 @@ def load_cpg16_crispr(data_path: str = "data/") -> tuple[pd.DataFrame, pd.DataFr
 
 def load_replogle(gene_type: str, data_type: str, data_path: str = "data/") -> sc.AnnData:
     """
-    Load Replogle et al. 2022 single-cell RNA-seq data for K562 cells  published here:
-        https://pubmed.ncbi.nlm.nih.gov/35688146/
+    Load Replogle et al. 2022 single-cell RNA-seq data for K562 cells.
+    Find more information about the dataset here: https://pubmed.ncbi.nlm.nih.gov/35688146/
     Four types of K562 data and downloaded using the links at:
         plus.figshare.com/articles/dataset/_Mapping_information-rich_genotype-phenotype_landscapes_with_genome-scale_Perturb-seq_Replogle_et_al_2022_processed_Perturb-seq_datasets/20029387
 

diff --git a/efaar_benchmarking/efaar.py b/efaar_benchmarking/efaar.py
@@ -90,6 +90,7 @@ def embed_align_by_pca(
         variance_or_ncomp (float, optional): Variance or number of components to keep after PCA.
             Defaults to 100 (n_components). If between 0 and 1, select the number of components such that
             the amount of variance that needs to be explained is greater than the percentage specified.
+            If 1, a single component is kept, and if None, all components are kept.
         plate_col (str, optional): Column name for plate metadata. Defaults to None.
     Returns:
         np.ndarray: Transformed data using PCA.
@@ -107,7 +108,7 @@ def align_on_controls(
     metadata: pd.DataFrame,
     scale: bool = True,
     pert_col: str = cst.REPLOGLE_PERT_LABEL_COL,
-    control_key: str = cst.CONTROL_PERT_LABEL,
+    control_key: str = cst.REPLOGLE_CONTROL_PERT_LABEL,
 ) -> np.ndarray:
     """
     Center the embeddings by the control perturbation units in the metadata.
@@ -133,7 +134,7 @@ def aggregate(
     embeddings: np.ndarray,
     metadata: pd.DataFrame,
     pert_col: str = cst.REPLOGLE_PERT_LABEL_COL,
-    control_key: str = cst.CONTROL_PERT_LABEL,
+    control_key: str = cst.REPLOGLE_CONTROL_PERT_LABEL,
     method="mean",
 ) -> Bunch[pd.DataFrame, pd.DataFrame]:
     """
@@ -168,6 +169,26 @@ def aggregate(
     return Bunch(features=pd.DataFrame(final_embeddings), metadata=pd.DataFrame.from_dict({pert_col: unique_perts}))
 
 
+def filter_to_perturbations(
+    features: pd.DataFrame, metadata: pd.DataFrame, perts: list[str], pert_col: str = cst.REPLOGLE_PERT_LABEL_COL
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Filters the features and metadata dataframes based on a list of perturbations.
+
+    Args:
+        features (pd.DataFrame): The features dataframe.
+        metadata (pd.DataFrame): The metadata dataframe.
+        perts (list[str]): A list of perturbations to filter.
+        pert_col (str, optional): The column name in the metadata dataframe that contains the perturbation labels.
+            Defaults to cst.REPLOGLE_PERT_LABEL_COL.
+
+    Returns:
+        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the filtered features and metadata dataframes.
+    """
+    indices = metadata[pert_col].isin(perts)
+    return features[indices], metadata[indices]
+
+
 def filter_cpg16_crispr(
     features: pd.DataFrame,
     metadata: pd.DataFrame,