Murali-group · neelspatel999 · Oct 3, 2021 · Oct 11, 2021 · Nov 15, 2021 · Nov 15, 2021
diff --git a/BoolODE/__init__.py b/BoolODE/__init__.py
@@ -36,12 +36,14 @@ def __init__(self,
                  dimred_jobs,
                  slingshot_jobs,
                  gensample_jobs,
+                 compare_ss_jobs,
                  geneexpression_jobs) -> None:
 
         self.dropout_jobs = dropout_jobs
         self.dimred_jobs = dimred_jobs
         self.slingshot_jobs = slingshot_jobs
         self.gensample_jobs = gensample_jobs
+        self.compare_ss_jobs = compare_ss_jobs
         self.geneexpression_jobs = geneexpression_jobs        
 
 class BoolODE(object):
@@ -147,6 +149,7 @@ def do_post_processing(self):
         if self.post_settings.dropout_jobs\
            or self.post_settings.dimred_jobs\
            or self.post_settings.geneexpression_jobs\
+           or self.post_settings.compare_ss_jobs\
            or self.post_settings.slingshot_jobs:
             doOtherAnalysis = True
         generatedPaths = {}
@@ -203,7 +206,37 @@ def do_post_processing(self):
                             po.genDropouts(settings)
                     if num_invalid == len(alljobs):
                         break
-
+        # compare_ss_jobs (compare steady states jobs)
+        if self.post_settings.compare_ss_jobs is not None:
+            print("Starting to compare steady states...")
+            for ss_comparison_job in self.post_settings.compare_ss_jobs:
+                for jobid in alljobs:
+                    for gsampPath in generatedPaths[jobid]:
+                        # outputPath = self.jobs[jobid]['outputPath']
+                        settings = {}
+                        settings['outputPath'] = ss_comparison_job.get('outputPath')
+                        settings['perform_PyBoolNet'] = ss_comparison_job.get('perform_PyBoolNet', False)
+                        # To use genSamples generated ExpressionData for
+                        # this specific post-processing test as ExpressionData
+                        # settings['expressionDataFileLocation'] = Path(gsampPath,\
+                        #                     'ExpressionData.csv')
+
+                        # Use BoolODE output (the one the user sees) as ExpressionData
+                        settings['expressionDataFileLocation'] = str(
+                            self.jobs[jobid]['outprefix']) + "/ExpressionData.csv"
+
+                        data_file_name = ""
+                        for jobid2, joby in enumerate(self.job_settings.jobs):
+                            data_file_name = str(Path(self.global_settings.model_dir, joby.get('model_definition', '')))
+                        parent_directory_path = Path(self.global_settings.model_dir)
+                        parent_directory_path = parent_directory_path.parent.absolute()
+
+                        # Will need to know what model is being analyzed if
+                        # it is running PyBoolNet
+
+                        settings['modelPath'] = str(parent_directory_path) + "/" + data_file_name
+                        po.compare_ss(settings)
+
         if self.post_settings.dimred_jobs is not None:
             print("Starting dimesionality reduction using tSNE")
             for dimred_jobs in self.post_settings.dimred_jobs:
@@ -342,7 +375,8 @@ def __parse_postproc_settings(input_settings_map) -> GlobalSettings:
         slingshot_jobs = input_settings_map.get('Slingshot', None)
         dimred_jobs = input_settings_map.get('DimRed', None)
         gensample_jobs = input_settings_map.get('GenSamples', None)
+        compare_ss_jobs = input_settings_map.get('CompareSteadyStates', None)
         geneexpression_jobs = input_settings_map.get('GeneExpression', None)        
         return PostProcSettings(dropout_jobs, dimred_jobs,
                                 slingshot_jobs, gensample_jobs,
-                                geneexpression_jobs)
+                                compare_ss_jobs, geneexpression_jobs)
diff --git a/BoolODE/compare_steady_states.py b/BoolODE/compare_steady_states.py
@@ -0,0 +1,241 @@
+"""
+ This script DBSCAN clustering
+ on a provided ExpressionData.CSV file to estimate
+ the number of steady states for a particular boolean model.
+
+ @Author: Neel Patel ([email protected])
+ @Author: Madeline Shaklee ([email protected])
+ @Author: Amelia Whitehead ([email protected])
+"""
+
+def compare_ss(expression_data_location, output_file_path, compare_with_pyboolnet, model_path):
+    """
+     Main caller of any steady state comparison techniques.
+
+     @Params:       expression_data_location: the full file path of ExpressionData
+                    output_file_path: the file path where output files should go
+                    compare_with_pyboolnet: 1 if compare output with PyBoolNet, 0 otherwise
+                    model_path: States the path of the model. Needed for PyBoolNet to analyze steady states
+    """
+    import os
+    # Perform DBSCAN, a clustering technique to approximate the number of steady states.
+    n_clusters_ = run_dbscan_clustering(expression_data_location, output_file_path)
+
+    # (Optional) Perform PyBoolNet to calculate the steady states and report the number of steady states.
+    pyboolnet_result = -1
+    if compare_with_pyboolnet:
+        # Note: run_pyboolnet still has initial conditions section to be added
+        pyboolnet_result = run_pyboolnet(False, model_path)
+
+    # Only do these print statements if PyBoolNet was ran.
+    if pyboolnet_result != -1:
+        if n_clusters_ == pyboolnet_result:
+            if n_clusters_ == 1:
+                print("\nDBSCAN and PyBoolNet results match and found " + str(n_clusters_) + " steady state\n")
+            else:
+                print("\nDBSCAN and PyBoolNet results match and found " + str(n_clusters_) + " steady states\n")
+        else:
+            print("\nDBSCAN and PyBoolNet outputs did not match. \nDBSCAN: " + str(n_clusters_) + ", PyBoolNet: " + str(
+                pyboolnet_result) + "\n")
+    output_file_path = os.path.join(output_file_path, "steady_state_comparison.txt")
+    outputfile = open(output_file_path, "w")
+    outputfile.write("DBSCAN Number of Steady State Estimation: " + str(n_clusters_) + "\n")
+    if pyboolnet_result != -1:
+        outputfile.write("PyBoolNet Number of Steady States: " + str(pyboolnet_result) + "\n")
+        outputfile.write("\n")
+        if n_clusters_ == pyboolnet_result:
+            if n_clusters_ == 1:
+                outputfile.write("DBSCAN and PyBoolNet outputs match and found " + str(n_clusters_) + " steady state")
+            else:
+                outputfile.write("DBSCAN and PyBoolNet outputs match and found " + str(n_clusters_) + " steady states")
+        else:
+            outputfile.write("DBSCAN and PyBoolNet outputs did not match.")
+    else:
+        outputfile.write("PyBoolNet was not ran because it was not requested")
+    print("Steady state comparison results were written to steady_state_comparison.txt\n")
+
+
+
+def run_dbscan_clustering(expression_data_location, output_file_path):
+    """
+     Performs DBSCAN on the ExpressionData file and generates a DBSCAN_ClusterIDs.csv file, which specifies
+     which cell belongs to what cluster, according to DBSCAN. All noise points are grouped together into
+     a separate cluster for visualization purposes.
+
+     @Precondition: The number of samples (cells) needs to be greater than two
+                    times the number of genes
+
+     @Note:         DBSCAN does not work well with datasets of varying density
+
+     @Params:       expression_data_location: the full file path of ExpressionData
+                    output_file_path: the file path where output files should go
+
+     @Citations:    https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
+                    https://medium.com/@tarammullin/dbscan-parameter-estimation-ff8330e3a3bd
+    """
+    import os
+    from sklearn.cluster import DBSCAN
+    from sklearn.neighbors import NearestNeighbors
+    from kneed import KneeLocator
+    import numpy as np
+    import pandas
+
+    # ExpressionData is a matrix, where rows are genes and columns are cells
+    expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0)
+
+    # Performs transpose for later clustering on the samples.
+    # We cluster the cell columns of ExpressionData
+    expression_data_transpose = np.array(expression_df.T.to_numpy().tolist())
+    # ExpressionData is a matrix, where rows are genes and columns are cells
+    # expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0)
+
+    # Performs transpose for later clustering on the samples.
+    # We cluster the cell columns of ExpressionData
+    # expression_data_transpose = np.array(data_frame.tolist())
+
+    # DBSCAN takes two parameters, MinSamples and Epsilon
+    # MinSamples is described now, and Epsilon will be described later
+
+    # The MinSamples parameter is the fewest number of samples (cells)
+    # DBSCAN will put in a cluster. We assign MinSamples
+    # to two times the number of genes in ExpressionData
+    min_samples_num = len(expression_df.index) * 2
+
+    ############# Below is from Medium.com reference #############
+
+    # Compute nearest neighbors and distances
+    neighbors = NearestNeighbors(n_neighbors=min_samples_num)
+
+    neighbors_fit = neighbors.fit(expression_data_transpose)
+    distances, indices = neighbors_fit.kneighbors(expression_data_transpose)
+
+    # Sort distances in ascending order
+    distances = np.sort(distances, axis=0)
+
+    ############# End of Medium.com reference #############
+
+    # Find the average k-distances then plot to find the knee,
+    # which is the point of maximum curvature
+    # y_values = compute_average_distances(distances)
+    #############################################################
+    denominator_of_average = len(distances[0])
+    # Will be of length number of samples (cells) in the
+    # ExpressionData, once full
+
+    average_distances = []
+    for distances_element in distances:
+        distance_sum = 0
+        for distance_value in distances_element:
+            distance_sum = distance_sum + distance_value
+        average_distance = distance_sum / denominator_of_average
+        average_distances.append(average_distance)
+    y_values = average_distances
+    ############################################
+    x_values = list(range(1, len(y_values) + 1))
+    knee_locator = KneeLocator(x_values, y_values, S=1.0, curve='convex', direction='increasing')
+    maximum_curvature_position = round(knee_locator.knee, 20)
+
+    # The Epsilon parameter of DBSCAN represents the upper bound (exclusive)
+    # distance between two points to be clustered together.
+    epsilon = y_values[maximum_curvature_position - 1]
+
+    ############# Below is from the scikit-learn reference #############
+
+    # Performs DBSCAN with the calculated parameters
+    db = DBSCAN(eps=epsilon, min_samples=min_samples_num).fit(expression_data_transpose)
+
+    clusters_identifiers = db.fit_predict(expression_data_transpose)
+    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+    core_samples_mask[db.core_sample_indices_] = True
+    labels = db.labels_
+
+    # Number of clusters in labels, ignoring noise if present.
+    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+    n_noise_ = list(labels).count(-1)
+
+    print("DBSCAN result:")
+    print('Estimated number of clusters: %d' % n_clusters_)
+    print('Estimated number of noise points: %d \n' % n_noise_)
+
+    ############# End of scikit-learn reference #############
+
+    # Adding the cluster labels to the dataframe
+    expression_df.loc['cluster_labels', :] = clusters_identifiers
+
+    # Write a new ClusterID file according to DBSCAN's clustering
+    cluster_values = expression_df.loc['cluster_labels'].tolist()
+    cluster_values = [int(number) for number in cluster_values]
+    cell_names = expression_df.columns.tolist()
+    dictionary = {'': cell_names, 'cl': cluster_values}
+    cluster_df = pandas.DataFrame(dictionary)
+    output_file_path = os.path.join(output_file_path, 'DBSCAN_ClusterIDs.csv')
+    cluster_df.to_csv(output_file_path, index=False)
+
+    print("DBSCAN analysis generated a DBSCAN_ClusterIDs.csv file.")
+    return n_clusters_
+
+
+# Runs PyBoolNet
+def run_pyboolnet(ics_present, model_path):
+    """
+         Performs PyBoolNet to get the steady states. Information on PyBoolNet can be
+         found at https://pyboolnet.readthedocs.io/en/master/
+
+         @Params:       ics_present: True if initial conditions are present, False otherwise
+                        model_path: States the path of the model. Needed for PyBoolNet to analyze steady states
+        """
+    #### Still has initial conditions section to be added
+    print("Starting PyBoolNet...")
+
+    import PyBoolNet
+    import pandas as pd
+    import os
+
+    input_file = open(model_path, "r")
+    temp_bnet_file = open(model_path + "_temporary_bnet.txt", "x")
+    for line in input_file:
+        # If it is not the first line
+        if "".join(line.split()).lower() != "generule":
+            # Convert the format and remove tabs.
+            py_formatted_line = line.replace('\t', ',', 1)
+            py_formatted_line = py_formatted_line.replace("and", "&")
+            py_formatted_line = py_formatted_line.replace("or", "|")
+            py_formatted_line = py_formatted_line.replace("not", "!")
+            py_formatted_line = py_formatted_line.replace('\t', ' ')
+            temp_bnet_file.write(py_formatted_line)
+            temp_bnet_file.write('\n')
+
+    input_file.close()
+    temp_bnet_file.close()
+
+    primes = PyBoolNet.FileExchange.bnet2primes(model_path + "_temporary_bnet.txt")
+    steady_states = PyBoolNet.AspSolver.steady_states(primes)
+    df = pd.DataFrame(steady_states)
+
+    print("PyBoolNet found " + str(len(df)) + " steady state/s total")
+
+    # Delete the temporary file.
+    os.remove(model_path + "_temporary_bnet.txt")
+
+    # Initial conditions and reachability from a given initial condition is TBD
+    # We currently always set this to always false. Still need to work on this section
+    unreachable_steady_states = 0
+    if ics_present:
+        current_goal_state = ""
+        ic = "1--00"
+        list_of_steady_states = df.values.tolist()
+        # Find the number of unreachable_steady_states
+        for steady_state in list_of_steady_states:
+            current_goal_state = ','.join(str(v) for v in steady_state)
+            current_goal_state = current_goal_state.replace(',', '')
+            current_goal_state = current_goal_state.strip()
+            path = PyBoolNet.StateTransitionGraphs.best_first_reachability(primes, InitialSpace=ic,
+                                                                           GoalSpace=current_goal_state)
+            if path:
+                # for x in path:
+                #     print(x)
+                print("A path was found for steady state: " + current_goal_state)
+            else:
+                print("No path was found for steady state: " + current_goal_state)
+
+    return len(df) - unreachable_steady_states