Refactor Bandage script and add docstrings

ctlab · Nov 3, 2023 · 617ab3e · 617ab3e
1 parent 83b2fb2
commit 617ab3e
Show file tree

Hide file tree

Showing 13 changed files with 156 additions and 109 deletions.
diff --git a/bin/metafx-scripts/build_model_for_bandage.py b/bin/metafx-scripts/build_model_for_bandage.py
@@ -11,15 +11,27 @@
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
 from sklearn.tree import DecisionTreeClassifier
 
-# dump and load classification model
+# to save and load classification model
 from joblib import dump, load
 
 
 def buildModelRandomForest(dataFile, rawLabels, nEstimators, maxDepth):
+    """Fit Random Forest classification model
+
+    Arguments:
+    - dataFile (str): path to tab-separated file with features table
+    - rawLabels (pd.DataFrame): DataFrame with class labels
+    - nEstimators (int): number of Decision Trees in model
+    - maxDepth (int): maximal depth of Decision Tree
+
+    Returns:
+    sklearn.ensemble.RandomForestClassifier: fitted model
+    """
     data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
     if set(data.columns) != set(rawLabels.index):
         data = data.filter(items=rawLabels.index, axis=1)
-        print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
+        print("Samples from feature table and metadata does not match! " +
+              "Will use only " + str(data.shape[1]) + " common samples")
     data = data.T
     labels = np.array([rawLabels.loc[i, 1] for i in data.index])
 
@@ -38,10 +50,22 @@ def buildModelRandomForest(dataFile, rawLabels, nEstimators, maxDepth):
 
 
 def buildModelGradientBoosting(dataFile, rawLabels, nEstimators, maxDepth):
+    """Fit Gradient Boosting classification model
+
+    Arguments:
+    - dataFile (str): path to tab-separated file with features table
+    - rawLabels (pd.DataFrame): DataFrame with class labels
+    - nEstimators (int): number of Decision Trees in model
+    - maxDepth (int): maximal depth of Decision Tree
+
+    Returns:
+    sklearn.ensemble.GradientBoostingClassifier: fitted model
+    """
     data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
     if set(data.columns) != set(rawLabels.index):
         data = data.filter(items=rawLabels.index, axis=1)
-        print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
+        print("Samples from feature table and metadata does not match! " +
+              "Will use only " + str(data.shape[1]) + " common samples")
     data = data.T
     labels = np.array([rawLabels.loc[i, 1] for i in data.index])
 
@@ -60,10 +84,22 @@ def buildModelGradientBoosting(dataFile, rawLabels, nEstimators, maxDepth):
 
 
 def buildModelAdaBoost(dataFile, rawLabels, nEstimators, maxDepth):
+    """Fit AdaBoost classification model
+
+    Arguments:
+    - dataFile (str): path to tab-separated file with features table
+    - rawLabels (pd.DataFrame): DataFrame with class labels
+    - nEstimators (int): number of Decision Trees in model
+    - maxDepth (int): maximal depth of Decision Tree
+
+    Returns:
+    sklearn.ensemble.AdaBoostClassifier: fitted model
+    """
     data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
     if set(data.columns) != set(rawLabels.index):
         data = data.filter(items=rawLabels.index, axis=1)
-        print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
+        print("Samples from feature table and metadata does not match! " +
+              "Will use only " + str(data.shape[1]) + " common samples")
     data = data.T
     labels = np.array([rawLabels.loc[i, 1] for i in data.index])
 
@@ -83,13 +119,26 @@ def buildModelAdaBoost(dataFile, rawLabels, nEstimators, maxDepth):
     return model
 
 
-def printModelBase(model, resFileName, sourceDir):
+def printModel(model, resFileName, sourceDir, typeOfForest=0):
+    """Print fitted model to file in format supported by BandageNG
+
+    Arguments:
+    - model: fitted model
+    - resFileName (str): filename to output result
+    - sourceDir (str): path to directory with features' sequences
+    - typeOfForest (int): 0 (RandomForest), 1 (GradientBoosting) or 2 (AdaBoost)
+
+    Returns:
+    None
+    """
     f = open(resFileName, 'w')
     prefix = 0
     features = dict()
     treeClassifierList = model.estimators_
     feature_names = model.feature_names_in_
     classes = model.classes_
+    if typeOfForest == 1:
+        treeClassifierList = np.ravel(treeClassifierList)
     for tc in treeClassifierList:
         tree = tc.tree_
         nodeIds = [0]
@@ -104,79 +153,25 @@ def printModelBase(model, resFileName, sourceDir):
                     value = tree.value[nodeId].T[0]
                 class_name = np.argmax(value)
                 print("C", nodeId + prefix, classes[class_name], sep="\t", file=f)
-                continue
-            childLeftId = tree.children_left[nodeId]
-            nodeIds.append(childLeftId)
-            print(childLeftId + prefix, sep="\t", end="\t", file=f)
-
-            childRightId = tree.children_right[nodeId]
-            nodeIds.append(childRightId)
-            print(childRightId + prefix,  sep="\t", end="\t", file=f)
-            print(file=f)
-            feature = feature_names[tree.feature[nodeId]]
-            threshold = tree.threshold[nodeId]
-            classF = feature.split("_")[0]
-            print("C", nodeId + prefix, classF, sep="\t", file=f)
-            print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", end="\n", file=f)
-            if feature in features:
-                features[feature].append(nodeId + prefix)
             else:
-                features[feature] = [nodeId + prefix]
-        prefix += tree.node_count
-
-    for fClass in classes:
-        file = open(sourceDir + "/contigs_" + fClass + "/components.seq.fasta", 'r')
-        line = file.readline()
-        while line:
-            feature = fClass + "_" + line[1:].split("_")[0]
-            seq = file.readline().strip()
-            if feature in features:
-                print("S", feature, seq, sep="\t", file=f)
-            line = file.readline()
-    f.close()
-
-
-def printModelGradientBoosting(model, resFileName, sourceDir):
-    f = open(resFileName, 'w')
-    prefix = 0
-    features = dict()
-    treeClassifierList = model.estimators_
-    feature_names = model.feature_names_in_
-    classes = model.classes_
-    for i in treeClassifierList:
-        for tc in i:
-            tree = tc.tree_
-            nodeIds = [0]
-            while (len(nodeIds) > 0):
-                nodeId = nodeIds.pop(0)
-                print("N", nodeId + prefix, sep="\t", end="\t", file=f)
-                if tree.children_left[nodeId] == tree.children_right[nodeId]:
-                    print(file=f)
-                    if tree.n_outputs == 1:
-                        value = tree.value[nodeId][0]
-                    else:
-                        value = tree.value[nodeId].T[0]
-                    class_name = np.argmax(value)
-                    print("C", nodeId + prefix, classes[class_name], sep="\t", file=f)
-                    continue
                 childLeftId = tree.children_left[nodeId]
                 nodeIds.append(childLeftId)
-                print(childLeftId + prefix, sep="\t", end="\t", file=f)
+                print(childLeftId + prefix, end="\t", file=f)
 
                 childRightId = tree.children_right[nodeId]
                 nodeIds.append(childRightId)
-                print(childRightId + prefix,  sep="\t", end="\t", file=f)
-                print(file=f)
+                print(childRightId + prefix, file=f)
+
                 feature = feature_names[tree.feature[nodeId]]
                 threshold = tree.threshold[nodeId]
                 classF = feature.split("_")[0]
                 print("C", nodeId + prefix, classF, sep="\t", file=f)
-                print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", end="\n", file=f)
+                print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", file=f)
                 if feature in features:
                     features[feature].append(nodeId + prefix)
                 else:
                     features[feature] = [nodeId + prefix]
-            prefix += tree.node_count
+        prefix += tree.node_count
 
     for fClass in classes:
         file = open(sourceDir + "/contigs_" + fClass + "/components.seq.fasta", 'r')
@@ -191,24 +186,33 @@ def printModelGradientBoosting(model, resFileName, sourceDir):
 
 
 def buildAndPrintModel(sourceDir, treeNum, maxDepth, typeOfForest, resFile, model=None):
+    """Wrapper to fit and print classification model
+
+    Arguments:
+    - sourceDir (str): path to directory with features' sequences
+    - treeNum (int): number of Decision Trees in model
+    - maxDepth (int): maximal depth of Decision Tree
+    - typeOfForest (int): 0 (RandomForest), 1 (GradientBoosting) or 2 (AdaBoost)
+    - resFile (str): filename to output result
+    - model: pre-fitted model
+
+    Returns:
+    None
+    """
     rawLabels = pd.read_csv(sourceDir + '/samples_categories.tsv', sep="\t", index_col=0, header=None)
     rawLabels.index = rawLabels.index.astype(str)
 
-    if typeOfForest == 0:
-        if model is None:
-            model = buildModelRandomForest(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
-            dump(model, resFile[:-4]+".joblib")
-        printModelBase(model, resFile, sourceDir)
-    elif typeOfForest == 1:
-        if model is None:
-            model = buildModelGradientBoosting(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
-            dump(model, resFile[:-4]+".joblib")
-        printModelGradientBoosting(model, resFile, sourceDir)
-    else:
-        if model is None:
-            model = buildModelAdaBoost(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
-            dump(model, resFile[:-4]+".joblib")
-        printModelBase(model, resFile, sourceDir)
+    if model is None:
+        dataFile = sourceDir + '/feature_table.tsv'
+        if typeOfForest == 0:
+            model = buildModelRandomForest(dataFile, rawLabels, treeNum, maxDepth)
+        elif typeOfForest == 1:
+            model = buildModelGradientBoosting(dataFile, rawLabels, treeNum, maxDepth)
+        elif typeOfForest == 2:
+            model = buildModelAdaBoost(dataFile, rawLabels, treeNum, maxDepth)
+        dump(model, resFile[:-4] + ".joblib")
+
+    printModel(model, resFile, sourceDir, typeOfForest)
 
 
 if __name__ == "__main__":

diff --git a/bin/metafx-scripts/cv.py b/bin/metafx-scripts/cv.py
@@ -20,7 +20,8 @@
 
     if set(features.columns) != set(metadata.index):
         features = features.filter(items=metadata.index, axis=1)
-        print("Samples from feature table and metadata does not match! Will use only " + str(features.shape[1]) + " common samples")
+        print("Samples from feature table and metadata does not match! " +
+              "Will use only " + str(features.shape[1]) + " common samples")
 
     M = features.shape[0]  # features count
     N = features.shape[1]  # samples  count
@@ -38,12 +39,12 @@
         df_grid = pd.DataFrame.from_dict(clf.cv_results_).filter(regex='param_.*|mean_test_score|std_test_score|rank_test_score')
         df_grid = df_grid.sort_values(by="rank_test_score", kind="mergesort")
         print("\nGrid search cross-validation accuracy:")
-        print(df_grid.head(10).to_string(index=False), "."*88, df_grid.tail(10).to_string(index=False), sep="\n")
+        print(df_grid.head(10).to_string(index=False), "." * 88, df_grid.tail(10).to_string(index=False), sep="\n")
         print("\nSelected parameters for best Random Forest classifier:")
         for k, v in clf.best_params_.items():
             print("\t", k, "=", v)
         print()
-        dump(clf.best_estimator_, outName+".joblib")
+        dump(clf.best_estimator_, outName + ".joblib")
         print("Model accuracy after training:")
         print(classification_report(y, clf.best_estimator_.predict(X)))
     else:  # performing cross-validation
@@ -63,6 +64,6 @@
 
         model = RandomForestClassifier(n_estimators=100)
         model.fit(X, y)
-        dump(model, outName+".joblib")
+        dump(model, outName + ".joblib")
         print("Model accuracy after training:")
         print(classification_report(y, model.predict(X)))
diff --git a/bin/metafx-scripts/fit.py b/bin/metafx-scripts/fit.py
@@ -15,7 +15,8 @@
 
     if set(features.columns) != set(metadata.index):
         features = features.filter(items=metadata.index, axis=1)
-        print("Samples from feature table and metadata does not match! Will use only " + str(features.shape[1]) + " common samples")
+        print("Samples from feature table and metadata does not match! " +
+              "Will use only " + str(features.shape[1]) + " common samples")
 
     M = features.shape[0]  # features count
     N = features.shape[1]  # samples  count
@@ -25,7 +26,7 @@
     y = [metadata.loc[i, 1] for i in X.index]
 
     model.fit(X, y)
-    dump(model, outName+".joblib")
+    dump(model, outName + ".joblib")
 
     print("Model accuracy after training:")
     print(classification_report(y, model.predict(X)))
diff --git a/bin/metafx-scripts/fit_predict.py b/bin/metafx-scripts/fit_predict.py
@@ -17,7 +17,8 @@
         features_train = features.filter(items=metadata.index, axis=1)
         features_test = features[features.columns.difference(metadata.index)]
         predict = True
-        print("Will use " + str(features_train.shape[1]) + " common samples for model training and " + str(features_test.shape[1]) + " samples to predict new labels")
+        print("Will use " + str(features_train.shape[1]) + " common samples for model training " +
+              "and " + str(features_test.shape[1]) + " samples to predict new labels")
     else:
         features_train = features
         predict = False
@@ -28,7 +29,7 @@
     y_train = [metadata.loc[i, 1] for i in X_train.index]
 
     model.fit(X_train, y_train)
-    dump(model, outName+".joblib")
+    dump(model, outName + ".joblib")
 
     print("Model accuracy after training:")
     print(classification_report(y_train, model.predict(X_train)))
@@ -37,8 +38,8 @@
         X_test = features_test.T
         y_pred = model.predict(X_test)
 
-        outFile = open(outName+".tsv", "w")
+        outFile = open(outName + ".tsv", "w")
         for sam, pred in zip(X_test.index, y_pred):
             print(sam, pred, sep="\t", file=outFile)
         outFile.close()
-        print("Predicted labels saved to "+outName+".tsv")
+        print("Predicted labels saved to " + outName + ".tsv")
diff --git a/bin/metafx-scripts/get_G.sh b/bin/metafx-scripts/get_G.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Utility for detecting greatest G value, such that the number of unique k-mers is more than 100000
+
 a=$(grep "of them is good" ${1} | grep -o -E ", [0-9']+ \(")
 a=$(echo "$a" | sed "s/'//g" | grep -o -E "[0-9]+")
 

diff --git a/bin/metafx-scripts/graph2contigs.py b/bin/metafx-scripts/graph2contigs.py
@@ -5,10 +5,10 @@
 
 if __name__ == "__main__":
     wd = sys.argv[1]
-    file = open(wd+"/components.seq.fasta", "w")
+    file = open(wd + "/components.seq.fasta", "w")
     comp = -1
     comp_i = 0
-    for line in open(wd+"/components-graph.gfa"):
+    for line in open(wd + "/components-graph.gfa"):
         if line.split()[0] == 'S':
             _, name, seq, *_ = line.strip().split(sep="\t")
             name = int(name.split("_")[1][1:])
@@ -18,4 +18,4 @@
             comp_i += 1
             print(">" + str(comp) + "_" + str(comp_i), file=file)
             print(seq, file=file)
-    file.close()
+    file.close()
diff --git a/bin/metafx-scripts/join_feature_vectors.py b/bin/metafx-scripts/join_feature_vectors.py
@@ -6,7 +6,15 @@
 
 
 def load_cat(cat, wd):
-    # df_list = [pd.read_csv(wd + "/features_" + cat + "/vectors/" + file + ".breadth", header=None, index_col=None) for file in all_files]
+    """Concatenate feature vectors for all files for given category into one table
+
+    Arguments:
+    cat (str): name of category
+    wd (str): path to directory with feature vectors files
+
+    Returns:
+    pd.DataFrame: table of features of shape (n_features, n_samples)
+    """
     all_files = glob.glob(wd + "/features_" + cat + "/vectors/" + "*.breadth")
     df_list = [pd.read_csv(file, header=None, index_col=None) for file in all_files]
     data = pd.concat(df_list, axis=1)
@@ -29,5 +37,5 @@ def load_cat(cat, wd):
         subtables.append(load_cat(cat, wd))
 
     feature_table = pd.concat(subtables, axis=0)
-    feature_table.to_csv(wd+"/feature_table.tsv", sep="\t")
+    feature_table.to_csv(wd + "/feature_table.tsv", sep="\t")
     print("Total " + str(feature_table.shape[0]) + " features found!")
diff --git a/bin/metafx-scripts/join_gfa.py b/bin/metafx-scripts/join_gfa.py
@@ -5,7 +5,7 @@
 
 if __name__ == "__main__":
     wd = sys.argv[1]
-    file = open(wd+"/all-components-graph.gfa", "w")
+    file = open(wd + "/all-components-graph.gfa", "w")
     cat = 0
     m = dict()
     for fin in sys.argv[2:]:
@@ -19,4 +19,4 @@
             if line.split()[0] == 'L':
                 a, b, c, d, e, f = line.strip().split(sep="\t")
                 print(a, m[b], c, m[d], e, f, sep="\t", file=file)
-    file.close()
+    file.close()