Skip to content

Commit

Permalink
Refactor Bandage script and add docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
ivartb committed Nov 3, 2023
1 parent 83b2fb2 commit 617ab3e
Show file tree
Hide file tree
Showing 13 changed files with 156 additions and 109 deletions.
162 changes: 83 additions & 79 deletions bin/metafx-scripts/build_model_for_bandage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,27 @@
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# dump and load classification model
# to save and load classification model
from joblib import dump, load


def buildModelRandomForest(dataFile, rawLabels, nEstimators, maxDepth):
"""Fit Random Forest classification model
Arguments:
- dataFile (str): path to tab-separated file with features table
- rawLabels (pd.DataFrame): DataFrame with class labels
- nEstimators (int): number of Decision Trees in model
- maxDepth (int): maximal depth of Decision Tree
Returns:
sklearn.ensemble.RandomForestClassifier: fitted model
"""
data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
if set(data.columns) != set(rawLabels.index):
data = data.filter(items=rawLabels.index, axis=1)
print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
print("Samples from feature table and metadata does not match! " +
"Will use only " + str(data.shape[1]) + " common samples")
data = data.T
labels = np.array([rawLabels.loc[i, 1] for i in data.index])

Expand All @@ -38,10 +50,22 @@ def buildModelRandomForest(dataFile, rawLabels, nEstimators, maxDepth):


def buildModelGradientBoosting(dataFile, rawLabels, nEstimators, maxDepth):
"""Fit Gradient Boosting classification model
Arguments:
- dataFile (str): path to tab-separated file with features table
- rawLabels (pd.DataFrame): DataFrame with class labels
- nEstimators (int): number of Decision Trees in model
- maxDepth (int): maximal depth of Decision Tree
Returns:
sklearn.ensemble.GradientBoostingClassifier: fitted model
"""
data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
if set(data.columns) != set(rawLabels.index):
data = data.filter(items=rawLabels.index, axis=1)
print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
print("Samples from feature table and metadata does not match! " +
"Will use only " + str(data.shape[1]) + " common samples")
data = data.T
labels = np.array([rawLabels.loc[i, 1] for i in data.index])

Expand All @@ -60,10 +84,22 @@ def buildModelGradientBoosting(dataFile, rawLabels, nEstimators, maxDepth):


def buildModelAdaBoost(dataFile, rawLabels, nEstimators, maxDepth):
"""Fit AdaBoost classification model
Arguments:
- dataFile (str): path to tab-separated file with features table
- rawLabels (pd.DataFrame): DataFrame with class labels
- nEstimators (int): number of Decision Trees in model
- maxDepth (int): maximal depth of Decision Tree
Returns:
sklearn.ensemble.AdaBoostClassifier: fitted model
"""
data = pd.read_csv(dataFile, header=0, index_col=0, sep='\t')
if set(data.columns) != set(rawLabels.index):
data = data.filter(items=rawLabels.index, axis=1)
print("Samples from feature table and metadata does not match! Will use only " + str(data.shape[1]) + " common samples")
print("Samples from feature table and metadata does not match! " +
"Will use only " + str(data.shape[1]) + " common samples")
data = data.T
labels = np.array([rawLabels.loc[i, 1] for i in data.index])

Expand All @@ -83,13 +119,26 @@ def buildModelAdaBoost(dataFile, rawLabels, nEstimators, maxDepth):
return model


def printModelBase(model, resFileName, sourceDir):
def printModel(model, resFileName, sourceDir, typeOfForest=0):
"""Print fitted model to file in format supported by BandageNG
Arguments:
- model: fitted model
- resFileName (str): filename to output result
- sourceDir (str): path to directory with features' sequences
- typeOfForest (int): 0 (RandomForest), 1 (GradientBoosting) or 2 (AdaBoost)
Returns:
None
"""
f = open(resFileName, 'w')
prefix = 0
features = dict()
treeClassifierList = model.estimators_
feature_names = model.feature_names_in_
classes = model.classes_
if typeOfForest == 1:
treeClassifierList = np.ravel(treeClassifierList)
for tc in treeClassifierList:
tree = tc.tree_
nodeIds = [0]
Expand All @@ -104,79 +153,25 @@ def printModelBase(model, resFileName, sourceDir):
value = tree.value[nodeId].T[0]
class_name = np.argmax(value)
print("C", nodeId + prefix, classes[class_name], sep="\t", file=f)
continue
childLeftId = tree.children_left[nodeId]
nodeIds.append(childLeftId)
print(childLeftId + prefix, sep="\t", end="\t", file=f)

childRightId = tree.children_right[nodeId]
nodeIds.append(childRightId)
print(childRightId + prefix, sep="\t", end="\t", file=f)
print(file=f)
feature = feature_names[tree.feature[nodeId]]
threshold = tree.threshold[nodeId]
classF = feature.split("_")[0]
print("C", nodeId + prefix, classF, sep="\t", file=f)
print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", end="\n", file=f)
if feature in features:
features[feature].append(nodeId + prefix)
else:
features[feature] = [nodeId + prefix]
prefix += tree.node_count

for fClass in classes:
file = open(sourceDir + "/contigs_" + fClass + "/components.seq.fasta", 'r')
line = file.readline()
while line:
feature = fClass + "_" + line[1:].split("_")[0]
seq = file.readline().strip()
if feature in features:
print("S", feature, seq, sep="\t", file=f)
line = file.readline()
f.close()


def printModelGradientBoosting(model, resFileName, sourceDir):
f = open(resFileName, 'w')
prefix = 0
features = dict()
treeClassifierList = model.estimators_
feature_names = model.feature_names_in_
classes = model.classes_
for i in treeClassifierList:
for tc in i:
tree = tc.tree_
nodeIds = [0]
while (len(nodeIds) > 0):
nodeId = nodeIds.pop(0)
print("N", nodeId + prefix, sep="\t", end="\t", file=f)
if tree.children_left[nodeId] == tree.children_right[nodeId]:
print(file=f)
if tree.n_outputs == 1:
value = tree.value[nodeId][0]
else:
value = tree.value[nodeId].T[0]
class_name = np.argmax(value)
print("C", nodeId + prefix, classes[class_name], sep="\t", file=f)
continue
childLeftId = tree.children_left[nodeId]
nodeIds.append(childLeftId)
print(childLeftId + prefix, sep="\t", end="\t", file=f)
print(childLeftId + prefix, end="\t", file=f)

childRightId = tree.children_right[nodeId]
nodeIds.append(childRightId)
print(childRightId + prefix, sep="\t", end="\t", file=f)
print(file=f)
print(childRightId + prefix, file=f)

feature = feature_names[tree.feature[nodeId]]
threshold = tree.threshold[nodeId]
classF = feature.split("_")[0]
print("C", nodeId + prefix, classF, sep="\t", file=f)
print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", end="\n", file=f)
print("F", nodeId + prefix, feature, "{:.2f}".format(threshold), sep="\t", file=f)
if feature in features:
features[feature].append(nodeId + prefix)
else:
features[feature] = [nodeId + prefix]
prefix += tree.node_count
prefix += tree.node_count

for fClass in classes:
file = open(sourceDir + "/contigs_" + fClass + "/components.seq.fasta", 'r')
Expand All @@ -191,24 +186,33 @@ def printModelGradientBoosting(model, resFileName, sourceDir):


def buildAndPrintModel(sourceDir, treeNum, maxDepth, typeOfForest, resFile, model=None):
"""Wrapper to fit and print classification model
Arguments:
- sourceDir (str): path to directory with features' sequences
- treeNum (int): number of Decision Trees in model
- maxDepth (int): maximal depth of Decision Tree
- typeOfForest (int): 0 (RandomForest), 1 (GradientBoosting) or 2 (AdaBoost)
- resFile (str): filename to output result
- model: pre-fitted model
Returns:
None
"""
rawLabels = pd.read_csv(sourceDir + '/samples_categories.tsv', sep="\t", index_col=0, header=None)
rawLabels.index = rawLabels.index.astype(str)

if typeOfForest == 0:
if model is None:
model = buildModelRandomForest(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
dump(model, resFile[:-4]+".joblib")
printModelBase(model, resFile, sourceDir)
elif typeOfForest == 1:
if model is None:
model = buildModelGradientBoosting(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
dump(model, resFile[:-4]+".joblib")
printModelGradientBoosting(model, resFile, sourceDir)
else:
if model is None:
model = buildModelAdaBoost(sourceDir + '/feature_table.tsv', rawLabels, treeNum, maxDepth)
dump(model, resFile[:-4]+".joblib")
printModelBase(model, resFile, sourceDir)
if model is None:
dataFile = sourceDir + '/feature_table.tsv'
if typeOfForest == 0:
model = buildModelRandomForest(dataFile, rawLabels, treeNum, maxDepth)
elif typeOfForest == 1:
model = buildModelGradientBoosting(dataFile, rawLabels, treeNum, maxDepth)
elif typeOfForest == 2:
model = buildModelAdaBoost(dataFile, rawLabels, treeNum, maxDepth)
dump(model, resFile[:-4] + ".joblib")

printModel(model, resFile, sourceDir, typeOfForest)


if __name__ == "__main__":
Expand Down
9 changes: 5 additions & 4 deletions bin/metafx-scripts/cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

if set(features.columns) != set(metadata.index):
features = features.filter(items=metadata.index, axis=1)
print("Samples from feature table and metadata does not match! Will use only " + str(features.shape[1]) + " common samples")
print("Samples from feature table and metadata does not match! " +
"Will use only " + str(features.shape[1]) + " common samples")

M = features.shape[0] # features count
N = features.shape[1] # samples count
Expand All @@ -38,12 +39,12 @@
df_grid = pd.DataFrame.from_dict(clf.cv_results_).filter(regex='param_.*|mean_test_score|std_test_score|rank_test_score')
df_grid = df_grid.sort_values(by="rank_test_score", kind="mergesort")
print("\nGrid search cross-validation accuracy:")
print(df_grid.head(10).to_string(index=False), "."*88, df_grid.tail(10).to_string(index=False), sep="\n")
print(df_grid.head(10).to_string(index=False), "." * 88, df_grid.tail(10).to_string(index=False), sep="\n")
print("\nSelected parameters for best Random Forest classifier:")
for k, v in clf.best_params_.items():
print("\t", k, "=", v)
print()
dump(clf.best_estimator_, outName+".joblib")
dump(clf.best_estimator_, outName + ".joblib")
print("Model accuracy after training:")
print(classification_report(y, clf.best_estimator_.predict(X)))
else: # performing cross-validation
Expand All @@ -63,6 +64,6 @@

model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)
dump(model, outName+".joblib")
dump(model, outName + ".joblib")
print("Model accuracy after training:")
print(classification_report(y, model.predict(X)))
5 changes: 3 additions & 2 deletions bin/metafx-scripts/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

if set(features.columns) != set(metadata.index):
features = features.filter(items=metadata.index, axis=1)
print("Samples from feature table and metadata does not match! Will use only " + str(features.shape[1]) + " common samples")
print("Samples from feature table and metadata does not match! " +
"Will use only " + str(features.shape[1]) + " common samples")

M = features.shape[0] # features count
N = features.shape[1] # samples count
Expand All @@ -25,7 +26,7 @@
y = [metadata.loc[i, 1] for i in X.index]

model.fit(X, y)
dump(model, outName+".joblib")
dump(model, outName + ".joblib")

print("Model accuracy after training:")
print(classification_report(y, model.predict(X)))
9 changes: 5 additions & 4 deletions bin/metafx-scripts/fit_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
features_train = features.filter(items=metadata.index, axis=1)
features_test = features[features.columns.difference(metadata.index)]
predict = True
print("Will use " + str(features_train.shape[1]) + " common samples for model training and " + str(features_test.shape[1]) + " samples to predict new labels")
print("Will use " + str(features_train.shape[1]) + " common samples for model training " +
"and " + str(features_test.shape[1]) + " samples to predict new labels")
else:
features_train = features
predict = False
Expand All @@ -28,7 +29,7 @@
y_train = [metadata.loc[i, 1] for i in X_train.index]

model.fit(X_train, y_train)
dump(model, outName+".joblib")
dump(model, outName + ".joblib")

print("Model accuracy after training:")
print(classification_report(y_train, model.predict(X_train)))
Expand All @@ -37,8 +38,8 @@
X_test = features_test.T
y_pred = model.predict(X_test)

outFile = open(outName+".tsv", "w")
outFile = open(outName + ".tsv", "w")
for sam, pred in zip(X_test.index, y_pred):
print(sam, pred, sep="\t", file=outFile)
outFile.close()
print("Predicted labels saved to "+outName+".tsv")
print("Predicted labels saved to " + outName + ".tsv")
2 changes: 2 additions & 0 deletions bin/metafx-scripts/get_G.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash
# Utility for detecting greatest G value, such that the number of unique k-mers is more than 100000

a=$(grep "of them is good" ${1} | grep -o -E ", [0-9']+ \(")
a=$(echo "$a" | sed "s/'//g" | grep -o -E "[0-9]+")

Expand Down
6 changes: 3 additions & 3 deletions bin/metafx-scripts/graph2contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

if __name__ == "__main__":
wd = sys.argv[1]
file = open(wd+"/components.seq.fasta", "w")
file = open(wd + "/components.seq.fasta", "w")
comp = -1
comp_i = 0
for line in open(wd+"/components-graph.gfa"):
for line in open(wd + "/components-graph.gfa"):
if line.split()[0] == 'S':
_, name, seq, *_ = line.strip().split(sep="\t")
name = int(name.split("_")[1][1:])
Expand All @@ -18,4 +18,4 @@
comp_i += 1
print(">" + str(comp) + "_" + str(comp_i), file=file)
print(seq, file=file)
file.close()
file.close()
12 changes: 10 additions & 2 deletions bin/metafx-scripts/join_feature_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@


def load_cat(cat, wd):
# df_list = [pd.read_csv(wd + "/features_" + cat + "/vectors/" + file + ".breadth", header=None, index_col=None) for file in all_files]
"""Concatenate feature vectors for all files for given category into one table
Arguments:
cat (str): name of category
wd (str): path to directory with feature vectors files
Returns:
pd.DataFrame: table of features of shape (n_features, n_samples)
"""
all_files = glob.glob(wd + "/features_" + cat + "/vectors/" + "*.breadth")
df_list = [pd.read_csv(file, header=None, index_col=None) for file in all_files]
data = pd.concat(df_list, axis=1)
Expand All @@ -29,5 +37,5 @@ def load_cat(cat, wd):
subtables.append(load_cat(cat, wd))

feature_table = pd.concat(subtables, axis=0)
feature_table.to_csv(wd+"/feature_table.tsv", sep="\t")
feature_table.to_csv(wd + "/feature_table.tsv", sep="\t")
print("Total " + str(feature_table.shape[0]) + " features found!")
4 changes: 2 additions & 2 deletions bin/metafx-scripts/join_gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

if __name__ == "__main__":
wd = sys.argv[1]
file = open(wd+"/all-components-graph.gfa", "w")
file = open(wd + "/all-components-graph.gfa", "w")
cat = 0
m = dict()
for fin in sys.argv[2:]:
Expand All @@ -19,4 +19,4 @@
if line.split()[0] == 'L':
a, b, c, d, e, f = line.strip().split(sep="\t")
print(a, m[b], c, m[d], e, f, sep="\t", file=file)
file.close()
file.close()
Loading

0 comments on commit 617ab3e

Please sign in to comment.