-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering_lib.py
48 lines (41 loc) · 1.68 KB
/
clustering_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import io
import numpy as np
import pandas as pd
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from Bio import Phylo
from Bio.Phylo.TreeConstruction import _DistanceMatrix, DistanceTreeConstructor
constructor = DistanceTreeConstructor()
def read_matrix(file_path):
df = pd.read_csv(file_path)
matrix, languages = df.iloc[:, 1:].values.tolist(), df.iloc[:, 0].tolist()
return matrix, languages
def node2newick(node, parent_dist, leaf_names, newick=''):
# https://stackoverflow.com/a/31878514
if node.is_leaf():
return '%s:%f%s' % (leaf_names[node.id], parent_dist - node.dist, newick)
else:
if len(newick) > 0:
newick = '):%f%s' % (parent_dist - node.dist, newick)
else:
newick = ');'
newick = node2newick(
node.get_left(), node.dist, leaf_names, newick=newick)
newick = node2newick(
node.get_right(), node.dist, leaf_names, newick=',%s' % (newick))
newick = '(%s' % (newick)
return newick
def cluster(matrix, languages, method):
if method == 'nj':
triangle = [[matrix[i][j] if i >= j else matrix[j][i]
for j in range(i + 1)] for i in range(len(matrix))]
distance_matrix = _DistanceMatrix(languages, triangle)
tree = constructor.nj(distance_matrix)
string = io.StringIO()
Phylo.write(tree, string, 'newick')
return string.getvalue().replace('\n', '')
condensed = squareform(np.array(matrix))
z = hierarchy.linkage(condensed, method)
root_node = hierarchy.to_tree(z)
newick = node2newick(root_node, root_node.dist, languages)
return newick.replace('\n', '')