-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate.py
160 lines (142 loc) · 6.73 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import detreefy as dt
import os
from gensim.models.doc2vec import Doc2Vec
import pickle
import subprocess
from scipy import spatial
from nltk.tokenize import word_tokenize
import sys
def main():
editscript = sys.argv[1]
matches, count = dt.detreefy(pool, editscript)
if matches is None:
print('No matches found')
else:
print('Found ' + str(count) + ' matches')
while(1):
ans = input('Do you want to continue? (y/p/n): ')
if ans == 'p':
i = 0
for match in matches:
i = i+1
print('f{i}: {match}')
elif ans == 'n':
exit()
while(1):
ans = input('p for setting pivot, v for view source code, q for exit')
if ans == 'v':
i = input('Enter the index of the file you want to view: ')
i = int(i)
proj = matches[i][0]
cpc = matches[i][1]
file = matches[i][3]
blame_line = matches[i][4]
code = suggestions_builder(proj, cpc, file, blame_line)
if code is None:
print('No code found on index ' + str(i))
else:
print(code)
elif ans == 'q':
break
elif ans == 'p':
i = input('Enter the index of the file you want to set as pivot: ')
i = int(i)
proj = matches[i][0]
cpc = matches[i][1]
file = matches[i][3]
blame_line = matches[i][4]
code = suggestions_builder(proj, cpc, file, blame_line)
if code is None:
print('No code found on index ' + str(i))
else:
print('selected pivot code is as follows:')
print(code)
pivot = code
print('computing cosine similarity...')
snap_vec = d2v_model.infer_vector(tokenize(pivot))
similarity_list = []
code_list = []
i = 0
j = 10
for match in matches:
i = i+1
proj = match[0]
cpc = match[1]
file = match[3]
blame_line = match[4]
code = suggestions_builder(proj, cpc, file, blame_line)
if code is None:
code_list.append('')
similarity_list.append(1)
else:
code_list.append(code)
similarity_list.append(1 - cos_sim(code, snap_vec))
if i == int(len(matches)/i)*j:
print('f {j}0% done')
j = j+10
copy = similarity_list.copy()
while(1):
ans = input('top? [num/q/r]')
if ans == 'q':
break
elif ans == 'r':
copy = similarity_list.copy()
else:
topk = int(ans)
copy.sort()
i = 0
for i in range(topk):
print(copy[i])
print(code_list[similarity_list.index(copy[i])])
copy = copy[ans:]
def suggestions_builder(proj:str, cpc:str, file:str, blame_line:str) -> str or None:
global similarity_candidate
if len(blame_line) > 500:
return None
if blame_line[-1] == '\n':
blame_line = blame_line[:-1]
blame_line = blame_line.replace('"', 'ㅗ') # unsued delimeter since double quotation confuses the parser
process = subprocess.Popen(['make', 'pp_run','proj='+proj, 'pc='+cpc, 'file='+file, 'line='+"\""+blame_line+"\""], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
code = ''
for line in process.stdout:
if str(line).startswith('b\''):
line = str(line).replace('b\'','').replace('b\"','').replace('\\n\'','') + '\n'
elif str(line).startswith('b\"'):
line = str(line).replace('b\"','').replace('\\n\"','') + '\n'
if line.startswith('SLF4J:'):
continue
elif 'error: unable to get target hunk' in line or '[pp_run]' in line or 'git: changes are all deleted null' in line or '.ipynb' in line or 'error: unable to get target hunk' in line:
# # similarity_candidate = []
# print(line)
print('can\'t find the code')
return None
code = code + line
# print(proj +'\t'+ pc+'\t'+ file +'\t'+ blame_line, code)
# print(code)
return code
# print('suggestions_builder: ', len(similarity_candidate))
def cos_sim(diff:str,snap_vec):
vec = d2v_model.infer_vector(tokenize(diff))
cos_distance = spatial.distance.cosine(vec, snap_vec)
return cos_distance
def tokenize (code:str):
lines = code.split('\n')[7:]
code = ''
for line in lines:
code = code + line + '\n'
token = word_tokenize(code.lower())
through_token = [item for item in token if not '#' in item and not '!' in item and not '-' in item and not ':' in item and not '*' in item and not '\'' in item and not '=' in item
and not '/' in item and not '\\' in item and not '.' in item and not ',' in item and not ';' in item and not '(' in item and not ')' in item and not '[' in item
and not ']' in item and not '{' in item and not '}' in item and not '<' in item and not '>' in item and not '@' in item and not '$' in item and not '%' in item
and not '^' in item and not '&' in item and not '~' in item and not '`' in item and not '|' in item and not '"' in item and not '?' in item and not '+' in item
and not '``' in item and item.isnumeric() == False]
non_flat = [item.split('_') for item in through_token]
flat = [item for l in non_flat for item in l if item != '']
return flat
d2v_model = Doc2Vec.load('./train/d2v.model')
with open('./pool.tree','rb') as file: # load the pool
unpickler = pickle.Unpickler(file)
print(unpickler.__str__)
pool = unpickler.load()
if __name__ == '__main__':
main()