-
Notifications
You must be signed in to change notification settings - Fork 1
/
overlap_test.py
59 lines (48 loc) · 2.04 KB
/
overlap_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from collections import Counter
import pickle
dataset_1 = '/Volumes/Shared/shared/amarcozzi/manuscript_2015_11_08/cleaned_dataset_exact_breaks_only_2015-10-29_correct_cancertype.txt'
##chr1 break1 o1 chr2 break2 o2 source sample_name sv_type cancer_type
samples = {}
with open(dataset_1, 'r') as f:
for line in f:
if line[0] not in ['\n','#','',' ']:
data = line.split('\t')
chr1 = data[0]
brk1 = int(data[1])
o1 = data[2]
chr2 = data[3]
brk2 = int(data[4])
o2 = data[5]
src = data[6].strip()
sample = data[7].strip()
sv_type = data[8].strip()
cancer_type = data[9].strip()
if sample not in samples:
samples.update({sample:[]})
samples[sample].append([chr1,brk1,o1,chr2,brk2,o1,src,sv_type,cancer_type])
limit = 10 #max difference in bp between an two events
to_compare = [sample for sample in samples] #list of all the samples
sim_table = Counter() #keep track of the common events among samples
duplicates = []
for sample in samples:
to_compare.remove(sample) #do not compare with itself
for event in samples[sample]:
for s in to_compare:
for e in samples[s]:
if event[0] == e[0] and event[3] == e[3]: #it the event is in the same chromosomes
if abs(event[1] - e[1]) <= limit and abs(event[4] - e[4]) <= limit: #if the breaks are very similar
sim_table.update({'{}:{}'.format(sample,s)}) #count how many breaks are in common
duplicates.append((event[0],event[1],event[3],event[4],e[0],e[1],e[3],e[4])
with open('/Volumes/Shared/shared/amarcozzi/manuscript_2015_11_08/sim_table.txt','wb') as f:
pickle.dump(sim_table, f)
with open('/Volumes/Shared/shared/amarcozzi/manuscript_2015_11_08/samples.txt','wb') as f:
pickle.dump(samples, f)
with open('/Volumes/Shared/shared/amarcozzi/manuscript_2015_11_08/duplicates.txt','wb') as f:
pickle.dump(duplicates, f)
# total_count = 0
# for sample in sim_table.most_common():
# id1,id2 = sample[0].split(':')
# len1 = len(samples[id1])
# len2 = len(samples[id2])
# total_count += sim_table[sample[0]]
# print(len1,len2,sim_table[sample[0]],sample[0])