-
Notifications
You must be signed in to change notification settings - Fork 1
/
MaskReferenceFromGATKTable.py
118 lines (104 loc) · 4.04 KB
/
MaskReferenceFromGATKTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/python
from __future__ import print_function
import pandas as pd
from argparse import ArgumentParser, FileType
from Bio import SeqIO
from Bio.Seq import MutableSeq
from time import time
from progressbar import ProgressBar
def mask_sites(chrom_sequence, sub_table):
corrected_sites = 0
masked_sites = 0
bed = []
for i, var in sub_table.iterrows():
if var['HOM-VAR'] == var['NCALLED'] and len(var['ALT']) == 1:
chrom_sequence[var.POS-1] = var['ALT']
corrected_sites += 1
elif var['HOM-REF'] != var['NCALLED']:
if ((var['HOM-REF'] == 1)
and (var['HOM-VAR'] == 1)
and (len(var['ALT']) == 1)
and (len(var['REF'])== 1)):
bed.append('{}\t{}\t{}\t{}\n'.format(
var.CHROM,
var.POS - 1,
var.POS,
var[reference_column][0] +"|"+var[alternate_column][0],
)
)
for p in range(var.POS-1, var.POS + len(var.REF) - 1):
chrom_sequence[p] = 'N'
masked_sites += 1
return chrom_sequence, bed, masked_sites, corrected_sites
tic = time()
parser = ArgumentParser()
parser.add_argument('--emit-bed', '-b', default=None, type=FileType('w'))
parser.add_argument('--threads', '-t', default=1, type=int)
parser.add_argument('--outfasta', type=str)
parser.add_argument('--target-species', '-S', default='sim')
parser.add_argument('fasta_file', type=open)
parser.add_argument('table', type=open)
args = parser.parse_args()
seq_recs = [rec for rec in SeqIO.parse(args.fasta_file, 'fasta')]
sequence = {rec.id: MutableSeq(str(rec.seq)) for rec in seq_recs}
print("Finished Parsing: {}s".format(time() - tic))
tic = time()
joint_vars = pd.read_table(args.table)
reference_column = ''
alternate_column = ''
for column in joint_vars.columns:
if column.startswith(args.target_species) and column.endswith('.GT'):
reference_column = column
elif column.endswith('.GT'):
alternate_column = column
if reference_column and alternate_column:
break
print("Using genotypes: Reference: {} \tAlternate: {}".format(reference_column, alternate_column))
print("Finished reading in table: {}s".format(time() - tic))
masked_sites = 0
corrected_sites = 0
if args.threads == 1:
pb = ProgressBar(maxval=len(joint_vars))
pb.start()
for i, var in joint_vars.iterrows():
pb.update(i)
if var['HOM-VAR'] == var['NCALLED'] and len(var['ALT']) == 1:
sequence[var.CHROM][var.POS-1] = var['ALT']
corrected_sites += 1
elif var['HOM-REF'] != var['NCALLED']:
if (args.emit_bed
and (var['HOM-REF'] == 1)
and (var['HOM-VAR'] == 1)
and (len(var['ALT']) == 1)
and (len(var['REF'])== 1)):
args.emit_bed.write('{}\t{}\t{}\t{}\n'.format(
var.CHROM,
var.POS - 1,
var.POS,
var[reference_column][0] +"|"+var[alternate_column][0],
)
)
for p in range(var.POS-1, var.POS + len(var.REF) - 1):
sequence[var.CHROM][p] = 'N'
masked_sites += 1
pb.finish()
for rec in seq_recs:
rec.seq = sequence[rec.id]
else:
from multiprocessing import Pool
p = Pool(8)
groups = joint_vars.groupby('CHROM').groups
res = p.starmap(mask_sites,
[(sequence[rec.id], joint_vars.ix[groups[rec.id]])
for rec in seq_recs
if rec.id in groups])
for rec, (seq, bed, mask, corr) in zip(seq_recs, res):
rec.seq = seq
masked_sites += mask
corrected_sites += corr
if args.emit_bed:
args.emit_bed.writelines(bed)
if args.outfasta:
SeqIO.write(seq_recs, args.outfasta, 'fasta')
print("Corrected sites: {:,}".format(corrected_sites))
print("Masked sites: {:,}".format(masked_sites))