-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcandidate_gene.py
92 lines (76 loc) · 3.1 KB
/
candidate_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import argparse
def get_genes(gene_list):
with open(gene_list) as f:
gene_file=f.readlines()
cls_genes = {i.strip().upper() for i in gene_file}
return cls_genes
#reads .bed file with gene coordinates
def gene_coord(gene_bed, cls_genes):
with open(gene_bed) as f:
file = f.readlines()
file = file[1:]
loc_dict = {}
for i in file:
ele = i.split()
gene_name = ele[4].upper()
if gene_name not in cls_genes:
continue
if ele[0] not in loc_dict:
loc_dict[ele[0]] = []
loc_dict[ele[0]].append(ele[1]+":"+ele[2]+":"+gene_name)
return loc_dict
def varto_gene(plink_assoc, loc_dict):
with open(plink_assoc) as f:
file=f.readlines()
file = file[1:]
plink_annot = {"23" : "X", "24" : "Y", "25" : "X", "26" : "MT"}
var_num = 0
cls_var = []
for i in file:
var_num += 1
print("Plink output variant no.:", var_num)
mut = i.split()
if mut[8] == "NA":
continue
chr_num = mut[0]
if chr_num in plink_annot:
chr_num = plink_annot[chr_num]
chrom = "chr" + chr_num
if chrom not in loc_dict:
continue
coord = float(mut[2])
for ele in loc_dict[chrom]:
gene_coord = ele.split(":")
if coord >= float(gene_coord[0]) and coord <= float(gene_coord[1]):
gene_name = gene_coord[2]
cls_var.append(mut + [gene_name])
break
else:
pass
return cls_var
def create_csv(cls_var, out_file):
adj_list = []
header = ["CHR", "SNP", "BP", "A1", "C_A", "C_U", "A2", "CHISQ", "P", "OR", "GENE", "BONF"]
for ele in cls_var:
bonf = float(ele[8]) * len(cls_var)
line = ele + [bonf]
adj_list.append(line)
df = pd.DataFrame(adj_list, columns=header)
sorted_df = df.sort_values("P")
sorted_df.to_csv(out_file, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Multiple testing correction using genetic variants belonging to a category of genes')
parser.add_argument('--gene_list', default = 'gene_list.txt', help='input file having list of genes')
parser.add_argument('--gene_bed', default = 'UCSC_canonical.bed', help='bed file with genomic coordinates of genes')
parser.add_argument('--plink_assoc', default = 'plink.assoc', help='plink association output file')
parser.add_argument('--out_file', default = 'plink.gene.assoc.csv', help='output file with gene-list based associations')
args, unknown = parser.parse_known_args()
print("Extracting genes from text file into a set...")
cls_genes = get_genes(args.gene_list)
print("Reading .bed file with gene coordinates")
loc_dict = gene_coord(args.gene_bed, cls_genes)
print("Retrieving plink variants that belong to the input list of genes...")
cls_var = varto_gene(args.plink_assoc, loc_dict)
print("Creating output csv file with plink variants, genes and their corrected p-value...")
create_csv(cls_var, args.out_file)