forked from omerwe/polyfun
-
Notifications
You must be signed in to change notification settings - Fork 0
/
aggregate_finemapper_results.py
114 lines (89 loc) · 5.18 KB
/
aggregate_finemapper_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np; np.set_printoptions(precision=4, linewidth=200)
import pandas as pd; pd.set_option('display.width', 200)
import os
import logging
import scipy.stats as stats
from tqdm import tqdm
from polyfun import configure_logger, check_package_versions
from polyfun_utils import set_snpid_index
from pyarrow import ArrowIOError
from pyarrow.lib import ArrowInvalid
from polyfun_utils import DEFAULT_REGIONS_FILE
def main(args):
#read sumstats file
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
#compute p-values if needed
if args.pvalue_cutoff is not None:
df_sumstats['P'] = stats.chi2(1).sf(df_sumstats['Z']**2)
#read regions file
df_regions = pd.read_table(args.regions_file)
if args.chr is not None:
df_regions = df_regions.query('CHR==%d'%(args.chr))
if df_regions.shape[0]==0: raise ValueError('no SNPs found in chromosome %d'%(args.chr))
df_regions_keep = df_regions.apply(lambda r: np.sum((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))) > 1, axis=1)
df_regions = df_regions.loc[df_regions_keep]
#aggregate outputs
df_sumstats_list = []
logging.info('Aggregating results...')
for _, r in tqdm(df_regions.iterrows()):
chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX']
#apply p-value filter if needed
if args.pvalue_cutoff is not None:
df_sumstats_r = df_sumstats.query('CHR==%d & %d <= BP <= %d'%(chr_num, start, end))
if np.all(df_sumstats_r['P'] > args.pvalue_cutoff): continue
output_file_r = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end)
if not os.path.exists(output_file_r):
err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist'%(chr_num, start, end)
if args.allow_missing_jobs:
logging.warning(err_msg)
continue
else:
raise IOError(err_msg + '.\nTo override this error, please provide the flag --allow-missing-jobs')
df_sumstats_r = pd.read_table(output_file_r)
#add the current region to the credible set
df_sumstats_r['CREDIBLE_SET'] = 'chr%s:%s-%s:'%(chr_num, start, end) + df_sumstats_r['CREDIBLE_SET'].astype(str)
#mark distance from center
middle = (start+end)//2
df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] - middle)
df_sumstats_list.append(df_sumstats_r)
if len(df_sumstats_list)==0:
raise ValueError('no output files found')
#keep only the most central result for each SNP
df_sumstats = pd.concat(df_sumstats_list, axis=0)
df_sumstats.sort_values('DISTANCE_FROM_CENTER', inplace=True, ascending=True)
df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True)
df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')]
del df_sumstats['DISTANCE_FROM_CENTER']
df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True)
#write output file
if args.adjust_beta_freq:
df_sumstats['BETA_MEAN'] /= np.sqrt(2*df_sumstats['MAF']*(1-df_sumstats['MAF']))
df_sumstats['BETA_SD'] /= np.sqrt(2*df_sumstats['MAF']*(1-df_sumstats['MAF']))
df_sumstats.to_csv(args.out, sep='\t', index=False)
logging.info('Wrote aggregated results to %s'%(args.out))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
#general parameters
parser.add_argument('--sumstats', required=True, help='Name of sumstats file')
parser.add_argument('--out-prefix', required=True, help='prefix of output files')
parser.add_argument('--out', required=True, help='name of the aggregated output files')
parser.add_argument('--allow-missing-jobs', default=False, action='store_true', help='whether to allow missing jobs')
parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs')
parser.add_argument('--chr', default=None, type=int, help='Target chromosome (if not provided, all chromosomes will be considered)')
parser.add_argument('--pvalue-cutoff', type=float, default=None, help='only consider regions that have at least one SNP with a p-value greater than this cutoff')
parser.add_argument('--adjust-beta-freq', default=False, action='store_true', help='If specified, the posterior estimates of the SNP effect sizes will be on per-allele scale rather than a per-standardized genotype scale')
#check package versions
check_package_versions()
#extract args
args = parser.parse_args()
#check that the output directory exists
if len(os.path.dirname(args.out))>0 and not os.path.exists(os.path.dirname(args.out)):
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out)))
#configure logger
configure_logger(args.out_prefix)
#invoke main function
main(args)