-
Notifications
You must be signed in to change notification settings - Fork 22
/
polyfun.py
854 lines (670 loc) · 39.5 KB
/
polyfun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
import numpy as np
import pandas as pd
import os
import time
from ldsc_polyfun import jackknife, regressions, sumstats, ldscore, parse
import logging
from copy import deepcopy
from tqdm import tqdm
from polyfun_utils import Logger, check_package_versions, set_snpid_index, configure_logger, get_file_name
from polyfun_utils import SNP_COLUMNS
from pyarrow import ArrowIOError
from pyarrow.lib import ArrowInvalid
from compute_ldscores_from_ld import compute_ldscores_chr
import tempfile
MAX_CHI2=80
def __filter__(fname, noun, verb, merge_obj):
merged_list = None
if fname:
f = lambda x,n: x.format(noun=noun, verb=verb, fname=fname, num=n)
x = parse.FilterFile(fname)
c = 'Read list of {num} {noun} to {verb} from {fname}'
logging.info(f(c, len(x.IDList)))
merged_list = merge_obj.loj(x.IDList)
len_merged_list = len(merged_list)
if len_merged_list > 0:
c = 'After merging, {num} {noun} remain'
logging.info(f(c, len_merged_list))
else:
error_msg = 'No {noun} retained for analysis'
raise ValueError(f(error_msg, 0))
return merged_list
def splash_screen():
print('*********************************************************************')
print('* PolyFun (POLYgenic FUNctionally-informed fine-mapping)')
print('* Version 1.0.0')
print('* (C) 2019-2024 Omer Weissbrod')
print('*********************************************************************')
print()
def check_args(args):
#verify that the requested computations are valid
mode_params = np.array([args.compute_h2_L2, args.compute_ldscores, args.compute_h2_bins])
if np.sum(mode_params)==0:
raise ValueError('must specify at least one of --compute-h2-L2, --compute-ldscores, --compute-h2-bins')
if args.compute_h2_L2 and args.compute_h2_bins and not args.compute_ldscores:
raise ValueError('cannot use both --compute-h2_L2 and --compute_h2_bins without also specifying --compute-ldscores')
if args.chr is not None:
if args.compute_h2_L2 or args.compute_h2_bins:
raise ValueError('--chr can only be specified when using only --compute-ldscores')
if args.bfile_chr is not None:
if not args.compute_ldscores:
raise ValueError('--bfile-chr can only be specified when using --compute-ldscores')
if args.ld_ukb:
if not args.compute_ldscores:
raise ValueError('--ld-ukb can only be specified when using --compute-ldscores')
if args.no_partitions:
if not args.compute_h2_L2:
raise ValueError('cannot specify --no-partitions without specifying --compute-h2-L2')
if args.compute_ldscores:
raise ValueError('cannot specify both --no-partitions and --compute-ldscores')
if args.compute_h2_bins:
raise ValueError('cannot specify both --no-partitions and --compute-h2-bins')
if args.compute_ldscores and args.compute_h2_bins and not args.compute_h2_L2:
raise ValueError('cannot use both --compute-ldscores and --compute_h2_bins without also specifying --compute-h2-L2')
#verify partitioning parameters
if args.skip_Ckmedian and (args.num_bins is None or args.num_bins<=0):
raise ValueError('You must specify --num-bins when using --skip-Ckmedian')
#verify LD-score related parameters
if args.ld_dir is not None and not args.ld_ukb:
raise ValueError('You cannot specify --ld-dir without also specifying --ld-ukb')
if args.bfile_chr is not None and args.ld_ukb:
raise ValueError('You can specify only one of --bfile-chr and --ld-ukb')
if args.compute_ldscores:
if args.bfile_chr is None and not args.ld_ukb:
raise ValueError('You must specify either --bfile-chr or --ld-ukb when you specify --compute-ldscores')
if not args.ld_ukb and (args.ld_wind_cm is None and args.ld_wind_kb is None and args.ld_wind_snps is None):
args.ld_wind_cm = 1.0
logging.warning('no ld-wind argument specified. PolyFun will use --ld-cm 1.0')
if not args.compute_ldscores:
if not (args.ld_wind_cm is None and args.ld_wind_kb is None and args.ld_wind_snps is None):
raise ValueError('--ld-wind parameters can only be specified together with --compute-ldscores')
if args.keep is not None:
raise ValueError('--keep can only be specified together with --compute-ldscores')
if args.chr is not None:
raise ValueError('--chr can only be specified together with --compute-ldscores')
if args.compute_h2_L2:
if args.sumstats is None:
raise ValueError('--sumstats must be specified when using --compute-h2-L2')
if args.ref_ld_chr is None:
raise ValueError('--ref-ld-chr must be specified when using --compute-h2-L2')
if args.w_ld_chr is None:
raise ValueError('--w-ld-chr must be specified when using --compute-h2-L2')
if args.compute_h2_bins:
if args.sumstats is None:
raise ValueError('--sumstats must be specified when using --compute-h2-bins')
if args.w_ld_chr is None:
raise ValueError('--w-ld-chr must be specified when using --compute-h2-bins')
if args.ref_ld_chr is not None and not args.compute_ldscores:
raise ValueError('--ref-ld-chr should not be specified when using --compute-h2-bins, unless you also use --compute-ldscores')
return args
def check_files(args):
#check that required input files exist
if args.compute_h2_L2:
if not os.path.exists(args.sumstats):
raise IOError('Cannot find sumstats file %s'%(args.sumstats))
for chr_num in range(1, args.num_chr+1):
get_file_name(args, 'ref-ld', chr_num, verify_exists=True, allow_multiple=True)
get_file_name(args, 'w-ld', chr_num, verify_exists=True)
get_file_name(args, 'annot', chr_num, verify_exists=True, allow_multiple=True)
if args.compute_ldscores:
if args.chr is None: chr_range = range(1, args.num_chr+1)
else: chr_range = range(args.chr, args.chr+1)
for chr_num in chr_range:
if args.bfile_chr is not None:
get_file_name(args, 'bim', chr_num, verify_exists=True)
get_file_name(args, 'fam', chr_num, verify_exists=True)
get_file_name(args, 'bed', chr_num, verify_exists=True)
if not args.compute_h2_L2:
get_file_name(args, 'snpvar_ridge', chr_num, verify_exists=True)
get_file_name(args, 'bins', chr_num, verify_exists=True)
if args.compute_h2_bins and not args.compute_ldscores:
for chr_num in range(1, args.num_chr+1):
get_file_name(args, 'w-ld', chr_num, verify_exists=True)
if not args.compute_h2_L2:
get_file_name(args, 'bins', chr_num, verify_exists=True)
class PolyFun:
def __init__(self):
pass
def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2):
#prepare LDSC objects
log = Logger()
args.h2 = args.sumstats
args.ref_ld = None
args.w_ld = None
args.n_blocks = n_blocks
args.M = None
args.not_M_5_50 = True
#if not ridge, the we'll use the LD-scores of our bins
if not use_ridge:
args = deepcopy(args)
args.ref_ld_chr = args.output_prefix+'.'
#read input data
if use_ridge or not args.compute_ldscores or True:
M_annot, w_ld_cname, ref_ld_cnames, df_sumstats, _ = sumstats._read_ld_sumstats(args, log, args.h2)
else:
#TODO: Don't reload files if we don't have to...
M_annot = self.M
w_ld_cname = 'w_ld'
ref_ld_cnames = self.df_bins.columns
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
###merge everything together...
#prepare LD-scores for S-LDSC run
ref_ld = np.array(df_sumstats[ref_ld_cnames], dtype=np.float32)
sumstats._check_ld_condnum(args, log, ref_ld_cnames)
if df_sumstats.shape[0] < 200000:
logging.warning('number of SNPs is smaller than 200k; this is almost always bad.')
n_snp = len(df_sumstats)
n_blocks = np.minimum(n_snp, args.n_blocks)
n_annot = len(ref_ld_cnames)
if n_annot<=1:
raise ValueError('Only one annotation found')
chisq_max = max(0.001*df_sumstats['N'].max(), MAX_CHI2)
#prepare chi2 statistics
s = lambda x: np.array(x).reshape((n_snp, 1))
chisq = s(df_sumstats.Z**2).astype(np.float32)
ii = np.ravel(chisq < chisq_max)
df_sumstats = df_sumstats.loc[ii, :]
if np.any(~ii):
logging.info('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
n_snp = np.sum(ii) # lambdas are late-binding, so this works
ref_ld = np.array(df_sumstats[ref_ld_cnames], dtype=np.float32)
chisq = chisq[ii].reshape((n_snp, 1))
#Run S-LDSC
self.ref_ld_cnames = [c[:c.rfind('_')] for c in ref_ld_cnames if c not in SNP_COLUMNS]
hsqhat = regressions.Hsq(chisq,
ref_ld,
s(df_sumstats[w_ld_cname]),
s(df_sumstats.N),
M_annot, n_blocks=n_blocks, intercept=None,
twostep=None, old_weights=True,
chr_num=df_sumstats['CHR'],
loco=use_ridge, ridge_lambda=None,
standardize_ridge=True,
approx_ridge=True,
num_chr_sets=2,
evenodd_split=evenodd_split,
nn=nn,
keep_large=keep_large,
nnls_exact=args.nnls_exact,
num_chr=args.num_chr
)
#save the results object
if use_ridge:
self.hsqhat_ridge = hsqhat
else:
self.hsqhat = hsqhat
def load_annotations_file(self, args, chr_num, use_ridge):
#load annotations file for this chromosome
if use_ridge:
annot_filenames = get_file_name(args, 'annot', chr_num, allow_multiple=True)
else:
annot_filenames = [get_file_name(args, 'bins', chr_num)]
#load annotation file(s)
df_annot_chr_list = []
for annot_filename in annot_filenames:
try:
df_annot_chr = pd.read_parquet(annot_filename)
except (ArrowIOError, ArrowInvalid):
df_annot_chr = pd.read_table(annot_filename)
df_annot_chr_list.append(df_annot_chr)
if len(df_annot_chr_list)==1:
df_annot_chr = df_annot_chr_list[0]
else:
for df in df_annot_chr_list[1:]:
for snp_col in SNP_COLUMNS:
if (df.shape[0] != df_annot_chr_list[0].shape[0]) or (np.any(df[snp_col] != df_annot_chr_list[0][snp_col])):
raise ValueError('Different annotation files of chromosome %d must be perfectly aligned'%(chr_num))
df.drop(columns=['CM'], inplace=True, errors='ignore')
df.drop(columns=SNP_COLUMNS, inplace=True, errors='raise')
df_annot_chr = pd.concat(df_annot_chr_list, axis=1)
#make sure all required columns were found
df_annot_chr.drop(columns=['CM'], inplace=True, errors='ignore')
found_missing_col = False
for colname in SNP_COLUMNS:
if colname not in df_annot_chr.columns:
logging.error('%s has a missing column: %s'%(annot_filename, colname))
found_missing_col = True
if found_missing_col:
raise ValueError('Missing columns found in %s'%(annot_filename))
#subset annotations if requested
if args.anno is not None:
anno_to_use = args.anno.split(',')
assert np.all(np.isin(anno_to_use, df_annot_chr.columns))
df_annot_chr = df_annot_chr[SNP_COLUMNS + anno_to_use]
#if we have more annotations that ref-ld, it might mean that some annotations were removed, so remove them from here as well
if not np.all(np.isin(self.ref_ld_cnames, df_annot_chr.columns)):
missing_annot = [c for c in self.ref_ld_cnames if c not in df_annot_chr.columns]
raise ValueError('The following annotations have LD-scores but are not in any of the annotation files: %s'%(missing_annot))
if len(self.ref_ld_cnames) < len(df_annot_chr.columns) - len(SNP_COLUMNS):
df_annot_chr = df_annot_chr[SNP_COLUMNS + self.ref_ld_cnames]
#make sure that we get the same columns as the ones in the LD-score files
if not np.all([c for c in df_annot_chr.columns if c not in SNP_COLUMNS ]== self.ref_ld_cnames):
raise ValueError('Annotation names in annotations file do not match the one in the LD-scores file')
return df_annot_chr
def compute_snpvar_chr(self, args, chr_num, use_ridge):
#load annotations file from disk
df_annot_chr = self.load_annotations_file(args, chr_num, use_ridge)
#extract taus from a jknife object
if use_ridge:
hsqhat = self.hsqhat_ridge
jknife = hsqhat.jknife_ridge
#make sure that the chromosome exists in one set
found_chrom = np.any([chr_num in chr_set for chr_set in jknife.chromosome_sets])
if not found_chrom:
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')
#find the relevant set number
set_num=None
for chr_set_i, chr_set in enumerate(jknife.chromosome_sets):
if chr_num not in chr_set:
assert set_num is None
set_num = chr_set_i
if set_num is None:
raise ValueError('Could not find Ridge predictions for chromosome %d'%(chr_num))
#compute and return snpvar
taus = jknife.est_loco_ridge[set_num][:hsqhat.n_annot] / hsqhat.Nbar
else:
hsqhat = self.hsqhat
jknife = hsqhat.jknife
if len(jknife.est_loco) != args.num_chr:
raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes')
taus = jknife.est_loco[chr_num-1][:hsqhat.n_annot] / hsqhat.Nbar
#save the taus to disk
taus_output_file = get_file_name(args, ('taus_ridge' if use_ridge else 'taus_nn'), chr_num, verify_exists=False)
df_taus = pd.Series(taus, index=df_annot_chr.drop(columns=SNP_COLUMNS, errors='raise').columns)
df_taus.index.name = 'ANNOTATION'
df_taus.name = 'ANNOTATION_COEFFICIENT'
df_taus.to_csv(taus_output_file, header=True, index=True, sep='\t')
#compute and return the snp variances
df_snpvar_chr = df_annot_chr.drop(columns=SNP_COLUMNS, errors='raise').dot(taus)
df_snpvar_chr = df_snpvar_chr.to_frame(name='SNPVAR')
df_snpvar_chr = pd.concat((df_annot_chr[SNP_COLUMNS], df_snpvar_chr), axis=1)
return df_snpvar_chr
def compute_snpvar(self, args, use_ridge):
logging.info('Computing per-SNP h^2 for each chromosome...')
#iterate over chromosomes
df_snpvar_chr_list = []
for chr_num in tqdm(range(1, args.num_chr+1)):
df_snpvar_chr = self.compute_snpvar_chr(args, chr_num, use_ridge=use_ridge)
df_snpvar_chr_list.append(df_snpvar_chr)
df_snpvar = pd.concat(df_snpvar_chr_list, axis=0)
df_snpvar.reset_index(inplace=True, drop=True)
#save snpvar to a class member
if use_ridge:
self.df_snpvar_ridge = df_snpvar
else:
self.df_snpvar = df_snpvar
def create_df_bins(self, bin_sizes, df_snpvar, df_snpvar_sorted=None, min_bin_size=10):
#sort df_snpvar if needed
if df_snpvar_sorted is None:
df_snpvar_sorted = df_snpvar['SNPVAR'].sort_values()
assert bin_sizes.sum() == df_snpvar_sorted.shape[0]
#rearrange bins to prevent very small bins
bin_i = len(bin_sizes)-1
while True:
#if the current bin is large enough, proceed to the previous one
if bin_sizes[bin_i] >= min_bin_size:
bin_i -= 1
if bin_i==0: break
continue
#Compare the effects of the weakest bin in the current bin, and the strongest bin in the previous bin
bin_start_ind = bin_sizes[:bin_i].sum()
weakest_bin_snp = df_snpvar_sorted.iloc[::-1].iloc[bin_start_ind]
strongest_lastbin_snp = df_snpvar_sorted.iloc[::-1].iloc[bin_start_ind-1]
num_snps_to_transfer = np.minimum(min_bin_size-bin_sizes[bin_i], bin_sizes[bin_i-1])
bin_sizes[bin_i] += num_snps_to_transfer
bin_sizes[bin_i-1] -= num_snps_to_transfer
#if we emptied the previous bin, delete it
if bin_sizes[bin_i-1]==0:
bin_sizes = np.concatenate((bin_sizes[:bin_i-1], bin_sizes[bin_i:]))
bin_i -= 1
#if the current bin is large enough, move to the previous one
if bin_sizes[bin_i] >= min_bin_size:
bin_i -= 1
if bin_i==0: break
#create df_bins
ind=0
df_bins = pd.DataFrame(index=df_snpvar_sorted.index)
for bin_i, bin_size in enumerate(bin_sizes):
snpvar_bin = np.zeros(df_bins.shape[0], dtype=bool)
snpvar_bin[ind : ind+bin_size] = True
df_bins['snpvar_bin%d'%(len(bin_sizes) - bin_i)] = snpvar_bin
ind += bin_size
assert np.all(df_bins.sum(axis=0) == bin_sizes)
df_bins = df_bins.iloc[:, ::-1]
assert df_bins.shape[0] == df_snpvar.shape[0]
assert np.all(df_bins.sum(axis=1)==1)
#reorder df_bins
df_bins = df_bins.loc[df_snpvar.index]
df_bins = pd.concat((df_snpvar[SNP_COLUMNS], df_bins), axis=1)
assert np.all(df_bins.index == df_snpvar.index)
return df_bins
def partition_snps_Ckmedian(self, args, use_ridge):
logging.info('Clustering SNPs into bins using the R Ckmeans.1d.dp package')
#try loading the Ckmeans.1d.dp package
try:
import rpy2
import rpy2.robjects.numpy2ri as numpy2ri
try:
from importlib import reload
reload(rpy2.robjects.numpy2ri)
except:
pass
import rpy2.robjects as ro
ro.conversion.py2ri = numpy2ri
numpy2ri.activate()
from rpy2.robjects.packages import importr
importr('Ckmeans.1d.dp')
median_seg_func = ro.r('Ckmedian.1d.dp')
mean_seg_func = ro.r('Ckmeans.1d.dp')
except:
logging.error('Could not load the R package Ckmeans.1d.dp. Either install it or rerun PolyFun with --skip-Ckmedian')
logging.error('')
raise
#access the right class member
if use_ridge:
df_snpvar = self.df_snpvar_ridge
else:
df_snpvar = self.df_snpvar
#sort df_snpvar
df_snpvar_sorted = df_snpvar['SNPVAR'].sort_values()
#perform the segmentation
if args.num_bins is None or args.num_bins<=0:
logging.info('Determining the optimal number of bins (if this is slow, consider using --num-bins 20 (or some other number))')
seg_obj = median_seg_func(df_snpvar_sorted.values, k=np.array([5,30]))
else:
seg_obj = median_seg_func(df_snpvar_sorted.values, k=args.num_bins)
bin_sizes = np.array(seg_obj.rx2('size')).astype(np.int64)
num_bins = len(bin_sizes)
logging.info('Ckmedian.1d.dp partitioned SNPs into %d bins'%(num_bins))
#define df_bins
df_bins = self.create_df_bins(bin_sizes, df_snpvar, df_snpvar_sorted=df_snpvar_sorted)
return df_bins
def partition_snps_Kmeans(self, args, use_ridge):
logging.info('Clustering SNPs into bins using K-means clustering with %d bins'%(args.num_bins))
#make sure that we can run K-means clustering
assert args.num_bins is not None and args.num_bins>0
try:
from sklearn.cluster import KMeans
except ImportError:
raise ImportError('sklearn not properly installed. Please reinstall it')
#access the right class member
if use_ridge: df_snpvar = self.df_snpvar_ridge
else: df_snpvar = self.df_snpvar
#perform K-means clustering
kmeans_obj = KMeans(n_clusters=args.num_bins)
kmeans_obj.fit(df_snpvar[['SNPVAR']])
assert kmeans_obj.cluster_centers_.shape[0] == args.num_bins
#Make sure that clusters are contiguous
bins_order = np.argsort(kmeans_obj.cluster_centers_[:,0])
for bin_i, cluster_label in enumerate(bins_order[:-1]):
next_cluster_label = bins_order[bin_i+1]
assert df_snpvar.loc[kmeans_obj.labels_==cluster_label, 'SNPVAR'].max() <= df_snpvar.loc[kmeans_obj.labels_==next_cluster_label, 'SNPVAR'].min()
#define bin_sizes
bin_sizes = np.bincount(kmeans_obj.labels_)[bins_order]
#define df_bins
df_bins = self.create_df_bins(bin_sizes, df_snpvar, df_snpvar_sorted=None)
return df_bins
def partition_snps_to_bins(self, args, use_ridge):
#if skip_ckmedian was specified, run regular K-means
if args.skip_Ckmedian:
self.df_bins = self.partition_snps_Kmeans(args, use_ridge=use_ridge)
else:
self.df_bins = self.partition_snps_Ckmedian(args, use_ridge=use_ridge)
def save_bins_to_disk(self, args):
logging.info('Saving SNP-bins to disk')
for chr_num in tqdm(range(1, args.num_chr+1)):
#save bins file to disk
df_bins_chr = self.df_bins.query('CHR==%d'%(chr_num))
bins_chr_file = get_file_name(args, 'bins', chr_num, verify_exists=False)
df_bins_chr.to_parquet(bins_chr_file, index=False)
#save M files to disk
M_chr_file = get_file_name(args, 'M', chr_num, verify_exists=False)
M_chr = df_bins_chr.drop(columns=SNP_COLUMNS).sum(axis=0).values
np.savetxt(M_chr_file, M_chr.reshape((1, M_chr.shape[0])), fmt='%i')
def save_snpvar_to_disk(self, args, use_ridge, constrain_range):
if constrain_range:
logging.info('Saving constrained SNP variances to disk')
else:
logging.info('Saving SNP variances to disk')
#determine which df_snpvar to use
if use_ridge: df_snpvar = self.df_snpvar_ridge
else: df_snpvar = self.df_snpvar
#constrain the ratio between the largest and smallest snp-var
if constrain_range:
df_snpvar = df_snpvar.copy()
h2_total = df_snpvar['SNPVAR'].sum()
min_snpvar = df_snpvar['SNPVAR'].max() / args.q
df_snpvar.loc[df_snpvar['SNPVAR'] < min_snpvar, 'SNPVAR'] = min_snpvar
df_snpvar['SNPVAR'] *= h2_total / df_snpvar['SNPVAR'].sum()
assert np.isclose(df_snpvar['SNPVAR'].sum(), h2_total)
#merge snpvar with sumstats
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
df_sumstats.drop(columns=['SNP'], errors='ignore', inplace=True)
for col in ['CHR', 'BP', 'A1', 'A2']:
if col not in df_sumstats.columns:
raise ValueError('sumstats file has a missing column: %s'%(col))
df_snpvar = set_snpid_index(df_snpvar, copy=True)
df_sumstats = set_snpid_index(df_sumstats)
svpvar_cols = df_snpvar.columns.copy()
df_snpvar.drop(columns=['CHR', 'BP', 'A1', 'A2'], inplace=True)
df_snpvar = df_snpvar.merge(df_sumstats, left_index=True, right_index=True)
df_snpvar = df_snpvar[list(svpvar_cols) + [c for c in df_sumstats.columns if c not in list(svpvar_cols)]]
if df_snpvar.shape[0] < df_sumstats.shape[0]:
error_message = 'not all SNPs in the sumstats file and/or in the LD reference files are also in the annotations file'
if args.allow_missing:
logging.warning(error_message + '. Keeping %d/%d SNPs'%(df_snpvar.shape[0], df_sumstats.shape[0]))
else:
raise ValueError(error_message + '. If you wish to omit the missing SNPs, please use the flag --allow-missing')
#iterate over chromosomes
for chr_num in tqdm(range(1, args.num_chr+1)):
#define output file name
output_fname = 'snpvar'
if use_ridge: output_fname += '_ridge'
if constrain_range: output_fname += '_constrained'
snpvar_chr_file = get_file_name(args, output_fname, chr_num, verify_exists=False)
#save snpvar to file
df_snpvar_chr = df_snpvar.query('CHR==%d'%(chr_num))
df_snpvar_chr.to_csv(snpvar_chr_file, index=False, sep='\t', compression='gzip', float_format='%0.4e')
def polyfun_h2_L2(self, args):
#run Ridge regression
self.run_ldsc(args, use_ridge=True, nn=False, evenodd_split=False, keep_large=False)
#compute per-SNP h^2 based on L2-regularized S-LDSC coefficients
self.compute_snpvar(args, use_ridge=True)
#save L2-regularized S-LDSC per-SNP h^2 to disk
self.save_snpvar_to_disk(args, use_ridge=True, constrain_range=True)
self.save_snpvar_to_disk(args, use_ridge=True, constrain_range=False)
#partitions SNPs into bins and save to disk, unless explicitly requested not to
if not args.no_partitions:
self.partition_snps_to_bins(args, use_ridge=True)
self.save_bins_to_disk(args)
def load_bins_chr(self, args, chr_num):
bins_file = get_file_name(args, 'bins', chr_num)
df_bins_chr = pd.read_parquet(bins_file)
return df_bins_chr
def compute_ld_scores(self, args):
#define the range of chromosomes to iterate over
if args.chr is None:
chr_range = range(1, args.num_chr+1)
else:
chr_range = range(args.chr, args.chr+1)
#iterate over chromosomes and compute LD-scores
###df_ldscores_chr_list = []
for chr_num in tqdm(chr_range, disable=len(chr_range)==1):
#load or extract the bins for the current chromosome
try:
df_bins_chr = self.df_bins.query('CHR==%d'%(chr_num))
except AttributeError:
df_bins_chr = self.load_bins_chr(args, chr_num)
#compute LD-scores for this chromosome
if args.ld_ukb:
if args.ld_dir is None: ld_dir = tempfile.mkdtemp()
else: ld_dir = args.ld_dir
df_bins_chr = set_snpid_index(df_bins_chr)
df_ldscores_chr = compute_ldscores_chr(df_bins_chr, ld_dir=ld_dir, use_ukb=True)
elif args.bfile_chr is not None:
df_ldscores_chr = self.compute_ldscores_plink_chr(args, chr_num, df_bins_chr)
else:
raise ValueError('no LDscore computation method specified')
#save the LD-scores to disk
ldscores_output_file = get_file_name(args, 'ldscores', chr_num, verify_exists=False)
df_ldscores_chr.to_parquet(ldscores_output_file, index=False)
# #add the ldscores to the LDscores list
# df_ldscores_chr_list.append(df_ldscores_chr)
# #concatenate all the LD-score dfs
# if len(df_ldscores_chr_list)==1:
# self.df_bin_ldscores = df_ldscores_chr_list[0]
# else:
# self.df_bin_ldscores = pd.concat(df_ldscores_chr_list, axis=0)
# self.df_bin_ldscores.reset_index(inplace=True, drop=True)
def compute_ldscores_plink_chr(self, args, chr_num, df_bins_chr):
# read bim/snp
bim_file = get_file_name(args, 'bim', chr_num)
array_snps = parse.PlinkBIMFile(bim_file)
df_bim = array_snps.df
df_bim = set_snpid_index(df_bim)
#Remove annotations of SNPs that are not in the .bim file
df_bins_chr = set_snpid_index(df_bins_chr)
df_bins_chr = df_bins_chr.loc[df_bins_chr.index.isin(df_bim.index)]
#make sure that all SNPs have a bin
keep_snps = None
if np.any(~df_bim.index.isin(df_bins_chr.index)):
error_msg = 'Not all SNPs were assigned a bin (meaning some SNPS in the summary statistics and/or in the LD reference files are not in the annotation files)'
if args.allow_missing:
is_good_snp = df_bim.index.isin(df_bins_chr.index)
if not np.any(is_good_snp):
raise ValueError('No SNPs in chromosome %d have annotations'%(chr_num))
keep_snps = np.where(is_good_snp)[0]
logging.warning(error_msg)
logging.warning('Keeping only %d/%d SNPs in chromosome %d that have annotations'%(df_bim.shape[0], len(is_good_snp), chr_num))
else:
raise ValueError(error_msg + '. If you wish to omit the missing SNPs, please use the flag --allow-missing')
#find #individuals in bfile
fam_file = get_file_name(args, 'fam', chr_num)
df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+')
n = df_fam.shape[0]
#find keep_indivs
if args.keep is None:
keep_indivs= None
else:
array_indivs = parse.PlinkFAMFile(args.bfile+'.fam')
keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs)
logging.info('after applying --keep, %d individuals remain'%(len(keep_indivs)))
#read plink file
logging.info('Loading SNP file...')
bed_file = get_file_name(args, 'bed', chr_num)
geno_array = ldscore.PlinkBEDFile(bed_file, n, array_snps, keep_snps=keep_snps,
keep_indivs=keep_indivs, mafMin=None)
#remove omitted SNPs from df_bim
if len(geno_array.kept_snps) != df_bim.shape[0]:
assert np.all(np.array(geno_array.kept_snps) == np.sort(np.array(geno_array.kept_snps)))
assert geno_array.kept_snps[-1] < df_bim.shape[0]
df_bim = df_bim.iloc[geno_array.kept_snps]
#rearrange annotations to match the order of SNPs in the plink file
assert df_bins_chr.shape[0] >= df_bim.shape[0]
if (df_bins_chr.shape[0] > df_bim.shape[0]) or np.any(df_bins_chr.index != df_bim.index):
assert np.all(df_bim.index.isin(df_bins_chr.index))
df_bins_chr = df_bins_chr.loc[df_bim.index]
# determine block widths
num_wind_args = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
if np.sum(num_wind_args) != 1:
raise ValueError('Must specify exactly one --ld-wind option')
if args.ld_wind_snps:
max_dist = args.ld_wind_snps
coords = np.array(list(range(geno_array.m)))
elif args.ld_wind_kb:
max_dist = args.ld_wind_kb*1000
coords = np.array(df_bim['BP'])
if len(np.unique(coords)) == 1:
raise ValueError('bim file has no basepair data --- please use a different ld-wind option')
elif args.ld_wind_cm:
max_dist = args.ld_wind_cm
coords = np.array(df_bim['CM'])
if len(np.unique(coords)) == 1:
raise ValueError('bim file has no CM data --- please use a different ld-wind option')
#compute LD-scores
block_left = ldscore.getBlockLefts(coords, max_dist)
if block_left[len(block_left)-1] == 0:
error_msg = 'Only a single block selected - this is probably a mistake'
raise ValueError(error_msg)
t0 = time.time()
geno_array._currentSNP = 0
logging.info('Computing LD scores for chromosome %d'%(chr_num))
ldscores = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=df_bins_chr.drop(columns=SNP_COLUMNS).values)
#create an ldscores df
df_ldscores = pd.DataFrame(ldscores, index=df_bins_chr.index, columns=df_bins_chr.drop(columns=SNP_COLUMNS).columns)
df_ldscores = pd.concat((df_bins_chr[SNP_COLUMNS], df_ldscores), axis=1)
return df_ldscores
def compute_h2_bins(self, args, constrain_range):
#run S-LDSC
self.run_ldsc(args, use_ridge=False, nn=True, evenodd_split=True, keep_large=False)
#compute per-SNP h^2
self.compute_snpvar(args, use_ridge=False)
#save per-SNP h^2 to disk
self.save_snpvar_to_disk(args, use_ridge=False, constrain_range=constrain_range)
def polyfun_main(self, args):
#compute snp variances using L2-regularized S-LDSC with an odd/even chromosome split
if args.compute_h2_L2:
self.polyfun_h2_L2(args)
#compute LD-scores of SNP partitions
if args.compute_ldscores:
self.compute_ld_scores(args)
if args.compute_h2_bins:
self.compute_h2_bins(args, constrain_range=True)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
#partitioning-related parameters
parser.add_argument('--num-bins', type=int, default=None, help='Number of bins to partition SNPs into. If not specified, PolyFun will automatically select this number based on a BIC criterion')
parser.add_argument('--anno', default=None, help='Comma-delimited list of annotations to use (if not specified, will use all annotations)')
parser.add_argument('--skip-Ckmedian', default=False, action='store_true', help='If specified, use a regular K-means algorithm instead of the R Ckmeans.1d.dp package')
#mode related parameters
parser.add_argument('--compute-ldscores', default=False, action='store_true', help='If specified, PolyFun will compute LD-scores of SNP bins')
parser.add_argument('--compute-h2-L2', default=False, action='store_true', help='If specified, PolyFun will compute per-SNP h2 using L2-regularized S-LDSC')
parser.add_argument('--compute-h2-bins', default=False, action='store_true', help='If specified, PolyFun will robustly compute per-SNP h2 based on SNP bins')
parser.add_argument('--no-partitions', default=False, action='store_true', help='If specified, PolyFun will not partition SNPs into bins (do this only if you want to use the per-SNP h^2 from L2-regularized S-LDSC as prior causal probabilities, which is typically not recommended)')
#ld-score related parameters
parser.add_argument('--chr', type=int, default=None, help='Chromosome number (only applicable when only specifying --ldscores). If not set, PolyFun will compute LD-scores for all chromosomes')
parser.add_argument('--ld-wind-cm', type=float, default=None, help='window size to be used for estimating LD-scores in units of centiMorgans (cM).')
parser.add_argument('--ld-wind-kb', type=int, default=None, help='window size to be used for estimating LD-scores in units of Kb.')
parser.add_argument('--ld-wind-snps', type=int, default=None, help='window size to be used for estimating LD-scores in units of SNPs.')
parser.add_argument('--chunk-size', type=int, default=50, help='chunk size for LD-scores calculation')
parser.add_argument('--keep', default=None, help='File with ids of individuals to use when computing LD-scores')
#per-SNP h2 related parameters
parser.add_argument('--q', type=float, default=100, help='The maximum ratio between the largest and smallest truncated per-SNP heritabilites')
#data input/output parameters
parser.add_argument('--sumstats', help='Input summary statistics file')
parser.add_argument('--ref-ld-chr', help='Suffix of LD-score files (as in ldsc)')
parser.add_argument('--w-ld-chr', help='Suffix of LD-score weights files (as in ldsc)')
parser.add_argument('--bfile-chr', default=None, help='Prefix of plink files (used to compute LD-scores)')
parser.add_argument('--ld-ukb', default=False, action='store_true', help='If specified, PolyFun will use UKB LD matrices to compute LD-scores')
parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyFun will create a temporary directory)')
parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyFun output file names')
parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, PolyFun will not terminate if some SNPs with sumstats are not found in the annotations files')
parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')
#LDSC parameters
parser.add_argument('--nnls-exact', default=False, action='store_true', help='If specified, S-LDSC will estimate non-negative taus using an exact instead of an approximate solver (this will be slower but slightly more accurate)')
#check package versions
check_package_versions()
#show splash screen
splash_screen()
#extract args
args = parser.parse_args()
#check that the output directory exists
if len(os.path.dirname(args.output_prefix))>0 and not os.path.exists(os.path.dirname(args.output_prefix)):
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.output_prefix)))
#configure logger
configure_logger(args.output_prefix)
#check and fix args
args = check_args(args)
check_files(args)
#create and run PolyFun object
polyfun_obj = PolyFun()
polyfun_obj.polyfun_main(args)
print()