-
Notifications
You must be signed in to change notification settings - Fork 51
/
example_cht_workflow.sh
192 lines (162 loc) · 6.93 KB
/
example_cht_workflow.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Set these environment vars to point to
# your local installation of WASP
WASP=$HOME/proj/WASP
DATA_DIR=$WASP/examples/example_data
#
# Convert SNP files to HDF5 format. This program can be used
# on output from impute2 or on VCF files. Note that
# you do not need to repeat this step here if it was already
# done in the mapping pipeline.
#
$WASP/snp2h5/snp2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
--format impute \
--geno_prob $DATA_DIR/geno_probs.h5 \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--haplotype $DATA_DIR/haps.h5 \
--samples $DATA_DIR/genotypes/YRI_samples.txt \
$DATA_DIR/genotypes/chr*.hg19.impute2.gz \
$DATA_DIR/genotypes/chr*.hg19.impute2_haps.gz
#
# Convert FASTA files to HDF5 format.
# Note the HDF5 sequence files are only used for GC content
# correction part of CHT. This step can be omitted if
# GC-content correction is not used.
#
# The chr*.fa.gz files are the fasta files for the reference genome
# and can be downloaded from a genome browser (e.g.:
# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/)
#
$WASP/snp2h5/fasta2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
--seq $DATA_DIR/seq.h5 \
/data/external_public/reference_genomes/hg19/chr*.fa.gz
# loop over all individuals in samples file
H3K27AC_SAMPLES_FILE=$DATA_DIR/H3K27ac/samples.txt
ALL_SAMPLES_FILE=$DATA_DIR/genotypes/YRI_samples.txt
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
echo $INDIVIDUAL
#
# read BAM files for this individual and write read counts to
# HDF5 files
#
python $WASP/CHT/bam2h5.py --chrom $DATA_DIR/chromInfo.hg19.txt \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--haplotype $DATA_DIR/haps.h5 \
--individual $INDIVIDUAL \
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts $DATA_DIR/H3K27ac/read_counts.$INDIVIDUAL.h5 \
$DATA_DIR/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
done
#
# Make a list of target regions in ChIP-seq peaks and associated SNPs
# to test with the CHT (written to chr22.peaks.txt.gz). The provided
# get_target_regions.py script can be used to identify target regions
# and test SNPs that match specific criteria, (e.g. minimum number of
# heterozygous individuals and total number of allele-specific reads
# in target region). The file containing target regions and test SNPs can
# also be generated by the user (for example if a specific set of
# target regions and test SNPs are to be tested).
#
python $WASP/CHT/get_target_regions.py \
--target_region_size 2000 \
--min_as_count 10 \
--min_read_count 100 \
--min_het_count 1 \
--min_minor_allele_count 1\
--chrom $DATA_DIR/chromInfo.hg19.txt \
--read_count_dir $DATA_DIR/H3K27ac \
--individuals $H3K27AC_SAMPLES_FILE \
--samples $ALL_SAMPLES_FILE \
--snp_tab $DATA_DIR/snp_tab.h5 \
--snp_index $DATA_DIR/snp_index.h5 \
--haplotype $DATA_DIR/haps.h5 \
--output_file $DATA_DIR/H3K27ac/chr22.peaks.txt.gz
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
#
# create CHT input file for this individual
#
python $WASP/CHT/extract_haplotype_read_counts.py \
--chrom $DATA_DIR/chromInfo.hg19.txt \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--geno_prob $DATA_DIR/geno_probs.h5 \
--haplotype $DATA_DIR/haps.h5 \
--samples $ALL_SAMPLES_FILE \
--individual $INDIVIDUAL \
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts $DATA_DIR/H3K27ac/read_counts.$INDIVIDUAL.h5 \
$DATA_DIR/H3K27ac/chr22.peaks.txt.gz \
| gzip > $DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz
done
#
# Adjust read counts in CHT files by modeling
# relationship between read depth and GC content & peakiness
# in each sample.
# (first make files containing lists of input and output files)
#
IN_FILE=$DATA_DIR/H3K27ac/input_files.txt
OUT_FILE=$DATA_DIR/H3K27ac/output_files.txt
ls $DATA_DIR/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
cat $IN_FILE | sed 's/.txt/.adjusted.txt/' > $OUT_FILE
python $WASP/CHT/update_total_depth.py --seq $DATA_DIR/seq.h5 $IN_FILE $OUT_FILE
#
# Adjust heterozygote probabilities in CHT files to account for
# possible genotyping errors. Total counts of reference and
# alternative alleles are used to adjust the probability. In
# this example we just provide the same H3K27ac read counts, however
# you could also use read counts combined across many different
# experiments or (perhaps ideally) from DNA sequencing.
#
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
IN_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
OUT_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz
python $WASP/CHT/update_het_probs.py \
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
$IN_FILE $OUT_FILE
done
CHT_IN_FILE=$DATA_DIR/H3K27ac/cht_input_file.txt
ls $DATA_DIR/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE
#
# Estimate overdispersion parameters for allele-specific test (beta binomial)
#
OUT_FILE=$DATA_DIR/H3K27ac/cht_as_coef.txt
python $WASP/CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE
#
# Estimate overdispersion parameters for association test (beta-negative binomial)
#
OUT_FILE=$DATA_DIR/H3K27ac/cht_bnb_coef.txt
python $WASP/CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE
#
# run combined haplotype test
#
OUT_FILE=$DATA_DIR/H3K27ac/cht_results.txt
python $WASP/CHT/combined_test.py --min_as_counts 10 \
--bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
--as_disp example_data/H3K27ac/cht_as_coef.txt \
$CHT_IN_FILE $OUT_FILE
#
# Optionally, principcal component loadings can be used as covariates
# by the CHT. An example of how to perform PCA and obtain principal
# component loadings is provided in the file example_data/H3K27ac/get_PCs.R
# Note that we only recommend using PCs as covariates in when sample
# sizes are fairly large (e.g. > 30 individuals).
#
# Example of how to get PC loadings
# Rscript example_data/H3K27ac/get_PCs.R > example_data/H3K27ac/pcs.txt
#
# Using the first 2 PC loadings in the CHT:
# OUT_FILE=example_data/H3K27ac/cht_results.PCs.txt
# python CHT/combined_test.py --min_as_counts 10 \
# --bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
# --as_disp example_data/H3K27ac/cht_as_coef.txt \
# --num_pcs 2 --pc_file example_data/H3K27ac/pcs.txt \
# $CHT_IN_FILE $OUT_FILE
#