-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-cm-genes.sh
executable file
·41 lines (35 loc) · 1.05 KB
/
find-cm-genes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
NOW=$(date +"%Y-%m-%d")
OUTDIR="$NOW-genes-1000bp-updown"
# Extract the gene names from the tblout files and return the sequence using
# ENSEMBL
for tblout in $NOW-nhmmer-vs-cdna-longest/*.tblout; do
SPECIES=$(echo $(basename $tblout) | cut -f1 -d.);
cat $tblout | \
grep -v "#" | \
awk '{print $18}' | \
cut -f2 -d":" | \
sort | uniq | ./gseq-flank
done
# Move the results into a folder
mkdir $OUTDIR
mv *.1000bp.updown.fa $OUTDIR
# Now run nhmmer for each gene against the CM promoter model and determine
# if a CM hit is present
for i in $OUTDIR/*.1000bp.updown.fa; do
GENE=$(echo $(basename $i) | cut -f1 -d.);
nhmmer --dfamtblout "$GENE.dfamtblout" \
--tblout "$GENE.tblout" \
-E 1e-10 \
--cpu=22 \
./$NOW-CM-promoter/$NOW-human-NBPF-CM-promoter-900.hmm \
$i;
done
# Move the results into a folder
mv *.dfamtblout $OUTDIR
mv *.tblout $OUTDIR
# Print the CM counts for each gene
for i in $OUTDIR/*.tblout; do
GENE=$(echo $(basename $i) | cut -f1 -d.);
GENECOUNT=$(cat $i| grep -v "#" | wc -l)
echo $GENE $GENECOUNT
done