forked from petercombs/HybridSliceSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetMapStats.py
163 lines (149 loc) · 6.59 KB
/
GetMapStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from __future__ import division, print_function
import pandas as pd
import gzip
import multiprocessing as mp
from itertools import repeat
from os import path
from glob import glob
from pysam import Samfile
from collections import defaultdict
def parse_args():
from argparse import ArgumentParser
parser = ArgumentParser(description=("Take a collection of fpkm_tracking "
"files and makes a summary table "
"with all of the data as a TSV file"))
parser.add_argument('--confidence', '-c', default=False,
dest='conf', action='store_true',
help="Include confidence intervals")
parser.add_argument('--params', '-p', default=False,
dest='has_params',
help="Parameters file including renaming conventions: "
'Directory in column "Label", stage in column "Stage"')
parser.add_argument('--key', '-k', default='gene_short_name',
help='The column to combine on (FBgn in tracking_id)')
parser.add_argument('--mapped-bamfile', '-b', default='assigned_dmelR.bam',
help='The bam file to look in for mapped reads')
parser.add_argument('--in-subdirectory', default=None,
help='Subdirectory in '
'basedir/sample/subdirectory/genes.fpkm_tracking')
parser.add_argument('--out-basename', '-o', dest='basefile',
default='summary',
help='The base of the output filename to which'
' modifiers may be appended depending on options'
'(defaults to "summary")')
parser.add_argument('--count-unique', '-u', default=False,
action='store_true',
help='When removing samples, use the number of unique '
'reads, not total number of mappings')
parser.add_argument('--count-all', '-a', default=False,
action='store_true',
help='Count all reads in FASTQ files. This will be '
'more accurate, but also potentially slower than using '
'the heuristic that all but the last files are capped at '
'4M reads')
parser.add_argument('--translate-labels', '-T', default=False,
action='store_true')
parser.add_argument('basedir',
help='The directory containing directories, which '
'contain genes.fpkm_tracking files')
args = parser.parse_args()
return args
def get_stagenum(name, series, dir):
if not [c for c in name if c.isdigit()]:
return 0
# Slice until the first digit
name_base = name[:[i for i, c in enumerate(name) if c.isdigit()][0]]
dir = {'+': 1, '?': 1, '-': -1}[dir]
return (sorted({ix for ix in series if ix.startswith(name_base)})[::dir]
.index(name)) + 1
def mp_count_reads(args):
return count_reads(*args)
def count_reads(bamname, has_carrier=False, count_unique=False, count_all=False,
force_count=False):
mapfile = path.splitext(bamname)[0]+'.mapstats'
if path.exists(mapfile) and not force_count:
return list(pd.read_table(mapfile).ix[0])
sf = Samfile(bamname)
u_mapped = pd.np.nan
if count_unique:
u_mapped = 0
for read in sf:
u_mapped += not read.is_secondary
reads = 0
rfs = [entry for entry in
sf.header['PG'][0]['CL'].split()
if entry.endswith('.gz') or entry.endswith('.fastq')][0]
rfs = rfs.split(',')
rfs_by_dir = defaultdict(list)
for rf in rfs:
rfs_by_dir[path.dirname(rf)].append(rf)
reads = pd.np.nan
if not has_carrier:
reads = 0
for rfs in rfs_by_dir.values():
rfs = sorted(rfs)
if count_all:
for rf in rfs:
for i, line in enumerate(gzip.open(rfs[-1])):
pass
reads += (i+1)//4
else:
reads += 4e6 * (len(rfs) - 1)
for i, line in enumerate(gzip.open(rfs[-1])):
pass
reads += (i+1)//4
return u_mapped, sf.mapped, reads
if __name__ == "__main__":
args = parse_args()
if args.in_subdirectory:
fnames = glob(path.join(args.basedir, '*', args.in_subdirectory,
args.mapped_bamfile))
else:
fnames = glob(path.join(args.basedir, '*', args.mapped_bamfile))
has_carrier = defaultdict(bool)
if args.has_params:
params = (pd.read_table(args.has_params,
comment='#',
converters={'Label': str},
#keep_default_na=False, na_values='---',
)
)
params = params.dropna(how='any')
for dirname in params.Label:
has_carrier[dirname] = (set(params.ix[params.Label == dirname,
'CarrierSpecies'])
!= set(['---']))
has_carrier[dirname] = False
else:
has_carrier = defaultdict(bool)
pool = mp.Pool(30)
fdirs = [fname.split('/')[1] for fname in fnames]
res = pool.map(mp_count_reads,
zip(fnames,
[has_carrier[dirname] for dirname in fdirs],
repeat(args.count_unique),
repeat(args.count_all)))
if args.has_params:
new_fdirs = []
for dirname in fdirs:
indices = params.Label.index[params.Label == dirname]
if len(indices):
i = indices[0]
new_dirname = "{genotype}_cyc{stage}_sl{num:02}".format(
genotype=params.ix[i, 'SampleGenotype'],
stage=params.ix[i, 'Stage'],
num=get_stagenum(dirname, params.Label,
params.ix[i, 'Direction']))
else:
new_dirname = dirname
new_fdirs.append(new_dirname)
df = pd.DataFrame(index=fdirs,
data=pd.np.array(res),
columns=['UniqueMapped', 'AllMapped', 'RawReads'],
)
if args.translate_labels:
df['LongName'] = new_fdirs
df.sort_index().to_csv(path.join(args.basedir,
'map_stats.tsv'),
sep='\t',
)