-
Notifications
You must be signed in to change notification settings - Fork 23
/
gather-counts.py
59 lines (46 loc) · 1.67 KB
/
gather-counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#! /usr/bin/env python
"""
This script gathers & converts Salmon output counts into something that
edgeR can read ("counts files").
Run it in a directory above all of your Salmon output directories, and
it will create a bunch of '.counts' files that you can load into R.
See https://github.com/ngs-docs/2015-nov-adv-rna/ for background info.
C. Titus Brown, 11/2015
"""
import os, os.path
import sys
import csv
def process_quant_file(root, filename, outname):
"""
Convert individual quant.sf files into .counts files (transcripts\tcount).
"""
print >>sys.stderr, 'Loading counts from:', root, filename
outfp = open(outname, 'w')
print >>outfp, "transcript\tcount"
d = {}
full_file = os.path.join(root, filename)
for line in open(full_file):
if line.startswith('Name'):
continue
name, length, eff_length, tpm, count = line.strip().split('\t')
print >>outfp, "%s\t%s" % (name, float(tpm))
def main():
"""
Find all the quant.sf files, convert them into properly named .counts
files.
Here, "proper name" means "directory.counts".
"""
quantlist = []
start_dir = '.'
print >>sys.stderr, 'Starting in:', os.path.abspath(start_dir)
for root, dirs, files in os.walk('.'):
for filename in files:
if filename.endswith('quant.sf'):
dirname = os.path.basename(root)
outname = dirname + '.counts'
process_quant_file(root, filename, dirname + '.counts')
quantlist.append(outname)
break
print ",\n".join([ "\"%s\"" % i for i in sorted(quantlist)])
if __name__ == '__main__':
main()