analyzeHits

#!/usr/bin/python
'''
Takes a list of hits count and produces some stats

INPUT: the stream from countHits
ActR 1A42 0 537                                                            
ActR 2011 1 600                                                            
ActR 3841 0 644                                                            
ActR 5A14 1 733
[...]
'''
def getOptions():
    import argparse

    # create the top-level parser
    description = ("Takes a list of hits count and produces some stats")
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-n', metavar='nearfile', action='store',
                        dest='near',
                        default=None,
                        help='Near genomes file')
    parser.add_argument('-N', '--no-near', action="store_true",
                        default=False,
                        dest='nonear',
                        help='Consider the near genomes but don\'t plot them')
    parser.add_argument('-G', '--genes', action="store_true",
                        default=False,
                        dest='genes',
                        help='Count the genes')
    parser.add_argument('-p', metavar='paramsfile', action='store',
                        dest='params',
                        default='params.txt',
                        help='Motif parameters file')
    parser.add_argument('-i', action="store",
                        dest='imin',
                        type=float,
                        default=22.0,
                        help='Imin: minimum information content')
    return parser.parse_args()

options = getOptions()

near = set()
if options.near is not None:
    for l in open(options.near):
        near.add(l.strip())

dparam = {}
for l in open(options.params):
    reg, i = l.strip().split()
    dparam[reg] = float(i)
    
import sys
import numpy as np

d = {}
dother = {}

for l in sys.stdin:
    reg, org, genes, others = l.split()
    genes = float(genes)
    others = float(others)
    
    if org in near:
        try:
            v = genes/(others+genes)
            dother[reg] = dother.get(reg, [])
            dother[reg].append((genes, others, v))
        except ZeroDivisionError:
            if options.genes:
                dother[reg] = dother.get(reg, [])
                dother[reg].append((genes, others, np.nan))
    else:
        try:
            v = genes/(others+genes)
            d[reg] = d.get(reg, [])
            d[reg].append((genes, others, v))
        except ZeroDivisionError:
            if options.genes:
                d[reg] = d.get(reg, [])
                d[reg].append((genes, others, np.nan))

import math

g = set()

# Print a tab-delimited table
ireg = []
regz = []
err = []
for reg in sorted(d.keys()):
    ireg.append(dparam[reg])
    
    if not options.genes:
        m = np.array([x[2] for x in d[reg]]).mean()
        s = np.array([x[2] for x in d[reg]]).std() / math.sqrt(len(d[reg]))
    else:
        m = np.array([x[0] for x in d[reg]]).mean()
        s = np.array([x[0] for x in d[reg]]).std() / math.sqrt(len(d[reg]))
    
    g.add(m)
    regz.append(m)
    err.append(s)
    
    print '\t'.join( [reg, str(np.array([x[0] for x in d[reg]]).mean()),
                           str(np.array([x[1] for x in d[reg]]).mean()),
                           str(m),
                           str(dparam[reg])] )

if options.near is not None:
    print 
    n_ireg = []
    n_regz = []
    n_err = []
    for reg in sorted(dother.keys()):
        n_ireg.append(dparam[reg])
    
        if not options.genes:
            m = np.array([x[2] for x in dother[reg]]).mean()
            s = np.array([x[2] for x in dother[reg]]).std() / math.sqrt(len(dother[reg]))
        else:
            m = np.array([x[0] for x in dother[reg]]).mean()
            s = np.array([x[0] for x in dother[reg]]).std() / math.sqrt(len(dother[reg]))
        
        g.add(m)
        n_regz.append(m)
        n_err.append(s)
    
        print '\t'.join( [reg, str(np.array([x[0] for x in dother[reg]]).mean()),
                               str(np.array([x[1] for x in dother[reg]]).mean()),
                               str(m),
                               str(dparam[reg])] )

import matplotlib.pyplot as plt
from scipy import stats
                        
# Plot
# Create the plot area
plt.figure(figsize=(8,8))
ax = plt.subplot(111)
                        
# Interpolation!
slope, intercept, r_value, p_value, std_err = stats.linregress(ireg, regz)
line = slope*np.array(ireg)+intercept
ax.plot(ireg,line,'r-')
ax.fill_between(ireg, line-std_err, line+std_err)
if options.genes:
    ipsilon = max(g)-(max(g)/float(10))
else:
    ipsilon = 0.15
if options.near is not None and not options.nonear:
    slope, intercept, n_r_value, n_p_value, std_err = stats.linregress(n_ireg, n_regz)
    line = slope*np.array(n_ireg)+intercept
    ax.plot(n_ireg,line,'b-')
    ax.fill_between(n_ireg, line-std_err, line+std_err)
    ax.annotate("m - r: %.3f - p: %.3f\nn - r: %.3f - p: %.3f"%(r_value, p_value, n_r_value, n_p_value), (80, ipsilon))
else:
    ax.annotate("r: %.3f - p: %.3f"%(r_value, p_value), (80, ipsilon))
                            
# Reg
ax.errorbar(ireg, regz, yerr=err,fmt='or',label='meliloti')
# Near
if options.near is not None and not options.nonear:
    ax.errorbar(n_ireg, n_regz, yerr=n_err,fmt='^b',label='near')

# Make it nice
ax.set_xlabel('I (information content, in bits)')
if options.genes:
    ax.set_ylabel('Gene hits')
else:
    ax.set_ylabel('Gene hits / Total hits')
if options.genes:
    ax.legend(loc='upper left')
else:
    ax.legend(loc='best')
ax.minorticks_on()
ax.grid(b=True, which='major', linestyle='-', color='gray', axis='y')
ax.grid(b=True, which='minor', linestyle='--', color='gray', axis='y')


if options.genes:
    ax.set_ylim(0,max(g))
    ax.vlines(options.imin, 0, max(g), color='k', linestyles='dashed', lw=3)
else:
    ax.set_ylim(0,1)
    ax.vlines(options.imin, 0, 1, color='k', linestyles='dashed', lw=3)

plt.show()
plt.draw()
# Save the figure
plt.savefig('genehits.png')
plt.savefig('genehits.pdf')