regDist

#!/usr/bin/python

import sys
import networkx as nx

def getOptions():
    import argparse

    # create the top-level parser
    description = ("Org-to-org distance on pangenome scale regulatory networks")
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('GML_FILE', action='store',
                            help='Pangenome regulatory network')
    parser.add_argument('-B', '--babu-dist', action="store_true",
            default=False,
            dest='babudist',
            help='Compute the distances as Babu et al., 2006 (doi:10.1016/j.jmb.2006.02.019)')
    parser.add_argument('-D', '--downstream-dist', action="store_true",
            default=False,
            dest='downstreamdist',
            help='Compute the distances considering all the downstream regulated genes')
    
    parser.add_argument('-S', '--semnatic-dist', action="store_true",
            default=False,
            dest='semanticdist',
            help='Compute the distances loading gene pairs semantic similarities')
    parser.add_argument('-s', metavar='semanticDir', action='store',
            dest='semdir',
            default=None,
            help='Semantic similarity directory (regID.*)')

    return parser.parse_args()

options = getOptions()

metric = [options.babudist, options.downstreamdist, options.semanticdist]
if True in metric and metric.count(True) > 1:
    print('Please select only one metric')
    sys.exit(1)

infile = options.GML_FILE
n = nx.read_gml(infile)

# Grep the orgs in the net
orgs = set()
for x in n:
    for o in n.node[x]['orgs'].split():
        orgs.add(o)

if not options.babudist and not options.downstreamdist and not options.semanticdist:
    from scipy.spatial.distance import jaccard
   
    # Distance matrix for the regulators presence
    d = {}
    r = set()
    for x in filter(lambda x: n.node[x]['kind'] == 'regulator', n):
        orgz = set(n.node[x]['orgs'].split())
        for o in orgs:
            d[o] = d.get(o, [])
            if o in orgz:
                d[o].append(1)
            else:
                d[o].append(0)
        r.add(x)

    # Distance matrix for the regulatory links
    for a, b in filter(lambda x: n[x[0]][x[1]]['kind'] == 'regulated', n.edges()):
        # Check if this is a plug & play gene
        regulator = len(n.node[a]['orgs'].split())
        ro = set(n.node[a]['orgs'].split())
        
        edge = len(n[a][b]['orgs'].split())
        eo = set(n[a][b]['orgs'].split())
        
        regulated = len(n.node[b]['orgs'].split())
        reo = set(n.node[b]['orgs'].split())
        
        if eo != reo and eo.issuperset(reo) and reo.issubset(eo):
            raise ValueError('Found a regulator edge with more orgs than the regulated gene (%s --> %s)'%(a, b))
        elif edge == regulated and eo == reo:
            # Plug & play gene
            continue

        orgz = set(n[a][b]['orgs'].split())
        for o in orgs:
            d[o] = d.get(o, [])
            if o in orgz:
                d[o].append(1)
            else:
                d[o].append(0)
        r.add( '%s-%s'%(a,b) )
        
    # Print the distance matrix
    print '\t'.join( [''] + sorted(orgs) )
    for o in sorted(orgs):
        print '\t'.join( [str(o)] + [str(jaccard(d[o], d[x])) for x in sorted(orgs)] )

elif options.babudist:
    from itertools import combinations

    d = {}
    
    for o in orgs:
        d[o] = {}
        d[o][o] = 0.
    
    for a,b in combinations(orgs, 2):
        d[a] = d.get(a, {})
        d[b] = d.get(b, {})
        
        core = len(
                filter(lambda (x, y):
                      n[x][y]['kind'] == 'regulated' and
                      a in n[x][y]['orgs'].split() and
                      b in n[x][y]['orgs'].split(),
                                                n.edges())
                                                          )
        total = len(
                filter(lambda (x, y):
                      n[x][y]['kind'] == 'regulated' and
                      (a in n[x][y]['orgs'].split() or
                      b in n[x][y]['orgs'].split()),
                                                n.edges())
                                                          )
        
        dist = 1 - (float(core) / float(total))
        
        d[a][b] = dist
        d[b][a] = dist
    
    # Print the distance matrix
    print '\t'.join( [''] + sorted(orgs) )
    for o in sorted(orgs):
        print '\t'.join( [str(o)] + [str(d[o][x]) for x in sorted(orgs)] )

elif options.downstreamdist:
    import numpy as np
    from itertools import combinations
    from regtools.regnet import getDownstream

    d = {}
    
    for o in orgs:
        d[o] = {}
        d[o][o] = 0.
    
    for a,b in combinations(orgs, 2):
        d[a] = d.get(a, {})
        d[b] = d.get(b, {})

    # Analysis has to be done on a regulator basis, since we are not considering
    # the single regulatory links, but rather the overall downstream genes
    downstream = {}
    for x in filter(lambda x: n.node[x]['kind'] == 'regulator', n):
        downstream[x] = downstream.get(x, {})
        for o in orgs:
            # Is the regulator even present in this org?
            if o not in n.node[x]['orgs'].split():
                downstream[x][o] = set([])
                continue
        
            down = getDownstream(n, x, o)
            
            downstream[x][o] = set(down)
    
    # Beware of corner cases
    # 1. No regulator in both orgs
    # 2. No downstream genes in both orgs
    for a,b in combinations(orgs, 2):
        dists = []
        
        for x in filter(lambda x: n.node[x]['kind'] == 'regulator', n):
            core = len( downstream[x][a].intersection(downstream[x][b]) )
            
            total = len( downstream[x][a].union(downstream[x][b]) )
        
            if total == 0:
                dist = 1
            else:
                dist = 1 - (float(core) / float(total))
                
            dists.append(dist)
        
        d[a][b] = np.array(dists).mean()
        d[b][a] = np.array(dists).mean()
        
    # Print the distance matrix
    print '\t'.join( [''] + sorted(orgs) )
    for o in sorted(orgs):
        print '\t'.join( [str(o)] + [str(d[o][x]) for x in sorted(orgs)] )

elif options.semanticdist:
    import os
    import numpy as np
    from itertools import combinations
    from regtools.regnet import getDownstream
    
    semantic = os.listdir(options.semdir)

    d = {}
    
    for o in orgs:
        d[o] = {}
        d[o][o] = 0.
    
    for a,b in combinations(orgs, 2):
        d[a] = d.get(a, {})
        d[a][b] = []
        d[b] = d.get(b, {})
        d[b][a] = []
    
    # Analysis has to be done on a regulator basis, since we are not considering
    # the single regulatory links, but rather the overall downstream genes
    downstream = {}
    for x in filter(lambda x: n.node[x]['kind'] == 'regulator', n):
        downstream[x] = downstream.get(x, {})
        for o in orgs:
            # Is the regulator even present in this org?
            if o not in n.node[x]['orgs'].split():
                downstream[x][o] = set([])
                continue
        
            down = getDownstream(n, x, o)
            
            downstream[x][o] = set([n.node[y]['label'] for y in down])

    for x in filter(lambda x: n.node[x]['kind'] == 'regulator', n):
        # Read the semantic similarity file
        sem = {}
        for l in open(os.path.join(options.semdir,
                                   str(x))):
            s = l.strip().split('\t')
            sem[s[0]] = sem.get(s[1], {})
            sem[s[0]][s[1]] = float(s[2])
            sem[s[1]] = sem.get(s[0], {})
            sem[s[1]][s[0]] = float(s[2])

        for a,b in combinations(orgs, 2):
            ag = downstream[x][a]
            bg = downstream[x][b]

            dist = []
            for i in ag:
                if i not in sem:continue
                for j in bg:
                    if j not in sem:continue
                    dist.append(sem[i][j])

            if len(dist) == 0:
                continue

            d[a][b].append( np.array(dist).mean() )
            d[b][a].append( np.array(dist).mean() )

    # Print the distance matrix
    print '\t'.join( [''] + sorted(orgs) )
    for o in sorted(orgs):
        print '\t'.join( [str(o)] + [str(np.array(d[o][x]).mean()) for x in sorted(orgs)] )