Merge pull request #3 from stjude/madetunj

converted from python2 to python3
stjude · Jan 10, 2020 · cb4ce74 · cb4ce74
2 parents c14e6a4 + 49cef09
commit cb4ce74
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 120 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ CLONED using SOURCETREE from: https://bitbucket.org/young_computation/rose/src/m
 	* samtools
 	* R version > 3.4
 	* bedtools > 2
-	* python2
+	* python3
 
 3) USAGE
 

diff --git a/bin/ROSE_bamToGFF.py b/bin/ROSE_bamToGFF.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 #bamToGFF.py
 
 #script to grab reads from a bam that align to a .gff file
@@ -12,7 +12,7 @@
 
 import os
 
-from string import join,upper,maketrans
+import string
 
 
 
@@ -41,16 +41,16 @@ def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False
     else:
         MMR = 1
 
-    print('using a MMR value of %s' % (MMR))
+    print(('using a MMR value of %s' % (MMR)))
 
-    senseTrans = maketrans('-+.','+-+')
+    #senseTrans = maketrans('-+.','+-+') #deprecated
 
     if ROSE_utils.checkChrStatus(bamFile) == 1:
-      print "has chr"
+      print("has chr")
       hasChrFlag = 1
       #sys.exit();
     else:
-      print "does not have chr"
+      print("does not have chr")
       hasChrFlag = 0
       #sys.exit()
 
@@ -67,10 +67,10 @@ def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False
     for line in gff:
         line = line[0:9]
         if ticker%100 == 0:
-            print ticker
+            print(ticker)
         ticker+=1
         if not hasChrFlag:
-	  line[0] = re.sub(r"chr",r"",line[0])
+            line[0] = re.sub(r"chr",r"",line[0])
         gffLocus = ROSE_utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1])
         #print line[0]
         #sys.exit()
@@ -86,11 +86,11 @@ def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False
                 locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID())
             extendedReads.append(locus)
         if gffLocus.sense() == '+' or gffLocus.sense == '.':
-            senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads)
-            antiReads = filter(lambda x:x.sense() == '-',extendedReads)
+            senseReads = [x for x in extendedReads if x.sense() == '+' or x.sense() == '.']
+            antiReads = [x for x in extendedReads if x.sense() == '-']
         else:
-            senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads)
-            antiReads = filter(lambda x:x.sense() == '+',extendedReads)
+            senseReads = [x for x in extendedReads if x.sense() == '-' or x.sense() == '.']
+            antiReads = [x for x in extendedReads if x.sense() == '+']
 
         senseHash = defaultdict(int)
         antiHash = defaultdict(int)
@@ -107,12 +107,12 @@ def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False
                     antiHash[x]+=1
 
         #now apply flooring and filtering for coordinates
-        keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys())
+        keys = ROSE_utils.uniquify(list(senseHash.keys())+list(antiHash.keys()))
         if floor > 0:
 
-            keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys)
+            keys = [x for x in keys if (senseHash[x]+antiHash[x]) > floor]
         #coordinate filtering
-        keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys)
+        keys = [x for x in keys if gffLocus.start() < x < gffLocus.end()]
 
 
         #setting up the output table
@@ -131,15 +131,15 @@ def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False
 
             while n <nBins:
                 n+=1
-                binKeys = filter(lambda x: i < x < i+binSize,keys)
+                binKeys = [x for x in keys if i < x < i+binSize]
                 binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
                 clusterLine+=[round(binDen/MMR,4)]
                 i = i+binSize
         else:
             i = gffLocus.end()
             while n < nBins:
                 n+=1
-                binKeys = filter(lambda x: i-binSize < x < i,keys)
+                binKeys = [x for x in keys if i-binSize < x < i]
                 binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
                 clusterLine+=[round(binDen/MMR,4)]
                 i = i-binSize
@@ -192,7 +192,7 @@ def main():
         bamFile = options.bam
         fullPath = os.path.abspath(bamFile)
         bamName = fullPath.split('/')[-1].split('.')[0]
-        pathFolder = join(fullPath.split('/')[0:-1],'/')
+        pathFolder = '/'.join(fullPath.split('/')[0:-1])
         fileList = os.listdir(pathFolder)
         hasBai = False
         for fileName in fileList:

diff --git a/bin/ROSE_callSuper.R b/bin/ROSE_callSuper.R
@@ -141,8 +141,8 @@ if(wceName == 'NONE'){
 }else{
 	plot(length(rankBy_vector):1,rankBy_vector[signalOrder], col='red',xlab=paste(rankBy_factor,'_enhancers'),ylab=paste(rankBy_factor,' Signal','- ',wceName),pch=19,cex=2)
 }
-abline(h=cutoff_options$absolute,color='grey',lty=2)
-abline(v=length(rankBy_vector)-length(superEnhancerRows),color='grey',lty=2)
+abline(h=cutoff_options$absolute,col='grey',lty=2)
+abline(v=length(rankBy_vector)-length(superEnhancerRows),col='grey',lty=2)
 lines(length(rankBy_vector):1,rankBy_vector[signalOrder],lwd=4, col='red')
 text(0,0.8*max(rankBy_vector),paste(' Cutoff used: ',cutoff_options$absolute,'\n','Super-Enhancers identified: ',length(superEnhancerRows)),pos=4)
 

diff --git a/bin/ROSE_geneMapper.py b/bin/ROSE_geneMapper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 #130428
 
 #ROSE_geneMapper.py
@@ -10,14 +10,11 @@
 
 import sys
 
-
-
 import ROSE_utils
 
-
 import os
 
-from string import upper,join
+import string
 
 from collections import defaultdict
 
@@ -33,9 +30,9 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
     '''
     maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
     '''
-    print "Herp"
+    print("Herp")
     startDict = ROSE_utils.makeStartDict(annotFile)
-    print "Derp"
+    print("Derp")
     enhancerTable = ROSE_utils.parseTable(enhancerFile,'\t')
 
 
@@ -46,7 +43,7 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
         transcribedTable = ROSE_utils.parseTable(transcribedFile,'\t')
         transcribedGenes = [line[1] for line in transcribedTable]
     else:
-        transcribedGenes = startDict.keys()
+        transcribedGenes = list(startDict.keys())
 
     print('MAKING TRANSCRIPT COLLECTION')
     transcribedCollection = ROSE_utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)
@@ -132,24 +129,24 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
             #print distList.index(min(distList))
             #print min(distList)
             #print len(distList)
-	    #print len(allEnhancerGenes[distList.index(min(distList))])
-	    #print line
-	    #print len(startDict[allEnhancerGenes[distList.index(min(distList))]])
+	        #print len(allEnhancerGenes[distList.index(min(distList))])
+	        #print line
+	        #print len(startDict[allEnhancerGenes[distList.index(min(distList))]])
             closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']
 
         #NOW WRITE THE ROW FOR THE ENHANCER TABLE
         newEnhancerLine = line[0:6]
         if byRefseq:
-            newEnhancerLine.append(join(ROSE_utils.uniquify([x for x in overlappingGenes]),','))
-            newEnhancerLine.append(join(ROSE_utils.uniquify([x for x in proximalGenes]),','))
+            newEnhancerLine.append(','.join(ROSE_utils.uniquify([x for x in overlappingGenes])))
+            newEnhancerLine.append(','.join(ROSE_utils.uniquify([x for x in proximalGenes])))
             #print newEnhancerLine
             #print len(allEnhancerGenes)
             #print distList
             closestGene = allEnhancerGenes[distList.index(min(distList))]
             newEnhancerLine.append(closestGene)
         else:
-            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
-            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
+            newEnhancerLine.append(','.join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes])))
+            newEnhancerLine.append(','.join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes])))
             closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']
             newEnhancerLine.append(closestGene)
 
@@ -187,7 +184,7 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
         proxEnhancers = geneDict['proximal'][refID] + geneDict['overlapping'][refID]
 
 
-        newLine = [geneName,refID,join(proxEnhancers,',')]
+        newLine = [geneName,refID,','.join(proxEnhancers)]
         geneToEnhancerTable.append(newLine)
 
     #re-sort enhancerToGeneTable
@@ -245,12 +242,12 @@ def main():
     if options.out:
         outFolder = ROSE_utils.formatFolder(options.out,True)
     else:
-        outFolder = join(enhancerFile.split('/')[0:-1],'/') + '/'
+        outFolder = '/'.join(enhancerFile.split('/')[0:-1]) + '/'
 
 
     #GETTING THE GENOME
     genome = options.genome
-    print('USING %s AS THE GENOME' % genome)
+    print(('USING %s AS THE GENOME' % genome))
 
 
     #GETTING THE CORRECT ANNOT FILE
@@ -263,7 +260,7 @@ def main():
         'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd),
         }
 
-    annotFile = genomeDict[upper(genome)]
+    annotFile = genomeDict[genome.upper()]
 
     #GETTING THE TRANSCRIBED LIST
     if options.geneList:

diff --git a/bin/ROSE_main.py b/bin/ROSE_main.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 
 #mapEnhancerFromFactor.py
 '''
@@ -19,7 +19,7 @@
 
 import os
 
-from string import upper,join
+import string
 
 from collections import defaultdict
 
@@ -46,7 +46,7 @@ def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
         #this loop makes a locus centered around +/- tssWindow of transcribed genes
         #then adds it to the list tssLoci
         tssLoci = []
-        for geneID in startDict.keys():
+        for geneID in list(startDict.keys()):
             tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,tssWindow,tssWindow))
 
 
@@ -67,7 +67,7 @@ def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
                 boundCollection.remove(locus)
                 debugOutput.append([locus.__str__(),locus.ID(),'CONTAINED'])
                 removeTicker+=1
-        print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker))
+        print(('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)))
 
     #boundCollection is now all enriched region loci that don't overlap an active TSS
     stitchedCollection = boundCollection.stitchCollection(stitchWindow,'both')
@@ -77,7 +77,7 @@ def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
         #with the original loci that were there
         fixedLoci = []
         tssLoci = []
-        for geneID in startDict.keys():
+        for geneID in list(startDict.keys()):
             tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,50,50))
 
 
@@ -101,8 +101,8 @@ def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
             else:
                 fixedLoci.append(stitchedLocus)
 
-        print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker))
-        print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker))
+        print(('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)))
+        print(('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)))
         fixedCollection = ROSE_utils.LocusCollection(fixedLoci,50)
         return fixedCollection,debugOutput
     else:
@@ -163,16 +163,16 @@ def mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolde
 
         bamFileName = bamFile.split('/')[-1]
 
-        print('GETTING MAPPING DATA FOR  %s' % bamFile)
+        print(('GETTING MAPPING DATA FOR  %s' % bamFile))
         #assumes standard convention for naming enriched region gffs
 
         #opening up the mapped GFF
-        print('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName))
+        print(('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName)))
 
         mappedGFF =ROSE_utils.parseTable('%s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName),'\t')        
 
         signalDict = defaultdict(float)
-        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
+        print(('MAKING SIGNAL DICT FOR %s' % (bamFile)))
         mappedLoci = []
         for line in mappedGFF[1:]:
 
@@ -268,13 +268,13 @@ def main():
         ROSE_utils.bedToGFF(options.input,inputGFFFile)
     elif options.input.split('.')[-1] =='gff':
         #COPY THE INPUT GFF TO THE GFF FOLDER
-	inputGFFFile = options.input
+        inputGFFFile = options.input
         os.system('cp %s %s' % (inputGFFFile,gffFolder))        
 
     else:
         print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
         #COPY THE INPUT GFF TO THE GFF FOLDER
-	inputGFFFile = options.input
+        inputGFFFile = options.input
         os.system('cp %s %s' % (inputGFFFile,gffFolder))        
 
 
@@ -301,13 +301,13 @@ def main():
         removeTSS = False
 
     #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
-    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
+    print(('USING %s AS THE INPUT GFF' % (inputGFFFile)))
     inputName = inputGFFFile.split('/')[-1].split('.')[0]
 
 
     #GETTING THE GENOME
     genome = options.genome
-    print('USING %s AS THE GENOME' % genome)
+    print(('USING %s AS THE GENOME' % genome))
 
 
     #GETTING THE CORRECT ANNOT FILE
@@ -320,7 +320,7 @@ def main():
         'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd),
         }
 
-    annotFile = genomeDict[upper(genome)]
+    annotFile = genomeDict[genome.upper()]
 
     #MAKING THE START DICT
     print('MAKING START DICT')
@@ -354,19 +354,19 @@ def main():
     #WRITING DEBUG OUTPUT TO DISK
 
     if debug:
-        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
+        print(('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)))
         ROSE_utils.unParseTable(debugOutput,debugOutFile,'\t')
 
     #WRITE THE GFF TO DISK
-    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
+    print(('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)))
     ROSE_utils.unParseTable(stitchedGFF,stitchedGFFFile,'\t')
 
 
 
     #SETTING UP THE OVERALL OUTPUT FILE
     outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
 
-    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))
+    print(('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1)))
 
     #MAPPING TO THE NON STITCHED (ORIGINAL GFF)
     #MAPPING TO THE STITCHED GFF
@@ -413,7 +413,7 @@ def main():
         '''
         outputDone = True
         if ticker%6 == 0:
-            print(ticker*5)
+            print((ticker*5))
         ticker +=1
         #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP
         if ticker == 144:
@@ -442,7 +442,7 @@ def main():
         if outputDone == True:
             break
         time.sleep(300)
-    print('MAPPING TOOK %s MINUTES' % (ticker*5))
+    print(('MAPPING TOOK %s MINUTES' % (ticker*5)))
 
     print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
     #CALCULATE DENSITY BY REGION