Merge pull request #32 from yhoogstrate/refactor

Refactored code
yhoogstrate · Apr 2, 2016 · 6d6a030 · 6d6a030
2 parents e93667e + 9f3ce57
commit 6d6a030
Show file tree

Hide file tree

Showing 70 changed files with 3,344 additions and 164 deletions.
diff --git a/.gitignore b/.gitignore
@@ -39,4 +39,6 @@ nosetests.xml
 .pydevproject
 
 # test output files
-test_Functional.*
+#/test_*
+#/tests/bak/
+output_fuma.txt
diff --git a/Changelog b/Changelog
@@ -1,3 +1,51 @@
+2016-04-01  Youri Hoogstrate
+
+	* Version 3.0.0: The core has been rewritte because it needed to use much
+	  less memory for a large number of datasets. Initially the code created
+	  sub datasets, because it was expected to export them time-wise and it was
+	  very handy for running unit tests and for creating the summary output.
+	  This resulted in a very high memory consumption for a large number of
+	  experiments (not with respect to the number of total Fusion genes).
+	  The rewritten code consumes memory in relation to the total number of
+	  Fusion objects. However, for the summary output we still use the legacy
+	  code and for the list output we make use of the new code.
+
+	  FuMa now starts with a n*n (num Fusion objects in all experiments)
+	  triangular matrix in which it compares all fusions with any other fusion
+	  gene. If they are considered identical, a MergedFusion object will be
+	  stored for the next iteration. Otherwise, at the end of the iteration,
+	  all non matched fusion genes will be exported to file.
+
+	  For the remaining MergedFusion genes, FuMa will create a m (number of
+	  MergedFusion objects) * n square matrix and compare whether the Fusion
+	  genes matches the Merged fusion genes. Again, if they are identical,
+	  they will be kept for the next iteration (these MergedFusion objects
+	  will contain 3, 4 or more original Fusion objects each) and those that
+	  are not being matched will be exported to file. For those that will be
+	  kept for the next iteration, 'duplicates' will be removed. If no matched
+	  objects remain, FuMa is finished.
+
+	  Because of this update, for analysis with a low number of samples and a
+	  high number of fusion genes, FuMa may have become (quite) a bit slower. 
+	  However, we believe the cost of some extra running time is much and much
+	  more desired than the exponential memory requirements.
+
+
+		Important:
+		We have also found and resolved a small bug. In older versions of FuMa,
+		indexing was chromosome-name based. Therefore matching two fusion genes
+		only happened when they were annotated upon the same chr name. If you
+		would have a fusion gene A-B (both on chr1) and fusion A-B (both on
+		chr2), the old versions would consider these distinct whereas the new
+		version of FuMa considers these identical.
+
+		Important 2:
+		We have found another minor bug. In rare situations where no fusion
+		gene was matched, the original fusion genes were not reported but
+		such that the number of input files did not equal the number of
+		output files (test_OverlapComplex 08_b and 09_05 and many in test 10).
+		This bug has been resolved in v3.
+
 2016-03-16  Youri Hoogstrate
 
 	* Version 2.12.3: Bugfix.

diff --git a/bin/fuma b/bin/fuma
@@ -27,6 +27,7 @@ import fuma
 
 from fuma.ParseBED import ParseBED
 from fuma.OverlapComplex import OverlapComplex
+from fuma.ComparisonTriangle import ComparisonTriangle
 
 from fuma.Readers import *
 
@@ -207,18 +208,25 @@ if __name__ == "__main__":
 			samples[sample_name].annotate_genes(gene_annotations[reference_name])
 			samples[sample_name].remove_duplicates(args)
 
-	o = OverlapComplex()
-
-	for sample_name in sample_names:
-		o.add_experiment(samples[sample_name])
-
 	if(args.format == "summary"):
+		o = OverlapComplex()
+
+		for sample_name in sample_names:
+			o.add_experiment(samples[sample_name])
+
 		o.overlay_fusions(True,False,args)
 		o.export_summary(args.output)
 	else:
-		if(args.output == "-"):
-			o.overlay_fusions(False,sys.stdout,args)# Exports content of the datasets
-		else:
-			fh = open(args.output,"w")
-			o.overlay_fusions(True,fh,args)# Exports content of the datasets << check if sparse can be enabled?
-			fh.close()
+		o = ComparisonTriangle(args)
+
+		for sample_name in sample_names:
+			o.add_experiment(samples[sample_name])
+
+		o.overlay_fusions()
+
+		#if(args.output == "-"):
+		#	o.overlay_fusions(False,sys.stdout,args)# Exports content of the datasets
+		#else:
+		#	fh = open(args.output,"w")
+		#	o.overlay_fusions(True,fh,args)# Exports content of the datasets << check if sparse can be enabled?
+		#	fh.close()
diff --git a/fuma/CLI.py b/fuma/CLI.py
@@ -102,7 +102,7 @@ def CLI(argv=None):
 
 	parser.add_argument("-g","--long-gene-size",default=200000,type=int,help="Gene-name based matching is more sensitive to long genes. This is the gene size used to mark fusion genes spanning a 'long gene' as reported the output. Use 0 to disable this feature.")
 
-	parser.add_argument("-o","--output",help="output filename; '-' for stdout",default="overlap/")
+	parser.add_argument("-o","--output",help="output filename; '-' for stdout",default="output_fuma.txt")
 
 	if(argv == None):
 		return parser.parse_args()