From d1dc5ccfaebcc9a4fade9136428d5e5cf9c761d5 Mon Sep 17 00:00:00 2001 From: Megan Shand Date: Tue, 25 Jun 2024 14:41:03 -0400 Subject: [PATCH 1/2] reducing memory required with merging intervals with first name only --- .../htsjdk/samtools/util/IntervalList.java | 16 ++++++- .../samtools/util/IntervalListTest.java | 44 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalList.java b/src/main/java/htsjdk/samtools/util/IntervalList.java index e1f8735260..1f163f073e 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalList.java +++ b/src/main/java/htsjdk/samtools/util/IntervalList.java @@ -888,20 +888,29 @@ public Interval next() { private Interval getNext() { Interval next; + int start = current == null ? -1 : current.getStart(); while (inputIntervals.hasNext()) { next = inputIntervals.next(); if (current == null) { toBeMerged.add(next); current = new MutableFeature(next); + start = next.getStart(); currentStrandNegative = next.isNegativeStrand(); } else if (current.overlaps(next) || (combineAbuttingIntervals && current.withinDistanceOf(next,1))) { if (enforceSameStrands && currentStrandNegative != next.isNegativeStrand()) { throw new SAMException("Strands were not equal for: " + current.toString() + " and " + next.toString()); } - toBeMerged.add(next); + if (concatenateNames) { + toBeMerged.add(next); + } current.end = Math.max(current.getEnd(), next.getEnd()); } else { // Emit merged/unique interval + if (!concatenateNames) { + if (start!=-1) { + toBeMerged.add(new Interval(current.contig, start, current.getEnd())); + } + } final Interval retVal = merge(toBeMerged, concatenateNames); toBeMerged.clear(); current.setAll(next); @@ -911,6 +920,11 @@ private Interval getNext() { } } // Emit merged/unique interval + if (!concatenateNames) { + if (start!=-1) { + toBeMerged.add(new Interval(current.contig, start, current.getEnd())); + } + } final Interval retVal = merge(toBeMerged, concatenateNames); toBeMerged.clear(); current = null; diff --git a/src/test/java/htsjdk/samtools/util/IntervalListTest.java b/src/test/java/htsjdk/samtools/util/IntervalListTest.java index 7010773599..a3bc82b342 100644 --- a/src/test/java/htsjdk/samtools/util/IntervalListTest.java +++ b/src/test/java/htsjdk/samtools/util/IntervalListTest.java @@ -702,4 +702,48 @@ public static Object[][] brokenFiles() { public void testBreaks(final Path brokenIntervalFile){ IntervalList.fromPath(brokenIntervalFile); } + + @Test + public void testLargeIteratorMerge() { + final IntervalList intervals = new IntervalList(this.fileHeader); + intervals.add(new Interval("1", 1, 2, false, "foo")); + for (int i = 2; i < 100000; i++) { + intervals.add(new Interval("1", i, i + 1, false, "bar")); + } + final Interval merged = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false).next(); + Assert.assertEquals(merged, new Interval("1", 1, 100000, false, "foo")); + } + + @DataProvider + public static Object[][] lessMemForMergeWithNoNames() { + String contig = "1"; + Interval interval1 = new Interval(contig, 1, 100); + Interval interval2 = new Interval(contig, 101, 200); + Interval interval3 = new Interval(contig, 301, 400); + Interval overlapInterval = new Interval(contig, 350, 450); + Interval interval4 = new Interval(contig, 401, 500); + Interval combined1 = new Interval(contig, 1, 200); + Interval combined2 = new Interval(contig, 301, 500); + return new Object[][]{ + {Arrays.asList(interval1), Arrays.asList(interval1)}, + {Arrays.asList(interval1, interval2), Arrays.asList(combined1)}, + {Arrays.asList(interval1, interval2, interval3), Arrays.asList(combined1, interval3)}, + {Arrays.asList(interval1, interval2, interval3, interval4), Arrays.asList(combined1, combined2)}, + {Arrays.asList(interval1, interval2, interval3, overlapInterval, interval4), Arrays.asList(combined1, combined2)}, + }; + } + + @Test(dataProvider = "lessMemForMergeWithNoNames") + public void testLessMemForMergeWithNoNames(final List intervals, final List expected) { + final IntervalList intervalList = new IntervalList(this.fileHeader); + intervalList.addall(intervals); + + final IntervalList.IntervalMergerIterator firstNameMergerIterator = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false); + Collection firstNameMerged = CollectionUtil.makeCollection(firstNameMergerIterator); + Assert.assertEquals(firstNameMerged, expected); + + final IntervalList.IntervalMergerIterator concatNameMergerIterator = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, true); + Collection concatNameMerged = CollectionUtil.makeCollection(concatNameMergerIterator); + Assert.assertEquals(concatNameMerged, expected); + } } From 793ca5612a0bd0029a030527d34ae1186bd6c291 Mon Sep 17 00:00:00 2001 From: Megan Shand Date: Tue, 2 Jul 2024 15:14:59 -0400 Subject: [PATCH 2/2] cleaner and better tests --- .../htsjdk/samtools/util/IntervalList.java | 29 +++++------ .../samtools/util/IntervalListTest.java | 49 +++++++++++++------ 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalList.java b/src/main/java/htsjdk/samtools/util/IntervalList.java index 1f163f073e..24fb8c4230 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalList.java +++ b/src/main/java/htsjdk/samtools/util/IntervalList.java @@ -864,6 +864,7 @@ public static class IntervalMergerIterator implements Iterator { MutableFeature current = null; boolean currentStrandNegative = false; + String currentFirstName = null; public IntervalMergerIterator(Iterator intervals, final boolean combineAbuttingIntervals, final boolean enforceSameStrand, final boolean concatenateNames) { this.inputIntervals = intervals; @@ -888,14 +889,15 @@ public Interval next() { private Interval getNext() { Interval next; - int start = current == null ? -1 : current.getStart(); while (inputIntervals.hasNext()) { next = inputIntervals.next(); if (current == null) { - toBeMerged.add(next); + if (concatenateNames) { + toBeMerged.add(next); + } current = new MutableFeature(next); - start = next.getStart(); currentStrandNegative = next.isNegativeStrand(); + currentFirstName = next.getName(); } else if (current.overlaps(next) || (combineAbuttingIntervals && current.withinDistanceOf(next,1))) { if (enforceSameStrands && currentStrandNegative != next.isNegativeStrand()) { throw new SAMException("Strands were not equal for: " + current.toString() + " and " + next.toString()); @@ -906,26 +908,21 @@ private Interval getNext() { current.end = Math.max(current.getEnd(), next.getEnd()); } else { // Emit merged/unique interval - if (!concatenateNames) { - if (start!=-1) { - toBeMerged.add(new Interval(current.contig, start, current.getEnd())); - } - } - final Interval retVal = merge(toBeMerged, concatenateNames); + final Interval retVal = concatenateNames ? merge(toBeMerged, concatenateNames) : + new Interval(current.getContig(), current.getStart(), current.getEnd(), currentStrandNegative, currentFirstName); toBeMerged.clear(); current.setAll(next); currentStrandNegative = next.isNegativeStrand(); - toBeMerged.add(next); + currentFirstName = next.getName(); + if (concatenateNames) { + toBeMerged.add(next); + } return retVal; } } // Emit merged/unique interval - if (!concatenateNames) { - if (start!=-1) { - toBeMerged.add(new Interval(current.contig, start, current.getEnd())); - } - } - final Interval retVal = merge(toBeMerged, concatenateNames); + final Interval retVal = concatenateNames ? merge(toBeMerged, concatenateNames) : + new Interval(current.getContig(), current.getStart(), current.getEnd(), currentStrandNegative, currentFirstName); toBeMerged.clear(); current = null; return retVal; diff --git a/src/test/java/htsjdk/samtools/util/IntervalListTest.java b/src/test/java/htsjdk/samtools/util/IntervalListTest.java index a3bc82b342..924c6c6feb 100644 --- a/src/test/java/htsjdk/samtools/util/IntervalListTest.java +++ b/src/test/java/htsjdk/samtools/util/IntervalListTest.java @@ -711,39 +711,56 @@ public void testLargeIteratorMerge() { intervals.add(new Interval("1", i, i + 1, false, "bar")); } final Interval merged = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false).next(); - Assert.assertEquals(merged, new Interval("1", 1, 100000, false, "foo")); + Assert.assertEquals(merged, new Interval("1", 1, 100000)); + Assert.assertEquals(merged.getName(), "foo"); } @DataProvider public static Object[][] lessMemForMergeWithNoNames() { String contig = "1"; - Interval interval1 = new Interval(contig, 1, 100); - Interval interval2 = new Interval(contig, 101, 200); - Interval interval3 = new Interval(contig, 301, 400); - Interval overlapInterval = new Interval(contig, 350, 450); - Interval interval4 = new Interval(contig, 401, 500); - Interval combined1 = new Interval(contig, 1, 200); - Interval combined2 = new Interval(contig, 301, 500); + Interval interval1 = new Interval(contig, 1, 100, false, "foo"); + Interval interval2 = new Interval(contig, 101, 200, false, "bar"); + Interval interval3 = new Interval(contig, 301, 400, false, "baz"); + Interval overlapInterval = new Interval(contig, 350, 450, false, "overlap"); + Interval interval4 = new Interval(contig, 401, 500, false, "qux"); + Interval combined1NoConcat = new Interval(contig, 1, 200, false, "foo"); + Interval combined2NoConcat = new Interval(contig, 301, 500, false, "baz"); + Interval combined1WithConcat = new Interval(contig, 1, 200, false, "foo|bar"); + Interval combined2WithConcat = new Interval(contig, 301, 500, false, "baz|qux"); + Interval combined2WithConcatAndOverlap = new Interval(contig, 301, 500, false, "baz|overlap|qux"); return new Object[][]{ - {Arrays.asList(interval1), Arrays.asList(interval1)}, - {Arrays.asList(interval1, interval2), Arrays.asList(combined1)}, - {Arrays.asList(interval1, interval2, interval3), Arrays.asList(combined1, interval3)}, - {Arrays.asList(interval1, interval2, interval3, interval4), Arrays.asList(combined1, combined2)}, - {Arrays.asList(interval1, interval2, interval3, overlapInterval, interval4), Arrays.asList(combined1, combined2)}, + {Collections.emptyList(), Collections.emptyList(), Collections.emptyList()}, + {Arrays.asList(interval1), Arrays.asList(interval1), Arrays.asList(interval1)}, + {Arrays.asList(interval1, interval2), Arrays.asList(combined1NoConcat), Arrays.asList(combined1WithConcat)}, + {Arrays.asList(interval1, interval2, interval3), Arrays.asList(combined1NoConcat, interval3), Arrays.asList(combined1WithConcat, interval3)}, + {Arrays.asList(interval1, interval2, interval3, interval4), Arrays.asList(combined1NoConcat, combined2NoConcat), Arrays.asList(combined1WithConcat, combined2WithConcat)}, + {Arrays.asList(interval1, interval2, interval3, overlapInterval, interval4), Arrays.asList(combined1NoConcat, combined2NoConcat), Arrays.asList(combined1WithConcat, combined2WithConcatAndOverlap)} }; } @Test(dataProvider = "lessMemForMergeWithNoNames") - public void testLessMemForMergeWithNoNames(final List intervals, final List expected) { + public void testLessMemForMergeWithNoNames(final List intervals, final List expectedNoConcat, final List expectedWithConcat) { final IntervalList intervalList = new IntervalList(this.fileHeader); intervalList.addall(intervals); final IntervalList.IntervalMergerIterator firstNameMergerIterator = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false); Collection firstNameMerged = CollectionUtil.makeCollection(firstNameMergerIterator); - Assert.assertEquals(firstNameMerged, expected); + Assert.assertEquals(firstNameMerged, expectedNoConcat); + List firstNameMergedList = new ArrayList<>(firstNameMerged); + for(int i=0; i concatNameMerged = CollectionUtil.makeCollection(concatNameMergerIterator); - Assert.assertEquals(concatNameMerged, expected); + Assert.assertEquals(concatNameMerged, expectedWithConcat); + List allNamesMergedList = new ArrayList<>(concatNameMerged); + for(int i=0; i