Skip to content

Commit

Permalink
reducing memory required in IntervalMergerIterator when not concattin…
Browse files Browse the repository at this point in the history
…g names (#1711)

* reducing memory required with merging intervals with first name only
  this works around an issue when using a large interval list that completely covers the genome (like a GVCF) where you would end up storing the entire potentially very large list in memory in order to computer an inordinately long name.
* cleaner and better tests
  • Loading branch information
meganshand authored Jul 3, 2024
1 parent 127f3de commit 9e89d3c
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 5 deletions.
21 changes: 16 additions & 5 deletions src/main/java/htsjdk/samtools/util/IntervalList.java
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,7 @@ public static class IntervalMergerIterator implements Iterator<Interval> {

MutableFeature current = null;
boolean currentStrandNegative = false;
String currentFirstName = null;

public IntervalMergerIterator(Iterator<Interval> intervals, final boolean combineAbuttingIntervals, final boolean enforceSameStrand, final boolean concatenateNames) {
this.inputIntervals = intervals;
Expand Down Expand Up @@ -891,27 +892,37 @@ private Interval getNext() {
while (inputIntervals.hasNext()) {
next = inputIntervals.next();
if (current == null) {
toBeMerged.add(next);
if (concatenateNames) {
toBeMerged.add(next);
}
current = new MutableFeature(next);
currentStrandNegative = next.isNegativeStrand();
currentFirstName = next.getName();
} else if (current.overlaps(next) || (combineAbuttingIntervals && current.withinDistanceOf(next,1))) {
if (enforceSameStrands && currentStrandNegative != next.isNegativeStrand()) {
throw new SAMException("Strands were not equal for: " + current.toString() + " and " + next.toString());
}
toBeMerged.add(next);
if (concatenateNames) {
toBeMerged.add(next);
}
current.end = Math.max(current.getEnd(), next.getEnd());
} else {
// Emit merged/unique interval
final Interval retVal = merge(toBeMerged, concatenateNames);
final Interval retVal = concatenateNames ? merge(toBeMerged, concatenateNames) :
new Interval(current.getContig(), current.getStart(), current.getEnd(), currentStrandNegative, currentFirstName);
toBeMerged.clear();
current.setAll(next);
currentStrandNegative = next.isNegativeStrand();
toBeMerged.add(next);
currentFirstName = next.getName();
if (concatenateNames) {
toBeMerged.add(next);
}
return retVal;
}
}
// Emit merged/unique interval
final Interval retVal = merge(toBeMerged, concatenateNames);
final Interval retVal = concatenateNames ? merge(toBeMerged, concatenateNames) :
new Interval(current.getContig(), current.getStart(), current.getEnd(), currentStrandNegative, currentFirstName);
toBeMerged.clear();
current = null;
return retVal;
Expand Down
61 changes: 61 additions & 0 deletions src/test/java/htsjdk/samtools/util/IntervalListTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -702,4 +702,65 @@ public static Object[][] brokenFiles() {
public void testBreaks(final Path brokenIntervalFile){
IntervalList.fromPath(brokenIntervalFile);
}

@Test
public void testLargeIteratorMerge() {
final IntervalList intervals = new IntervalList(this.fileHeader);
intervals.add(new Interval("1", 1, 2, false, "foo"));
for (int i = 2; i < 100000; i++) {
intervals.add(new Interval("1", i, i + 1, false, "bar"));
}
final Interval merged = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false).next();
Assert.assertEquals(merged, new Interval("1", 1, 100000));
Assert.assertEquals(merged.getName(), "foo");
}

@DataProvider
public static Object[][] lessMemForMergeWithNoNames() {
String contig = "1";
Interval interval1 = new Interval(contig, 1, 100, false, "foo");
Interval interval2 = new Interval(contig, 101, 200, false, "bar");
Interval interval3 = new Interval(contig, 301, 400, false, "baz");
Interval overlapInterval = new Interval(contig, 350, 450, false, "overlap");
Interval interval4 = new Interval(contig, 401, 500, false, "qux");
Interval combined1NoConcat = new Interval(contig, 1, 200, false, "foo");
Interval combined2NoConcat = new Interval(contig, 301, 500, false, "baz");
Interval combined1WithConcat = new Interval(contig, 1, 200, false, "foo|bar");
Interval combined2WithConcat = new Interval(contig, 301, 500, false, "baz|qux");
Interval combined2WithConcatAndOverlap = new Interval(contig, 301, 500, false, "baz|overlap|qux");
return new Object[][]{
{Collections.emptyList(), Collections.emptyList(), Collections.emptyList()},
{Arrays.asList(interval1), Arrays.asList(interval1), Arrays.asList(interval1)},
{Arrays.asList(interval1, interval2), Arrays.asList(combined1NoConcat), Arrays.asList(combined1WithConcat)},
{Arrays.asList(interval1, interval2, interval3), Arrays.asList(combined1NoConcat, interval3), Arrays.asList(combined1WithConcat, interval3)},
{Arrays.asList(interval1, interval2, interval3, interval4), Arrays.asList(combined1NoConcat, combined2NoConcat), Arrays.asList(combined1WithConcat, combined2WithConcat)},
{Arrays.asList(interval1, interval2, interval3, overlapInterval, interval4), Arrays.asList(combined1NoConcat, combined2NoConcat), Arrays.asList(combined1WithConcat, combined2WithConcatAndOverlap)}
};
}

@Test(dataProvider = "lessMemForMergeWithNoNames")
public void testLessMemForMergeWithNoNames(final List<Interval> intervals, final List<Interval> expectedNoConcat, final List<Interval> expectedWithConcat) {
final IntervalList intervalList = new IntervalList(this.fileHeader);
intervalList.addall(intervals);

final IntervalList.IntervalMergerIterator firstNameMergerIterator = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, false);
Collection<Interval> firstNameMerged = CollectionUtil.makeCollection(firstNameMergerIterator);
Assert.assertEquals(firstNameMerged, expectedNoConcat);
List<Interval> firstNameMergedList = new ArrayList<>(firstNameMerged);
for(int i=0; i<firstNameMergedList.size(); i++){
Interval actual = firstNameMergedList.get(i);
Interval expected = expectedNoConcat.get(i);
Assert.assertEquals(actual.getName(), expected.getName());
}

final IntervalList.IntervalMergerIterator concatNameMergerIterator = new IntervalList.IntervalMergerIterator(intervals.iterator(), true, false, true);
Collection<Interval> concatNameMerged = CollectionUtil.makeCollection(concatNameMergerIterator);
Assert.assertEquals(concatNameMerged, expectedWithConcat);
List<Interval> allNamesMergedList = new ArrayList<>(concatNameMerged);
for(int i=0; i<allNamesMergedList.size(); i++){
Interval actual = allNamesMergedList.get(i);
Interval expected = expectedWithConcat.get(i);
Assert.assertEquals(actual.getName(), expected.getName());
}
}
}

0 comments on commit 9e89d3c

Please sign in to comment.