From f2bc5377d7548f9f18283852ff42727a398efd53 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Fri, 27 Oct 2023 20:38:43 -0400 Subject: [PATCH] Add more test cases --- .../datawave/query/model/FieldIndexHole.java | 33 +--- .../query/util/AllFieldMetadataHelper.java | 178 +++++++++++------- .../datawave/query/util/MetadataHelper.java | 13 +- .../util/AllFieldMetadataHelperTest.java | 175 ++++++++++++----- 4 files changed, 253 insertions(+), 146 deletions(-) diff --git a/src/main/java/datawave/query/model/FieldIndexHole.java b/src/main/java/datawave/query/model/FieldIndexHole.java index 361ceeb..a83d85f 100644 --- a/src/main/java/datawave/query/model/FieldIndexHole.java +++ b/src/main/java/datawave/query/model/FieldIndexHole.java @@ -1,14 +1,17 @@ package datawave.query.model; import java.util.Collection; +import java.util.Comparator; import java.util.Date; import java.util.Objects; import java.util.SortedSet; import java.util.StringJoiner; -import java.util.TreeSet; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +import com.google.common.collect.ImmutableSortedSet; + /** * This class represents a set of calculated field index holes for a given fieldName and datatype. A field index hole is effectively a date where a frequency * row was seen, but an index and/or reversed indexed row was not. @@ -17,11 +20,15 @@ public class FieldIndexHole { private final String fieldName; private final String datatype; - private final SortedSet> dateRanges = new TreeSet<>(); + private final SortedSet> dateRanges; - public FieldIndexHole(String fieldName, String dataType) { + public FieldIndexHole(String fieldName, String dataType, Collection> holes) { this.fieldName = fieldName; this.datatype = dataType; + // Ensure the date range set is immutable. + ImmutableSortedSet.Builder> builder = new ImmutableSortedSet.Builder<>(Comparator.naturalOrder()); + holes.forEach(p -> builder.add(new ImmutablePair<>(p.getLeft(), p.getRight()))); + dateRanges = builder.build(); } /** @@ -52,26 +59,6 @@ public SortedSet> getDateRanges() { return dateRanges; } - /** - * Add a collection of field index hole date ranges to this {@link FieldIndexHole}. - * - * @param dateRanges - * the date ranges - */ - public void addDateRanges(Collection> dateRanges) { - this.dateRanges.addAll(dateRanges); - } - - /** - * Add a field index hole date range to this {@link FieldIndexHole}. - * - * @param dateRange - * the date range - */ - public void addDateRange(Pair dateRange) { - this.dateRanges.add(dateRange); - } - @Override public boolean equals(Object o) { if (this == o) { diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index f910f1c..1783a20 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -45,12 +45,12 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.google.common.collect.Multimaps; import com.google.common.collect.Sets; -import com.tdunning.math.stats.Sort; import datawave.data.ColumnFamilyConstants; import datawave.data.type.Type; @@ -1059,29 +1059,31 @@ public Set loadDatatypes() throws TableNotFoundException { } /** - * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all indexed entries. + * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all indexed entries. The map consists of + * field names to datatypes to field index holes. * * @return a map of field names and datatype pairs to field index holes */ @Cacheable(value = "getFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") - public Map,FieldIndexHole> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { return getFieldIndexHoles(ColumnFamilyConstants.COLF_I); } /** - * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all reversed indexed entries. + * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all reversed indexed entries. The map + * consists of field names to datatypes to field index holes. * * @return a map of field names and datatype pairs to field index holes */ @Cacheable(value = "getReversedFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") - public Map,FieldIndexHole> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { return getFieldIndexHoles(ColumnFamilyConstants.COLF_RI); } /** * Supplies field index hole for {@link #getFieldIndexHoles()} and {@link #getReversedFieldIndexHoles()}. */ - private Map,FieldIndexHole> getFieldIndexHoles(Text indexColumnFamily) throws TableNotFoundException { + private Map> getFieldIndexHoles(Text indexColumnFamily) throws TableNotFoundException { log.debug("cache fault for getFieldIndexHoles(" + this.auths + "," + this.metadataTableName + ")"); Scanner bs = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths); @@ -1093,10 +1095,10 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol // For all keys in the DatawaveMetadata table. bs.setRange(new Range()); - // We must first scan over all fieldName-datatype combinations and extract the date ranges in which we've seen them. Each date range represents a span - // of time when we saw an event for each day in that date range, from the start to end (inclusive). - Map,SortedSet>> frequencyMap = new HashMap<>(); - Map,SortedSet>> indexMap = new HashMap<>(); + // We must first scan over the fieldName-datatype combinations and extract the date ranges in which we've seen them. Each date range represents a span + // of time when we saw an event for each day in that date range, from the start (inclusive) to end (inclusive). + Map>> frequencyMap = new HashMap<>(); + Map>> indexMap = new HashMap<>(); Calendar calendar = Calendar.getInstance(); String prevFieldName = null; @@ -1104,8 +1106,12 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol Date prevDate = null; Date startDate = null; Text prevColumnFamily = null; - // Points to the target map object that we add date ranges to. This changes when we see a different column family compared to the previous row. - Map,SortedSet>> dateMap = frequencyMap; + + // Points to the target map object that we add date ranges to. This changes when we see a different column family compared to the previous row. We must + // initially start adding entries to the frequency map. + Map>> dateMap = frequencyMap; + + Map>> fieldIndexHoles = new HashMap<>(); // Scan each row and extract the date ranges. for (Entry entry : bs) { @@ -1129,85 +1135,130 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol continue; } - // If the column family is different, record the last date range, and begin collecting date ranges for the next batch of related rows. + // If the column family is different, determine record the last date range, and begin collecting date ranges for the next batch of related rows. if (!prevColumnFamily.equals(columnFamily)) { - // We've encountered a new fieldName-datatype combination. Add the latest date range seen for the previous fieldName-datatype combination. + // Add the latest date range seen for the previous fieldName-datatype combination. Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(Pair.of(prevFieldName, prevDatatype), (k) -> new TreeSet<>()); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); dates.add(dateRange); - // Update our tracking variables. - prevFieldName = fieldName; - prevDatatype = datatype; - startDate = date; - prevDate = date; - - // Change which map dateMap points to based on the column family. - if (key.getColumnFamily().equals(ColumnFamilyConstants.COLF_F)) { + // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for the + // previously collected data. + if (columnFamily.equals(ColumnFamilyConstants.COLF_F)) { + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + // Clear the date range maps. + frequencyMap.clear(); + indexMap.clear(); + // Set the target date map to the frequency map. dateMap = frequencyMap; } else { + // The current column family is the target index. Add the latest date range seen for the previous datatype. dateMap = indexMap; } + // Update our tracking variables. + prevFieldName = fieldName; + prevDatatype = datatype; + startDate = date; } else { - // We're on the same fieldName-datatype combination as the previous entry. Compare the dates and determine if we need to start a new date range. - if (fieldName.equals(prevFieldName) && datatype.equals(prevDatatype)) { + if (!fieldName.equals(prevFieldName)) { + // Add the latest date range seen for the previous fieldName. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + + // We have encountered a new field name and the previous fieldName-datatype combination did not have any corresponding index row entries. + // Add + // the field index holes for the previous field name. + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + // Clear the date range maps. + frequencyMap.clear(); + indexMap.clear(); + // Update our tracking variables. + prevFieldName = fieldName; + prevDatatype = datatype; + startDate = date; + } else if (datatype.equals(prevDatatype)) { + // We are on the same fieldName-datatype combination as the previous row. Determine if we can add a date-range. calendar.setTime(prevDate); calendar.add(Calendar.DATE, 1); - // If the current date is one day after the previous date, it falls within the current date range. Update our tracking variables and - // continue. + // If the current date is not one day after the previous date, it is not a continuous part of the previously tracked date range. Save the + // previous date range and begin a new one. if (!calendar.getTime().equals(date)) { // The current date should not be included in the current date range. Add the current date range, and start a new one. Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(Pair.of(prevFieldName, prevDatatype), (k) -> new TreeSet<>()); + SortedSet> dates = dateMap.computeIfAbsent(datatype, (k) -> new TreeSet<>()); dates.add(dateRange); - // Update our tracking variables. + // Update the date tracking variables. startDate = date; } } else { - // We've encountered a new fieldName-datatype combination. Add the latest date range seen for the previous fieldName-datatype combination. + // We've encountered a new datatype. Add the latest date range seen for the previous datatype. Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(Pair.of(prevFieldName, prevDatatype), (k) -> new TreeSet<>()); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); dates.add(dateRange); // Update our tracking variables. - prevFieldName = fieldName; prevDatatype = datatype; startDate = date; } - prevDate = date; } + // Update the previous date and column family. + prevDate = date; prevColumnFamily = columnFamily; } // After there are no more rows, ensure that we record the last date range for the last fieldName-datatype combination that we saw. Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(Pair.of(prevFieldName, prevDatatype), (k) -> new TreeSet<>()); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); dates.add(dateRange); + // Get the field index holes for the previous field name. + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + + // Create immutable versions of the field index holes, and do not retain any empty collections. + ImmutableMap.Builder> fieldMapBuilder = new ImmutableMap.Builder<>(); + for (String fieldName : fieldIndexHoles.keySet()) { + Multimap> datatypeMap = fieldIndexHoles.get(fieldName); + if (!datatypeMap.isEmpty()) { + ImmutableMap.Builder datatypeMapBuilder = new ImmutableMap.Builder<>(); + for (String datatype : datatypeMap.keySet()) { + FieldIndexHole fieldIndexHole = new FieldIndexHole(fieldName, datatype, datatypeMap.get(datatype)); + datatypeMapBuilder.put(datatype, fieldIndexHole); + } + fieldMapBuilder.put(fieldName, datatypeMapBuilder.build()); + } + } + + // Return the finalized field index holes. + return fieldMapBuilder.build(); + } + + private Multimap> getFieldIndexHoles(Map>> frequencyMap, + Map>> indexMap) { // New tracking variables. - Pair prevFieldNameAndDataType = null; + String prevDataType = null; Pair prevFrequencyDateRange = null; Date holeStartDate = null; - Map,FieldIndexHole> fieldIndexHoles = new HashMap<>(); + Multimap> fieldIndexHoles = HashMultimap.create(); + Calendar calendar = Calendar.getInstance(); - // Now that we have the date ranges for the frequency and index rows, compare the date ranges for each fieldName-datatype combination to identify any - // and all field index holes. Evaluate the date ranges for each fieldName-datatype. - for (Pair fieldNameAndDatatype : frequencyMap.keySet()) { - - // If hole start date is not null, we have a hole left over from the previous fieldName-datatype combination. The index hole spans from the hole + // Compare the date ranges for each datatype to identify any and all field index holes. Evaluate the date ranges for each datatype. + for (String datatype : frequencyMap.keySet()) { + // If holeStartDate is not null, we have a hole left over from the previous datatype combination. The index hole spans from the hole // start date to the end of the last frequency date range. if (holeStartDate != null) { - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(prevFieldNameAndDataType, (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRange(Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); holeStartDate = null; } // At least one corresponding index row was seen. Compare the date ranges to identify any index holes. - if (indexMap.containsKey(fieldNameAndDatatype)) { - SortedSet> frequencyDates = frequencyMap.get(fieldNameAndDatatype); - - Iterator> indexDatesIterator = indexMap.get(fieldNameAndDatatype).iterator(); + if (indexMap.containsKey(datatype)) { + SortedSet> frequencyDates = frequencyMap.get(datatype); + Iterator> indexDatesIterator = indexMap.get(datatype).iterator(); Pair prevIndexDateRange = null; boolean comparePrevIndexDateRange = false; // Evaluate each date range we saw for frequency rows for the current fieldName-datatype. @@ -1225,9 +1276,7 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol // If holeStartDate is not null, we have an index hole left over from the previous frequency date range. The index hole spans from the // hole start date to the end of the last frequency date range. if (holeStartDate != null) { - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, - (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRange(Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); holeStartDate = null; } @@ -1244,13 +1293,11 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol } else { // The index start date is after the frequency start date. Check if we have a hole that partially covers the frequency date range, // or all of it. - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, - (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); if (indexStartDate.before(frequencyEndDate)) { // There is an index hole starting on the frequency start date, and ending the day before the index start date. calendar.setTime(indexStartDate); calendar.add(Calendar.DATE, -1); - indexHole.addDateRange(Pair.of(frequencyStartDate, calendar.getTime())); + fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); if (indexEndDate.before(frequencyEndDate)) { // There is an index hole starting the day after the index end date. We must evaluate the next index date range to determine @@ -1262,7 +1309,7 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol } else { // The entire frequency date range is an index hole. Add it as such, and continue to the next frequency date range. We want to // compare the current index date range to the next frequency date range as well. - indexHole.addDateRange(frequencyDateRange); + fieldIndexHoles.put(datatype, frequencyDateRange); continue; } } @@ -1290,19 +1337,16 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol holeStartDate = calendar.getTime(); } } else if (indexStartDate.before(frequencyEndDate)) { - FieldIndexHole indexHole; calendar.setTime(indexStartDate); calendar.add(Calendar.DATE, -1); if (holeStartDate != null) { // If holeStartDate is not null, we've previously identified the start of an index hole that is not the start of the frequency // date range. There is an index hole from holeStartDate to the day before the index start date. - indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRange(Pair.of(holeStartDate, calendar.getTime())); + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, calendar.getTime())); holeStartDate = null; } else { // There is an index hole from the frequency start date to the day before the index start date. - indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRange(Pair.of(frequencyStartDate, calendar.getTime())); + fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); } // It's possible for the current index date range to end before the current frequency date range. If so, this indicates a new index @@ -1317,17 +1361,15 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol } else { // The start of the current index date range occurs after the current frequency date range. There is a hole in the current frequency // date range. - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, - (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); if (holeStartDate == null) { // The entire current frequency date range is an index hole. Add it as such and break out to continue to the next frequency // date range. - indexHole.addDateRange(frequencyDateRange); + fieldIndexHoles.put(datatype, frequencyDateRange); break; } else { // There is an index hole from the recorded hole start date to the end of the frequency date range. Add it as such and break // out to continue to the next frequency date range. - indexHole.addDateRange(Pair.of(holeStartDate, frequencyEndDate)); + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, frequencyEndDate)); holeStartDate = null; // The current index date range is entirely after the current frequency date range. As such, we need to compare the current // index date range to the next frequency date range. @@ -1343,18 +1385,16 @@ private Map,FieldIndexHole> getFieldIndexHoles(Text indexCol } else { // No corresponding index rows were seen for any of the frequency rows. Each date range represents an index hole. - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(fieldNameAndDatatype, (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRanges(frequencyMap.get(fieldNameAndDatatype)); + fieldIndexHoles.putAll(datatype, frequencyMap.get(datatype)); } - // Update the prev fieldName-datatype. - prevFieldNameAndDataType = fieldNameAndDatatype; + // Update the prev datatype. + prevDataType = datatype; } // If we have a non-null hole start date after processing all the date ranges, we have an index hole that ends at the last frequency date range seen // for the last fieldName-datatype combination. if (holeStartDate != null) { - FieldIndexHole indexHole = fieldIndexHoles.computeIfAbsent(prevFieldNameAndDataType, (k) -> new FieldIndexHole(k.getLeft(), k.getRight())); - indexHole.addDateRange(Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); } return fieldIndexHoles; diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index 9bae08d..2b5e1d1 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -43,7 +43,6 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.commons.lang.time.DateUtils; -import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.slf4j.Logger; @@ -1418,20 +1417,20 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri } /** - * Return the field index holes calculated between all "i" and "f" entries. - * + * Return the field index holes calculated between all "i" and "f" entries. The map consists of field names to datatypes to field index holes. + * * @return the field index holes */ - public Map,FieldIndexHole> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { return allFieldMetadataHelper.getFieldIndexHoles(); } /** - * Return the field index holes calculated between all "ri" and "f" entries. - * + * Return the field index holes calculated between all "ri" and "f" entries. The map consists of field names to datatypes to field index holes. + * * @return the field index holes */ - public Map,FieldIndexHole> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { return allFieldMetadataHelper.getReversedFieldIndexHoles(); } diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index 7d8e7ed..50320e1 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -8,6 +8,7 @@ import java.util.Collection; import java.util.Collections; import java.util.Date; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -35,6 +36,7 @@ import org.junit.jupiter.params.provider.ValueSource; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; @@ -99,10 +101,11 @@ private void writeMutations(Collection mutations) { /** * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles()}. */ + @SuppressWarnings("unchecked") @Nested public class FieldIndexHoleTests { - private Supplier,FieldIndexHole>> INDEX_FUNCTION = () -> { + private final Supplier>> INDEX_FUNCTION = () -> { try { return helper.getFieldIndexHoles(); } catch (TableNotFoundException | CharacterCodingException e) { @@ -110,7 +113,7 @@ public class FieldIndexHoleTests { } }; - private Supplier,FieldIndexHole>> REVERSED_INDEX_FUNCTION = () -> { + private final Supplier>> REVERSED_INDEX_FUNCTION = () -> { try { return helper.getReversedFieldIndexHoles(); } catch (TableNotFoundException | CharacterCodingException e) { @@ -118,7 +121,7 @@ public class FieldIndexHoleTests { } }; - private Supplier,FieldIndexHole>> getIndexHoleFunction(String cf) { + private Supplier>> getIndexHoleFunction(String cf) { return cf.equals("i") ? INDEX_FUNCTION : REVERSED_INDEX_FUNCTION; } @@ -145,8 +148,7 @@ void testNoFieldIndexHoles(String cf) { writeMutations(mutationCreator.getMutations()); // Verify that no index holes were found. - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - System.out.println(fieldIndexHoles); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); Assertions.assertTrue(fieldIndexHoles.isEmpty()); } @@ -162,11 +164,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange(String cf) { mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); writeMutations(mutationCreator.getMutations()); - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200101", "20200105")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -181,12 +183,12 @@ void testFieldIndexHoleForStartOfFrequencyDateRange(String cf) { mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); writeMutations(mutationCreator.getMutations()); - - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200101", "20200103")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -202,11 +204,11 @@ void testFieldIndexHoleForEndOfFrequencyDateRange(String cf) { mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); writeMutations(mutationCreator.getMutations()); - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200103", "20200105")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -222,12 +224,12 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange(String cf) { mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); writeMutations(mutationCreator.getMutations()); - - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200104", "20200106")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -242,15 +244,16 @@ void testMultipleFieldIndexHolesInFrequencyDateRange(String cf) { mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113"); mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118"); writeMutations(mutationCreator.getMutations()); - - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200101", "20200103")); - expectedHole.addDateRange(dateRange("20200107", "20200109")); - expectedHole.addDateRange(dateRange("20200114", "20200116")); - expectedHole.addDateRange(dateRange("20200119", "20200120")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -266,11 +269,12 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo(String cf) { mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105"); writeMutations(mutationCreator.getMutations()); - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200103", "20200105")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } /** @@ -286,13 +290,90 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges(String cf) { mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115"); writeMutations(mutationCreator.getMutations()); - Map,FieldIndexHole> fieldIndexHoles = getIndexHoleFunction(cf).get(); - Assertions.assertEquals(1, fieldIndexHoles.size()); - FieldIndexHole expectedHole = new FieldIndexHole("NAME", "wiki"); - expectedHole.addDateRange(dateRange("20200104", "20200105")); - expectedHole.addDateRange(dateRange("20200110", "20200112")); - Assertions.assertEquals(expectedHole, fieldIndexHoles.get(Pair.of("NAME", "wiki"))); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200105"), dateRange("20200110", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } + + /** + * Test against data where we have a number of index holes that span just a day. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104"); + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115"); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125"); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } + + private Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + private FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); } private Pair dateRange(String start, String end) {