From 316dccdee9c4a644a7f7076cab49c66bf59726bd Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Mon, 11 Mar 2024 12:53:03 -0400 Subject: [PATCH 01/10] Add aggregator for frequency metadata rows Create the aggregator FrequencyMetadataAggregator. This aggregator will support compaction of the "f", "i", and "ri" columns in the metadata table, collapsing the counts for dates into a single entry for each unique row, data type, and column visbility grouping. Update MetadataHelper and AllFieldsMetadataHelper such that methods scanning over either the "f", "i", and/or "ri" columns are able to handle entries that either have the original format created upon ingest, or the aggregated format generated from the aggregator. Required for datawave/issues/716. --- .../FrequencyMetadataAggregator.java | 414 +++ .../query/model/DateFrequencyMap.java | 198 ++ .../java/datawave/query/model/Frequency.java | 80 + .../query/util/AllFieldMetadataHelper.java | 107 +- .../datawave/query/util/MetadataHelper.java | 182 +- .../FrequencyMetadataAggregatorTest.java | 459 ++++ .../util/AllFieldMetadataHelperTest.java | 2351 ++++++++++++++--- .../query/util/MetadataHelperTest.java | 372 ++- .../java/datawave/query/util/TestUtils.java | 132 + .../resources/MarkingFunctionsContext.xml | 23 + 10 files changed, 3755 insertions(+), 563 deletions(-) create mode 100644 src/main/java/datawave/iterators/FrequencyMetadataAggregator.java create mode 100644 src/main/java/datawave/query/model/DateFrequencyMap.java create mode 100644 src/main/java/datawave/query/model/Frequency.java create mode 100644 src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java create mode 100644 src/test/java/datawave/query/util/TestUtils.java create mode 100644 src/test/resources/MarkingFunctionsContext.xml diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java new file mode 100644 index 00000000..c513d820 --- /dev/null +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -0,0 +1,414 @@ +package datawave.iterators; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.LongCombiner; +import org.apache.accumulo.core.iterators.OptionDescriber; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; +import org.apache.log4j.Logger; + +import datawave.marking.MarkingFunctions; +import datawave.query.model.DateFrequencyMap; + +/** + * Aggregates entries in the metadata table for the "f", "i", and "ri" columns. When initially ingested, entries for these columns have a column qualifier with + * the format {@code \0}, and a value containing a possibly partial frequency count for the date in the column qualifier. Entries with the + * same row, column family, datatype, and column family will be aggregated into a single entry where the column qualifier consists of the datatype and the value + * consists of an encoded {@link DateFrequencyMap} with the dates and counts seen. Additionally, this aggregator will handle the case where we have a previously + * aggregated entry and freshly ingested rows that need to be aggregated together.
+ *
+ * This iterator supports the following options: + *
    + *
  • {@value COMBINE_VISIBILITIES}: Defaults to false. If true, entries will be aggregated by row, column family, and datatype only, and the column visibility + * will be a combination of all column visibilities seen for the row/column family/datatype combo. This option is meant to be used when scanning only, and not + * for compaction.
  • + *
+ */ +public class FrequencyMetadataAggregator extends WrappingIterator implements OptionDescriber { + + public static final String COMBINE_VISIBILITIES = "FrequencyMetadataAggregator.COMBINE_VISIBILITIES"; + + private static final Logger log = Logger.getLogger(FrequencyMetadataAggregator.class); + private static final String NULL_BYTE = "\0"; + private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions(); + + private SortedKeyValueIterator source; + private boolean combineVisibilities; + private Key topKey; + private Value topValue; + + private final TreeMap cache; + private final Map visibilityToDateFrequencies; + private final Map visibilityToMaxTimestamp; + + private Text currentRow; + private Text currentColumnFamily; + private String currentDatatype; + private String currentDate; + private ColumnVisibility currentVisibility; + private long currentTimestamp; + private boolean isCurrentAggregated; + + public FrequencyMetadataAggregator() { + cache = new TreeMap<>(); + visibilityToDateFrequencies = new HashMap<>(); + visibilityToMaxTimestamp = new HashMap<>(); + } + + public FrequencyMetadataAggregator(FrequencyMetadataAggregator other, IteratorEnvironment env) { + this(); + source = other.getSource().deepCopy(env); + combineVisibilities = other.combineVisibilities; + cache.putAll(other.cache); + } + + @Override + public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { + return new FrequencyMetadataAggregator(this, env); + } + + @Override + public IteratorOptions describeOptions() { + Map options = new HashMap<>(); + options.put(COMBINE_VISIBILITIES, "Boolean value denoting whether to combine entries with different visibilities. Defaults to false."); + + return new IteratorOptions(getClass().getSimpleName(), "An iterator used to collapse frequency columns in the metadata table", options, null); + } + + @Override + public boolean validateOptions(Map options) { + // Check if entries with different column visibilities should be combined. + if (options.containsKey(COMBINE_VISIBILITIES)) { + combineVisibilities = Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES)); + if (log.isTraceEnabled()) { + log.trace("combine visibilities: " + combineVisibilities); + } + } + + return true; + } + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { + if (!validateOptions(options)) { + throw new IllegalArgumentException("Invalid options given: " + options); + } + + this.source = source; + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + log.trace("seeking"); + + source.seek(range, columnFamilies, inclusive); + + // Establish the first top key. + next(); + + if (log.isTraceEnabled()) { + log.trace("first top key after seek: " + topKey); + } + } + + @Override + public Key getTopKey() { + return topKey; + } + + @Override + public Value getTopValue() { + return topValue; + } + + @Override + public boolean hasTop() { + return topKey != null; + } + + @Override + public void next() throws IOException { + log.trace("Fetching next"); + if (!popCache()) { + log.trace("No entries in cache"); + if (source.hasTop()) { + log.trace("Source has top, updating cache"); + updateCache(); + } else { + log.trace("Source does not have top"); + } + popCache(); + } + } + + /** + * Set {@link #topKey} and {@link #topValue} to the next available entry in the cache. Returns true if the cache was not empty, or false otherwise. + */ + private boolean popCache() { + topKey = null; + topValue = null; + + if (!cache.isEmpty()) { + Map.Entry entry = cache.pollFirstEntry(); + topKey = entry.getKey(); + topValue = entry.getValue(); + return true; + } + return false; + } + + /** + * Reset all current tracking variables. + */ + private void resetCurrent() { + currentRow = null; + currentColumnFamily = null; + currentDatatype = null; + currentDate = null; + currentVisibility = null; + currentTimestamp = 0L; + isCurrentAggregated = false; + visibilityToDateFrequencies.clear(); + visibilityToMaxTimestamp.clear(); + } + + /** + * Iterate over the source entries, aggregate all entries for the next row/column family/datatype combination, and add them to the cache. + */ + private void updateCache() throws IOException { + log.trace("Updating cache"); + + resetCurrent(); + + while (true) { + // If the source does not have any more entries, wrap up the last batch of entries. + if (!source.hasTop()) { + log.trace("Source does not have top"); + wrapUpCurrent(); + return; + } + + Key key = source.getTopKey(); + if (log.isTraceEnabled()) { + log.trace("updateCache examining key " + key); + } + + // If the current entry has a different row, column family, or datatype from the previous entry, wrap up and return the current + // batch of entries. + if (differsFromPrev(key)) { + wrapUpCurrent(); + return; + } + + // Aggregate the current entry. + aggregateCurrent(); + + // Advance to the next entry from the source. + source.next(); + } + } + + /** + * Return true if the current entry is not the first entry seen in the current call to {@link #updateCache()} and has a different row, column family, or + * datatype from the previous entry, or false otherwise. + */ + private boolean differsFromPrev(Key key) { + // Update the current row if null. + if (currentRow == null) { + currentRow = key.getRow(); + if (log.isTraceEnabled()) { + log.trace("Set current row to " + currentRow); + } + // Check if we're on a new field. + } else if (!currentRow.equals(key.getRow())) { + if (log.isTraceEnabled()) { + log.trace("Next row " + key.getRow() + " differs from prev " + currentRow); + } + return true; + } + + // Update the current column family if null. + if (currentColumnFamily == null) { + currentColumnFamily = key.getColumnFamily(); + if (log.isTraceEnabled()) { + log.trace("Set current column family to " + currentColumnFamily); + } + // Check if we're on a new column family. + } else if (!currentColumnFamily.equals(key.getColumnFamily())) { + if (log.isTraceEnabled()) { + log.trace("Next column family " + key.getColumnFamily() + " differs from prev " + currentColumnFamily); + } + return true; + } + + String columnQualifier = key.getColumnQualifier().toString(); + int separatorPos = columnQualifier.indexOf(NULL_BYTE); + // If a null byte was not found, the column qualifier has the format and is from a previously aggregated entry. Otherwise, the column + // qualifier has the format \0 and is from a non-aggregated entry. + isCurrentAggregated = separatorPos == -1; + String datatype; + if (isCurrentAggregated) { + datatype = columnQualifier; + } else { + datatype = columnQualifier.substring(0, separatorPos); + currentDate = columnQualifier.substring((separatorPos + 1)); + if (log.isTraceEnabled()) { + log.trace("Set current date to " + currentDate); + } + } + + // Update the current datatype if null. + if (currentDatatype == null) { + currentDatatype = datatype; + if (log.isTraceEnabled()) { + log.trace("Set current datatype to " + currentDatatype); + } + // Check if we're on a new datatype. + } else if (!currentDatatype.equals(datatype)) { + if (log.isTraceEnabled()) { + log.trace("Next datatype " + datatype + " differs from prev " + currentDatatype); + } + return true; + } + + // Update the current visibility and timestamp. + currentVisibility = new ColumnVisibility(key.getColumnVisibility()); + currentTimestamp = key.getTimestamp(); + return false; + } + + /** + * Aggregate the current entry. + */ + private void aggregateCurrent() { + Value value = source.getTopValue(); + // Fetch the date-frequency map for the current column visibility, creating one if not present. + DateFrequencyMap dateFrequencies = visibilityToDateFrequencies.computeIfAbsent(currentVisibility, (k) -> new DateFrequencyMap()); + + // If the current entry has an aggregated value, parse it as such and merge it with the date-frequency map. + if (isCurrentAggregated) { + try { + DateFrequencyMap entryMap = new DateFrequencyMap(value.get()); + dateFrequencies.incrementAll(entryMap); + } catch (IOException e) { + Key key = source.getTopKey(); + log.error("Failed to parse date frequency map from value for key " + key, e); + throw new IllegalArgumentException("Failed to parse date frequency map from value for key " + key, e); + } + } else { + // If the current entry does not have an aggregated value, it has a count for a specific date. Increment the count for the date in the map. + long count = LongCombiner.VAR_LEN_ENCODER.decode(value.get()); + dateFrequencies.increment(currentDate, count); + } + + // If the current timestamp is later than the previously tracked timestamp for the current column visibility, update the tracked timestamp. + if (visibilityToMaxTimestamp.containsKey(currentVisibility)) { + long prevTimestamp = visibilityToMaxTimestamp.get(currentVisibility); + if (prevTimestamp < currentTimestamp) { + visibilityToMaxTimestamp.put(currentVisibility, currentTimestamp); + } + } else { + visibilityToMaxTimestamp.put(currentVisibility, currentTimestamp); + } + } + + /** + * Create the entries to be returned by {@link #next()} and add them to the cache. + */ + private void wrapUpCurrent() { + if (log.isTraceEnabled()) { + log.trace("Wrapping up for row: " + currentRow + ", cf: " + currentColumnFamily + ", cq: " + currentDatatype); + } + + cache.putAll(buildTopEntries()); + resetCurrent(); + } + + /** + * Build and return a sorted map of the key-value entries that should be made available to be returned by {@link #next()}. + */ + private Map buildTopEntries() { + if (log.isTraceEnabled()) { + log.trace("buildTopKeys, currentRow: " + currentRow); + log.trace("buildTopKeys, currentColumnFamily: " + currentColumnFamily); + log.trace("buildTopKeys, currentDatatype: " + currentDatatype); + } + + Text columnQualifier = new Text(currentDatatype); + + // If we are combining all entries regardless of column visibility, we will end up with one entry to return. + if (combineVisibilities) { + // Combine the visibilities and frequencies, and find the latest timestamp. + ColumnVisibility combined = combineAllVisibilities(); + long latestTimestamp = getLatestTimestamp(); + DateFrequencyMap combinedFrequencies = combineAllDateFrequencies(); + + // Return the single key-value pair. + Key key = new Key(currentRow, currentColumnFamily, columnQualifier, combined, latestTimestamp); + Value value = new Value(WritableUtils.toByteArray(combinedFrequencies)); + return Collections.singletonMap(key, value); + } else { + Map entries = new HashMap<>(); + // Create a key-value pair for each distinct column visibility. + for (Map.Entry entry : visibilityToDateFrequencies.entrySet()) { + ColumnVisibility visibility = entry.getKey(); + long timestamp = visibilityToMaxTimestamp.get(visibility); + Key key = new Key(currentRow, currentColumnFamily, columnQualifier, visibility, timestamp); + Value value = new Value(WritableUtils.toByteArray(entry.getValue())); + entries.put(key, value); + } + return entries; + } + } + + /** + * Return a {@link ColumnVisibility} that is the combination of all visibilities present in {@link #visibilityToDateFrequencies}. + */ + private ColumnVisibility combineAllVisibilities() { + Set visibilities = visibilityToDateFrequencies.keySet(); + try { + return markingFunctions.combine(visibilities); + } catch (MarkingFunctions.Exception e) { + log.error("Failed to combine visibilities " + visibilities); + throw new IllegalArgumentException("Failed to combine visibilities " + visibilities, e); + } + } + + /** + * Return the latest timestamp present in {@link #visibilityToMaxTimestamp}. + */ + private long getLatestTimestamp() { + long max = 0L; + for (long timestamp : visibilityToMaxTimestamp.values()) { + max = Math.max(max, timestamp); + } + return max; + } + + /** + * Return a {@link DateFrequencyMap} that contains all date counts present in {@link #visibilityToDateFrequencies}. + */ + private DateFrequencyMap combineAllDateFrequencies() { + DateFrequencyMap combined = new DateFrequencyMap(); + for (DateFrequencyMap map : visibilityToDateFrequencies.values()) { + combined.incrementAll(map); + } + return combined; + + } +} diff --git a/src/main/java/datawave/query/model/DateFrequencyMap.java b/src/main/java/datawave/query/model/DateFrequencyMap.java new file mode 100644 index 00000000..56b29d6c --- /dev/null +++ b/src/main/java/datawave/query/model/DateFrequencyMap.java @@ -0,0 +1,198 @@ +package datawave.query.model; + +import java.io.ByteArrayInputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; + +public class DateFrequencyMap implements Writable { + + // TODO - Should we use the YearMonthDay class instead as the key here? + private final TreeMap dateToFrequencies; + + public DateFrequencyMap() { + this.dateToFrequencies = new TreeMap<>(); + } + + public DateFrequencyMap(byte[] bytes) throws IOException { + this(); + ByteArrayInputStream in = new ByteArrayInputStream(bytes); + DataInputStream dataIn = new DataInputStream(in); + readFields(dataIn); + dataIn.close(); + } + + /** + * Associates the given frequency with the given date in this {@link DateFrequencyMap}. If the map previously contained a mapping for the given date, the + * old frequency is replaced by the new frequency. + * + * @param date + * the date + * @param frequency + * the frequency + */ + public void put(String date, long frequency) { + put(date, new Frequency(frequency)); + } + + /** + * Associates the given frequency with the given date in this {@link DateFrequencyMap}. If the map previously contained a mapping for the given date, the + * old frequency is replaced by the new frequency. + * + * @param date + * the date + * @param frequency + * the frequency + */ + public void put(String date, Frequency frequency) { + dateToFrequencies.put(date, frequency); + } + + /** + * Increments the frequency associated with the given date by the given addend. If a mapping does not previously exist for the date, a new mapping will be + * added with the given addend as the frequency. + * + * @param date + * the date + * @param addend + * the addend + */ + public void increment(String date, long addend) { + dateToFrequencies.computeIfAbsent(date, (k) -> new Frequency()).increment(addend); + } + + /** + * Increment all frequencies in this {@link DateFrequencyMap} by the frequencies in the given map. If the given map contains mappings for dates not present + * in this map, those mappings will be added to this map. + * + * @param map + * the map + */ + public void incrementAll(DateFrequencyMap map) { + for (Map.Entry entry : map.dateToFrequencies.entrySet()) { + increment(entry.getKey(), entry.getValue().getValue()); + } + } + + /** + * Return the frequency associated with the given date, or null if no such mapping exists. + * + * @param date + * the date + * @return the count + */ + public Frequency get(String date) { + return dateToFrequencies.get(date); + } + + /** + * Return whether this map contains a mapping for the given date. + * + * @param date + * the date + * @return true if a mapping exists for the given date, or false otherwise + */ + public boolean contains(String date) { + return dateToFrequencies.containsKey(date); + } + + /** + * Clear all mappings in this {@link DateFrequencyMap}. + */ + public void clear() { + this.dateToFrequencies.clear(); + } + + /** + * Returns a {@link Set} view of the mappings contained within this map, sorted in ascending by order. + * + * @return a {@link Set} view of the mappings + */ + public Set> entrySet() { + return this.dateToFrequencies.entrySet(); + } + + /** + * Returns a view of the portion of this {@link DateFrequencyMap}'s underlying map whose keys range from startDate (inclusive) to endDate (inclusive). + * + * @param startDate + * the start date + * @param endDate + * the end date + * @return the map view + */ + public SortedMap subMap(String startDate, String endDate) { + return dateToFrequencies.subMap(startDate, true, endDate, true); + } + + /** + * Returns the earliest date in this {@link DateFrequencyMap}. + * + * @return the earliest date + */ + public String earliestDate() { + return dateToFrequencies.firstKey(); + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + // Write the map's size. + WritableUtils.writeVInt(dataOutput, dateToFrequencies.size()); + + // Write each entry. + for (Map.Entry entry : dateToFrequencies.entrySet()) { + WritableUtils.writeString(dataOutput, entry.getKey()); + entry.getValue().write(dataOutput); + } + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + // Clear the map. + this.dateToFrequencies.clear(); + + // Read how many entries to expect. + int entries = WritableUtils.readVInt(dataInput); + + // Read each entry. + for (int i = 0; i < entries; i++) { + // Read the date key. + String date = WritableUtils.readString(dataInput); + // Read the frequency value. + Frequency value = new Frequency(); + value.readFields(dataInput); + this.dateToFrequencies.put(date, value); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DateFrequencyMap that = (DateFrequencyMap) o; + return Objects.equals(dateToFrequencies, that.dateToFrequencies); + } + + @Override + public int hashCode() { + return Objects.hash(dateToFrequencies); + } + + @Override + public String toString() { + return dateToFrequencies.toString(); + } +} diff --git a/src/main/java/datawave/query/model/Frequency.java b/src/main/java/datawave/query/model/Frequency.java new file mode 100644 index 00000000..1b1aac97 --- /dev/null +++ b/src/main/java/datawave/query/model/Frequency.java @@ -0,0 +1,80 @@ +package datawave.query.model; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Objects; + +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +/** + * Represents a frequency count. + */ +public class Frequency implements WritableComparable { + + // The value. + private long value; + + public Frequency() {} + + public Frequency(long value) { + this.value = value; + } + + /** + * Return the value of this {@link Frequency}. + * + * @return the frequency + */ + public long getValue() { + return value; + } + + /** + * Increment the value of this {@link Frequency} by the given addend. + * + * @param addend + * the addend to add + */ + public void increment(long addend) { + this.value += addend; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, value); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + value = WritableUtils.readVLong(dataInput); + } + + @Override + public int compareTo(Frequency o) { + return Long.compare(this.value, o.value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Frequency frequency = (Frequency) o; + return value == frequency.value; + } + + @Override + public int hashCode() { + return Objects.hash(value); + } + + @Override + public String toString() { + return Long.toString(value); + } +} diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index d844fe77..873197b7 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -30,6 +30,7 @@ import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.RegExFilter; +import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.io.Text; @@ -56,7 +57,9 @@ import datawave.data.type.Type; import datawave.query.composite.CompositeMetadata; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.FieldIndexHole; +import datawave.query.model.Frequency; import datawave.security.util.AuthorizationsMinimizer; import datawave.security.util.ScannerHelper; import datawave.util.time.DateHelper; @@ -692,26 +695,35 @@ protected HashMap getCountsByFieldInDayWithTypes(Entry (aggregated entries) and/or \0 (non-aggregated entries). + // Filter out any non-aggregated entries that does not have the date in the column qualifier. IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, ".*\u0000" + date, null, false); + // Allow any entries that do not contain the null byte delimiter, or contain it with the target date directly afterwards. + RegExFilter.setRegexs(cqRegex, null, null, "^((?!\u0000).)*$|^(.*\u0000" + date + ")$", null, false); scanner.addScanIterator(cqRegex); - final Text holder = new Text(); final HashMap datatypeToCounts = Maps.newHashMap(); - for (Entry countEntry : scanner) { - ByteArrayInputStream bais = new ByteArrayInputStream(countEntry.getValue().get()); - DataInputStream inputStream = new DataInputStream(bais); - - Long sum = WritableUtils.readVLong(inputStream); - - countEntry.getKey().getColumnQualifier(holder); - int offset = holder.find(NULL_BYTE); - - Preconditions.checkArgument(-1 != offset, "Could not find nullbyte separator in column qualifier for: " + countEntry.getKey()); - - String datatype = Text.decode(holder.getBytes(), 0, offset); + for (Entry entry : scanner) { + Text colq = entry.getKey().getColumnQualifier(); + int nullBytePos = colq.find(NULL_BYTE); - datatypeToCounts.put(datatype, sum); + // If the null byte is not present in the colq, this is an aggregated entry. The colq consists solely of the datatype, and the value is a + // DateFrequencyMap. + if (nullBytePos == -1) { + String datatype = Text.decode(colq.getBytes()); + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + // If a count is present for the target date, merge in the sum. + if (map.contains(date)) { + long count = map.get(date).getValue(); + datatypeToCounts.merge(datatype, count, Long::sum); + } + } else { + // If the null byte is present, this is an entry that hasn't been compacted yet. The colq consists of the datatype and date, and the value is a + // long. + String datatype = Text.decode(colq.getBytes(), 0, nullBytePos); + Long count = SummingCombiner.VAR_LEN_ENCODER.decode(entry.getValue().get()); + datatypeToCounts.merge(datatype, count, Long::sum); + } } return datatypeToCounts; @@ -1187,16 +1199,18 @@ private static class FieldIndexHoleFinder { * * @return the field index holes * @throws IOException + * if an exception occurs when decoding a {@link Value} */ - Map> findHoles() throws IOException { + private Map> findHoles() throws IOException { String prevFieldName = null; Text prevColumnFamily = null; String currFieldName; String currDatatype; Text currColumnFamily; - Date currDate; - Long currCount; + Date currDate = null; + Value currentValue; + boolean isCurrentAggregated; for (Map.Entry entry : scanner) { // Parse the current row. @@ -1206,24 +1220,31 @@ Map> findHoles() throws IOException { String cq = key.getColumnQualifier().toString(); int offset = cq.indexOf(NULL_BYTE); - currDatatype = cq.substring(0, offset); + isCurrentAggregated = offset == -1; + // If the current entry is an aggregated entry, the colq consists solely of the datatype. + if (isCurrentAggregated) { + currDatatype = cq; + } else { + // Otherwise, the colq consists of the datatype and a date. + currDatatype = cq.substring(0, offset); + currDate = DateHelper.parse(cq.substring((offset + 1))); + } // Check if the current field and datatype are part of the fields and datatypes we want to retrieve field index holes for. if (!isPartOfTarget(currFieldName, currDatatype)) { continue; } - currDate = DateHelper.parse(cq.substring((offset + 1))); - - ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get()); - DataInputStream inputStream = new DataInputStream(byteStream); - currCount = WritableUtils.readVLong(inputStream); + currentValue = entry.getValue(); // If this is the very first entry we've looked at, update our tracking variables, add the current entry to the target map, and continue to the - // next - // entry. + // next entry. if (prevFieldName == null) { - addToTargetMap(currDatatype, currDate, currCount); + if (isCurrentAggregated) { + addToTargetMap(currDatatype, currentValue); + } else { + addToTargetMap(currDatatype, currDate, currentValue); + } prevFieldName = currFieldName; prevColumnFamily = currColumnFamily; @@ -1237,8 +1258,7 @@ Map> findHoles() throws IOException { // In both cases, record the last entry, and begin collecting date ranges for the next batch of related rows. if (!prevColumnFamily.equals(currColumnFamily)) { // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for - // the - // previously collected data. + // the previously collected data. if (currColumnFamily.equals(ColumnFamilyConstants.COLF_F)) { // Find and add all field index holes for the current frequency and index entries. findFieldIndexHoles(prevFieldName); @@ -1250,9 +1270,6 @@ Map> findHoles() throws IOException { // The current column family is the target index column family. Set the target map to the index map. this.targetMap = indexMap; } - - // Add the current entry to the target entry map. - addToTargetMap(currDatatype, currDate, currCount); } else { // The column family is the same. We have two possible scenarios: // - A row with a field that is different to the previous field. @@ -1264,14 +1281,16 @@ Map> findHoles() throws IOException { findFieldIndexHoles(prevFieldName); // Clear the entry maps. clearEntryMaps(); - // Add the current entry to the target entry map. - addToTargetMap(currDatatype, currDate, currCount); - } else { - // The current row has the same field. Add the current entry to the target map. - addToTargetMap(currDatatype, currDate, currCount); } } + // Add the current entry to the target entry map. + if (isCurrentAggregated) { + addToTargetMap(currDatatype, currentValue); + } else { + addToTargetMap(currDatatype, currDate, currentValue); + } + // Set the values for our prev entry to the current entry. prevFieldName = currFieldName; prevColumnFamily = currColumnFamily; @@ -1294,9 +1313,19 @@ private boolean isPartOfTarget(String field, String datatype) { /** * Add the current date and count to the current target map for the current datatype. */ - private void addToTargetMap(String datatype, Date date, Long count) { + private void addToTargetMap(String datatype, Date date, Value value) { + Long count = SummingCombiner.VAR_LEN_ENCODER.decode(value.get()); SortedMap datesToCounts = targetMap.computeIfAbsent(datatype, (k) -> new TreeMap<>()); - datesToCounts.put(date, count); + datesToCounts.merge(date, count, Long::sum); + } + + private void addToTargetMap(String datatype, Value value) throws IOException { + DateFrequencyMap map = new DateFrequencyMap(value.get()); + SortedMap datesToCounts = targetMap.computeIfAbsent(datatype, (k) -> new TreeMap<>()); + for (Entry entry : map.entrySet()) { + Date date = DateHelper.parse(entry.getKey()); + datesToCounts.merge(date, entry.getValue().getValue(), Long::sum); + } } /** diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index 50d791c7..c67a881b 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -1,6 +1,7 @@ package datawave.query.util; import java.io.ByteArrayInputStream; +import java.io.Console; import java.io.DataInputStream; import java.io.IOException; import java.nio.charset.CharacterCodingException; @@ -43,6 +44,7 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.commons.lang3.time.DateUtils; +import org.apache.hadoop.fs.shell.Concat; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.slf4j.Logger; @@ -70,9 +72,11 @@ import datawave.iterators.filter.EdgeMetadataCQStrippingIterator; import datawave.marking.MarkingFunctions; import datawave.query.composite.CompositeMetadata; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.Direction; import datawave.query.model.FieldIndexHole; import datawave.query.model.FieldMapping; +import datawave.query.model.Frequency; import datawave.query.model.ModelKeyParser; import datawave.query.model.QueryModel; import datawave.security.util.AuthorizationsMinimizer; @@ -1099,34 +1103,42 @@ public Set getContentFields(Set ingestTypeFilter) throws TableNo } /** - * Sum all of the frequency counts for a field between a start and end date (inclusive) + * Return the sum of all frequency counts for a field between a start and end date (inclusive). * * @param fieldName + * the field name * @param begin + * the start date * @param end - * @return + * the end date + * @return the sum * @throws TableNotFoundException + * if the metadata table could not be found */ public long getCardinalityForField(String fieldName, Date begin, Date end) throws TableNotFoundException { return getCardinalityForField(fieldName, null, begin, end); } /** - * Sum all of the frequency counts for a field in a datatype between a start and end date (inclusive) + * Return the sum of all frequency counts for a field in a datatype between a start and end date (inclusive). * * @param fieldName + * the field name * @param datatype + * the datatype * @param begin + * the start date * @param end - * @return + * the end date + * @return the sum * @throws TableNotFoundException + * if the metadata table could not be found */ public long getCardinalityForField(String fieldName, String datatype, Date begin, Date end) throws TableNotFoundException { log.trace("getCardinalityForField from table: " + metadataTableName); Text row = new Text(fieldName.toUpperCase()); - // Get all the rows in DatawaveMetadata for the field, only in the 'f' - // colfam + // Get all the rows in DatawaveMetadata for the field, only in the 'f' column family. Scanner bs = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths); Key startKey = new Key(row); @@ -1138,10 +1150,13 @@ public long getCardinalityForField(String fieldName, String datatype, Date begin for (Entry entry : bs) { Text colq = entry.getKey().getColumnQualifier(); + // Check for the presence of a null byte in the colq. If present, we have a non-aggregated entry with a Long value. If not present, we have an + // aggregated entry with a DateFrequencyMap value. int index = colq.find(NULL_BYTE); + + // If a null byte is present in the colq, this is a non-aggregated entry. if (index != -1) { - // If we were given a non-null datatype - // Ensure that we process records only on that type + // If a datatype was specified, sum the count only if the current datatype matches. if (null != datatype) { try { String type = Text.decode(colq.getBytes(), 0, index); @@ -1154,14 +1169,12 @@ public long getCardinalityForField(String fieldName, String datatype, Date begin } } - // Parse the date to ensure that we want this record - String dateStr = "null"; - Date date; + // Parse the date to ensure that we want this record. + String dateStr = null; try { dateStr = Text.decode(colq.getBytes(), index + 1, colq.getLength() - (index + 1)); - date = DateHelper.parse(dateStr); - // Add the provided count if we fall within begin and end, - // inclusive + Date date = DateHelper.parse(dateStr); + // Add the provided count if we fall within begin and end, inclusively. if (date.compareTo(begin) >= 0 && date.compareTo(end) <= 0) { count += SummingCombiner.VAR_LEN_ENCODER.decode(entry.getValue().get()); } @@ -1172,11 +1185,26 @@ public long getCardinalityForField(String fieldName, String datatype, Date begin } catch (DateTimeParseException e) { log.warn("Could not convert date string: " + dateStr); } + } else { + // If a datatype was specified, sum the counts only if the current datatype matches. + if (datatype != null) { + String type = colq.toString(); + if (!type.equals(datatype)) { + continue; + } + } + try { + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + // Fetch all entries within the target date range and sum the counts. + long sum = map.subMap(DateHelper.format(begin), DateHelper.format(end)).values().stream().mapToLong(Frequency::getValue).sum(); + count += sum; + } catch (IOException e) { + log.debug("Could not convert the value to a " + DateFrequencyMap.class.getSimpleName()); + } } } bs.close(); - return count; } @@ -1311,11 +1339,14 @@ protected HashMap getCountsByFieldInDayWithTypes(String fieldName, scanner.fetchColumnFamily(ColumnFamilyConstants.COLF_F); scanner.setRange(Range.exact(fieldName)); + // It's possible to find rows with column qualifiers in the format (aggregated entries) and/or \0 (non-aggregated + // entries). + // Filter out any non-aggregated entries that does not have the date in the column qualifier. IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, ".*\u0000" + date, null, false); + // Allow any entries that do not contain the null byte delimiter, or contain it with the target date directly afterwards. + RegExFilter.setRegexs(cqRegex, null, null, "^((?!\u0000).)*$|^(.*\u0000" + date + ")$", null, false); scanner.addScanIterator(cqRegex); - final Text holder = new Text(); for (Entry entry : scanner) { // if this is the real connector, and wrapped connector is not null, it means // that we didn't get a hit in the cache. So, we will update the cache with the @@ -1324,19 +1355,27 @@ protected HashMap getCountsByFieldInDayWithTypes(String fieldName, writer = updateCache(entry, writer, wrappedClient); } - ByteArrayInputStream bais = new ByteArrayInputStream(entry.getValue().get()); - DataInputStream inputStream = new DataInputStream(bais); - - Long sum = WritableUtils.readVLong(inputStream); + Text colq = entry.getKey().getColumnQualifier(); + int nullBytePos = colq.find(NULL_BYTE); - entry.getKey().getColumnQualifier(holder); - int offset = holder.find(NULL_BYTE); - - Preconditions.checkArgument(-1 != offset, "Could not find nullbyte separator in column qualifier for: " + entry.getKey()); - - String datatype = Text.decode(holder.getBytes(), 0, offset); - - datatypeToCounts.put(datatype, sum); + // If the null byte is not present in the colq, this is an aggregated entry. The colq consists solely of the datatype, and the value is a + // DateFrequencyMap. + if (nullBytePos == -1) { + String datatype = Text.decode(colq.getBytes()); + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + // If a count is present for the target date, merge in the sum. + if (map.contains(date)) { + long count = map.get(date).getValue(); + datatypeToCounts.merge(datatype, count, Long::sum); + } + } else { + // If the null byte is present, this is an entry that hasn't been compacted yet. The colq consists of the datatype and date, and the value + // is a + // long. + String datatype = Text.decode(colq.getBytes(), 0, nullBytePos); + Long count = SummingCombiner.VAR_LEN_ENCODER.decode(entry.getValue().get()); + datatypeToCounts.merge(datatype, count, Long::sum); + } } } finally { if (writer != null) { @@ -1368,8 +1407,11 @@ public Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String return date; } - protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String dataType, AccumuloClient client, WrappedAccumuloClient wrappedClient) { - String dateString = null; + protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String dataTypeFilter, AccumuloClient client, + WrappedAccumuloClient wrappedClient) { + String prevDatatype = null; + boolean prevEntryAggregated = false; + String earliestDate = null; BatchWriter writer = null; try { @@ -1377,37 +1419,74 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri scanner.fetchColumnFamily(ColumnFamilyConstants.COLF_F); scanner.setRange(Range.exact(fieldName)); - // if a type was specified, add a regex filter for it - if (dataType != null) { + // It's possible to find rows with column qualifiers in the format (aggregated entries) and/or \0 + // (non-aggregated entries). Filter out any non-aggregated entries that does not have the date in the column qualifier. + if (dataTypeFilter != null) { IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, dataType + "\u0000.*", null, false); + // Allow any entries that match the datatype exactly, or contain it with a null byte afterwards.. + RegExFilter.setRegexs(cqRegex, null, null, "^" + dataTypeFilter + "$|^(" + dataTypeFilter + "\u0000.*" + ")$", null, false); scanner.addScanIterator(cqRegex); } try { - final Text holder = new Text(); for (Entry entry : scanner) { - // if this is the real connector, and wrapped connector is not null, it means - // that we didn't get a hit in the cache. So, we will update the cache with the - // entries from the real table + // if this is the real connector, and wrapped connector is not null, it means that we didn't get a hit in the cache. So, we will update the + // cache with the entries from the real table. if (wrappedClient != null && client == wrappedClient.getReal()) { writer = updateCache(entry, writer, wrappedClient); } - entry.getKey().getColumnQualifier(holder); - int startPos = holder.find(NULL_BYTE) + 1; + String colq = entry.getKey().getColumnQualifier().toString(); + int nullBytePos = colq.indexOf(NULL_BYTE); - if (0 == startPos) { - log.trace("Could not find nullbyte separator in column qualifier for: " + entry.getKey()); - } else if ((holder.getLength() - startPos) <= 0) { - log.trace("Could not find date to parse in column qualifier for: " + entry.getKey()); - } else { + // If the null byte is not present in the colq, this is an aggregated entry. The colq consists solely of the datatype, and the value is a + // DateFrequencyMap. + if (nullBytePos == -1) { + // If a datatype filter was not specified, track/update the current datatype. + if (dataTypeFilter == null) { + if (prevDatatype == null || !prevDatatype.equals(colq)) { + prevDatatype = colq; + } + } try { - dateString = Text.decode(holder.getBytes(), startPos, holder.getLength() - startPos); + // The value is a DateFrequencyMap. Fetch the earliest date from it. + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + String earliestKey = map.earliestDate(); + // If the earliest date has not been set yet, or the previous earliest date value is later than the current date, update it. + if (earliestDate == null || earliestKey.compareTo(earliestDate) < 0) { + earliestDate = earliestKey; + } + } catch (IOException e) { + log.trace("Could not parse DateFrequencyMap from value for " + entry.getKey()); + } + // Mark that we saw an aggregated entry last. + prevEntryAggregated = true; + } else { + // If a datatype filter was specified, we only need to check the date of the first non-aggregated entry we see. + if (dataTypeFilter != null) { + String date = colq.substring((nullBytePos + 1)); + // If the earliest date has not been set yet, or the previous earliest date from an aggregated entry is later than the current date, + // we have found the earliest date. + if (earliestDate == null || date.compareTo(earliestDate) < 0) { + earliestDate = date; + } break; - } catch (CharacterCodingException e) { - log.trace("Unable to decode date string for: " + entry.getKey().getColumnQualifier()); + } else { + // If a datatype filter was specified, we need to check for a possible earlier date if either the previous entry was an aggregated + // entry, or if the current datatype differs from the previous datatype. + String datatype = colq.substring(0, nullBytePos); + if (prevEntryAggregated || prevDatatype == null || !prevDatatype.equals(datatype)) { + String date = colq.substring((nullBytePos + 1)); + // If the earliest date has not been set yet, or the previous earliest date value is later than the current date, update it. + if (earliestDate == null || date.compareTo(earliestDate) < 0) { + earliestDate = date; + } + } + // Update the last datatype seen. + prevDatatype = datatype; } + // Mark that we saw a non-aggregated entry last. + prevEntryAggregated = false; } } } finally { @@ -1425,11 +1504,11 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri } } + // Parse and return the date. Date date = null; - if (dateString != null) { - date = DateHelper.parse(dateString); + if (earliestDate != null) { + date = DateHelper.parse(earliestDate); } - return date; } @@ -1649,5 +1728,4 @@ public static void basicIterator(AccumuloClient client, String tableName, Collec public String getMetadataTableName() { return metadataTableName; } - } diff --git a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java new file mode 100644 index 00000000..4ed7094c --- /dev/null +++ b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java @@ -0,0 +1,459 @@ +package datawave.iterators; + +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.data.ColumnFamilyConstants.COLF_I; +import static datawave.data.ColumnFamilyConstants.COLF_RI; +import static datawave.query.util.TestUtils.createDateFrequencyMap; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.LongCombiner; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.accumulo.inmemory.InMemoryAccumuloClient; +import datawave.accumulo.inmemory.InMemoryInstance; +import datawave.query.model.DateFrequencyMap; +import datawave.query.util.TestUtils; +import datawave.security.util.ScannerHelper; + +public class FrequencyMetadataAggregatorTest { + + private static final String TABLE_METADATA = "metadata"; + private static final String[] AUTHS = {"FOO", "BAR", "COB"}; + private static final Set AUTHS_SET = Collections.singleton(new Authorizations(AUTHS)); + private static final String NULL_BYTE = "\0"; + + private AccumuloClient accumuloClient; + private Boolean combineColumnVisibilities; + private final List> expected = new ArrayList<>(); + private final List mutations = new ArrayList<>(); + + @BeforeAll + static void beforeAll() throws URISyntaxException { + File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); + File targetDir = dir.getParentFile(); + System.setProperty("hadoop.home.dir", targetDir.getAbsolutePath()); + } + + @BeforeEach + public void setUp() throws Exception { + accumuloClient = new InMemoryAccumuloClient("root", new InMemoryInstance(FrequencyMetadataAggregatorTest.class.toString())); + if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { + accumuloClient.tableOperations().create(TABLE_METADATA); + } + } + + @AfterEach + public void tearDown() throws Exception { + accumuloClient.tableOperations().deleteRows(TABLE_METADATA, null, null); + combineColumnVisibilities = null; + expected.clear(); + } + + /** + * Verify that aggregation of entries for the columns "f", "i", and "ri" in their non-aggregated format (e.g. when they're initially ingested) are + * aggregated correctly. + */ + @Test + void testDifferingColumnFamilies() throws TableNotFoundException, IOException { + // "f" rows. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // "i" rows. + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000003L, "20200103", 3L); + + // "ri" rows. + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000005L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200103", 3L); + + expect("NAME", COLF_F, "csv", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + expect("NAME", COLF_I, "csv", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L, "20200102", 8L, "20200103", 9L)); + expect("NAME", COLF_RI, "csv", "FOO", 1500000005L, createDateFrequencyMap("20200101", 5L, "20200102", 12L, "20200103", 15L)); + + assertResults(); + } + + /** + * Verify that entries with the same name, column family, and column visibility are separated by their datatype. + */ + @Test + void testDifferingDatatypes() throws TableNotFoundException, IOException { + // Datatype "csv". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Datatype "wiki". + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200103", 2L); + + // Datatype "text". + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000003L, "20200104", 4L); + + expect("NAME", COLF_F, "csv", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + expect("NAME", COLF_F, "text", "FOO", 1500000015L, createDateFrequencyMap("20200102", 12L, "20200103", 4L, "20200104", 16L)); + expect("NAME", COLF_F, "wiki", "FOO", 1500000003L, createDateFrequencyMap("20200101", 12L, "20200102", 4L, "20200103", 8L)); + + assertResults(); + } + + /** + * Verify that when entries for the same field, column family, datatype, and date are aggregated, that the aggregated entries are still separated by their + * column visibility by default. + */ + @Test + public void testDifferingColumnVisibilities() throws TableNotFoundException, IOException { + // Column visibility "FOO". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Column visibility "BAR". + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200103", 2L); + + // Column visibility "COB". + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200104", 4L); + + expect("NAME", COLF_F, "csv", "BAR", 1500000003L, createDateFrequencyMap("20200101", 12L, "20200102", 4L, "20200103", 8L)); + expect("NAME", COLF_F, "csv", "COB", 1500000015L, createDateFrequencyMap("20200102", 12L, "20200103", 4L, "20200104", 16L)); + expect("NAME", COLF_F, "csv", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + + assertResults(); + } + + /** + * Verify that when the iterator option {@link FrequencyMetadataAggregator#COMBINE_VISIBILITIES} is set to true, entries with same field, column family, + * datatype, and date are aggregated and their column visibilities are combined. + */ + @Test + public void testCombiningColumnVisibilities() throws TableNotFoundException, IOException { + // Column visibility "FOO". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Column visibility "BAR". + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200103", 2L); + + // Column visibility "COB". + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200104", 4L); + + // Enable to option to combine visibilities. + givenCombineColumnVisibilitiesIsTrue(); + + expect("NAME", COLF_F, "csv", "BAR&COB&FOO", 1500000015L, createDateFrequencyMap("20200101", 16L, "20200102", 26L, "20200103", 24L, "20200104", 16L)); + + assertResults(); + } + + /** + * Verify that aggregating non-aggregated entries into a previously-aggregated row works correctly. + */ + @Test + void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException, IOException { + // Aggregated entry. + givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999999L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + + // Non-aggregated entry. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + expect("NAME", COLF_F, "csv", "FOO", 1500000004L, createDateFrequencyMap("20191225", 40L, "20200101", 19L, "20200102", 30L, "20200103", 12L)); + + assertResults(); + } + + /** + * Verify that entries not requiring any aggregation are not modified. + */ + @Test + void testNoAggregationNeeded() throws TableNotFoundException, IOException { + givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_I, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_F, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_I, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_RI, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("NAME", COLF_I, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("NAME", COLF_RI, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("GENDER", COLF_F, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_I, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_RI, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_F, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_I, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_RI, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + + expect("GENDER", COLF_F, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_F, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("NAME", COLF_F, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_F, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_F, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + expect("NAME", COLF_I, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_I, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_I, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + expect("NAME", COLF_RI, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_RI, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_RI, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + + assertResults(); + } + + /** + * Test aggregation over a more diverse dataset of mixed aggregated and non-aggregated rows. + */ + @Test + void testDiverseDataset() throws TableNotFoundException, IOException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + expect("AGE", COLF_F, "lifetime", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + private void assertResults() throws TableNotFoundException, IOException { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + Scanner scanner = createScanner(); + List> actual = new ArrayList<>(); + for (Map.Entry entry : scanner) { + actual.add(new AbstractMap.SimpleEntry<>(entry.getKey(), new DateFrequencyMap(entry.getValue().get()))); + } + Assertions.assertEquals(expected, actual); + } + + private Scanner createScanner() throws TableNotFoundException { + Scanner scanner = ScannerHelper.createScanner(accumuloClient, TABLE_METADATA, AUTHS_SET); + scanner.setRange(new Range()); + + scanner.fetchColumnFamily(COLF_F); + scanner.fetchColumnFamily(COLF_I); + scanner.fetchColumnFamily(COLF_RI); + + IteratorSetting iteratorSetting = new IteratorSetting(10, FrequencyMetadataAggregator.class); + if (combineColumnVisibilities != null) { + iteratorSetting.addOption(FrequencyMetadataAggregator.COMBINE_VISIBILITIES, String.valueOf(combineColumnVisibilities)); + } + scanner.addScanIterator(iteratorSetting); + + return scanner; + } + + private void givenCombineColumnVisibilitiesIsTrue() { + this.combineColumnVisibilities = true; + } + + private void givenNonAggregatedRow(String row, Text colf, String datatype, String colv, long timestamp, String date, long count) { + givenMutation(row, colf, datatype + NULL_BYTE + date, colv, timestamp, new Value(LongCombiner.VAR_LEN_ENCODER.encode(count))); + } + + private void givenAggregatedRow(String row, Text colf, String datatype, String colv, long timestamp, DateFrequencyMap map) { + givenMutation(row, colf, datatype, colv, timestamp, new Value(WritableUtils.toByteArray(map))); + } + + private void givenMutation(String row, Text colf, String colq, String colv, long timestamp, Value value) { + Mutation mutation = new Mutation(row); + mutation.put(colf, new Text(colq), new ColumnVisibility(colv), timestamp, value); + this.mutations.add(mutation); + } + + private void expect(String row, Text colf, String colq, String colv, long timestamp, DateFrequencyMap map) { + expected.add(new AbstractMap.SimpleEntry<>(new Key(new Text(row), colf, new Text(colq), new ColumnVisibility(colv), timestamp), map)); + } +} diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index 0e0e3feb..94f2f90f 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -1,11 +1,17 @@ package datawave.query.util; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.query.util.TestUtils.createDateFrequencyMap; +import static datawave.query.util.TestUtils.createRangedDateFrequencyMap; +import static datawave.query.util.TestUtils.getDatesInRange; +import static org.apache.accumulo.core.iterators.LongCombiner.VAR_LEN_ENCODER; + import java.io.File; import java.io.IOException; import java.net.URISyntaxException; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.Calendar; -import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; @@ -19,21 +25,20 @@ import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.BatchWriter; -import org.apache.accumulo.core.client.BatchWriterConfig; -import org.apache.accumulo.core.client.MutationsRejectedException; import org.apache.accumulo.core.client.TableExistsException; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -43,6 +48,7 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.FieldIndexHole; import datawave.util.time.DateHelper; @@ -54,6 +60,8 @@ class AllFieldMetadataHelperTest { private AccumuloClient accumuloClient; private AllFieldMetadataHelper helper; + private final List mutations = new ArrayList<>(); + @BeforeAll static void beforeAll() throws URISyntaxException { File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); @@ -70,6 +78,7 @@ void setUp() throws AccumuloSecurityException, AccumuloException, TableExistsExc if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { accumuloClient.tableOperations().create(TABLE_METADATA); } + final Set allMetadataAuths = Collections.emptySet(); final Set auths = Collections.singleton(new Authorizations(AUTHS)); TypeMetadataHelper typeMetadataHelper = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, auths, false); @@ -83,32 +92,134 @@ void setUp() throws AccumuloSecurityException, AccumuloException, TableExistsExc @AfterEach void tearDown() throws AccumuloException, TableNotFoundException, AccumuloSecurityException { accumuloClient.tableOperations().deleteRows(TABLE_METADATA, null, null); + this.mutations.clear(); } /** * Write the given mutations to the metadata table. */ - private void writeMutations(Collection mutations) { - BatchWriterConfig config = new BatchWriterConfig(); - config.setMaxMemory(0); - try (BatchWriter writer = accumuloClient.createBatchWriter(TABLE_METADATA, config)) { - writer.addMutations(mutations); - writer.flush(); - } catch (MutationsRejectedException | TableNotFoundException e) { - throw new RuntimeException(e); - } + private void writeMutations() { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + } + + private void givenNonAggregatedFrequencyRows(String row, String colf, String datatype, String startDate, String endDate, long count) { + givenNonAggregatedFrequencyRows(row, new Text(colf), datatype, startDate, endDate, count); + } + + private void givenNonAggregatedFrequencyRows(String row, Text colf, String datatype, String startDate, String endDate, long count) { + Mutation mutation = new Mutation(row); + Value value = new Value(VAR_LEN_ENCODER.encode(count)); + List dates = TestUtils.getDatesInRange(startDate, endDate); + dates.forEach((date) -> mutation.put(colf, new Text(datatype + NULL_BYTE + date), value)); + givenMutation(mutation); + } + + private void givenAggregatedFrequencyRow(String row, String colf, String datatype, DateFrequencyMap map) { + givenAggregatedFrequencyRow(row, new Text(colf), datatype, map); + } + + private void givenAggregatedFrequencyRow(String row, Text colf, String datatype, DateFrequencyMap map) { + Mutation mutation = new Mutation(row); + Value value = new Value(WritableUtils.toByteArray(map)); + mutation.put(colf, new Text(datatype), value); + givenMutation(mutation); + } + + private void givenMutation(Mutation mutation) { + this.mutations.add(mutation); } /** - * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and - * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. + * Tests for {@link AllFieldMetadataHelper#getCountsByFieldInDayWithTypes(Map.Entry)}. */ @Nested - public class FieldIndexHoleTests { + public class CountsByFieldInDayWithTypesTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 3L); // Does not contain target date. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 1L); + expected.put("wiki", 2L); + expected.put("maze", 3L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200110")); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 5L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); // Should get summed into previous. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200115", "20200120", 3L); // Does not have entry for 20200102, should not get + // incremented. + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 6L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + } + + public abstract class AbstractFieldIndexHoleTests { - private Set fields = new HashSet<>(); - private Set datatypes = new HashSet<>(); - private double minimumThreshold = 1.0d; + protected Set fields = new HashSet<>(); + protected Set datatypes = new HashSet<>(); + protected double minimumThreshold = 1.0d; protected final Supplier>> INDEX_FUNCTION = () -> { try { @@ -137,6 +248,44 @@ void tearDown() { givenMinimumThreshold(1.0d); } + protected void givenFields(String... fields) { + this.fields = Sets.newHashSet(fields); + } + + protected void givenDatatypes(String... datatypes) { + this.datatypes = Sets.newHashSet(datatypes); + } + + protected void givenMinimumThreshold(double minimumThreshold) { + this.minimumThreshold = minimumThreshold; + } + + protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + } + + protected Pair dateRange(String start, String end) { + return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains non-aggregated entries only. + */ + @Nested + public class FieldIndexHoleTestsForNonAggregatedEntries extends AbstractFieldIndexHoleTests { + /** * Test against data that has no field index holes. */ @@ -144,20 +293,19 @@ void tearDown() { @ValueSource(strings = {"i", "ri"}) void testNoFieldIndexHoles(String cf) { // Create a series of frequency rows over date ranges, each with a matching index row for each date. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200101", "20200120", 1L); + writeMutations(); // Verify that no index holes were found. Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -170,11 +318,10 @@ void testNoFieldIndexHoles(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -189,12 +336,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -202,19 +348,18 @@ void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } - + /** * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -230,13 +375,12 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -244,19 +388,18 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } - + /** * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -272,13 +415,12 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -286,20 +428,19 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } - + /** * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -315,14 +456,13 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -338,14 +478,13 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200106", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200106", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -360,12 +499,11 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -384,16 +522,15 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200109", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200114", "20200116", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200119", "20200120", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200109", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -413,12 +550,11 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -435,13 +571,12 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String c @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -457,12 +592,11 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -478,14 +612,13 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200112", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -501,12 +634,11 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -525,16 +657,15 @@ void testAllDatesAreIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200115", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200125", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200328", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200115", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200125", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -553,26 +684,25 @@ void testAllDatesAreIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -592,34 +722,33 @@ void testSingularDayIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -639,31 +768,30 @@ void testSingularDayIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMixedDateGapsAndThresholdIndexHoles(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -685,31 +813,30 @@ void testMixedDateGapsAndThresholdIndexHoles(String cf) { void testMinimumThresholdPercentageBelow100(String cf) { givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 74L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 100L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 98L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 100L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 98L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 90L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 99L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 90L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 99L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -732,31 +859,30 @@ void testOneFieldSpecified(String cf) { // Retrieve field index holes for field NAME. givenFields("NAME"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -776,39 +902,38 @@ void testMultipleFieldsSpecified(String cf) { // Retrieve field index holes for fields URI and EVENT_DATE. givenFields("URI", "EVENT_DATE"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-wiki on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -829,55 +954,54 @@ void testDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -901,55 +1025,54 @@ void testFieldsAndDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -959,86 +1082,1458 @@ void testFieldsAndDatatypesSpecified(String cf) { // @formatter:on Assertions.assertEquals(expected, fieldIndexHoles); } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains aggregated entries only. + */ + @Nested + public class FieldIndexHoleTestsForAggregatedEntries extends AbstractFieldIndexHoleTests { - private void givenFields(String... fields) { - this.fields = Sets.newHashSet(fields); + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); } - private void givenDatatypes(String... datatypes) { - this.datatypes = Sets.newHashSet(datatypes); + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void givenMinimumThreshold(double minimumThreshold) { - this.minimumThreshold = minimumThreshold; + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Make the index counts a value that will not meet the threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { - Map> fieldIndexHoles = new HashMap<>(); - for (FieldIndexHole hole : holes) { - Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); - datatypeMap.put(hole.getDatatype(), hole); - } - return fieldIndexHoles; + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - @SafeVarargs - protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { - return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - protected Pair dateRange(String start, String end) { - return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - } - - /** - * Helper class for creating mutations in bulk for field index hole tests. - */ - private static class FieldIndexHoleMutationCreator { - private final List mutations = new ArrayList<>(); - - private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, "f", datatype, date, count)); + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, cf, datatype, date, count)); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getDatesInRange(String startDateStr, String endDateStr) { - Date startDate = DateHelper.parse(startDateStr); - Date endDate = DateHelper.parse(endDateStr); - - List dates = new ArrayList<>(); - dates.add(startDateStr); - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(startDate); - while (true) { - calendar.add(Calendar.DAY_OF_MONTH, 1); - Date date = calendar.getTime(); - if (date.before(endDate) || date.equals(endDate)) { - dates.add(DateHelper.format(date)); - } else { - break; - } - } + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); - return dates; + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, long count) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(count))); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getMutations() { - return mutations; + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L, "20200110", "20200113", 1L, "20200117", "20200118", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } - } - + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L, "20200114", "20200116", 1L, "20200117", "20200118", 5L, "20200119", "20200120", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L, "20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200113", "20200115", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L, "20200110", + "20200112", 1L, "20200113", "20200115", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L, "20200104", "20200104", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L, "20200114", "20200115", 1L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L, "20200123", "20200125", 1L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 74L, "20200111", "20200112", 75L, "20200114", "20200115", 100L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains both aggregated and non-aggregated entries. + */ + @Nested + public class FieldIndexHoleTestsForMixedAggregatedAndNonAggregatedEntries extends AbstractFieldIndexHoleTests { + + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200111", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200115", "20200120", 1L); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200107", 5L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200108", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200115", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L)); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } } diff --git a/src/test/java/datawave/query/util/MetadataHelperTest.java b/src/test/java/datawave/query/util/MetadataHelperTest.java index 405922c3..720a7815 100644 --- a/src/test/java/datawave/query/util/MetadataHelperTest.java +++ b/src/test/java/datawave/query/util/MetadataHelperTest.java @@ -1,30 +1,38 @@ package datawave.query.util; -import static org.junit.Assert.assertEquals; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.query.util.TestUtils.createDateFrequencyMap; +import static org.apache.accumulo.core.iterators.LongCombiner.VAR_LEN_ENCODER; import java.io.File; +import java.io.IOException; import java.net.URISyntaxException; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.BatchWriter; -import org.apache.accumulo.core.client.BatchWriterConfig; -import org.apache.accumulo.core.client.MutationsRejectedException; import org.apache.accumulo.core.client.TableExistsException; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; +import org.apache.http.auth.AUTH; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; import com.google.common.collect.Maps; @@ -33,14 +41,22 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; +import datawave.util.time.DateHelper; +import datawave.webservice.common.connection.WrappedAccumuloClient; public class MetadataHelperTest { - private static final String TABLE_METADATA = "testMetadataTable"; + private static final String TABLE_METADATA = "metadata"; private static final String[] AUTHS = {"FOO"}; + private static final Set AUTHORIZATIONS = Collections.singleton(new Authorizations(AUTHS)); + private static final String NULL_BYTE = "\0"; + private AccumuloClient accumuloClient; private MetadataHelper helper; + private final List mutations = new ArrayList<>(); + @BeforeAll static void beforeAll() throws URISyntaxException { File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); @@ -54,67 +70,335 @@ public void setup() throws TableNotFoundException, AccumuloException, TableExist if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { accumuloClient.tableOperations().create(TABLE_METADATA); } - helper = new MetadataHelper(createAllFieldMetadataHelper(), Collections.emptySet(), accumuloClient, TABLE_METADATA, Collections.emptySet(), + + helper = new MetadataHelper(createAllFieldMetadataHelper(), Collections.emptySet(), accumuloClient, TABLE_METADATA, AUTHORIZATIONS, Collections.emptySet()); } private AllFieldMetadataHelper createAllFieldMetadataHelper() { final Set allMetadataAuths = Collections.emptySet(); - final Set auths = Collections.singleton(new Authorizations(AUTHS)); - TypeMetadataHelper tmh = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, auths, false); - CompositeMetadataHelper cmh = new CompositeMetadataHelper(accumuloClient, TABLE_METADATA, auths); - return new AllFieldMetadataHelper(tmh, cmh, accumuloClient, TABLE_METADATA, auths, allMetadataAuths); + TypeMetadataHelper tmh = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, AUTHORIZATIONS, false); + CompositeMetadataHelper cmh = new CompositeMetadataHelper(accumuloClient, TABLE_METADATA, AUTHORIZATIONS); + return new AllFieldMetadataHelper(tmh, cmh, accumuloClient, TABLE_METADATA, AUTHORIZATIONS, allMetadataAuths); } @AfterEach void tearDown() throws AccumuloException, TableNotFoundException, AccumuloSecurityException { accumuloClient.tableOperations().delete(TABLE_METADATA); + this.mutations.clear(); } - @Test - public void testSingleFieldFilter() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.singleton("dataTypeA"))); - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(null)); - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.emptySet())); + /** + * Write the given mutations to the metadata table. + */ + private void writeMutations() { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + } + + private void givenMutation(Mutation mutation) { + this.mutations.add(mutation); } - @Test - public void testMultipleFieldFilter() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - writeMutation("rowB", "t", "dataTypeB", new Value("value")); + private void givenMutation(String row, String columnFamily, String columnQualifier, Value value) throws TableNotFoundException { + Mutation mutation = new Mutation(row); + mutation.put(columnFamily, columnQualifier, value); + givenMutation(mutation); + } + + private void givenNonAggregatedFrequencyRows(String row, String colf, String datatype, String startDate, String endDate, long count) { + givenNonAggregatedFrequencyRows(row, new Text(colf), datatype, startDate, endDate, count); + } + + private void givenNonAggregatedFrequencyRows(String row, Text colf, String datatype, String startDate, String endDate, long count) { + Mutation mutation = new Mutation(row); + Value value = new Value(VAR_LEN_ENCODER.encode(count)); + List dates = TestUtils.getDatesInRange(startDate, endDate); + dates.forEach((date) -> mutation.put(colf, new Text(datatype + NULL_BYTE + date), value)); + givenMutation(mutation); + } + + private void givenAggregatedFrequencyRow(String row, String colf, String datatype, DateFrequencyMap map) { + givenAggregatedFrequencyRow(row, new Text(colf), datatype, map); + } + + private void givenAggregatedFrequencyRow(String row, Text colf, String datatype, DateFrequencyMap map) { + Mutation mutation = new Mutation(row); + Value value = new Value(WritableUtils.toByteArray(map)); + mutation.put(colf, new Text(datatype), value); + givenMutation(mutation); + } + + /** + * Tests for {@link MetadataHelper#getAllFields(Set)}. + */ + @Nested + public class GetAllFieldsTest { + @Test + public void testSingleFieldFilter() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.singleton("dataTypeA"))); + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(null)); + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.emptySet())); + } - Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(null)); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(Collections.emptySet())); + @Test + public void testMultipleFieldFilter() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + givenMutation("rowB", "t", "dataTypeB", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(null)); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(Collections.emptySet())); + } + + @Test + public void testMultipleFieldFilter2() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + givenMutation("rowB", "t", "dataTypeB", new Value("value")); + givenMutation("rowC", "t", "dataTypeC", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(null)); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(Collections.emptySet())); + } } - @Test - public void testMultipleFieldFilter2() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - writeMutation("rowB", "t", "dataTypeB", new Value("value")); - writeMutation("rowC", "t", "dataTypeC", new Value("value")); + /** + * Tests for {@link MetadataHelper#getCardinalityForField(String, Date, Date)} and + * {@link MetadataHelper#getCardinalityForField(String, String, Date, Date)}. + */ + @Nested + public class GetCardinalityForFieldTests { - Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(null)); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(Collections.emptySet())); + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200111", "20200120", 1L); // 5 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); // 12 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200110", 1L); // 7 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 1L); // No entries within date range. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 1L); // Field does not match. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 1L); // Field does not match. + + writeMutations(); + + Assertions.assertEquals(24L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(12L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L, "20200104", 3L, + "20200105", 3L, "20200106", 3L, "20200107", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L, "20200107", 3L, + "20200108", 3L, "20200111", 3L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200111", 3L, + "20200114", 3L, "20200115", 3L, "20200116", 3L, "20200120", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200120", 3L)); // Does + // not + // contain + // target + // date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); // Field does not + // match. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); // Field does not + // match. + writeMutations(); + + Assertions.assertEquals(33L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(12L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L, "20200104", 3L, + "20200105", 3L, "20200106", 3L, "20200107", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L, "20200107", 3L, + "20200108", 3L, "20200111", 3L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200113", "20200120", 3L); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200120", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); + // Following does not match field. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(51L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(21L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } } - private void writeMutation(String row, String columnFamily, String columnQualifier, Value value) throws TableNotFoundException { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, columnQualifier, value); - writeMutation(mutation); + /** + * Tests for {@link MetadataHelper#getCountsByFieldInDayWithTypes(String, String, AccumuloClient, WrappedAccumuloClient)} (Map.Entry)}. + */ + @Nested + public class CountsByFieldInDayWithTypesTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 3L); // Does not contain target date. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 1L); + expected.put("wiki", 2L); + expected.put("maze", 3L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200110", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 5L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200102", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); // Should get summed into previous. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200115", "20200120", 3L); // Does not have entry for 20200102, should not get + // incremented. + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 6L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200102", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } } - private void writeMutation(Mutation m) throws TableNotFoundException { - BatchWriterConfig config = new BatchWriterConfig(); - config.setMaxMemory(0); - try (BatchWriter writer = accumuloClient.createBatchWriter(TABLE_METADATA, config)) { - writer.addMutation(m); - writer.flush(); - } catch (MutationsRejectedException e) { - throw new RuntimeException(e); + /** + * Tests for {@link MetadataHelper#getEarliestOccurrenceOfFieldWithType(String, String, AccumuloClient, WrappedAccumuloClient)}. + */ + @Nested + public class GetEarliestOccurrenceOfFieldWithTypeTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200103", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200105", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200107", "20200102", 3L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200105"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200113", 1L, "20200115", 5L, "20200116", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200102", 1L, "20200104", 55L, "20200105", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200102"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200111", 1L, "20200112", 5L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200111", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200111", 1L, "20200112", 55L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200103", "20200120", 3L); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200111", 1L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200115", 3L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200103"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); } } } diff --git a/src/test/java/datawave/query/util/TestUtils.java b/src/test/java/datawave/query/util/TestUtils.java new file mode 100644 index 00000000..40cb31a9 --- /dev/null +++ b/src/test/java/datawave/query/util/TestUtils.java @@ -0,0 +1,132 @@ +package datawave.query.util; + +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collection; +import java.util.Date; +import java.util.List; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Mutation; + +import datawave.query.model.DateFrequencyMap; +import datawave.util.time.DateHelper; + +public class TestUtils { + + private static final String NULL_BYTE = "\0"; + + /** + * Write the given mutations to the specified table via the accumulo client. + */ + public static void writeMutations(AccumuloClient client, String tableName, Collection mutations) { + BatchWriterConfig config = new BatchWriterConfig(); + config.setMaxMemory(0); + try (BatchWriter writer = client.createBatchWriter(tableName, config)) { + writer.addMutations(mutations); + writer.flush(); + } catch (MutationsRejectedException | TableNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Return the set of dates contained within the start and end date + * + * @param startDateStr + * @param endDateStr + * @return + */ + public static List getDatesInRange(String startDateStr, String endDateStr) { + Date startDate = DateHelper.parse(startDateStr); + Date endDate = DateHelper.parse(endDateStr); + + List dates = new ArrayList<>(); + dates.add(startDateStr); + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(startDate); + while (true) { + calendar.add(Calendar.DAY_OF_MONTH, 1); + Date date = calendar.getTime(); + if (date.before(endDate) || date.equals(endDate)) { + dates.add(DateHelper.format(date)); + } else { + break; + } + } + + return dates; + } + + /** + * Create and a return a {@link DateFrequencyMap} map with the specified dates and counts. The args are expected to be alternating String dates and long + * counts. For example: + * + *
+     * {
+     *     @code
+     *     DateFrequencyMap map = createDateFrequencyMap("20200101", 12L, "20200102", 34L, "20200103", 55L);
+     * }
+     * 
+ * + * will result in a map with a count of 12 for the date 01-01-2020, 34 for the date 01-02-2020, and 55 for the date 01-03-2020. + * + * @param entries + * the entries + * @return the date frequency map + */ + public static DateFrequencyMap createDateFrequencyMap(Object... entries) { + DateFrequencyMap map = new DateFrequencyMap(); + int lastEntry = entries.length - 1; + for (int i = 0; i < lastEntry; i++) { + String date = (String) entries[i]; + i++; + long count = (Long) entries[i]; + map.put(date, count); + } + return map; + } + + /** + * Create and a return a {@link DateFrequencyMap} map with counts for the specified date ranges. The args are expected to be alternating String date ranges + * and long counts. For example: + * + *
+     *  {@code
+     * DateFrequencyMap map = createDateFrequencyMap("20200101", "20200105", 12L, "20200106", "20200110", 34L, "20200111", "20200115" 55L);
+     * }
+     * 
+ * + * will result in a map with a count of 12 for the dates 01-01-2020 to 01-05-2020, 34 for the dates 01-06-2020 to 01-10-2020, and 55 for the dates + * 01-11-2020 to 01-15-2020. + * + * @param entries + * the entries + * @return the date frequency map + */ + public static DateFrequencyMap createRangedDateFrequencyMap(Object... entries) { + DateFrequencyMap map = new DateFrequencyMap(); + int lastEntry = entries.length - 1; + for (int i = 0; i < lastEntry; i++) { + String startDate = (String) entries[i]; + i++; + String endDate = (String) entries[i]; + i++; + long count = (Long) entries[i]; + List dates = getDatesInRange(startDate, endDate); + for (String date : dates) { + map.put(date, count); + } + } + return map; + } + + private TestUtils() { + throw new UnsupportedOperationException(); + } +} diff --git a/src/test/resources/MarkingFunctionsContext.xml b/src/test/resources/MarkingFunctionsContext.xml new file mode 100644 index 00000000..6496e900 --- /dev/null +++ b/src/test/resources/MarkingFunctionsContext.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + From 560ad96df3678e33c1acd04232214c988b8f7606 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Fri, 12 Apr 2024 14:35:24 -0400 Subject: [PATCH 02/10] Shorten COMBINE_VISIBILITIES value --- .../java/datawave/iterators/FrequencyMetadataAggregator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java index c513d820..4fe18447 100644 --- a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -41,7 +41,7 @@ */ public class FrequencyMetadataAggregator extends WrappingIterator implements OptionDescriber { - public static final String COMBINE_VISIBILITIES = "FrequencyMetadataAggregator.COMBINE_VISIBILITIES"; + public static final String COMBINE_VISIBILITIES = "COMBINE_VISIBILITIES"; private static final Logger log = Logger.getLogger(FrequencyMetadataAggregator.class); private static final String NULL_BYTE = "\0"; From 20030aab7d5566fb42b2d4de5b1ac2ba444c547f Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 16 Apr 2024 02:28:48 -0400 Subject: [PATCH 03/10] Add columns option --- .../FrequencyMetadataAggregator.java | 183 ++++++++++-------- .../FrequencyMetadataAggregatorTest.java | 101 ++++++++-- 2 files changed, 187 insertions(+), 97 deletions(-) diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java index 4fe18447..6830f944 100644 --- a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -4,6 +4,7 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -17,38 +18,37 @@ import org.apache.accumulo.core.iterators.OptionDescriber; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.iteratorsImpl.conf.ColumnSet; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.log4j.Logger; +import com.google.common.base.Splitter; + import datawave.marking.MarkingFunctions; import datawave.query.model.DateFrequencyMap; +import datawave.util.StringUtils; /** * Aggregates entries in the metadata table for the "f", "i", and "ri" columns. When initially ingested, entries for these columns have a column qualifier with * the format {@code \0}, and a value containing a possibly partial frequency count for the date in the column qualifier. Entries with the * same row, column family, datatype, and column family will be aggregated into a single entry where the column qualifier consists of the datatype and the value * consists of an encoded {@link DateFrequencyMap} with the dates and counts seen. Additionally, this aggregator will handle the case where we have a previously - * aggregated entry and freshly ingested rows that need to be aggregated together.
- *
- * This iterator supports the following options: - *
    - *
  • {@value COMBINE_VISIBILITIES}: Defaults to false. If true, entries will be aggregated by row, column family, and datatype only, and the column visibility - * will be a combination of all column visibilities seen for the row/column family/datatype combo. This option is meant to be used when scanning only, and not - * for compaction.
  • - *
+ * aggregated entry and freshly ingested rows that need to be aggregated together. */ public class FrequencyMetadataAggregator extends WrappingIterator implements OptionDescriber { - public static final String COMBINE_VISIBILITIES = "COMBINE_VISIBILITIES"; + public static final String COMBINE_VISIBILITIES_OPTION = "COMBINE_VISIBILITIES"; + public static final String COLUMNS_OPTION = "columns"; private static final Logger log = Logger.getLogger(FrequencyMetadataAggregator.class); private static final String NULL_BYTE = "\0"; private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions(); - private SortedKeyValueIterator source; private boolean combineVisibilities; + private ColumnSet columns; + private Key topKey; private Value topValue; @@ -56,8 +56,9 @@ public class FrequencyMetadataAggregator extends WrappingIterator implements Opt private final Map visibilityToDateFrequencies; private final Map visibilityToMaxTimestamp; - private Text currentRow; - private Text currentColumnFamily; + private final Key workKey = new Key(); + private final Text currentRow = new Text(); + private final Text currentColumnFamily = new Text(); private String currentDatatype; private String currentDate; private ColumnVisibility currentVisibility; @@ -70,33 +71,45 @@ public FrequencyMetadataAggregator() { visibilityToMaxTimestamp = new HashMap<>(); } - public FrequencyMetadataAggregator(FrequencyMetadataAggregator other, IteratorEnvironment env) { - this(); - source = other.getSource().deepCopy(env); - combineVisibilities = other.combineVisibilities; - cache.putAll(other.cache); - } - @Override public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { - return new FrequencyMetadataAggregator(this, env); + FrequencyMetadataAggregator copy = new FrequencyMetadataAggregator(); + copy.setSource(getSource().deepCopy(env)); + copy.combineVisibilities = combineVisibilities; + return copy; } @Override public IteratorOptions describeOptions() { Map options = new HashMap<>(); - options.put(COMBINE_VISIBILITIES, "Boolean value denoting whether to combine entries with different visibilities. Defaults to false."); - + options.put(COMBINE_VISIBILITIES_OPTION, "Boolean value denoting whether to combine entries with different visibilities. Defaults to false."); + options.put(COLUMNS_OPTION, "[:]{,[:]} escape non-alphanum chars using %."); return new IteratorOptions(getClass().getSimpleName(), "An iterator used to collapse frequency columns in the metadata table", options, null); } @Override public boolean validateOptions(Map options) { - // Check if entries with different column visibilities should be combined. - if (options.containsKey(COMBINE_VISIBILITIES)) { - combineVisibilities = Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES)); - if (log.isTraceEnabled()) { - log.trace("combine visibilities: " + combineVisibilities); + if (options.containsKey(COMBINE_VISIBILITIES_OPTION)) { + try { + // noinspection ResultOfMethodCallIgnored + Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES_OPTION)); + } catch (Exception e) { + throw new IllegalArgumentException("Bad boolean for " + COMBINE_VISIBILITIES_OPTION + " option: " + options.get(COMBINE_VISIBILITIES_OPTION)); + } + } + + if (!options.containsKey(COLUMNS_OPTION)) { + throw new IllegalArgumentException("Options must include " + COLUMNS_OPTION); + } + + String encodedColumns = options.get(COLUMNS_OPTION); + if (encodedColumns.isEmpty()) { + throw new IllegalArgumentException("Empty columns specified for " + COLUMNS_OPTION); + } + + for (String columns : Splitter.on(",").split(encodedColumns)) { + if (!ColumnSet.isValidEncoding(columns)) { + throw new IllegalArgumentException("invalid column encoding " + encodedColumns); } } @@ -105,54 +118,68 @@ public boolean validateOptions(Map options) { @Override public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { - if (!validateOptions(options)) { - throw new IllegalArgumentException("Invalid options given: " + options); - } + super.init(source, options, env); + + combineVisibilities = options.containsKey(COMBINE_VISIBILITIES_OPTION) && Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES_OPTION)); + columns = new ColumnSet(List.of(StringUtils.split(options.get(COLUMNS_OPTION), ","))); - this.source = source; + if (log.isTraceEnabled()) { + log.trace("Option " + COMBINE_VISIBILITIES_OPTION + ": " + combineVisibilities); + log.trace("Option " + COLUMNS_OPTION + ": " + columns); + } } @Override public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { - log.trace("seeking"); - - source.seek(range, columnFamilies, inclusive); - - // Establish the first top key. - next(); - - if (log.isTraceEnabled()) { - log.trace("first top key after seek: " + topKey); - } + super.seek(range, columnFamilies, inclusive); + findTop(); } @Override public Key getTopKey() { - return topKey; + return topKey == null ? super.getTopKey() : topKey; } @Override public Value getTopValue() { - return topValue; + return topValue == null ? super.getTopValue() : topValue; } @Override public boolean hasTop() { - return topKey != null; + return topKey != null || super.hasTop(); } @Override public void next() throws IOException { log.trace("Fetching next"); + // If topKey is not null, the last call to next() popped an entry from the cache. Reset to null. If any more entries remain in the cache, they will be + // popped in findTop(). + if (topKey != null) { + topKey = null; + topValue = null; + } else { + // If topKey is null, the last call to next() did not pop an entry from the cache. Advance to the next from the source. We will determine if + // aggregation is needed in findTop(). + super.next(); + } + + findTop(); + } + + private void findTop() throws IOException { + log.trace("Finding top"); + // Attempt to pop an entry from the cache. If no entries remain, evaluate the next key for potential aggregation. if (!popCache()) { - log.trace("No entries in cache"); - if (source.hasTop()) { - log.trace("Source has top, updating cache"); - updateCache(); - } else { - log.trace("Source does not have top"); + if (super.hasTop()) { + workKey.set(super.getTopKey()); + // Check if the current key contains a column marked for aggregation, and is not deleted. If so, rebuild the cache with the relevant aggregated + // entries. + if (columns.contains(workKey) && !workKey.isDeleted()) { + updateCache(); + popCache(); + } } - popCache(); } } @@ -160,9 +187,7 @@ public void next() throws IOException { * Set {@link #topKey} and {@link #topValue} to the next available entry in the cache. Returns true if the cache was not empty, or false otherwise. */ private boolean popCache() { - topKey = null; - topValue = null; - + log.trace("Popping cache"); if (!cache.isEmpty()) { Map.Entry entry = cache.pollFirstEntry(); topKey = entry.getKey(); @@ -176,8 +201,8 @@ private boolean popCache() { * Reset all current tracking variables. */ private void resetCurrent() { - currentRow = null; - currentColumnFamily = null; + currentRow.clear(); + currentColumnFamily.clear(); currentDatatype = null; currentDate = null; currentVisibility = null; @@ -197,40 +222,42 @@ private void updateCache() throws IOException { while (true) { // If the source does not have any more entries, wrap up the last batch of entries. - if (!source.hasTop()) { + if (!super.hasTop()) { log.trace("Source does not have top"); wrapUpCurrent(); return; } - Key key = source.getTopKey(); + workKey.set(super.getTopKey()); if (log.isTraceEnabled()) { - log.trace("updateCache examining key " + key); + log.trace("updateCache examining key " + workKey); } // If the current entry has a different row, column family, or datatype from the previous entry, wrap up and return the current // batch of entries. - if (differsFromPrev(key)) { + if (!partOfCurrentAggregation(workKey)) { wrapUpCurrent(); return; } - // Aggregate the current entry. - aggregateCurrent(); + // Aggregate the current entry only if it is not deleted. + if (!workKey.isDeleted()) { + // Aggregate the current entry. + aggregateCurrent(); + } // Advance to the next entry from the source. - source.next(); + super.next(); } } /** - * Return true if the current entry is not the first entry seen in the current call to {@link #updateCache()} and has a different row, column family, or - * datatype from the previous entry, or false otherwise. + * Return true if the current entry has the same row, column family, and datatype from the previous entry, or false otherwise. */ - private boolean differsFromPrev(Key key) { + private boolean partOfCurrentAggregation(Key key) { // Update the current row if null. - if (currentRow == null) { - currentRow = key.getRow(); + if (currentRow.getLength() == 0) { + currentRow.set(key.getRow()); if (log.isTraceEnabled()) { log.trace("Set current row to " + currentRow); } @@ -239,12 +266,12 @@ private boolean differsFromPrev(Key key) { if (log.isTraceEnabled()) { log.trace("Next row " + key.getRow() + " differs from prev " + currentRow); } - return true; + return false; } // Update the current column family if null. - if (currentColumnFamily == null) { - currentColumnFamily = key.getColumnFamily(); + if (currentColumnFamily.getLength() == 0) { + currentColumnFamily.set(key.getColumnFamily()); if (log.isTraceEnabled()) { log.trace("Set current column family to " + currentColumnFamily); } @@ -253,7 +280,7 @@ private boolean differsFromPrev(Key key) { if (log.isTraceEnabled()) { log.trace("Next column family " + key.getColumnFamily() + " differs from prev " + currentColumnFamily); } - return true; + return false; } String columnQualifier = key.getColumnQualifier().toString(); @@ -283,20 +310,20 @@ private boolean differsFromPrev(Key key) { if (log.isTraceEnabled()) { log.trace("Next datatype " + datatype + " differs from prev " + currentDatatype); } - return true; + return false; } // Update the current visibility and timestamp. currentVisibility = new ColumnVisibility(key.getColumnVisibility()); currentTimestamp = key.getTimestamp(); - return false; + return true; } /** * Aggregate the current entry. */ private void aggregateCurrent() { - Value value = source.getTopValue(); + Value value = super.getTopValue(); // Fetch the date-frequency map for the current column visibility, creating one if not present. DateFrequencyMap dateFrequencies = visibilityToDateFrequencies.computeIfAbsent(currentVisibility, (k) -> new DateFrequencyMap()); @@ -306,7 +333,7 @@ private void aggregateCurrent() { DateFrequencyMap entryMap = new DateFrequencyMap(value.get()); dateFrequencies.incrementAll(entryMap); } catch (IOException e) { - Key key = source.getTopKey(); + Key key = super.getTopKey(); log.error("Failed to parse date frequency map from value for key " + key, e); throw new IllegalArgumentException("Failed to parse date frequency map from value for key " + key, e); } @@ -335,14 +362,14 @@ private void wrapUpCurrent() { log.trace("Wrapping up for row: " + currentRow + ", cf: " + currentColumnFamily + ", cq: " + currentDatatype); } - cache.putAll(buildTopEntries()); + cache.putAll(buildCacheEntries()); resetCurrent(); } /** * Build and return a sorted map of the key-value entries that should be made available to be returned by {@link #next()}. */ - private Map buildTopEntries() { + private Map buildCacheEntries() { if (log.isTraceEnabled()) { log.trace("buildTopKeys, currentRow: " + currentRow); log.trace("buildTopKeys, currentColumnFamily: " + currentColumnFamily); diff --git a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java index 4ed7094c..a9e0d471 100644 --- a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java +++ b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java @@ -1,7 +1,10 @@ package datawave.iterators; +import static datawave.data.ColumnFamilyConstants.COLF_DESC; +import static datawave.data.ColumnFamilyConstants.COLF_E; import static datawave.data.ColumnFamilyConstants.COLF_F; import static datawave.data.ColumnFamilyConstants.COLF_I; +import static datawave.data.ColumnFamilyConstants.COLF_N; import static datawave.data.ColumnFamilyConstants.COLF_RI; import static datawave.query.util.TestUtils.createDateFrequencyMap; @@ -50,7 +53,7 @@ public class FrequencyMetadataAggregatorTest { private AccumuloClient accumuloClient; private Boolean combineColumnVisibilities; - private final List> expected = new ArrayList<>(); + private final List> expected = new ArrayList<>(); private final List mutations = new ArrayList<>(); @BeforeAll @@ -80,7 +83,7 @@ public void tearDown() throws Exception { * aggregated correctly. */ @Test - void testDifferingColumnFamilies() throws TableNotFoundException, IOException { + void testDifferingColumnFamilies() throws TableNotFoundException { // "f" rows. givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); @@ -137,7 +140,7 @@ void testDifferingColumnFamilies() throws TableNotFoundException, IOException { * Verify that entries with the same name, column family, and column visibility are separated by their datatype. */ @Test - void testDifferingDatatypes() throws TableNotFoundException, IOException { + void testDifferingDatatypes() throws TableNotFoundException { // Datatype "csv". givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); @@ -193,7 +196,7 @@ void testDifferingDatatypes() throws TableNotFoundException, IOException { * column visibility by default. */ @Test - public void testDifferingColumnVisibilities() throws TableNotFoundException, IOException { + public void testDifferingColumnVisibilities() throws TableNotFoundException { // Column visibility "FOO". givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); @@ -245,11 +248,11 @@ public void testDifferingColumnVisibilities() throws TableNotFoundException, IOE } /** - * Verify that when the iterator option {@link FrequencyMetadataAggregator#COMBINE_VISIBILITIES} is set to true, entries with same field, column family, - * datatype, and date are aggregated and their column visibilities are combined. + * Verify that when the iterator option {@link FrequencyMetadataAggregator#COMBINE_VISIBILITIES_OPTION} is set to true, entries with same field, column + * family, datatype, and date are aggregated and their column visibilities are combined. */ @Test - public void testCombiningColumnVisibilities() throws TableNotFoundException, IOException { + public void testCombiningColumnVisibilities() throws TableNotFoundException { // Column visibility "FOO". givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); @@ -305,7 +308,7 @@ public void testCombiningColumnVisibilities() throws TableNotFoundException, IOE * Verify that aggregating non-aggregated entries into a previously-aggregated row works correctly. */ @Test - void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException, IOException { + void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException { // Aggregated entry. givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999999L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); @@ -333,7 +336,7 @@ void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException, IOEx * Verify that entries not requiring any aggregation are not modified. */ @Test - void testNoAggregationNeeded() throws TableNotFoundException, IOException { + void testNoAggregationNeeded() throws TableNotFoundException { givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); givenAggregatedRow("NAME", COLF_I, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); givenAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); @@ -373,7 +376,7 @@ void testNoAggregationNeeded() throws TableNotFoundException, IOException { * Test aggregation over a more diverse dataset of mixed aggregated and non-aggregated rows. */ @Test - void testDiverseDataset() throws TableNotFoundException, IOException { + void testDiverseDataset() throws TableNotFoundException { givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); @@ -408,12 +411,71 @@ void testDiverseDataset() throws TableNotFoundException, IOException { assertResults(); } - private void assertResults() throws TableNotFoundException, IOException { + /** + * Verify that scanning over a table with columns that are not to be aggregated result in them being unchanged. + */ + @Test + void testMixedColumns() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Non frequency rows that should not be affected by aggregation. + givenMutation("AGE", COLF_E, "num", "BAR", 1400000005L, new Value()); + givenMutation("AGE", COLF_DESC, "num", "BAR", 1400000005L, new Value("age_num description")); + givenMutation("AGE", COLF_E, "lifetime", "BAR", 1400000005L, new Value()); + givenMutation("AGE", COLF_DESC, "lifetime", "BAR", 1400000005L, new Value("age_lifetime description")); + givenMutation("AGE", COLF_DESC, "var", "BAR", 1400000005L, new Value("age_var description")); + givenMutation("JOB", COLF_E, "attr", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_DESC, "attr", "BAR", 1400000005L, new Value("job_attr description")); + givenMutation("GENDER", COLF_DESC, "text", "BAR", 1400000005L, new Value("gender_text description")); + givenMutation("JOB", new Text("m"), "attr", "BAR", 1400000005L, new Value()); + + expect("AGE", COLF_DESC, "lifetime", "BAR", 1400000005L, new Value("age_lifetime description")); + expect("AGE", COLF_DESC, "num", "BAR", 1400000005L, new Value("age_num description")); + expect("AGE", COLF_DESC, "var", "BAR", 1400000005L, new Value("age_var description")); + expect("AGE", COLF_E, "lifetime", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_E, "num", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_F, "lifetime", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_DESC, "text", "BAR", 1400000005L, new Value("gender_text description")); + expect("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_DESC, "attr", "BAR", 1400000005L, new Value("job_attr description")); + expect("JOB", COLF_E, "attr", "BAR", 1400000005L, new Value()); + expect("JOB", COLF_F, "attr", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", new Text("m"), "attr", "BAR", 1400000005L, new Value()); + expect("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + private void assertResults() throws TableNotFoundException { TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); Scanner scanner = createScanner(); - List> actual = new ArrayList<>(); + List> actual = new ArrayList<>(); for (Map.Entry entry : scanner) { - actual.add(new AbstractMap.SimpleEntry<>(entry.getKey(), new DateFrequencyMap(entry.getValue().get()))); + actual.add(new AbstractMap.SimpleEntry<>(entry.getKey(), entry.getValue())); } Assertions.assertEquals(expected, actual); } @@ -422,14 +484,11 @@ private Scanner createScanner() throws TableNotFoundException { Scanner scanner = ScannerHelper.createScanner(accumuloClient, TABLE_METADATA, AUTHS_SET); scanner.setRange(new Range()); - scanner.fetchColumnFamily(COLF_F); - scanner.fetchColumnFamily(COLF_I); - scanner.fetchColumnFamily(COLF_RI); - IteratorSetting iteratorSetting = new IteratorSetting(10, FrequencyMetadataAggregator.class); if (combineColumnVisibilities != null) { - iteratorSetting.addOption(FrequencyMetadataAggregator.COMBINE_VISIBILITIES, String.valueOf(combineColumnVisibilities)); + iteratorSetting.addOption(FrequencyMetadataAggregator.COMBINE_VISIBILITIES_OPTION, String.valueOf(combineColumnVisibilities)); } + iteratorSetting.addOption(FrequencyMetadataAggregator.COLUMNS_OPTION, "f,i,ri"); scanner.addScanIterator(iteratorSetting); return scanner; @@ -454,6 +513,10 @@ private void givenMutation(String row, Text colf, String colq, String colv, long } private void expect(String row, Text colf, String colq, String colv, long timestamp, DateFrequencyMap map) { - expected.add(new AbstractMap.SimpleEntry<>(new Key(new Text(row), colf, new Text(colq), new ColumnVisibility(colv), timestamp), map)); + expect(row, colf, colq, colv, timestamp, new Value(WritableUtils.toByteArray(map))); + } + + private void expect(String row, Text colf, String colq, String colv, long timestamp, Value value) { + expected.add(new AbstractMap.SimpleEntry<>(new Key(new Text(row), colf, new Text(colq), new ColumnVisibility(colv), timestamp), value)); } } From 5fef9dff3f6e2fc576a673bd3f343ba0f319127c Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 16 Apr 2024 11:16:43 -0400 Subject: [PATCH 04/10] Ensure deleted keys are not dropped when aggregating --- .../java/datawave/iterators/FrequencyMetadataAggregator.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java index 6830f944..d2a726cb 100644 --- a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -244,6 +244,9 @@ private void updateCache() throws IOException { if (!workKey.isDeleted()) { // Aggregate the current entry. aggregateCurrent(); + } else { + // Add the deleted entry to the cache so that it is available for scanning, but do not include it as part of the aggregation. + cache.put(super.getTopKey(), super.getTopValue()); } // Advance to the next entry from the source. From 11a0a69fc8f394c808ccdc30f82102a3f065b37c Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 16 Apr 2024 18:54:01 -0400 Subject: [PATCH 05/10] Modify seek --- .../FrequencyMetadataAggregator.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java index d2a726cb..8cf44682 100644 --- a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -11,9 +11,11 @@ import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.IteratorUtil; import org.apache.accumulo.core.iterators.LongCombiner; import org.apache.accumulo.core.iterators.OptionDescriber; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; @@ -131,8 +133,23 @@ public void init(SortedKeyValueIterator source, Map op @Override public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { - super.seek(range, columnFamilies, inclusive); + // Do not seek to the middle of a value that should be combined. + Range seekRange = IteratorUtil.maximizeStartKeyTimeStamp(range); + + super.seek(seekRange, columnFamilies, inclusive); findTop(); + + if (range.getStartKey() != null) { + while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS) + && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) { + // Value has a more recent timestamp, pass it up. + next(); + } + } + + while (hasTop() && range.beforeStartKey(getTopKey())) { + next(); + } } @Override From 68e67e4891fe069ba8bf50f811b49a742f7340a7 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Mon, 3 Jun 2024 19:01:51 -0400 Subject: [PATCH 06/10] Remove bad import --- src/test/java/datawave/query/util/MetadataHelperTest.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/test/java/datawave/query/util/MetadataHelperTest.java b/src/test/java/datawave/query/util/MetadataHelperTest.java index 720a7815..53a465ac 100644 --- a/src/test/java/datawave/query/util/MetadataHelperTest.java +++ b/src/test/java/datawave/query/util/MetadataHelperTest.java @@ -7,7 +7,6 @@ import java.io.File; import java.io.IOException; import java.net.URISyntaxException; -import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -27,7 +26,6 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; -import org.apache.http.auth.AUTH; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; From 8933a7ea05cf067917ea8d09eb96f87ccaff6507 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 3 Sep 2024 05:12:18 -0400 Subject: [PATCH 07/10] Ensure index markers and legacy formats are not aggregated --- .../FrequencyMetadataAggregatorTest.java | 99 ++++++++++++++++++- src/test/resources/log4j.properties | 2 + 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java index 4d1d3b59..00487149 100644 --- a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java +++ b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java @@ -411,6 +411,101 @@ void testDiverseDataset() throws TableNotFoundException { assertResults(); } + /** + * Test aggregation over a dataset that contains index markers. + */ + @Test + void testIndexMarkers() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Entries with index markers that should not be aggregated. + givenMutation("AGE", COLF_I, "num" + NULL_BYTE + "20191230" + NULL_BYTE + "true", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "20190530" + NULL_BYTE + "false", "FOO", 1500000004L, new Value()); + givenMutation("NAME", COLF_I, "attr" + NULL_BYTE + "20171201" + NULL_BYTE + "true", "BAR", 1500000004L, new Value()); + + + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num" + NULL_BYTE + "20191230" + NULL_BYTE + "true", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_RI, "attr" + NULL_BYTE + "20190530" + NULL_BYTE + "false", "FOO", 1500000004L, new Value()); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "20171201" + NULL_BYTE + "true", "BAR", 1500000004L, new Value()); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + /** + * Test aggregation over a dataset that contains legacy formats. + */ + @Test + void testLegacyFormats() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Entries with legacy formats that should not be aggregated. + givenMutation("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "FakeTypeClassName", "FOO", 1500000004L, new Value()); + + + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_RI, "attr" + NULL_BYTE + "FakeTypeClassName", "FOO", 1500000004L, new Value()); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + /** * Verify that scanning over a table with columns that are not to be aggregated result in them being unchanged. */ @@ -476,10 +571,8 @@ private void assertResults() throws TableNotFoundException { List> actual = new ArrayList<>(); for (Map.Entry entry : scanner) { actual.add(new AbstractMap.SimpleEntry<>(entry.getKey(), entry.getValue())); - System.out.println("Key: '" + entry.getKey() + "'"); + System.out.println("Key: " + entry.getKey()); } - System.out.println("Expected:"); - expected.forEach(e -> System.out.println("Key: '" + e.getKey() + "'")); Assertions.assertEquals(expected, actual); } diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index cacd01b4..a83eb239 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -4,3 +4,5 @@ log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender log4j.appender.CONSOLE.Threshold=INFO log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-5p [%C{1}:%M] %m%n + +log4j.logger.datawave.iterators.FrequencyMetadataAggregatorTest=trace From 8cf9be7a50e560336ef252ba843e50d877055b32 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 3 Sep 2024 05:12:40 -0400 Subject: [PATCH 08/10] Code formatting --- .../FrequencyMetadataAggregator.java | 2 +- .../query/util/AllFieldMetadataHelper.java | 6 +-- .../datawave/query/util/MetadataHelper.java | 5 +- .../FrequencyMetadataAggregatorTest.java | 54 ++++++++++++------- .../query/util/MetadataHelperTest.java | 2 +- 5 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java index 463b0c60..2589acad 100644 --- a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -10,7 +10,6 @@ import java.util.Set; import java.util.TreeMap; -import datawave.util.time.DateHelper; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.PartialKey; @@ -33,6 +32,7 @@ import datawave.marking.MarkingFunctions; import datawave.query.model.DateFrequencyMap; import datawave.util.StringUtils; +import datawave.util.time.DateHelper; /** * Aggregates entries in the metadata table for the "f", "i", and "ri" columns. When initially ingested, entries for these columns have a column qualifier with diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index 2d6c0cf4..04d4796b 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -24,7 +24,6 @@ import java.util.TreeSet; import java.util.concurrent.ExecutionException; -import datawave.iterators.FrequencyMetadataAggregator; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.Scanner; @@ -60,6 +59,7 @@ import datawave.data.ColumnFamilyConstants; import datawave.data.type.Type; import datawave.data.type.TypeFactory; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadata; import datawave.query.composite.CompositeMetadataHelper; import datawave.query.model.DateFrequencyMap; @@ -857,7 +857,8 @@ protected HashMap getCountsByFieldInDayWithTypes(Entry (aggregated entries) and/or \0 (non-aggregated entries). + // It's possible to find rows with column qualifiers in the format (aggregated entries) and/or \0 (non-aggregated + // entries). // Filter out any non-aggregated entries that does not have the date in the column qualifier. IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); // Allow any entries that do not contain the null byte delimiter, or contain it with the target date directly afterwards. @@ -1638,7 +1639,6 @@ private void addToTargetMap(String datatype, DateFrequencyMap aggregatedCounts) } } - /** * Add the current date and count to the current target map for the given datatype. */ diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index 1ae7ebf5..ed5a5f67 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -26,7 +26,6 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; -import datawave.iterators.FrequencyMetadataAggregator; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; @@ -74,6 +73,7 @@ import datawave.data.MetadataCardinalityCounts; import datawave.data.type.Type; import datawave.iterators.EdgeMetadataCombiner; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.iterators.MetadataFColumnSeekingFilter; import datawave.iterators.filter.EdgeMetadataCQStrippingIterator; import datawave.marking.MarkingFunctions; @@ -1754,7 +1754,8 @@ public Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String * a wrapped AccumuloClient * @return the earliest date the field is found, or null otherwise */ - protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String datatypeFilter, AccumuloClient client, WrappedAccumuloClient wrappedClient) { + protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String datatypeFilter, AccumuloClient client, + WrappedAccumuloClient wrappedClient) { String earliestDate = null; String prevDatatype = null; boolean skipToAggregated = false; diff --git a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java index 00487149..4e3d1598 100644 --- a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java +++ b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java @@ -299,7 +299,8 @@ public void testCombiningColumnVisibilities() throws TableNotFoundException { // Enable to option to combine visibilities. givenCombineColumnVisibilitiesIsTrue(); - expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "BAR&COB&FOO", 1500000015L, createDateFrequencyMap("20200101", 16L, "20200102", 26L, "20200103", 24L, "20200104", 16L)); + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "BAR&COB&FOO", 1500000015L, + createDateFrequencyMap("20200101", 16L, "20200102", 26L, "20200103", 24L, "20200104", 16L)); assertResults(); } @@ -327,7 +328,8 @@ void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException { givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); - expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 40L, "20200101", 19L, "20200102", 30L, "20200103", 12L)); + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 40L, "20200101", 19L, "20200102", 30L, "20200103", 12L)); assertResults(); } @@ -353,19 +355,26 @@ void testNoAggregationNeeded() throws TableNotFoundException { givenAggregatedRow("GENDER", COLF_I, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); givenAggregatedRow("GENDER", COLF_RI, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); - expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); expect("NAME", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); expect("NAME", COLF_F, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); expect("NAME", COLF_I, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); expect("NAME", COLF_I, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); expect("NAME", COLF_I, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); - expect("NAME", COLF_RI, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_RI, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); expect("NAME", COLF_RI, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); expect("NAME", COLF_RI, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); @@ -396,10 +405,12 @@ void testDiverseDataset() throws TableNotFoundException { givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. - expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); - expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); @@ -440,11 +451,12 @@ void testIndexMarkers() throws TableNotFoundException { givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "20190530" + NULL_BYTE + "false", "FOO", 1500000004L, new Value()); givenMutation("NAME", COLF_I, "attr" + NULL_BYTE + "20171201" + NULL_BYTE + "true", "BAR", 1500000004L, new Value()); - - expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); - expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_I, "num" + NULL_BYTE + "20191230" + NULL_BYTE + "true", "BAR", 1400000005L, new Value()); expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); @@ -487,11 +499,12 @@ void testLegacyFormats() throws TableNotFoundException { givenMutation("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "FakeTypeClassName", "FOO", 1500000004L, new Value()); - - expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); - expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); @@ -505,7 +518,6 @@ void testLegacyFormats() throws TableNotFoundException { assertResults(); } - /** * Verify that scanning over a table with columns that are not to be aggregated result in them being unchanged. */ @@ -546,10 +558,12 @@ void testMixedColumns() throws TableNotFoundException { expect("AGE", COLF_DESC, "var", "BAR", 1400000005L, new Value("age_var description")); expect("AGE", COLF_E, "lifetime", "BAR", 1400000005L, new Value()); expect("AGE", COLF_E, "num", "BAR", 1400000005L, new Value()); - expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); - expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); expect("GENDER", COLF_DESC, "text", "BAR", 1400000005L, new Value("gender_text description")); expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); diff --git a/src/test/java/datawave/query/util/MetadataHelperTest.java b/src/test/java/datawave/query/util/MetadataHelperTest.java index a99ad105..ef5dc457 100644 --- a/src/test/java/datawave/query/util/MetadataHelperTest.java +++ b/src/test/java/datawave/query/util/MetadataHelperTest.java @@ -16,7 +16,6 @@ import java.util.Objects; import java.util.Set; -import datawave.iterators.FrequencyMetadataAggregator; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; @@ -39,6 +38,7 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadataHelper; import datawave.query.model.DateFrequencyMap; import datawave.util.time.DateHelper; From d9261ae6ff36bc8e0e20e0580dda6c2537d86be6 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 3 Sep 2024 05:45:04 -0400 Subject: [PATCH 09/10] Add back tests --- .../query/util/AllFieldMetadataHelper.java | 1 - .../datawave/query/util/MetadataHelper.java | 3 - .../util/AllFieldMetadataHelperTest.java | 2450 +++++++++++++---- 3 files changed, 1971 insertions(+), 483 deletions(-) diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index 04d4796b..554ef841 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -32,7 +32,6 @@ import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.RegExFilter; -import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.tuple.Pair; diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index ed5a5f67..f5f1f15a 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -1,7 +1,6 @@ package datawave.query.util; import java.io.ByteArrayInputStream; -import java.io.Console; import java.io.DataInputStream; import java.io.IOException; import java.nio.charset.CharacterCodingException; @@ -17,7 +16,6 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.SortedMap; import java.util.SortedSet; import java.util.TimeZone; import java.util.TreeSet; @@ -47,7 +45,6 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.commons.lang3.time.DateUtils; -import org.apache.hadoop.fs.shell.Concat; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.slf4j.Logger; diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index 70b45c29..14e08cbd 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -1,10 +1,15 @@ package datawave.query.util; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.query.util.TestUtils.createDateFrequencyMap; +import static datawave.query.util.TestUtils.createRangedDateFrequencyMap; +import static org.apache.accumulo.core.iterators.LongCombiner.VAR_LEN_ENCODER; + import java.io.File; import java.io.IOException; import java.net.URISyntaxException; +import java.util.AbstractMap; import java.util.ArrayList; -import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -26,14 +31,16 @@ import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -43,7 +50,9 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; import datawave.data.type.LcNoDiacriticsType; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.FieldIndexHole; import datawave.util.time.DateHelper; @@ -55,6 +64,8 @@ class AllFieldMetadataHelperTest { private AccumuloClient accumuloClient; private AllFieldMetadataHelper helper; + private final List mutations = new ArrayList<>(); + @BeforeAll static void beforeAll() throws URISyntaxException { File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); @@ -101,15 +112,150 @@ private void writeMutations(Collection mutations) { } /** - * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and - * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. + * Write the given mutations to the metadata table. + */ + private void writeMutations() { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + } + + private void givenNonAggregatedFrequencyRows(String row, String colf, String datatype, String startDate, String endDate, long count) { + givenNonAggregatedFrequencyRows(row, new Text(colf), datatype, startDate, endDate, count); + } + + private void givenNonAggregatedFrequencyRows(String row, Text colf, String datatype, String startDate, String endDate, long count) { + Mutation mutation = new Mutation(row); + Value value = new Value(VAR_LEN_ENCODER.encode(count)); + List dates = TestUtils.getDatesInRange(startDate, endDate); + dates.forEach((date) -> mutation.put(colf, new Text(datatype + NULL_BYTE + date), value)); + givenMutation(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date, boolean indexed) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype + NULL_BYTE + date + NULL_BYTE + indexed, new Value()); + mutations.add(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype, DateHelper.parse(date).getTime(), new Value()); + mutations.add(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date, Class typeClass) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype + NULL_BYTE + typeClass.getName(), DateHelper.parse(date).getTime(), new Value()); + mutations.add(mutation); + } + + private void givenAggregatedFrequencyRow(String row, String colf, String datatype, DateFrequencyMap map) { + givenAggregatedFrequencyRow(row, new Text(colf), datatype, map); + } + + private void givenAggregatedFrequencyRow(String row, Text colf, String datatype, DateFrequencyMap map) { + Mutation mutation = new Mutation(row); + Value value = new Value(WritableUtils.toByteArray(map)); + mutation.put(colf, new Text(datatype + NULL_BYTE + FrequencyMetadataAggregator.AGGREGATED), value); + givenMutation(mutation); + } + + private void givenMutation(Mutation mutation) { + this.mutations.add(mutation); + } + + /** + * Tests for {@link AllFieldMetadataHelper#getCountsByFieldInDayWithTypes(Map.Entry)}. */ @Nested - public class FieldIndexHoleTests { + public class CountsByFieldInDayWithTypesTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 3L); // Does not contain target date. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 1L); + expected.put("wiki", 2L); + expected.put("maze", 3L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200110")); + + Assertions.assertEquals(expected, actual); + } - private Set fields = new HashSet<>(); - private Set datatypes = new HashSet<>(); - private double minimumThreshold = 1.0d; + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 5L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); // Should get summed into previous. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200115", "20200120", 3L); // Does not have entry for 20200102, should not be incremented. + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 6L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + } + + /** + * Base class for field index hole tests. + */ + public abstract class AbstractFieldIndexHoleTests { + + protected Set fields = new HashSet<>(); + protected Set datatypes = new HashSet<>(); + protected double minimumThreshold = 1.0d; protected final Supplier>> INDEX_FUNCTION = () -> { try { @@ -138,6 +284,44 @@ void tearDown() { givenMinimumThreshold(1.0d); } + protected void givenFields(String... fields) { + this.fields = Sets.newHashSet(fields); + } + + protected void givenDatatypes(String... datatypes) { + this.datatypes = Sets.newHashSet(datatypes); + } + + protected void givenMinimumThreshold(double minimumThreshold) { + this.minimumThreshold = minimumThreshold; + } + + protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + } + + protected Pair dateRange(String start, String end) { + return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. + */ + @Nested + public class FieldIndexHoleTestsForNonAggregatedEntries extends AbstractFieldIndexHoleTests { + /** * Test against data that has no field index holes. */ @@ -145,20 +329,19 @@ void tearDown() { @ValueSource(strings = {"i", "ri"}) void testNoFieldIndexHoles(String cf) { // Create a series of frequency rows over date ranges, each with a matching index row for each date. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200101", "20200120", 1L); + writeMutations(); // Verify that no index holes were found. Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -171,11 +354,10 @@ void testNoFieldIndexHoles(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -190,12 +372,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -210,12 +391,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -231,13 +411,12 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -252,12 +431,11 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -273,13 +451,12 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -295,14 +472,13 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithNotIndexedMarker(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200109", false); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -319,13 +495,12 @@ void testFieldIndexHoleWithNotIndexedMarker(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarker(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", true); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -342,13 +517,12 @@ void testFieldIndexHoleWithIndexedMarker(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -365,13 +539,12 @@ void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", LcNoDiacriticsType.class); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -383,14 +556,13 @@ void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", true); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200104", "20200110", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -405,24 +577,23 @@ void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMixedDateGapsAndNonIndexedFields(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Not indexed nor covers full range for NAME - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -441,14 +612,13 @@ void testMixedDateGapsAndNonIndexedFields(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200106", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200106", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -463,12 +633,11 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -487,16 +656,15 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200109", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200114", "20200116", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200119", "20200120", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200109", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -516,12 +684,11 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -538,13 +705,12 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String c @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -560,12 +726,11 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -581,14 +746,13 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200112", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -604,12 +768,11 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -628,16 +791,15 @@ void testAllDatesAreIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200115", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200125", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200328", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200115", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200125", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -656,26 +818,25 @@ void testAllDatesAreIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -695,34 +856,33 @@ void testSingularDayIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -742,31 +902,30 @@ void testSingularDayIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMixedDateGapsAndThresholdIndexHoles(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -788,31 +947,30 @@ void testMixedDateGapsAndThresholdIndexHoles(String cf) { void testMinimumThresholdPercentageBelow100(String cf) { givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 74L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 100L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 98L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 100L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 98L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 90L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 99L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 90L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 99L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -835,31 +993,30 @@ void testOneFieldSpecified(String cf) { // Retrieve field index holes for field NAME. givenFields("NAME"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -879,39 +1036,38 @@ void testMultipleFieldsSpecified(String cf) { // Retrieve field index holes for fields URI and EVENT_DATE. givenFields("URI", "EVENT_DATE"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-wiki on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -932,55 +1088,54 @@ void testDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -1004,55 +1159,54 @@ void testFieldsAndDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -1062,120 +1216,1458 @@ void testFieldsAndDatatypesSpecified(String cf) { // @formatter:on Assertions.assertEquals(expected, fieldIndexHoles); } - - private void givenFields(String... fields) { - this.fields = Sets.newHashSet(fields); - } - - private void givenDatatypes(String... datatypes) { - this.datatypes = Sets.newHashSet(datatypes); - } - - private void givenMinimumThreshold(double minimumThreshold) { - this.minimumThreshold = minimumThreshold; - } - - protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { - Map> fieldIndexHoles = new HashMap<>(); - for (FieldIndexHole hole : holes) { - Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); - datatypeMap.put(hole.getDatatype(), hole); - } - return fieldIndexHoles; - } - - @SafeVarargs - protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { - return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); - } - - protected Pair dateRange(String start, String end) { - return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); - } } /** - * Helper class for creating mutations in bulk for field index hole tests. + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains aggregated entries only. */ - private static class FieldIndexHoleMutationCreator { - - private final List mutations = new ArrayList<>(); + @Nested + public class FieldIndexHoleTestsForAggregatedEntries extends AbstractFieldIndexHoleTests { - private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, "f", datatype, date, count)); + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); } - private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, cf, datatype, date, count)); + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate, boolean indexed) { - addMutation(fieldName, cf, datatype, endDate, indexed); + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Make the index counts a value that will not meet the threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate) { - addMutation(fieldName, cf, datatype, endDate); + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate, Class typeClass) { - addMutation(fieldName, cf, datatype, endDate, typeClass); + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getDatesInRange(String startDateStr, String endDateStr) { - Date startDate = DateHelper.parse(startDateStr); - Date endDate = DateHelper.parse(endDateStr); - - List dates = new ArrayList<>(); - dates.add(startDateStr); - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(startDate); - while (true) { - calendar.add(Calendar.DAY_OF_MONTH, 1); - Date date = calendar.getTime(); - if (date.before(endDate) || date.equals(endDate)) { - dates.add(DateHelper.format(date)); - } else { - break; - } - } + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); - return dates; + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, long count) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(count))); - mutations.add(mutation); + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, boolean indexed) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + date + NULL_BYTE + indexed, new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype, getTimestamp(date), new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, Class type) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + type.getName(), getTimestamp(date), new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private long getTimestamp(String date) { - return DateHelper.parse(date).getTime(); + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L, "20200110", "20200113", 1L, "20200117", "20200118", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getMutations() { - return mutations; - } - } - + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L, "20200114", "20200116", 1L, "20200117", "20200118", 5L, "20200119", "20200120", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L, "20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200113", "20200115", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L, "20200110", + "20200112", 1L, "20200113", "20200115", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L, "20200104", "20200104", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L, "20200114", "20200115", 1L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L, "20200123", "20200125", 1L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 74L, "20200111", "20200112", 75L, "20200114", "20200115", 100L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains both aggregated and non-aggregated entries. + */ + @Nested + public class FieldIndexHoleTestsForMixedAggregatedAndNonAggregatedEntries extends AbstractFieldIndexHoleTests { + + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200111", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200115", "20200120", 1L); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200107", 5L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200108", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200115", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L)); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } } From 30f049e89568de2823ff5e8dc400c648234bcf72 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Tue, 3 Sep 2024 06:08:34 -0400 Subject: [PATCH 10/10] Add additional index marker tests --- .../query/util/AllFieldMetadataHelper.java | 14 +- .../util/AllFieldMetadataHelperTest.java | 212 ++++++++++++++++++ src/test/resources/log4j.properties | 2 - 3 files changed, 218 insertions(+), 10 deletions(-) diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index 554ef841..a3f67a5e 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -1301,15 +1301,12 @@ public Map> getReversedFieldIndexHoles(Set> getFieldIndexHoles(Text targetColumnFamily, Set fields, Set datatypes, double minThreshold) throws TableNotFoundException, IOException { - // create local copies to avoid side effects - fields = new HashSet<>(fields); - datatypes = new HashSet<>(datatypes); - // Handle null fields if given. if (fields == null) { fields = Collections.emptySet(); } else { - // Ensure null is not present as an entry. + // Ensure null is not present as an entry in a local copy. + fields = new HashSet<>(fields); fields.remove(null); } @@ -1317,15 +1314,16 @@ private Map> getFieldIndexHoles(Text targetCol if (datatypes == null) { datatypes = Collections.emptySet(); } else { - // Ensure null is not present as an entry. + // Ensure null is not present as an entry in a local copy. + datatypes = new HashSet<>(datatypes); datatypes.remove(null); } // remove fields that are not indexed at all by the specified datatypes Multimap indexedFieldMap = (targetColumnFamily == ColumnFamilyConstants.COLF_I ? loadIndexedFields() : loadReverseIndexedFields()); - Set indexedFields = new HashSet<>(); + Set indexedFields; if (datatypes.isEmpty()) { - indexedFields.addAll(indexedFieldMap.values()); + indexedFields = new HashSet<>(indexedFieldMap.values()); } else { indexedFields = new HashSet<>(); for (String datatype : datatypes) { diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index 14e08cbd..e146574c 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -1366,6 +1366,112 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { Assertions.assertEquals(expected, fieldIndexHoles); } + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + * This uses a negative index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithNotIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200109"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format with type class + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200104", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + /** * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. */ @@ -2081,6 +2187,112 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { Assertions.assertEquals(expected, fieldIndexHoles); } + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + * This uses a negative index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithNotIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200109"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarker(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format with type class + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + /** * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. */ diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index a83eb239..cacd01b4 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -4,5 +4,3 @@ log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender log4j.appender.CONSOLE.Threshold=INFO log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-5p [%C{1}:%M] %m%n - -log4j.logger.datawave.iterators.FrequencyMetadataAggregatorTest=trace