diff --git a/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java new file mode 100644 index 0000000..2589aca --- /dev/null +++ b/src/main/java/datawave/iterators/FrequencyMetadataAggregator.java @@ -0,0 +1,491 @@ +package datawave.iterators; + +import java.io.IOException; +import java.time.format.DateTimeParseException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.PartialKey; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.IteratorUtil; +import org.apache.accumulo.core.iterators.LongCombiner; +import org.apache.accumulo.core.iterators.OptionDescriber; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.iteratorsImpl.conf.ColumnSet; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; +import org.apache.log4j.Logger; + +import com.google.common.base.Splitter; + +import datawave.marking.MarkingFunctions; +import datawave.query.model.DateFrequencyMap; +import datawave.util.StringUtils; +import datawave.util.time.DateHelper; + +/** + * Aggregates entries in the metadata table for the "f", "i", and "ri" columns. When initially ingested, entries for these columns have a column qualifier with + * the format {@code \0}, and a value containing a possibly partial frequency count for the date in the column qualifier. Entries with the + * same row, column family, datatype, and column family will be aggregated into a single entry where the column qualifier consists of the datatype and the value + * consists of an encoded {@link DateFrequencyMap} with the dates and counts seen. Additionally, this aggregator will handle the case where we have a previously + * aggregated entry and freshly ingested rows that need to be aggregated together. + */ +public class FrequencyMetadataAggregator extends WrappingIterator implements OptionDescriber { + + public static final String COMBINE_VISIBILITIES_OPTION = "COMBINE_VISIBILITIES"; + public static final String COLUMNS_OPTION = "columns"; + public static final String AGGREGATED = "AGGREGATED"; + + private static final Logger log = Logger.getLogger(FrequencyMetadataAggregator.class); + private static final String NULL_BYTE = "\0"; + private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions(); + + private boolean combineVisibilities; + private ColumnSet columns; + + private Key topKey; + private Value topValue; + + private final TreeMap cache; + private final Map visibilityToDateFrequencies; + private final Map visibilityToMaxTimestamp; + + private final Key workKey = new Key(); + private final Text currentRow = new Text(); + private final Text currentColumnFamily = new Text(); + private String currentDatatype; + private String currentDate; + private ColumnVisibility currentVisibility; + private long currentTimestamp; + private boolean isCurrentAggregated; + + public FrequencyMetadataAggregator() { + cache = new TreeMap<>(); + visibilityToDateFrequencies = new HashMap<>(); + visibilityToMaxTimestamp = new HashMap<>(); + } + + @Override + public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { + FrequencyMetadataAggregator copy = new FrequencyMetadataAggregator(); + copy.setSource(getSource().deepCopy(env)); + copy.combineVisibilities = combineVisibilities; + return copy; + } + + @Override + public IteratorOptions describeOptions() { + Map options = new HashMap<>(); + options.put(COMBINE_VISIBILITIES_OPTION, "Boolean value denoting whether to combine entries with different visibilities. Defaults to false."); + options.put(COLUMNS_OPTION, "[:]{,[:]} escape non-alphanum chars using %."); + return new IteratorOptions(getClass().getSimpleName(), "An iterator used to collapse frequency columns in the metadata table", options, null); + } + + @Override + public boolean validateOptions(Map options) { + if (options.containsKey(COMBINE_VISIBILITIES_OPTION)) { + try { + // noinspection ResultOfMethodCallIgnored + Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES_OPTION)); + } catch (Exception e) { + throw new IllegalArgumentException("Bad boolean for " + COMBINE_VISIBILITIES_OPTION + " option: " + options.get(COMBINE_VISIBILITIES_OPTION)); + } + } + + if (!options.containsKey(COLUMNS_OPTION)) { + throw new IllegalArgumentException("Options must include " + COLUMNS_OPTION); + } + + String encodedColumns = options.get(COLUMNS_OPTION); + if (encodedColumns.isEmpty()) { + throw new IllegalArgumentException("Empty columns specified for " + COLUMNS_OPTION); + } + + for (String columns : Splitter.on(",").split(encodedColumns)) { + if (!ColumnSet.isValidEncoding(columns)) { + throw new IllegalArgumentException("invalid column encoding " + encodedColumns); + } + } + + return true; + } + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { + super.init(source, options, env); + + combineVisibilities = options.containsKey(COMBINE_VISIBILITIES_OPTION) && Boolean.parseBoolean(options.get(COMBINE_VISIBILITIES_OPTION)); + columns = new ColumnSet(List.of(StringUtils.split(options.get(COLUMNS_OPTION), ","))); + + if (log.isTraceEnabled()) { + log.trace("Option " + COMBINE_VISIBILITIES_OPTION + ": " + combineVisibilities); + log.trace("Option " + COLUMNS_OPTION + ": " + columns); + } + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + // Do not seek to the middle of a value that should be combined. + Range seekRange = IteratorUtil.maximizeStartKeyTimeStamp(range); + + super.seek(seekRange, columnFamilies, inclusive); + findTop(); + + if (range.getStartKey() != null) { + while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS) + && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) { + // Value has a more recent timestamp, pass it up. + next(); + } + } + + while (hasTop() && range.beforeStartKey(getTopKey())) { + next(); + } + } + + @Override + public Key getTopKey() { + return topKey == null ? super.getTopKey() : topKey; + } + + @Override + public Value getTopValue() { + return topValue == null ? super.getTopValue() : topValue; + } + + @Override + public boolean hasTop() { + return topKey != null || super.hasTop(); + } + + @Override + public void next() throws IOException { + log.trace("Fetching next"); + // If topKey is not null, the last call to next() popped an entry from the cache. Reset to null. If any more entries remain in the cache, they will be + // popped in findTop(). + if (topKey != null) { + topKey = null; + topValue = null; + } else { + // If topKey is null, the last call to next() did not pop an entry from the cache. Advance to the next from the source. We will determine if + // aggregation is needed in findTop(). + super.next(); + } + + findTop(); + } + + private void findTop() throws IOException { + log.trace("Finding top"); + // Attempt to pop an entry from the cache. If no entries remain, evaluate the next key for potential aggregation. + if (!popCache()) { + if (super.hasTop()) { + workKey.set(super.getTopKey()); + // Check if the current key contains a column marked for aggregation, and is not deleted. If so, rebuild the cache with the relevant aggregated + // entries. + if (columns.contains(workKey) && !workKey.isDeleted()) { + updateCache(); + popCache(); + } + } + } + } + + /** + * Set {@link #topKey} and {@link #topValue} to the next available entry in the cache. Returns true if the cache was not empty, or false otherwise. + */ + private boolean popCache() { + log.trace("Popping cache"); + if (!cache.isEmpty()) { + Map.Entry entry = cache.pollFirstEntry(); + topKey = entry.getKey(); + topValue = entry.getValue(); + return true; + } + return false; + } + + /** + * Reset all current tracking variables. + */ + private void resetCurrent() { + currentRow.clear(); + currentColumnFamily.clear(); + currentDatatype = null; + currentDate = null; + currentVisibility = null; + currentTimestamp = 0L; + isCurrentAggregated = false; + visibilityToDateFrequencies.clear(); + visibilityToMaxTimestamp.clear(); + } + + /** + * Iterate over the source entries, aggregate all entries for the next row/column family/datatype combination, and add them to the cache. + */ + private void updateCache() throws IOException { + log.trace("Updating cache"); + + resetCurrent(); + + while (true) { + // If the source does not have any more entries, wrap up the last batch of entries. + if (!super.hasTop()) { + log.trace("Source does not have top"); + wrapUpCurrent(); + return; + } + + workKey.set(super.getTopKey()); + if (log.isTraceEnabled()) { + log.trace("updateCache examining key " + workKey); + } + + // If the current entry has a different row, column family, or datatype from the previous entry, wrap up and return the current + // batch of entries. + if (!partOfCurrentAggregation(workKey)) { + wrapUpCurrent(); + return; + } + + // Aggregate the current entry only if it is not deleted. + if (!workKey.isDeleted()) { + // Aggregate the current entry. + aggregateCurrent(); + } else { + // Add the deleted entry to the cache so that it is available for scanning, but do not include it as part of the aggregation. + cache.put(super.getTopKey(), super.getTopValue()); + } + + // Advance to the next entry from the source. + super.next(); + } + } + + /** + * Return true if the current entry has the same row, column family, and datatype from the previous entry, or false otherwise. + */ + private boolean partOfCurrentAggregation(Key key) { + // Update the current row if null. + if (currentRow.getLength() == 0) { + currentRow.set(key.getRow()); + if (log.isTraceEnabled()) { + log.trace("Set current row to " + currentRow); + } + // Check if we're on a new field. + } else if (!currentRow.equals(key.getRow())) { + if (log.isTraceEnabled()) { + log.trace("Next row " + key.getRow() + " differs from prev " + currentRow); + } + return false; + } + + // Update the current column family if null. + if (currentColumnFamily.getLength() == 0) { + currentColumnFamily.set(key.getColumnFamily()); + if (log.isTraceEnabled()) { + log.trace("Set current column family to " + currentColumnFamily); + } + // Check if we're on a new column family. + } else if (!currentColumnFamily.equals(key.getColumnFamily())) { + if (log.isTraceEnabled()) { + log.trace("Next column family " + key.getColumnFamily() + " differs from prev " + currentColumnFamily); + } + return false; + } + + String columnQualifier = key.getColumnQualifier().toString(); + int separatorPos = columnQualifier.indexOf(NULL_BYTE); + + // If a null byte is not present, this is an entry with a legacy format and should not be aggregated. + if (separatorPos == -1) { + if (log.isTraceEnabled()) { + log.trace("Found column qualifier that does not contain null byte: " + columnQualifier); + } + return false; + } + + String datatype = columnQualifier.substring(0, separatorPos); + String remainder = columnQualifier.substring((separatorPos + 1)); + + // If a second null byte is present, this is an entry with an index boundary marker in the format \0\0 and should not be + // aggregated. + if (remainder.contains(NULL_BYTE)) { + if (log.isTraceEnabled()) { + log.trace("Found index boundary marker: " + columnQualifier); + } + return false; + } + + // This is an aggregated entry. + if (remainder.equals(AGGREGATED)) { + isCurrentAggregated = true; + } else { + // The remainder should typically be a date, but in rare cases may be a legacy format with the type class name instead of the date, and cannot be + // aggregated if so. Check if the remainder can be parsed as a date. + try { + DateHelper.parse(remainder); + } catch (DateTimeParseException e) { + if (log.isTraceEnabled()) { + log.trace("Found unparseable date: " + columnQualifier); + } + return false; + } + currentDate = columnQualifier.substring((separatorPos + 1)); + if (log.isTraceEnabled()) { + log.trace("Set current date to " + currentDate); + } + } + + // Update the current datatype if null. + if (currentDatatype == null) { + currentDatatype = datatype; + if (log.isTraceEnabled()) { + log.trace("Set current datatype to " + currentDatatype); + } + // Check if we're on a new datatype. + } else if (!currentDatatype.equals(datatype)) { + if (log.isTraceEnabled()) { + log.trace("Next datatype " + datatype + " differs from prev " + currentDatatype); + } + return false; + } + + // Update the current visibility and timestamp. + currentVisibility = new ColumnVisibility(key.getColumnVisibility()); + currentTimestamp = key.getTimestamp(); + return true; + } + + /** + * Aggregate the current entry. + */ + private void aggregateCurrent() { + Value value = super.getTopValue(); + // Fetch the date-frequency map for the current column visibility, creating one if not present. + DateFrequencyMap dateFrequencies = visibilityToDateFrequencies.computeIfAbsent(currentVisibility, (k) -> new DateFrequencyMap()); + + // If the current entry has an aggregated value, parse it as such and merge it with the date-frequency map. + if (isCurrentAggregated) { + try { + DateFrequencyMap entryMap = new DateFrequencyMap(value.get()); + dateFrequencies.incrementAll(entryMap); + } catch (IOException e) { + Key key = super.getTopKey(); + log.error("Failed to parse date frequency map from value for key " + key, e); + throw new IllegalArgumentException("Failed to parse date frequency map from value for key " + key, e); + } + } else { + // If the current entry does not have an aggregated value, it has a count for a specific date. Increment the count for the date in the map. + long count = LongCombiner.VAR_LEN_ENCODER.decode(value.get()); + dateFrequencies.increment(currentDate, count); + } + + // If the current timestamp is later than the previously tracked timestamp for the current column visibility, update the tracked timestamp. + if (visibilityToMaxTimestamp.containsKey(currentVisibility)) { + long prevTimestamp = visibilityToMaxTimestamp.get(currentVisibility); + if (prevTimestamp < currentTimestamp) { + visibilityToMaxTimestamp.put(currentVisibility, currentTimestamp); + } + } else { + visibilityToMaxTimestamp.put(currentVisibility, currentTimestamp); + } + } + + /** + * Create the entries to be returned by {@link #next()} and add them to the cache. + */ + private void wrapUpCurrent() { + if (log.isTraceEnabled()) { + log.trace("Wrapping up for row: " + currentRow + ", cf: " + currentColumnFamily + ", cq: " + currentDatatype); + } + + cache.putAll(buildCacheEntries()); + resetCurrent(); + } + + /** + * Build and return a sorted map of the key-value entries that should be made available to be returned by {@link #next()}. + */ + private Map buildCacheEntries() { + if (log.isTraceEnabled()) { + log.trace("buildTopKeys, currentRow: " + currentRow); + log.trace("buildTopKeys, currentColumnFamily: " + currentColumnFamily); + log.trace("buildTopKeys, currentDatatype: " + currentDatatype); + } + + Text columnQualifier = new Text(currentDatatype + NULL_BYTE + AGGREGATED); + + // If we are combining all entries regardless of column visibility, we will end up with one entry to return. + if (combineVisibilities) { + // Combine the visibilities and frequencies, and find the latest timestamp. + ColumnVisibility combined = combineAllVisibilities(); + long latestTimestamp = getLatestTimestamp(); + DateFrequencyMap combinedFrequencies = combineAllDateFrequencies(); + + // Return the single key-value pair. + Key key = new Key(currentRow, currentColumnFamily, columnQualifier, combined, latestTimestamp); + Value value = new Value(WritableUtils.toByteArray(combinedFrequencies)); + return Collections.singletonMap(key, value); + } else { + Map entries = new HashMap<>(); + // Create a key-value pair for each distinct column visibility. + for (Map.Entry entry : visibilityToDateFrequencies.entrySet()) { + ColumnVisibility visibility = entry.getKey(); + long timestamp = visibilityToMaxTimestamp.get(visibility); + Key key = new Key(currentRow, currentColumnFamily, columnQualifier, visibility, timestamp); + Value value = new Value(WritableUtils.toByteArray(entry.getValue())); + entries.put(key, value); + } + return entries; + } + } + + /** + * Return a {@link ColumnVisibility} that is the combination of all visibilities present in {@link #visibilityToDateFrequencies}. + */ + private ColumnVisibility combineAllVisibilities() { + Set visibilities = visibilityToDateFrequencies.keySet(); + try { + return markingFunctions.combine(visibilities); + } catch (MarkingFunctions.Exception e) { + log.error("Failed to combine visibilities " + visibilities); + throw new IllegalArgumentException("Failed to combine visibilities " + visibilities, e); + } + } + + /** + * Return the latest timestamp present in {@link #visibilityToMaxTimestamp}. + */ + private long getLatestTimestamp() { + long max = 0L; + for (long timestamp : visibilityToMaxTimestamp.values()) { + max = Math.max(max, timestamp); + } + return max; + } + + /** + * Return a {@link DateFrequencyMap} that contains all date counts present in {@link #visibilityToDateFrequencies}. + */ + private DateFrequencyMap combineAllDateFrequencies() { + DateFrequencyMap combined = new DateFrequencyMap(); + for (DateFrequencyMap map : visibilityToDateFrequencies.values()) { + combined.incrementAll(map); + } + return combined; + + } +} diff --git a/src/main/java/datawave/query/model/DateFrequencyMap.java b/src/main/java/datawave/query/model/DateFrequencyMap.java new file mode 100644 index 0000000..56b29d6 --- /dev/null +++ b/src/main/java/datawave/query/model/DateFrequencyMap.java @@ -0,0 +1,198 @@ +package datawave.query.model; + +import java.io.ByteArrayInputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; + +public class DateFrequencyMap implements Writable { + + // TODO - Should we use the YearMonthDay class instead as the key here? + private final TreeMap dateToFrequencies; + + public DateFrequencyMap() { + this.dateToFrequencies = new TreeMap<>(); + } + + public DateFrequencyMap(byte[] bytes) throws IOException { + this(); + ByteArrayInputStream in = new ByteArrayInputStream(bytes); + DataInputStream dataIn = new DataInputStream(in); + readFields(dataIn); + dataIn.close(); + } + + /** + * Associates the given frequency with the given date in this {@link DateFrequencyMap}. If the map previously contained a mapping for the given date, the + * old frequency is replaced by the new frequency. + * + * @param date + * the date + * @param frequency + * the frequency + */ + public void put(String date, long frequency) { + put(date, new Frequency(frequency)); + } + + /** + * Associates the given frequency with the given date in this {@link DateFrequencyMap}. If the map previously contained a mapping for the given date, the + * old frequency is replaced by the new frequency. + * + * @param date + * the date + * @param frequency + * the frequency + */ + public void put(String date, Frequency frequency) { + dateToFrequencies.put(date, frequency); + } + + /** + * Increments the frequency associated with the given date by the given addend. If a mapping does not previously exist for the date, a new mapping will be + * added with the given addend as the frequency. + * + * @param date + * the date + * @param addend + * the addend + */ + public void increment(String date, long addend) { + dateToFrequencies.computeIfAbsent(date, (k) -> new Frequency()).increment(addend); + } + + /** + * Increment all frequencies in this {@link DateFrequencyMap} by the frequencies in the given map. If the given map contains mappings for dates not present + * in this map, those mappings will be added to this map. + * + * @param map + * the map + */ + public void incrementAll(DateFrequencyMap map) { + for (Map.Entry entry : map.dateToFrequencies.entrySet()) { + increment(entry.getKey(), entry.getValue().getValue()); + } + } + + /** + * Return the frequency associated with the given date, or null if no such mapping exists. + * + * @param date + * the date + * @return the count + */ + public Frequency get(String date) { + return dateToFrequencies.get(date); + } + + /** + * Return whether this map contains a mapping for the given date. + * + * @param date + * the date + * @return true if a mapping exists for the given date, or false otherwise + */ + public boolean contains(String date) { + return dateToFrequencies.containsKey(date); + } + + /** + * Clear all mappings in this {@link DateFrequencyMap}. + */ + public void clear() { + this.dateToFrequencies.clear(); + } + + /** + * Returns a {@link Set} view of the mappings contained within this map, sorted in ascending by order. + * + * @return a {@link Set} view of the mappings + */ + public Set> entrySet() { + return this.dateToFrequencies.entrySet(); + } + + /** + * Returns a view of the portion of this {@link DateFrequencyMap}'s underlying map whose keys range from startDate (inclusive) to endDate (inclusive). + * + * @param startDate + * the start date + * @param endDate + * the end date + * @return the map view + */ + public SortedMap subMap(String startDate, String endDate) { + return dateToFrequencies.subMap(startDate, true, endDate, true); + } + + /** + * Returns the earliest date in this {@link DateFrequencyMap}. + * + * @return the earliest date + */ + public String earliestDate() { + return dateToFrequencies.firstKey(); + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + // Write the map's size. + WritableUtils.writeVInt(dataOutput, dateToFrequencies.size()); + + // Write each entry. + for (Map.Entry entry : dateToFrequencies.entrySet()) { + WritableUtils.writeString(dataOutput, entry.getKey()); + entry.getValue().write(dataOutput); + } + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + // Clear the map. + this.dateToFrequencies.clear(); + + // Read how many entries to expect. + int entries = WritableUtils.readVInt(dataInput); + + // Read each entry. + for (int i = 0; i < entries; i++) { + // Read the date key. + String date = WritableUtils.readString(dataInput); + // Read the frequency value. + Frequency value = new Frequency(); + value.readFields(dataInput); + this.dateToFrequencies.put(date, value); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DateFrequencyMap that = (DateFrequencyMap) o; + return Objects.equals(dateToFrequencies, that.dateToFrequencies); + } + + @Override + public int hashCode() { + return Objects.hash(dateToFrequencies); + } + + @Override + public String toString() { + return dateToFrequencies.toString(); + } +} diff --git a/src/main/java/datawave/query/model/Frequency.java b/src/main/java/datawave/query/model/Frequency.java new file mode 100644 index 0000000..1b1aac9 --- /dev/null +++ b/src/main/java/datawave/query/model/Frequency.java @@ -0,0 +1,80 @@ +package datawave.query.model; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Objects; + +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +/** + * Represents a frequency count. + */ +public class Frequency implements WritableComparable { + + // The value. + private long value; + + public Frequency() {} + + public Frequency(long value) { + this.value = value; + } + + /** + * Return the value of this {@link Frequency}. + * + * @return the frequency + */ + public long getValue() { + return value; + } + + /** + * Increment the value of this {@link Frequency} by the given addend. + * + * @param addend + * the addend to add + */ + public void increment(long addend) { + this.value += addend; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + WritableUtils.writeVLong(dataOutput, value); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + value = WritableUtils.readVLong(dataInput); + } + + @Override + public int compareTo(Frequency o) { + return Long.compare(this.value, o.value); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Frequency frequency = (Frequency) o; + return value == frequency.value; + } + + @Override + public int hashCode() { + return Objects.hash(value); + } + + @Override + public String toString() { + return Long.toString(value); + } +} diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index 32bb1e8..d533a61 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -58,9 +58,12 @@ import datawave.data.ColumnFamilyConstants; import datawave.data.type.Type; import datawave.data.type.TypeFactory; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadata; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.FieldIndexHole; +import datawave.query.model.Frequency; import datawave.security.util.AuthorizationsMinimizer; import datawave.security.util.ScannerHelper; import datawave.util.time.DateHelper; @@ -130,7 +133,7 @@ public AllFieldMetadataHelper(TypeMetadataHelper typeMetadataHelper, CompositeMe /** * Get the datatype from a key's column qualifier - * + * * @param k * the key * @return the datatype @@ -146,7 +149,7 @@ protected String getDatatype(Key k) { /** * Get the field name from a composite key - * + * * @param k * the key * @return the field name @@ -166,7 +169,7 @@ protected String getCompositeFieldName(Key k) { /** * Get the authorizations used by this helper - * + * * @return the authorizations */ public Set getAuths() { @@ -175,7 +178,7 @@ public Set getAuths() { /** * Get the full user authorizations used by this helper - * + * * @return the full user authorizations */ public Set getFullUserAuths() { @@ -184,7 +187,7 @@ public Set getFullUserAuths() { /** * Get the metadata table name - * + * * @return the metadata table name */ public String getMetadataTableName() { @@ -193,7 +196,7 @@ public String getMetadataTableName() { /** * Get the {@link TypeMetadataHelper} - * + * * @return the TypeMetadataHelper */ public TypeMetadataHelper getTypeMetadataHelper() { @@ -606,7 +609,7 @@ public TypeMetadata getTypeMetadata() throws TableNotFoundException { /** * Get the {@link TypeMetadata} for a particular set of ingest types - * + * * @param ingestTypeFilter * the set of ingest types used to filter the scan * @return the {@link TypeMetadata} for a particular set of ingest types @@ -630,7 +633,7 @@ public CompositeMetadata getCompositeMetadata() throws TableNotFoundException { /** * Get the {@link CompositeMetadata} for the specified ingest types - * + * * @param ingestTypeFilter * the set of ingest types used to filter the scan * @return the CompositeMetadata @@ -815,7 +818,7 @@ public Set getContentFields(Set ingestTypeFilter) throws TableNo /** * Get the set of ingest types that exist in the database - * + * * @param ingestTypeFilter * a set of ingest types * @return the set of ingest types that exist @@ -848,30 +851,39 @@ protected HashMap getCountsByFieldInDayWithTypes(Entry datatypeToCounts; + try (Scanner scanner = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths)) { scanner.fetchColumnFamily(ColumnFamilyConstants.COLF_F); scanner.setRange(Range.exact(fieldName)); + // It's possible to find rows with column qualifiers in the format (aggregated entries) and/or \0 (non-aggregated + // entries). + // Filter out any non-aggregated entries that does not have the date in the column qualifier. IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, ".*\u0000" + date, null, false); + // Allow any entries that do not contain the null byte delimiter, or contain it with the target date directly afterwards. + RegExFilter.setRegexs(cqRegex, null, null, "^(.*\u0000" + FrequencyMetadataAggregator.AGGREGATED + ")$|^(.*\u0000" + date + ")$", null, false); scanner.addScanIterator(cqRegex); - final Text holder = new Text(); datatypeToCounts = Maps.newHashMap(); for (Entry countEntry : scanner) { - ByteArrayInputStream bais = new ByteArrayInputStream(countEntry.getValue().get()); - DataInputStream inputStream = new DataInputStream(bais); + String colq = countEntry.getKey().getColumnQualifier().toString(); + int offset = colq.indexOf(NULL_BYTE); + String datatype = colq.substring(0, offset); - Long sum = WritableUtils.readVLong(inputStream); - - countEntry.getKey().getColumnQualifier(holder); - int offset = holder.find(NULL_BYTE); - - Preconditions.checkArgument(-1 != offset, "Could not find nullbyte separator in column qualifier for: " + countEntry.getKey()); - - String datatype = Text.decode(holder.getBytes(), 0, offset); + String remainder = colq.substring((offset + 1)); + if (remainder.equals(FrequencyMetadataAggregator.AGGREGATED)) { + DateFrequencyMap countMap = new DateFrequencyMap(countEntry.getValue().get()); + if (countMap.contains(date)) { + long count = countMap.get(date).getValue(); + datatypeToCounts.merge(datatype, count, Long::sum); + } + } else { + ByteArrayInputStream bais = new ByteArrayInputStream(countEntry.getValue().get()); + DataInputStream inputStream = new DataInputStream(bais); + Long count = WritableUtils.readVLong(inputStream); + datatypeToCounts.merge(datatype, count, Long::sum); + } - datatypeToCounts.put(datatype, sum); } } @@ -1289,15 +1301,12 @@ public Map> getReversedFieldIndexHoles(Set> getFieldIndexHoles(Text targetColumnFamily, Set fields, Set datatypes, double minThreshold) throws TableNotFoundException, IOException { - // create local copies to avoid side effects - fields = new HashSet<>(fields); - datatypes = new HashSet<>(datatypes); - // Handle null fields if given. if (fields == null) { fields = Collections.emptySet(); } else { - // Ensure null is not present as an entry. + // Ensure null is not present as an entry in a local copy. + fields = new HashSet<>(fields); fields.remove(null); } @@ -1305,15 +1314,16 @@ private Map> getFieldIndexHoles(Text targetCol if (datatypes == null) { datatypes = Collections.emptySet(); } else { - // Ensure null is not present as an entry. + // Ensure null is not present as an entry in a local copy. + datatypes = new HashSet<>(datatypes); datatypes.remove(null); } // remove fields that are not indexed at all by the specified datatypes Multimap indexedFieldMap = (targetColumnFamily == ColumnFamilyConstants.COLF_I ? loadIndexedFields() : loadReverseIndexedFields()); - Set indexedFields = new HashSet<>(); + Set indexedFields; if (datatypes.isEmpty()) { - indexedFields.addAll(indexedFieldMap.values()); + indexedFields = new HashSet<>(indexedFieldMap.values()); } else { indexedFields = new HashSet<>(); for (String datatype : datatypes) { @@ -1457,24 +1467,22 @@ private static class FieldIndexHoleFinder { * * @return the field index holes * @throws IOException - * if a value fails to deserialize + * if an exception occurs when deserializing a {@link Value} */ - Map> findHoles() throws IOException { + private Map> findHoles() throws IOException { String prevFieldName = null; Text prevColumnFamily = null; - String currFieldName; - String currDatatype; - Text currColumnFamily; - Date currDate; - long currCount; - Boolean currBoundaryValue; - for (Map.Entry entry : scanner) { // Parse the current row. Key key = entry.getKey(); - currFieldName = key.getRow().toString(); - currColumnFamily = key.getColumnFamily(); + String currFieldName = key.getRow().toString(); + Text currColumnFamily = key.getColumnFamily(); + String currDatatype; + long currCount = 0L; + Date currDate = null; + Boolean currBoundaryValue = null; + DateFrequencyMap currAggregatedCounts = null; String cq = key.getColumnQualifier().toString(); int offset = cq.indexOf(NULL_BYTE); @@ -1490,7 +1498,6 @@ Map> findHoles() throws IOException { currDate = getBaseDate(key.getTimestamp()); log.warn("Found an index entry missing the date, treating as an index marker at " + currDate + " : " + key); currBoundaryValue = true; - currCount = 0; } else { currDatatype = cq.substring(0, offset); @@ -1500,27 +1507,29 @@ Map> findHoles() throws IOException { } String cqRemainder = cq.substring((offset + 1)); - // check for a marker of
\0\0true/false vs just
\0 - // where the boolean denotes that we can assume the field is indexed/no on and before this date - offset = cqRemainder.indexOf(NULL_BYTE); - if (offset >= 0) { - currBoundaryValue = Boolean.valueOf(cqRemainder.substring(offset + 1)); - currDate = DateHelper.parse(cqRemainder.substring(0, offset)); - currCount = 0; + // This is an aggregated entry. + if (cqRemainder.equals(FrequencyMetadataAggregator.AGGREGATED)) { + currAggregatedCounts = new DateFrequencyMap(entry.getValue().get()); } else { - currBoundaryValue = null; - try { - currDate = DateHelper.parse(cqRemainder); - ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get()); - DataInputStream inputStream = new DataInputStream(byteStream); - currCount = WritableUtils.readVLong(inputStream); - } catch (DateTimeParseException e) { - // probably the really old type classname format instead of a date. - // we can treat this like an index marker but the ts of the entry denotes the boundary - currDate = getBaseDate(key.getTimestamp()); - log.warn("Found an index entry missing the date, treating as an index marker at " + currDate + " : " + key); - currBoundaryValue = true; - currCount = 0; + // check for a marker of
\0\0true/false vs just
\0 + // where the boolean denotes that we can assume the field is indexed/no on and before this date + offset = cqRemainder.indexOf(NULL_BYTE); + if (offset >= 0) { + currBoundaryValue = Boolean.valueOf(cqRemainder.substring(offset + 1)); + currDate = DateHelper.parse(cqRemainder.substring(0, offset)); + } else { + try { + currDate = DateHelper.parse(cqRemainder); + ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get()); + DataInputStream inputStream = new DataInputStream(byteStream); + currCount = WritableUtils.readVLong(inputStream); + } catch (DateTimeParseException e) { + // probably the really old type classname format instead of a date. + // we can treat this like an index marker but the ts of the entry denotes the boundary + currDate = getBaseDate(key.getTimestamp()); + log.warn("Found an index entry missing the date, treating as an index marker at " + currDate + " : " + key); + currBoundaryValue = true; + } } } } @@ -1541,8 +1550,7 @@ Map> findHoles() throws IOException { // In both cases, record the last entry, and begin collecting date ranges for the next batch of related rows. if (!prevColumnFamily.equals(currColumnFamily)) { // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for - // the - // previously collected data. + // the previously collected data. if (currColumnFamily.equals(ColumnFamilyConstants.COLF_F)) { // Find and add all field index holes for the current frequency and index entries. findFieldIndexHoles(prevFieldName); @@ -1556,7 +1564,11 @@ Map> findHoles() throws IOException { } // Add the current entry to the target entry map. - addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + if (currAggregatedCounts != null) { + addToTargetMap(currDatatype, currAggregatedCounts); + } else { + addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + } } else { // The column family is the same. We have two possible scenarios: // - A row with a field that is different to the previous field. @@ -1569,10 +1581,18 @@ Map> findHoles() throws IOException { // Clear the entry maps. clearEntryMaps(); // Add the current entry to the target entry map. - addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + if (currAggregatedCounts != null) { + addToTargetMap(currDatatype, currAggregatedCounts); + } else { + addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + } } else { // The current row has the same field. Add the current entry to the target map. - addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + if (currAggregatedCounts != null) { + addToTargetMap(currDatatype, currAggregatedCounts); + } else { + addToTargetMap(currDatatype, currDate, currCount, currBoundaryValue); + } } } @@ -1606,7 +1626,18 @@ private boolean isPartOfTarget(String field, String datatype) { } /** - * Add the current date and count to the current target map for the current datatype. + * Add the current aggregated counts to the current target map for the given datatype. + */ + private void addToTargetMap(String datatype, DateFrequencyMap aggregatedCounts) { + for (Entry entry : aggregatedCounts.entrySet()) { + Date date = DateHelper.parse(entry.getKey()); + FieldCount fieldCount = getFieldCount(targetMap, datatype, date); + fieldCount.increment(entry.getValue().getValue()); + } + } + + /** + * Add the current date and count to the current target map for the given datatype. */ private void addToTargetMap(String datatype, Date date, long count, Boolean boundaryValue) { FieldCount fieldCount = getFieldCount(targetMap, datatype, date); @@ -1621,7 +1652,7 @@ private void addToTargetMap(String datatype, Date date, long count, Boolean boun /** * Return the field count entry from the specified map. A new entry is added to the map if missing - * + * * @param datatype * @param date * @return The field count. Never null @@ -1776,7 +1807,7 @@ private Map> getImmutableFieldIndexHoles() { /** * Get a key composed of the accumulo instance ID and the metadata table name - * + * * @param instanceID * the accumulo instance id * @param metadataTableName @@ -1792,7 +1823,7 @@ private static String getKey(String instanceID, String metadataTableName) { /** * Get a key - * + * * @param helper * an instance of an {@link AllFieldMetadataHelper} * @return a key @@ -1803,7 +1834,7 @@ private static String getKey(AllFieldMetadataHelper helper) { /** * ToString - * + * * @return a string */ @Override diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index 99be9e2..e76cbd7 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -71,13 +71,16 @@ import datawave.data.MetadataCardinalityCounts; import datawave.data.type.Type; import datawave.iterators.EdgeMetadataCombiner; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.iterators.MetadataFColumnSeekingFilter; import datawave.iterators.filter.EdgeMetadataCQStrippingIterator; import datawave.marking.MarkingFunctions; import datawave.query.composite.CompositeMetadata; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.Direction; import datawave.query.model.FieldIndexHole; import datawave.query.model.FieldMapping; +import datawave.query.model.Frequency; import datawave.query.model.ModelKeyParser; import datawave.query.model.QueryModel; import datawave.security.util.AuthorizationsMinimizer; @@ -1225,90 +1228,92 @@ public Set getContentFields(Set ingestTypeFilter) throws TableNo } /** - * Sum the frequency counts for a field between a start and end date (inclusive) + * Return the sum of all frequency counts for a field between a start and end date (inclusive). * * @param fieldName - * the field + * the field name * @param begin * the start date * @param end * the end date * @return the total instances of the field in the date range * @throws TableNotFoundException - * if no table exists + * if the metadata table does not exist */ public long getCardinalityForField(String fieldName, Date begin, Date end) throws TableNotFoundException { return getCardinalityForField(fieldName, null, begin, end); } /** - * Sum the frequency counts for a field in a datatype between a start and end date (inclusive) + * Return the sum of all frequency counts for a field in a datatype between a start and end date (inclusive). * * @param fieldName * the field * @param datatype - * the ingest type + * the datatype * @param begin * the start date * @param end * the end date * @return the total instances of the field in the date range * @throws TableNotFoundException - * if no table exists + * if the metadata table does not exist */ public long getCardinalityForField(String fieldName, String datatype, Date begin, Date end) throws TableNotFoundException { log.trace("getCardinalityForField from table: {}", metadataTableName); Text row = new Text(fieldName.toUpperCase()); // Get all the rows in DatawaveMetadata for the field, only in the 'f' colfam - long count; + long count = 0; try (Scanner bs = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths)) { Key startKey = new Key(row); bs.setRange(new Range(startKey, startKey.followingKey(PartialKey.ROW))); bs.fetchColumnFamily(ColumnFamilyConstants.COLF_F); - count = 0; + // If a datatype was specified, add a regex filter to only include entries with the datatype. + if (datatype != null) { + IteratorSetting colqRegex = new IteratorSetting(50, RegExFilter.class); + RegExFilter.setRegexs(colqRegex, null, null, datatype + "\u0000.*", null, false); + bs.addScanIterator(colqRegex); + } for (Entry entry : bs) { Text colq = entry.getKey().getColumnQualifier(); - int index = colq.find(NULL_BYTE); - if (index == -1) { + + String remainder; + try { + remainder = Text.decode(colq.getBytes(), index + 1, colq.getLength() - (index + 1)); + } catch (CharacterCodingException e) { + log.warn("Could not deserialize colqual: {} ", entry.getKey()); continue; } - // If we were given a non-null datatype - // Ensure that we process records only on that type - if (null != datatype) { + // This is an aggregated entry. + if (remainder.equals(FrequencyMetadataAggregator.AGGREGATED)) { try { - String type = Text.decode(colq.getBytes(), 0, index); - if (!type.equals(datatype)) { - continue; - } - } catch (CharacterCodingException e) { - log.warn("Could not deserialize colqual: {}", entry.getKey()); - continue; + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + // Fetch all entries within the target date range and sum the counts. + long sum = map.subMap(DateHelper.format(begin), DateHelper.format(end)).values().stream().mapToLong(Frequency::getValue).sum(); + count += sum; + } catch (IOException e) { + log.warn("Could not convert the Value to a DateFrequencyMap: {}", entry.getValue()); + log.error("Failed to convert Value to DateFrequencyMap", e); } - } - - // Parse the date to ensure that we want this record - String dateStr = "null"; - Date date; - try { - dateStr = Text.decode(colq.getBytes(), index + 1, colq.getLength() - (index + 1)); - date = DateHelper.parse(dateStr); - // Add the provided count if we fall within begin and end, - // inclusive - if (date.compareTo(begin) >= 0 && date.compareTo(end) <= 0) { - count += SummingCombiner.VAR_LEN_ENCODER.decode(entry.getValue().get()); + } else { + // This is an entry with a count for a single date. + try { + Date date = DateHelper.parse(remainder); + // Add the provided count if we fall within begin and end, inclusively. + if (date.compareTo(begin) >= 0 && date.compareTo(end) <= 0) { + count += SummingCombiner.VAR_LEN_ENCODER.decode(entry.getValue().get()); + } + } catch (ValueFormatException e) { + log.warn("Could not convert the Value to a long: {}", entry.getValue()); + } catch (DateTimeParseException e) { + log.warn("Could not convert date string: {}", remainder); } - } catch (ValueFormatException e) { - log.warn("Could not convert the Value to a long: {}", entry.getValue()); - } catch (CharacterCodingException e) { - log.warn("Could not deserialize colqual: {}", entry.getKey()); - } catch (DateTimeParseException e) { - log.warn("Could not convert date string: {}", dateStr); } } } @@ -1507,8 +1512,11 @@ protected HashMap getCountsByFieldInDayWithTypes(String fieldName, scanner.fetchColumnFamily(ColumnFamilyConstants.COLF_F); scanner.setRange(Range.exact(fieldName)); + // It's possible to find rows with column qualifiers in the format \0AGGREGATED (aggregated entries) and/or \0 + // (non-aggregated entries). Filter out any non-aggregated entries that do not have the date in the column qualifier. IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, ".*\u0000" + date, null, false); + // Allow any entries that contain the aggregated marker, or contain the null byte with the target date directly afterwards. + RegExFilter.setRegexs(cqRegex, null, null, "^(.*\u0000" + FrequencyMetadataAggregator.AGGREGATED + ")$|^(.*\u0000" + date + ")$", null, false); scanner.addScanIterator(cqRegex); final Text holder = new Text(); @@ -1520,19 +1528,33 @@ protected HashMap getCountsByFieldInDayWithTypes(String fieldName, writer = updateCache(entry, writer, wrappedClient); } - ByteArrayInputStream bais = new ByteArrayInputStream(entry.getValue().get()); - DataInputStream inputStream = new DataInputStream(bais); - - Long sum = WritableUtils.readVLong(inputStream); - entry.getKey().getColumnQualifier(holder); int offset = holder.find(NULL_BYTE); - Preconditions.checkArgument(-1 != offset, "Could not find nullbyte separator in column qualifier for: " + entry.getKey()); - String datatype = Text.decode(holder.getBytes(), 0, offset); + String remainder; + try { + remainder = Text.decode(holder.getBytes(), offset + 1, holder.getLength() - (offset + 1)); + } catch (CharacterCodingException e) { + log.warn("Could not deserialize colqual: {} ", entry.getKey()); + continue; + } - datatypeToCounts.put(datatype, sum); + // This is an aggregated entry. + if (remainder.equals(FrequencyMetadataAggregator.AGGREGATED)) { + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + // If a count is present for the target date, merge in the sum. + if (map.contains(date)) { + long count = map.get(date).getValue(); + datatypeToCounts.merge(datatype, count, Long::sum); + } + } else { + // This is an entry with a count for a single date. + ByteArrayInputStream bais = new ByteArrayInputStream(entry.getValue().get()); + DataInputStream inputStream = new DataInputStream(bais); + long count = WritableUtils.readVLong(inputStream); + datatypeToCounts.merge(datatype, count, Long::sum); + } } } finally { if (writer != null) { @@ -1587,7 +1609,7 @@ public Map getCountsForFieldsInDateRange(Set fields, Set createFieldCountRanges(Set fields, SortedSet /** * Deserialize a Value that contains a Long - * + * * @param value * an accumulo Value * @return a long @@ -1731,7 +1753,7 @@ public Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String * * @param fieldName * the field - * @param dataType + * @param datatypeFilter * the datatype * @param client * an AccumuloClient @@ -1739,8 +1761,11 @@ public Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String * a wrapped AccumuloClient * @return the earliest date the field is found, or null otherwise */ - protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String dataType, AccumuloClient client, WrappedAccumuloClient wrappedClient) { - String dateString = null; + protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final String datatypeFilter, AccumuloClient client, + WrappedAccumuloClient wrappedClient) { + String earliestDate = null; + String prevDatatype = null; + boolean skipToAggregated = false; BatchWriter writer = null; try (Scanner scanner = ScannerHelper.createScanner(client, metadataTableName, auths)) { @@ -1748,13 +1773,14 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri scanner.setRange(Range.exact(fieldName)); // if a type was specified, add a regex filter for it - if (dataType != null) { + if (datatypeFilter != null) { IteratorSetting cqRegex = new IteratorSetting(50, RegExFilter.class); - RegExFilter.setRegexs(cqRegex, null, null, dataType + "\u0000.*", null, false); + RegExFilter.setRegexs(cqRegex, null, null, datatypeFilter + "\u0000.*", null, false); scanner.addScanIterator(cqRegex); } final Text holder = new Text(); + for (Entry entry : scanner) { // if this is the real connector, and wrapped connector is not null, it means // that we didn't get a hit in the cache. So, we will update the cache with the @@ -1764,18 +1790,49 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri } entry.getKey().getColumnQualifier(holder); - int startPos = holder.find(NULL_BYTE) + 1; + int offset = holder.find(NULL_BYTE); - if (0 == startPos) { - log.trace("Could not find nullbyte separator in column qualifier for: {}", entry.getKey()); - } else if ((holder.getLength() - startPos) <= 0) { - log.trace("Could not find date to parse in column qualifier for: {}", entry.getKey()); - } else { + // Extract the datatype and the remainder of the colq. + String datatype; + String remainder; + try { + datatype = Text.decode(holder.getBytes(), 0, offset); + remainder = Text.decode(holder.getBytes(), (offset + 1), holder.getLength() - (offset + 1)); + } catch (CharacterCodingException e) { + log.trace("Could not deserialize colqual: {} ", entry.getKey()); + continue; + } + + // If this is the first datatype we've seen, or a new datatype (if a datatype filter was not specified) was seen, update the prev datatype seen + // and do not skip to the next aggregated entry. + if (prevDatatype == null || !prevDatatype.equals(datatype)) { + prevDatatype = datatype; + skipToAggregated = false; + } + + // This is an aggregated entry with counts for multiple dates. These entries have the colq format \0AGGREGATED, and will thus be + // sorted after entries with the colq format \0. Check if the earliest date in the aggregated counts map is earlier than + // any dates seen thus far. + if (remainder.equals(FrequencyMetadataAggregator.AGGREGATED)) { try { - dateString = Text.decode(holder.getBytes(), startPos, holder.getLength() - startPos); - break; - } catch (CharacterCodingException e) { - log.trace("Unable to decode date string for: {}", entry.getKey().getColumnQualifier()); + DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get()); + String earliest = map.earliestDate(); + if (earliestDate == null || earliest.compareTo(earliestDate) < 0) { + earliestDate = earliest; + } + } catch (IOException e) { + log.trace("Could not convert the Value to a DateFrequencyMap: {}", entry.getValue()); + log.error("Failed to convert Value to DateFrequencyMap", e); + } + } else { + // If skipToAggregated is false, this is the first entry seen for the current datatype with the colq format \0, and will + // have the earliest date for the current datatype for entries with this colq format. Check if the date is the earliest date seen thus far, + // and then skip ahead to any aggregated entries for the current datatype with the colq format \0. + if (!skipToAggregated) { + if (earliestDate == null || remainder.compareTo(earliestDate) < 0) { + earliestDate = remainder; + } + skipToAggregated = true; } } } @@ -1791,9 +1848,10 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri } } + // Parse and return the date. Date date = null; - if (dateString != null) { - date = DateHelper.parse(dateString); + if (earliestDate != null) { + date = DateHelper.parse(earliestDate); } return date; diff --git a/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java new file mode 100644 index 0000000..4e3d159 --- /dev/null +++ b/src/test/java/datawave/iterators/FrequencyMetadataAggregatorTest.java @@ -0,0 +1,633 @@ +package datawave.iterators; + +import static datawave.data.ColumnFamilyConstants.COLF_DESC; +import static datawave.data.ColumnFamilyConstants.COLF_E; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.data.ColumnFamilyConstants.COLF_I; +import static datawave.data.ColumnFamilyConstants.COLF_N; +import static datawave.data.ColumnFamilyConstants.COLF_RI; +import static datawave.query.util.TestUtils.createDateFrequencyMap; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.LongCombiner; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.accumulo.inmemory.InMemoryAccumuloClient; +import datawave.accumulo.inmemory.InMemoryInstance; +import datawave.query.model.DateFrequencyMap; +import datawave.query.util.TestUtils; +import datawave.security.util.ScannerHelper; + +public class FrequencyMetadataAggregatorTest { + + private static final String TABLE_METADATA = "metadata"; + private static final String[] AUTHS = {"FOO", "BAR", "COB"}; + private static final Set AUTHS_SET = Collections.singleton(new Authorizations(AUTHS)); + private static final String NULL_BYTE = "\0"; + + private AccumuloClient accumuloClient; + private Boolean combineColumnVisibilities; + private final List> expected = new ArrayList<>(); + private final List mutations = new ArrayList<>(); + + @BeforeAll + static void beforeAll() throws URISyntaxException { + File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); + File targetDir = dir.getParentFile(); + System.setProperty("hadoop.home.dir", targetDir.getAbsolutePath()); + } + + @BeforeEach + public void setUp() throws Exception { + accumuloClient = new InMemoryAccumuloClient("root", new InMemoryInstance(FrequencyMetadataAggregatorTest.class.toString())); + if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { + accumuloClient.tableOperations().create(TABLE_METADATA); + } + } + + @AfterEach + public void tearDown() throws Exception { + accumuloClient.tableOperations().deleteRows(TABLE_METADATA, null, null); + combineColumnVisibilities = null; + expected.clear(); + } + + /** + * Verify that aggregation of entries for the columns "f", "i", and "ri" in their non-aggregated format (e.g. when they're initially ingested) are + * aggregated correctly. + */ + @Test + void testDifferingColumnFamilies() throws TableNotFoundException { + // "f" rows. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // "i" rows. + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_I, "csv", "FOO", 1500000003L, "20200103", 3L); + + // "ri" rows. + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000005L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000003L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1500000004L, "20200103", 3L); + + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + expect("NAME", COLF_I, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L, "20200102", 8L, "20200103", 9L)); + expect("NAME", COLF_RI, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000005L, createDateFrequencyMap("20200101", 5L, "20200102", 12L, "20200103", 15L)); + + assertResults(); + } + + /** + * Verify that entries with the same name, column family, and column visibility are separated by their datatype. + */ + @Test + void testDifferingDatatypes() throws TableNotFoundException { + // Datatype "csv". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Datatype "wiki". + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1500000003L, "20200103", 2L); + + // Datatype "text". + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "text", "FOO", 1500000003L, "20200104", 4L); + + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + expect("NAME", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000015L, createDateFrequencyMap("20200102", 12L, "20200103", 4L, "20200104", 16L)); + expect("NAME", COLF_F, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1500000003L, createDateFrequencyMap("20200101", 12L, "20200102", 4L, "20200103", 8L)); + + assertResults(); + } + + /** + * Verify that when entries for the same field, column family, datatype, and date are aggregated, that the aggregated entries are still separated by their + * column visibility by default. + */ + @Test + public void testDifferingColumnVisibilities() throws TableNotFoundException { + // Column visibility "FOO". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Column visibility "BAR". + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200103", 2L); + + // Column visibility "COB". + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200104", 4L); + + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "BAR", 1500000003L, createDateFrequencyMap("20200101", 12L, "20200102", 4L, "20200103", 8L)); + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "COB", 1500000015L, createDateFrequencyMap("20200102", 12L, "20200103", 4L, "20200104", 16L)); + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 4L, "20200102", 10L, "20200103", 12L)); + + assertResults(); + } + + /** + * Verify that when the iterator option {@link FrequencyMetadataAggregator#COMBINE_VISIBILITIES_OPTION} is set to true, entries with same field, column + * family, datatype, and date are aggregated and their column visibilities are combined. + */ + @Test + public void testCombiningColumnVisibilities() throws TableNotFoundException { + // Column visibility "FOO". + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + // Column visibility "BAR". + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200101", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200101", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200102", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000000L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000001L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000002L, "20200103", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "BAR", 1500000003L, "20200103", 2L); + + // Column visibility "COB". + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200102", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000015L, "20200102", 3L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200103", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000000L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000001L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000002L, "20200104", 4L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "COB", 1500000003L, "20200104", 4L); + + // Enable to option to combine visibilities. + givenCombineColumnVisibilitiesIsTrue(); + + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "BAR&COB&FOO", 1500000015L, + createDateFrequencyMap("20200101", 16L, "20200102", 26L, "20200103", 24L, "20200104", 16L)); + + assertResults(); + } + + /** + * Verify that aggregating non-aggregated entries into a previously-aggregated row works correctly. + */ + @Test + void testAggregatedAndNonAggregatedEntries() throws TableNotFoundException { + // Aggregated entry. + givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999999L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + + // Non-aggregated entry. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200101", 1L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200102", 2L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000004L, "20200102", 2L); // Latest timestamp. + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000000L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000001L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000002L, "20200103", 3L); + givenNonAggregatedRow("NAME", COLF_F, "csv", "FOO", 1500000003L, "20200103", 3L); + + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 40L, "20200101", 19L, "20200102", 30L, "20200103", 12L)); + + assertResults(); + } + + /** + * Verify that entries not requiring any aggregation are not modified. + */ + @Test + void testNoAggregationNeeded() throws TableNotFoundException { + givenAggregatedRow("NAME", COLF_F, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_I, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_RI, "csv", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + givenAggregatedRow("NAME", COLF_F, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_I, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_RI, "text", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + givenAggregatedRow("NAME", COLF_F, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("NAME", COLF_I, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("NAME", COLF_RI, "wiki", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + givenAggregatedRow("GENDER", COLF_F, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_I, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_RI, "attr", "BAR", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_F, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_I, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + givenAggregatedRow("GENDER", COLF_RI, "attr", "FOO", 1499999995L, createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + + expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("GENDER", COLF_RI, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191220", 20L, "20191225", 10L, "20191230", 11L)); + expect("NAME", COLF_F, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_F, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + expect("NAME", COLF_I, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_I, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_I, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + expect("NAME", COLF_RI, "csv" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, + createDateFrequencyMap("20191225", 40L, "20200101", 15L, "20200102", 20L)); + expect("NAME", COLF_RI, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20200101", 20L, "20200102", 10L)); + expect("NAME", COLF_RI, "wiki" + NULL_BYTE + "AGGREGATED", "FOO", 1499999995L, createDateFrequencyMap("20191225", 20L, "20200101", 10L)); + + assertResults(); + } + + /** + * Test aggregation over a more diverse dataset of mixed aggregated and non-aggregated rows. + */ + @Test + void testDiverseDataset() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + /** + * Test aggregation over a dataset that contains index markers. + */ + @Test + void testIndexMarkers() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Entries with index markers that should not be aggregated. + givenMutation("AGE", COLF_I, "num" + NULL_BYTE + "20191230" + NULL_BYTE + "true", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "20190530" + NULL_BYTE + "false", "FOO", 1500000004L, new Value()); + givenMutation("NAME", COLF_I, "attr" + NULL_BYTE + "20171201" + NULL_BYTE + "true", "BAR", 1500000004L, new Value()); + + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num" + NULL_BYTE + "20191230" + NULL_BYTE + "true", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_RI, "attr" + NULL_BYTE + "20190530" + NULL_BYTE + "false", "FOO", 1500000004L, new Value()); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "20171201" + NULL_BYTE + "true", "BAR", 1500000004L, new Value()); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + /** + * Test aggregation over a dataset that contains legacy formats. + */ + @Test + void testLegacyFormats() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Entries with legacy formats that should not be aggregated. + givenMutation("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_RI, "attr" + NULL_BYTE + "FakeTypeClassName", "FOO", 1500000004L, new Value()); + + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_RI, "attr" + NULL_BYTE + "FakeTypeClassName", "FOO", 1500000004L, new Value()); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + /** + * Verify that scanning over a table with columns that are not to be aggregated result in them being unchanged. + */ + @Test + void testMixedColumns() throws TableNotFoundException { + givenAggregatedRow("AGE", COLF_F, "num", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1499999995L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "num", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1499999999L, createDateFrequencyMap("20191225", 1L, "20200101", 1L, "20200102", 1L)); + givenAggregatedRow("GENDER", COLF_F, "text", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_F, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + givenAggregatedRow("NAME", COLF_I, "attr", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + givenNonAggregatedRow("AGE", COLF_F, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "num", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_I, "lifetime", "FOO", 1500000004L, "20200101", 1L); // Should be aggregated into existing aggregated entry. + givenNonAggregatedRow("AGE", COLF_F, "var", "BAR", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new datatype. + givenNonAggregatedRow("GENDER", COLF_F, "text", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new colvis. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_F, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + givenNonAggregatedRow("JOB", COLF_I, "attr", "FOO", 1500000004L, "20200101", 1L); // Should result in new aggregated entry because new row. + + // Non frequency rows that should not be affected by aggregation. + givenMutation("AGE", COLF_E, "num", "BAR", 1400000005L, new Value()); + givenMutation("AGE", COLF_DESC, "num", "BAR", 1400000005L, new Value("age_num description")); + givenMutation("AGE", COLF_E, "lifetime", "BAR", 1400000005L, new Value()); + givenMutation("AGE", COLF_DESC, "lifetime", "BAR", 1400000005L, new Value("age_lifetime description")); + givenMutation("AGE", COLF_DESC, "var", "BAR", 1400000005L, new Value("age_var description")); + givenMutation("JOB", COLF_E, "attr", "BAR", 1400000005L, new Value()); + givenMutation("JOB", COLF_DESC, "attr", "BAR", 1400000005L, new Value("job_attr description")); + givenMutation("GENDER", COLF_DESC, "text", "BAR", 1400000005L, new Value("gender_text description")); + givenMutation("JOB", new Text("m"), "attr", "BAR", 1400000005L, new Value()); + + expect("AGE", COLF_DESC, "lifetime", "BAR", 1400000005L, new Value("age_lifetime description")); + expect("AGE", COLF_DESC, "num", "BAR", 1400000005L, new Value("age_num description")); + expect("AGE", COLF_DESC, "var", "BAR", 1400000005L, new Value("age_var description")); + expect("AGE", COLF_E, "lifetime", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_E, "num", "BAR", 1400000005L, new Value()); + expect("AGE", COLF_F, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_F, "var" + NULL_BYTE + "AGGREGATED", "BAR", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("AGE", COLF_I, "lifetime" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, + createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("AGE", COLF_I, "num" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20191225", 1L, "20200101", 2L, "20200102", 1L)); + expect("GENDER", COLF_DESC, "text", "BAR", 1400000005L, new Value("gender_text description")); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("GENDER", COLF_F, "text" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", COLF_DESC, "attr", "BAR", 1400000005L, new Value("job_attr description")); + expect("JOB", COLF_E, "attr", "BAR", 1400000005L, new Value()); + expect("JOB", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 3L)); + expect("JOB", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "FOO", 1500000004L, createDateFrequencyMap("20200101", 1L)); + expect("JOB", new Text("m"), "attr", "BAR", 1400000005L, new Value()); + expect("NAME", COLF_F, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + expect("NAME", COLF_I, "attr" + NULL_BYTE + "AGGREGATED", "BAR", 1499999999L, createDateFrequencyMap("20200101", 1L, "20200102", 1L)); + + assertResults(); + } + + private void assertResults() throws TableNotFoundException { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + Scanner scanner = createScanner(); + List> actual = new ArrayList<>(); + for (Map.Entry entry : scanner) { + actual.add(new AbstractMap.SimpleEntry<>(entry.getKey(), entry.getValue())); + System.out.println("Key: " + entry.getKey()); + } + + Assertions.assertEquals(expected, actual); + } + + private Scanner createScanner() throws TableNotFoundException { + Scanner scanner = ScannerHelper.createScanner(accumuloClient, TABLE_METADATA, AUTHS_SET); + scanner.setRange(new Range()); + + IteratorSetting iteratorSetting = new IteratorSetting(10, FrequencyMetadataAggregator.class); + if (combineColumnVisibilities != null) { + iteratorSetting.addOption(FrequencyMetadataAggregator.COMBINE_VISIBILITIES_OPTION, String.valueOf(combineColumnVisibilities)); + } + iteratorSetting.addOption(FrequencyMetadataAggregator.COLUMNS_OPTION, "f,i,ri"); + scanner.addScanIterator(iteratorSetting); + + return scanner; + } + + private void givenCombineColumnVisibilitiesIsTrue() { + this.combineColumnVisibilities = true; + } + + private void givenNonAggregatedRow(String row, Text colf, String datatype, String colv, long timestamp, String date, long count) { + givenMutation(row, colf, datatype + NULL_BYTE + date, colv, timestamp, new Value(LongCombiner.VAR_LEN_ENCODER.encode(count))); + } + + private void givenAggregatedRow(String row, Text colf, String datatype, String colv, long timestamp, DateFrequencyMap map) { + givenMutation(row, colf, datatype + NULL_BYTE + FrequencyMetadataAggregator.AGGREGATED, colv, timestamp, new Value(WritableUtils.toByteArray(map))); + } + + private void givenMutation(String row, Text colf, String colq, String colv, long timestamp, Value value) { + Mutation mutation = new Mutation(row); + mutation.put(colf, new Text(colq), new ColumnVisibility(colv), timestamp, value); + this.mutations.add(mutation); + } + + private void expect(String row, Text colf, String colq, String colv, long timestamp, DateFrequencyMap map) { + expect(row, colf, colq, colv, timestamp, new Value(WritableUtils.toByteArray(map))); + } + + private void expect(String row, Text colf, String colq, String colv, long timestamp, Value value) { + expected.add(new AbstractMap.SimpleEntry<>(new Key(new Text(row), colf, new Text(colq), new ColumnVisibility(colv), timestamp), value)); + } +} diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index 70b45c2..e146574 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -1,10 +1,15 @@ package datawave.query.util; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.query.util.TestUtils.createDateFrequencyMap; +import static datawave.query.util.TestUtils.createRangedDateFrequencyMap; +import static org.apache.accumulo.core.iterators.LongCombiner.VAR_LEN_ENCODER; + import java.io.File; import java.io.IOException; import java.net.URISyntaxException; +import java.util.AbstractMap; import java.util.ArrayList; -import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -26,14 +31,16 @@ import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -43,7 +50,9 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; import datawave.data.type.LcNoDiacriticsType; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; import datawave.query.model.FieldIndexHole; import datawave.util.time.DateHelper; @@ -55,6 +64,8 @@ class AllFieldMetadataHelperTest { private AccumuloClient accumuloClient; private AllFieldMetadataHelper helper; + private final List mutations = new ArrayList<>(); + @BeforeAll static void beforeAll() throws URISyntaxException { File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); @@ -101,15 +112,150 @@ private void writeMutations(Collection mutations) { } /** - * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and - * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. + * Write the given mutations to the metadata table. + */ + private void writeMutations() { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + } + + private void givenNonAggregatedFrequencyRows(String row, String colf, String datatype, String startDate, String endDate, long count) { + givenNonAggregatedFrequencyRows(row, new Text(colf), datatype, startDate, endDate, count); + } + + private void givenNonAggregatedFrequencyRows(String row, Text colf, String datatype, String startDate, String endDate, long count) { + Mutation mutation = new Mutation(row); + Value value = new Value(VAR_LEN_ENCODER.encode(count)); + List dates = TestUtils.getDatesInRange(startDate, endDate); + dates.forEach((date) -> mutation.put(colf, new Text(datatype + NULL_BYTE + date), value)); + givenMutation(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date, boolean indexed) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype + NULL_BYTE + date + NULL_BYTE + indexed, new Value()); + mutations.add(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype, DateHelper.parse(date).getTime(), new Value()); + mutations.add(mutation); + } + + private void givenIndexMarkerMutation(String row, String colf, String datatype, String date, Class typeClass) { + Mutation mutation = new Mutation(row); + mutation.put(colf, datatype + NULL_BYTE + typeClass.getName(), DateHelper.parse(date).getTime(), new Value()); + mutations.add(mutation); + } + + private void givenAggregatedFrequencyRow(String row, String colf, String datatype, DateFrequencyMap map) { + givenAggregatedFrequencyRow(row, new Text(colf), datatype, map); + } + + private void givenAggregatedFrequencyRow(String row, Text colf, String datatype, DateFrequencyMap map) { + Mutation mutation = new Mutation(row); + Value value = new Value(WritableUtils.toByteArray(map)); + mutation.put(colf, new Text(datatype + NULL_BYTE + FrequencyMetadataAggregator.AGGREGATED), value); + givenMutation(mutation); + } + + private void givenMutation(Mutation mutation) { + this.mutations.add(mutation); + } + + /** + * Tests for {@link AllFieldMetadataHelper#getCountsByFieldInDayWithTypes(Map.Entry)}. */ @Nested - public class FieldIndexHoleTests { + public class CountsByFieldInDayWithTypesTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 3L); // Does not contain target date. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 1L); + expected.put("wiki", 2L); + expected.put("maze", 3L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200110")); + + Assertions.assertEquals(expected, actual); + } - private Set fields = new HashSet<>(); - private Set datatypes = new HashSet<>(); - private double minimumThreshold = 1.0d; + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 5L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); // Should get summed into previous. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200115", "20200120", 3L); // Does not have entry for 20200102, should not be incremented. + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 6L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes(new AbstractMap.SimpleEntry<>("NAME", "20200102")); + + Assertions.assertEquals(expected, actual); + } + } + + /** + * Base class for field index hole tests. + */ + public abstract class AbstractFieldIndexHoleTests { + + protected Set fields = new HashSet<>(); + protected Set datatypes = new HashSet<>(); + protected double minimumThreshold = 1.0d; protected final Supplier>> INDEX_FUNCTION = () -> { try { @@ -138,6 +284,44 @@ void tearDown() { givenMinimumThreshold(1.0d); } + protected void givenFields(String... fields) { + this.fields = Sets.newHashSet(fields); + } + + protected void givenDatatypes(String... datatypes) { + this.datatypes = Sets.newHashSet(datatypes); + } + + protected void givenMinimumThreshold(double minimumThreshold) { + this.minimumThreshold = minimumThreshold; + } + + protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + } + + protected Pair dateRange(String start, String end) { + return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. + */ + @Nested + public class FieldIndexHoleTestsForNonAggregatedEntries extends AbstractFieldIndexHoleTests { + /** * Test against data that has no field index holes. */ @@ -145,20 +329,19 @@ void tearDown() { @ValueSource(strings = {"i", "ri"}) void testNoFieldIndexHoles(String cf) { // Create a series of frequency rows over date ranges, each with a matching index row for each date. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200101", "20200120", 1L); + writeMutations(); // Verify that no index holes were found. Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -171,11 +354,10 @@ void testNoFieldIndexHoles(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -190,12 +372,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -210,12 +391,11 @@ void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -231,13 +411,12 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -252,12 +431,11 @@ void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -273,13 +451,12 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -295,14 +472,13 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithNotIndexedMarker(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200109", false); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -319,13 +495,12 @@ void testFieldIndexHoleWithNotIndexedMarker(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarker(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", true); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -342,13 +517,12 @@ void testFieldIndexHoleWithIndexedMarker(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -365,13 +539,12 @@ void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", LcNoDiacriticsType.class); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -383,14 +556,13 @@ void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", true); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200104", "20200110", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -405,24 +577,23 @@ void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMixedDateGapsAndNonIndexedFields(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Not indexed nor covers full range for NAME - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -441,14 +612,13 @@ void testMixedDateGapsAndNonIndexedFields(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200106", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200106", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:on @@ -463,12 +633,11 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -487,16 +656,15 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200109", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200114", "20200116", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200119", "20200120", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200106", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200109", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -516,12 +684,11 @@ void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -538,13 +705,12 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String c @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -560,12 +726,11 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -581,14 +746,13 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200112", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -604,12 +768,11 @@ void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -628,16 +791,15 @@ void testAllDatesAreIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testAllDatesAreIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200115", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200125", 1L); // Will not meet threshold. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200328", 1L); // Will not meet threshold. - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200115", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200125", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -656,26 +818,25 @@ void testAllDatesAreIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_dateGaps(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 1L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 1L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 1L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 1L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 1L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 1L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 1L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -695,34 +856,33 @@ void testSingularDayIndexHoles_dateGaps(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testSingularDayIndexHoles_threshold(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -742,31 +902,30 @@ void testSingularDayIndexHoles_threshold(String cf) { @ParameterizedTest @ValueSource(strings = {"i", "ri"}) void testMixedDateGapsAndThresholdIndexHoles(String cf) { - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -788,31 +947,30 @@ void testMixedDateGapsAndThresholdIndexHoles(String cf) { void testMinimumThresholdPercentageBelow100(String cf) { givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 74L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 100L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 100L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 100L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 98L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 100L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 98L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 100L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 90L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 75L); // Meets 75% threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 74L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 99L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 100L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 90L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 75L); // Meets 75% threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 74L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 99L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -835,31 +993,30 @@ void testOneFieldSpecified(String cf) { // Retrieve field index holes for field NAME. givenFields("NAME"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -879,39 +1036,38 @@ void testMultipleFieldsSpecified(String cf) { // Retrieve field index holes for fields URI and EVENT_DATE. givenFields("URI", "EVENT_DATE"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-wiki on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -932,55 +1088,54 @@ void testDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -1004,55 +1159,54 @@ void testFieldsAndDatatypesSpecified(String cf) { // Retrieve field index holes for datatypes wiki and csv. givenDatatypes("wiki", "csv"); - FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200105", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200103", "20200103", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "maze", "20200105", "20200105", 1L); // Will not meet threshold. // Index holes for ALPHA-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); // Index hole for EVENT_DATE-maze on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", COLF_F, "maze", "20200216", "20200328", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200220", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200221", "20200221", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200222", "20200302", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200303", "20200303", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200304", "20200315", 5L); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200316", "20200316", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200317", "20200328", 5L); // Index hole for ZETA-csv on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200123", "20200125", 5L); // Index hole for ZETA-imdb on 20200122. - mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. - mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); - writeMutations(mutationCreator.getMutations()); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "imdb", "20200120", "20200125", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200120", "20200121", 5L); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + writeMutations(); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off @@ -1062,120 +1216,1670 @@ void testFieldsAndDatatypesSpecified(String cf) { // @formatter:on Assertions.assertEquals(expected, fieldIndexHoles); } - - private void givenFields(String... fields) { - this.fields = Sets.newHashSet(fields); - } - - private void givenDatatypes(String... datatypes) { - this.datatypes = Sets.newHashSet(datatypes); - } - - private void givenMinimumThreshold(double minimumThreshold) { - this.minimumThreshold = minimumThreshold; - } - - protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { - Map> fieldIndexHoles = new HashMap<>(); - for (FieldIndexHole hole : holes) { - Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); - datatypeMap.put(hole.getDatatype(), hole); - } - return fieldIndexHoles; - } - - @SafeVarargs - protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { - return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); - } - - protected Pair dateRange(String start, String end) { - return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); - } } /** - * Helper class for creating mutations in bulk for field index hole tests. + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains aggregated entries only. */ - private static class FieldIndexHoleMutationCreator { - - private final List mutations = new ArrayList<>(); + @Nested + public class FieldIndexHoleTestsForAggregatedEntries extends AbstractFieldIndexHoleTests { - private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, "f", datatype, date, count)); + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); } - private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate, long count) { - List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addMutation(fieldName, cf, datatype, date, count)); + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate, boolean indexed) { - addMutation(fieldName, cf, datatype, endDate, indexed); + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Make the index counts a value that will not meet the threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate) { - addMutation(fieldName, cf, datatype, endDate); + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate, Class typeClass) { - addMutation(fieldName, cf, datatype, endDate, typeClass); + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getDatesInRange(String startDateStr, String endDateStr) { - Date startDate = DateHelper.parse(startDateStr); - Date endDate = DateHelper.parse(endDateStr); - - List dates = new ArrayList<>(); - dates.add(startDateStr); - - Calendar calendar = Calendar.getInstance(); - calendar.setTime(startDate); - while (true) { - calendar.add(Calendar.DAY_OF_MONTH, 1); - Date date = calendar.getTime(); - if (date.before(endDate) || date.equals(endDate)) { - dates.add(DateHelper.format(date)); - } else { - break; - } - } + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); - return dates; + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, long count) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(count))); - mutations.add(mutation); + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, boolean indexed) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + date + NULL_BYTE + indexed, new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + * This uses a negative index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithNotIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200109"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype, getTimestamp(date), new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private void addMutation(String row, String columnFamily, String datatype, String date, Class type) { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, datatype + NULL_BYTE + type.getName(), getTimestamp(date), new Value()); - mutations.add(mutation); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private long getTimestamp(String date) { - return DateHelper.parse(date).getTime(); + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format with type class + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); } - private List getMutations() { - return mutations; + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200104", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200107", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L, "20200107", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L, "20200110", "20200113", 1L, "20200117", "20200118", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L, "20200114", "20200116", 1L, "20200117", "20200118", 5L, "20200119", "20200120", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200105", 1L)); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L, "20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200113", "20200115", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L, "20200110", + "20200112", 1L, "20200113", "20200115", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L, "20200104", "20200104", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L, "20200114", "20200115", 1L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L, "20200123", "20200125", 1L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 74L, "20200111", "20200112", 75L, "20200114", "20200115", 100L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)} where the metadata table contains both aggregated and non-aggregated entries. + */ + @Nested + public class FieldIndexHoleTestsForMixedAggregatedAndNonAggregatedEntries extends AbstractFieldIndexHoleTests { + + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "csv", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200111", "20200120", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200115", "20200120", 1L); + writeMutations(); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200105", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + * This uses a negative index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithNotIndexedMarker(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200103", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200109", false); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200109"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarker(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200110", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103"); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + * This uses a positive index marker derived from an older date-less format with type class + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", LcNoDiacriticsType.class); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200102", 1L); + givenIndexMarkerMutation("NAME", cf, "wiki", "20200103", true); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200104", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200107", "20200110", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200105", 1L); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200110", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200101", "20200105", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200107", 5L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200108", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200105", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200107", "20200110", 5L); + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200114", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200115", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200104", "20200106", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200113", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200120", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L, "20200104", "20200106", 5L, "20200107", + "20200109", 1L, "20200110", "20200113", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200114", "20200116", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200117", "20200118", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200119", "20200120", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("ZETA", COLF_F, "csv", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "csv", "20200101", "20200105", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200103", "20200105", 1L); + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 1L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L, "20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200103", 5L, "20200104", "20200105", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200110", "20200112", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200113", "20200115", 5L); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_dateGaps(String cf) { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200105", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 1L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles_threshold(String cf) { + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); // Will not meet threshold. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200120", "20200125", 5L); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); // Will not meet threshold. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenNonAggregatedFrequencyRows("URI", cf, "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 1L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200111", "20200112", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 1L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 1L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 1L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 1L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 1L, "20200222", "20200302", 1L, "20200304", + "20200315", 1L, "20200317", "20200328", 1L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200101", "20200102", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "csv", + createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 100L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 75L, "20200104", "20200104", 100L, "20200105", "20200105", 74L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 100L)); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 74L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 75L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 100L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 100L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 98L, "20200122", "20200122", 74L, "20200123", "20200125", 75L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 100L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 100L, "20200221", "20200221", 74L, "20200222", + "20200302", 90L, "20200304", "20200315", 75L, "20200316", "20200316", 74L, "20200317", "20200328", 99L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", + createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200104", "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for NAME-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L)); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-wiki on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "wiki", createRangedDateFrequencyMap("20200120", "20200121", 5L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200122", "20200122", 1L); + givenNonAggregatedFrequencyRows("ZETA", cf, "wiki", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenNonAggregatedFrequencyRows("ALPHA", COLF_F, "csv", "20200110", "20200115", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200110", "20200110", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200111", "20200112", 5L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200113", "20200113", 1L); + givenNonAggregatedFrequencyRows("ALPHA", cf, "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", cf, "maze", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "wiki", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L)); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200104", "20200104", 5L); + givenNonAggregatedFrequencyRows("NAME", cf, "wiki", "20200105", "20200105", 1L); + // Index holes for NAME-wiki on 20200103 and 20200105. + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createRangedDateFrequencyMap("20200101", "20200105", 5L)); + givenAggregatedFrequencyRow("NAME", cf, "maze", createRangedDateFrequencyMap("20200101", "20200102", 5L, "20200103", "20200103", 1L, "20200104", + "20200104", 5L, "20200105", "20200105", 1L)); + // Index holes for ALPHA-csv on 20200110 and 20200113. + givenAggregatedFrequencyRow("ALPHA", COLF_F, "csv", createRangedDateFrequencyMap("20200110", "20200115", 5L)); + givenAggregatedFrequencyRow("ALPHA", cf, "csv", createRangedDateFrequencyMap("20200110", "20200110", 1L, "20200111", "20200112", 5L, "20200113", + "20200113", 1L, "20200114", "20200115", 5L)); + // Index hole for EVENT_DATE-wiki on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "wiki", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for EVENT_DATE-maze on 20200122. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("EVENT_DATE", cf, "maze", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + givenAggregatedFrequencyRow("URI", COLF_F, "maze", createRangedDateFrequencyMap("20200216", "20200328", 5L)); + givenAggregatedFrequencyRow("URI", cf, "maze", createRangedDateFrequencyMap("20200216", "20200220", 5L, "20200221", "20200221", 1L, "20200222", + "20200302", 5L, "20200303", "20200303", 1L, "20200304", "20200315", 5L, "20200316", "20200316", 1L, "20200317", "20200328", 5L)); + // Index hole for ZETA-csv on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "csv", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "csv", + createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L, "20200123", "20200125", 5L)); + // Index hole for ZETA-imdb on 20200122. + givenAggregatedFrequencyRow("ZETA", COLF_F, "imdb", createRangedDateFrequencyMap("20200120", "20200125", 5L)); + givenAggregatedFrequencyRow("ZETA", cf, "imdb", createRangedDateFrequencyMap("20200120", "20200121", 5L, "20200122", "20200122", 1L)); + givenNonAggregatedFrequencyRows("ZETA", cf, "imdb", "20200123", "20200125", 5L); + + writeMutations(); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); } } - } diff --git a/src/test/java/datawave/query/util/MetadataHelperTest.java b/src/test/java/datawave/query/util/MetadataHelperTest.java index 5486880..ef5dc45 100644 --- a/src/test/java/datawave/query/util/MetadataHelperTest.java +++ b/src/test/java/datawave/query/util/MetadataHelperTest.java @@ -1,26 +1,36 @@ package datawave.query.util; +import static datawave.data.ColumnFamilyConstants.COLF_F; +import static datawave.query.util.TestUtils.createDateFrequencyMap; +import static org.apache.accumulo.core.iterators.LongCombiner.VAR_LEN_ENCODER; + import java.io.File; +import java.io.IOException; import java.net.URISyntaxException; +import java.util.ArrayList; import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.BatchWriter; -import org.apache.accumulo.core.client.BatchWriterConfig; -import org.apache.accumulo.core.client.MutationsRejectedException; import org.apache.accumulo.core.client.TableExistsException; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; import com.google.common.collect.Maps; @@ -28,15 +38,24 @@ import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; +import datawave.iterators.FrequencyMetadataAggregator; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.DateFrequencyMap; +import datawave.util.time.DateHelper; +import datawave.webservice.common.connection.WrappedAccumuloClient; public class MetadataHelperTest { - private static final String TABLE_METADATA = "testMetadataTable"; + private static final String TABLE_METADATA = "metadata"; private static final String[] AUTHS = {"FOO"}; + private static final Set AUTHORIZATIONS = Collections.singleton(new Authorizations(AUTHS)); + private static final String NULL_BYTE = "\0"; + private AccumuloClient accumuloClient; private MetadataHelper helper; + private final List mutations = new ArrayList<>(); + @BeforeAll static void beforeAll() throws URISyntaxException { File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); @@ -45,72 +64,332 @@ static void beforeAll() throws URISyntaxException { } @BeforeEach - public void setup() throws TableNotFoundException, AccumuloException, TableExistsException, AccumuloSecurityException { + public void setup() throws AccumuloException, TableExistsException, AccumuloSecurityException { accumuloClient = new InMemoryAccumuloClient("root", new InMemoryInstance(MetadataHelperTest.class.toString())); if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { accumuloClient.tableOperations().create(TABLE_METADATA); } - helper = new MetadataHelper(createAllFieldMetadataHelper(), Collections.emptySet(), accumuloClient, TABLE_METADATA, Collections.emptySet(), + + helper = new MetadataHelper(createAllFieldMetadataHelper(), Collections.emptySet(), accumuloClient, TABLE_METADATA, AUTHORIZATIONS, Collections.emptySet()); } private AllFieldMetadataHelper createAllFieldMetadataHelper() { final Set allMetadataAuths = Collections.emptySet(); - final Set auths = Collections.singleton(new Authorizations(AUTHS)); - TypeMetadataHelper tmh = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, auths, false); - CompositeMetadataHelper cmh = new CompositeMetadataHelper(accumuloClient, TABLE_METADATA, auths); - return new AllFieldMetadataHelper(tmh, cmh, accumuloClient, TABLE_METADATA, auths, allMetadataAuths); + TypeMetadataHelper tmh = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, AUTHORIZATIONS, false); + CompositeMetadataHelper cmh = new CompositeMetadataHelper(accumuloClient, TABLE_METADATA, AUTHORIZATIONS); + return new AllFieldMetadataHelper(tmh, cmh, accumuloClient, TABLE_METADATA, AUTHORIZATIONS, allMetadataAuths); } @AfterEach void tearDown() throws AccumuloException, TableNotFoundException, AccumuloSecurityException { accumuloClient.tableOperations().delete(TABLE_METADATA); + this.mutations.clear(); } - @Test - public void testSingleFieldFilter() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.singleton("dataTypeA"))); - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(null)); - Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.emptySet())); + /** + * Write the given mutations to the metadata table. + */ + private void writeMutations() { + TestUtils.writeMutations(accumuloClient, TABLE_METADATA, mutations); + } + + private void givenMutation(Mutation mutation) { + this.mutations.add(mutation); + } + + private void givenMutation(String row, String columnFamily, String columnQualifier, Value value) { + Mutation mutation = new Mutation(row); + mutation.put(columnFamily, columnQualifier, value); + givenMutation(mutation); + } + + private void givenNonAggregatedFrequencyRows(String row, Text colf, String datatype, String startDate, String endDate, long count) { + Mutation mutation = new Mutation(row); + Value value = new Value(VAR_LEN_ENCODER.encode(count)); + List dates = TestUtils.getDatesInRange(startDate, endDate); + dates.forEach((date) -> mutation.put(colf, new Text(datatype + NULL_BYTE + date), value)); + givenMutation(mutation); + } + + private void givenAggregatedFrequencyRow(String row, Text colf, String datatype, DateFrequencyMap map) { + Mutation mutation = new Mutation(row); + Value value = new Value(WritableUtils.toByteArray(map)); + mutation.put(colf, new Text(datatype + NULL_BYTE + FrequencyMetadataAggregator.AGGREGATED), value); + givenMutation(mutation); } - @Test - public void testMultipleFieldFilter() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - writeMutation("rowB", "t", "dataTypeB", new Value("value")); + /** + * Tests for {@link MetadataHelper#getAllFields(Set)}. + */ + @Nested + public class GetAllFieldsTest { + @Test + public void testSingleFieldFilter() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.singleton("dataTypeA"))); + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(null)); + Assertions.assertEquals(Collections.singleton("rowA"), helper.getAllFields(Collections.emptySet())); + } + + @Test + public void testMultipleFieldFilter() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + givenMutation("rowB", "t", "dataTypeB", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(null)); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(Collections.emptySet())); + } - Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(null)); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB"), helper.getAllFields(Collections.emptySet())); + @Test + public void testMultipleFieldFilter2() throws TableNotFoundException { + givenMutation("rowA", "t", "dataTypeA", new Value("value")); + givenMutation("rowB", "t", "dataTypeB", new Value("value")); + givenMutation("rowC", "t", "dataTypeC", new Value("value")); + + writeMutations(); + + Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(null)); + Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(Collections.emptySet())); + } } - @Test - public void testMultipleFieldFilter2() throws TableNotFoundException { - writeMutation("rowA", "t", "dataTypeA", new Value("value")); - writeMutation("rowB", "t", "dataTypeB", new Value("value")); - writeMutation("rowC", "t", "dataTypeC", new Value("value")); + /** + * Tests for {@link MetadataHelper#getCardinalityForField(String, Date, Date)} and + * {@link MetadataHelper#getCardinalityForField(String, String, Date, Date)}. + */ + @Nested + public class GetCardinalityForFieldTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200111", "20200120", 1L); // 5 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 1L); // 12 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200110", 1L); // 7 entries within date range. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 1L); // No entries within date range. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 1L); // Field does not match. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 1L); // Field does not match. + + writeMutations(); + + Assertions.assertEquals(24L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(12L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L, "20200104", 3L, + "20200105", 3L, "20200106", 3L, "20200107", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L, "20200107", 3L, + "20200108", 3L, "20200111", 3L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200111", 3L, + "20200114", 3L, "20200115", 3L, "20200116", 3L, "20200120", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200120", 3L)); // Does + // not + // contain + // target + // date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); // Field does not + // match. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); // Field does not + // match. + writeMutations(); + + Assertions.assertEquals(33L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(12L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } - Assertions.assertEquals(Collections.singleton("rowB"), helper.getAllFields(Collections.singleton("dataTypeB"))); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(null)); - Assertions.assertEquals(Sets.newHashSet("rowA", "rowB", "rowC"), helper.getAllFields(Collections.emptySet())); + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L, "20200104", 3L, + "20200105", 3L, "20200106", 3L, "20200107", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L, "20200107", 3L, + "20200108", 3L, "20200111", 3L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200113", "20200120", 3L); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L, "20200120", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); + // Following does not match field. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(51L, helper.getCardinalityForField("NAME", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + Assertions.assertEquals(21L, helper.getCardinalityForField("NAME", "wiki", DateHelper.parse("20200104"), DateHelper.parse("20200115"))); + } } - private void writeMutation(String row, String columnFamily, String columnQualifier, Value value) throws TableNotFoundException { - Mutation mutation = new Mutation(row); - mutation.put(columnFamily, columnQualifier, value); - writeMutation(mutation); + /** + * Tests for {@link MetadataHelper#getCountsByFieldInDayWithTypes(String, String, AccumuloClient, WrappedAccumuloClient)} (Map.Entry)}. + */ + @Nested + public class CountsByFieldInDayWithTypesTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200101", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200102", 3L); // Does not contain target date. + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 1L); + expected.put("wiki", 2L); + expected.put("maze", 3L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200110", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 5L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200102", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() throws TableNotFoundException, IOException { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200101", 1L, "20200102", 5L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200101", "20200120", 1L); // Should get summed into previous. + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200101", 1L, "20200102", 15L, "20200103", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200101", 1L, "20200102", 55L, "20200103", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200115", "20200120", 3L); // Does not have entry for 20200102, should not get + // incremented. + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); // Does not contain target date. + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200103", "20200105", 3L); // Does not contain target date. + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Map expected = new HashMap<>(); + expected.put("csv", 6L); + expected.put("wiki", 15L); + expected.put("maze", 55L); + + HashMap actual = helper.getCountsByFieldInDayWithTypes("NAME", "20200102", accumuloClient, null); + + Assertions.assertEquals(expected, actual); + } } - private void writeMutation(Mutation m) throws TableNotFoundException { - BatchWriterConfig config = new BatchWriterConfig(); - config.setMaxMemory(0); - try (BatchWriter writer = accumuloClient.createBatchWriter(TABLE_METADATA, config)) { - writer.addMutation(m); - writer.flush(); - } catch (MutationsRejectedException e) { - throw new RuntimeException(e); + /** + * Tests for {@link MetadataHelper#getEarliestOccurrenceOfFieldWithType(String, String, AccumuloClient, WrappedAccumuloClient)}. + */ + @Nested + public class GetEarliestOccurrenceOfFieldWithTypeTests { + + /** + * Test against a table that has only non-aggregated entries as matches. + */ + @Test + void testNonAggregatedEntriesOnly() { + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200103", "20200120", 1L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200105", "20200120", 3L); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200107", "20200102", 3L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200105"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); + } + + /** + * Test against a table that has only aggregated entries as matches. + */ + @Test + void testAggregatedEntriesOnly() { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200113", 1L, "20200115", 5L, "20200116", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200102", 1L, "20200104", 55L, "20200105", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200102"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); + } + + /** + * Test against a table that has both aggregated and non-aggregated entries as matches. + */ + @Test + void testMixedEntryFormats() { + givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200111", 1L, "20200112", 5L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200111", "20200120", 1L); + givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L)); + givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200111", 1L, "20200112", 55L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200103", "20200120", 3L); + givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200111", 1L, "20200113", 3L)); + givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200115", 3L); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L)); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L); + givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L); + writeMutations(); + + Assertions.assertEquals(DateHelper.parse("20200101"), helper.getEarliestOccurrenceOfFieldWithType("NAME", null, accumuloClient, null)); + Assertions.assertEquals(DateHelper.parse("20200103"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null)); } } } diff --git a/src/test/java/datawave/query/util/TestUtils.java b/src/test/java/datawave/query/util/TestUtils.java new file mode 100644 index 0000000..40cb31a --- /dev/null +++ b/src/test/java/datawave/query/util/TestUtils.java @@ -0,0 +1,132 @@ +package datawave.query.util; + +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collection; +import java.util.Date; +import java.util.List; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Mutation; + +import datawave.query.model.DateFrequencyMap; +import datawave.util.time.DateHelper; + +public class TestUtils { + + private static final String NULL_BYTE = "\0"; + + /** + * Write the given mutations to the specified table via the accumulo client. + */ + public static void writeMutations(AccumuloClient client, String tableName, Collection mutations) { + BatchWriterConfig config = new BatchWriterConfig(); + config.setMaxMemory(0); + try (BatchWriter writer = client.createBatchWriter(tableName, config)) { + writer.addMutations(mutations); + writer.flush(); + } catch (MutationsRejectedException | TableNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Return the set of dates contained within the start and end date + * + * @param startDateStr + * @param endDateStr + * @return + */ + public static List getDatesInRange(String startDateStr, String endDateStr) { + Date startDate = DateHelper.parse(startDateStr); + Date endDate = DateHelper.parse(endDateStr); + + List dates = new ArrayList<>(); + dates.add(startDateStr); + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(startDate); + while (true) { + calendar.add(Calendar.DAY_OF_MONTH, 1); + Date date = calendar.getTime(); + if (date.before(endDate) || date.equals(endDate)) { + dates.add(DateHelper.format(date)); + } else { + break; + } + } + + return dates; + } + + /** + * Create and a return a {@link DateFrequencyMap} map with the specified dates and counts. The args are expected to be alternating String dates and long + * counts. For example: + * + *
+     * {
+     *     @code
+     *     DateFrequencyMap map = createDateFrequencyMap("20200101", 12L, "20200102", 34L, "20200103", 55L);
+     * }
+     * 
+ * + * will result in a map with a count of 12 for the date 01-01-2020, 34 for the date 01-02-2020, and 55 for the date 01-03-2020. + * + * @param entries + * the entries + * @return the date frequency map + */ + public static DateFrequencyMap createDateFrequencyMap(Object... entries) { + DateFrequencyMap map = new DateFrequencyMap(); + int lastEntry = entries.length - 1; + for (int i = 0; i < lastEntry; i++) { + String date = (String) entries[i]; + i++; + long count = (Long) entries[i]; + map.put(date, count); + } + return map; + } + + /** + * Create and a return a {@link DateFrequencyMap} map with counts for the specified date ranges. The args are expected to be alternating String date ranges + * and long counts. For example: + * + *
+     *  {@code
+     * DateFrequencyMap map = createDateFrequencyMap("20200101", "20200105", 12L, "20200106", "20200110", 34L, "20200111", "20200115" 55L);
+     * }
+     * 
+ * + * will result in a map with a count of 12 for the dates 01-01-2020 to 01-05-2020, 34 for the dates 01-06-2020 to 01-10-2020, and 55 for the dates + * 01-11-2020 to 01-15-2020. + * + * @param entries + * the entries + * @return the date frequency map + */ + public static DateFrequencyMap createRangedDateFrequencyMap(Object... entries) { + DateFrequencyMap map = new DateFrequencyMap(); + int lastEntry = entries.length - 1; + for (int i = 0; i < lastEntry; i++) { + String startDate = (String) entries[i]; + i++; + String endDate = (String) entries[i]; + i++; + long count = (Long) entries[i]; + List dates = getDatesInRange(startDate, endDate); + for (String date : dates) { + map.put(date, count); + } + } + return map; + } + + private TestUtils() { + throw new UnsupportedOperationException(); + } +} diff --git a/src/test/resources/MarkingFunctionsContext.xml b/src/test/resources/MarkingFunctionsContext.xml new file mode 100644 index 0000000..6496e90 --- /dev/null +++ b/src/test/resources/MarkingFunctionsContext.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + +