Skip to content

Commit

Permalink
re #2444: Added the parsing of older index date formats to be treated…
Browse files Browse the repository at this point in the history
… as index boundary markers
  • Loading branch information
ivakegg committed Jun 27, 2024
1 parent d14577b commit 9796df5
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 23 deletions.
77 changes: 54 additions & 23 deletions src/main/java/datawave/query/util/AllFieldMetadataHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.format.DateTimeParseException;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
Expand Down Expand Up @@ -1443,31 +1445,50 @@ Map<String,Map<String,FieldIndexHole>> findHoles() throws IOException {
String cq = key.getColumnQualifier().toString();
int offset = cq.indexOf(NULL_BYTE);
if (offset < 0) {
// we can assume this is an entry of the older format (perhaps the value aggregation is not being applied)
log.error("Found an index entry missing the date: " + key);
continue;
}
currDatatype = cq.substring(0, offset);

// Check if the current field and datatype are part of the fields and datatypes we want to retrieve field index holes for.
if (!isPartOfTarget(currFieldName, currDatatype)) {
continue;
}

String cqRemainder = cq.substring((offset + 1));
// check for a marker of <dt>\0<date>\0true/false vs just <dt>\0<date>
// where the boolean denotes that we can assume the field is indexed/no on and before this date
offset = cqRemainder.indexOf(NULL_BYTE);
if (offset >= 0) {
currBoundaryValue = Boolean.valueOf(cqRemainder.substring(offset + 1));
currDate = DateHelper.parse(cqRemainder.substring(0, offset));
currDatatype = cq;

// Check if the current field and datatype are part of the fields and datatypes we want to retrieve field index holes for.
if (!isPartOfTarget(currFieldName, currDatatype)) {
continue;
}

// we can treat this like an index marker but the ts of the entry denotes the boundary
currDate = getBaseDate(key.getTimestamp());
log.warn("Found an index entry missing the date, treating as an index marker at " + currDate + " : " + key);
currBoundaryValue = true;
currCount = 0;
} else {
currBoundaryValue = null;
currDate = DateHelper.parse(cqRemainder);
ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get());
DataInputStream inputStream = new DataInputStream(byteStream);
currCount = WritableUtils.readVLong(inputStream);
currDatatype = cq.substring(0, offset);

// Check if the current field and datatype are part of the fields and datatypes we want to retrieve field index holes for.
if (!isPartOfTarget(currFieldName, currDatatype)) {
continue;
}

String cqRemainder = cq.substring((offset + 1));
// check for a marker of <dt>\0<date>\0true/false vs just <dt>\0<date>
// where the boolean denotes that we can assume the field is indexed/no on and before this date
offset = cqRemainder.indexOf(NULL_BYTE);
if (offset >= 0) {
currBoundaryValue = Boolean.valueOf(cqRemainder.substring(offset + 1));
currDate = DateHelper.parse(cqRemainder.substring(0, offset));
currCount = 0;
} else {
currBoundaryValue = null;
try {
currDate = DateHelper.parse(cqRemainder);
ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get());
DataInputStream inputStream = new DataInputStream(byteStream);
currCount = WritableUtils.readVLong(inputStream);
} catch (DateTimeParseException e) {
// probably the really old type classname format instead of a date.
// we can treat this like an index marker but the ts of the entry denotes the boundary
currDate = getBaseDate(key.getTimestamp());
log.warn("Found an index entry missing the date, treating as an index marker at " + currDate + " : " + key);
currBoundaryValue = true;
currCount = 0;
}
}
}

// If this is the very first entry we've looked at, update our tracking variables
Expand Down Expand Up @@ -1533,6 +1554,16 @@ Map<String,Map<String,FieldIndexHole>> findHoles() throws IOException {
return getImmutableFieldIndexHoles();
}

private Date getBaseDate(long ts) {
Calendar c = Calendar.getInstance();
c.setTimeInMillis(ts);
c.set(Calendar.HOUR_OF_DAY, 0);
c.set(Calendar.SECOND, 0);
c.set(Calendar.MINUTE, 0);
c.set(Calendar.MILLISECOND, 0);
return c.getTime();
}

/**
* Return whether the given field and datatype represent a pairing that should be evaluated for field index holes.
*/
Expand Down
73 changes: 73 additions & 0 deletions src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

import datawave.accumulo.inmemory.InMemoryAccumuloClient;
import datawave.accumulo.inmemory.InMemoryInstance;
import datawave.data.type.LcNoDiacriticsType;
import datawave.query.composite.CompositeMetadataHelper;
import datawave.query.model.FieldIndexHole;
import datawave.util.time.DateHelper;
Expand Down Expand Up @@ -289,6 +290,7 @@ void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) {

/**
* Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps.
* This uses a negative index marker.
*/
@ParameterizedTest
@ValueSource(strings = {"i", "ri"})
Expand All @@ -312,6 +314,7 @@ void testFieldIndexHoleWithNotIndexedMarker(String cf) {
/**
* Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the
* threshold.
* This uses a positive index marker.
*/
@ParameterizedTest
@ValueSource(strings = {"i", "ri"})
Expand All @@ -331,6 +334,52 @@ void testFieldIndexHoleWithIndexedMarker(String cf) {
Assertions.assertEquals(expected, fieldIndexHoles);
}

/**
* Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the
* threshold.
* This uses a positive index marker derived from an older date-less format
*/
@ParameterizedTest
@ValueSource(strings = {"i", "ri"})
void testFieldIndexHoleWithIndexedMarkerSansDate(String cf) {
FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator();
mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L);
mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103");
mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L);
mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L);
mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L);
writeMutations(mutationCreator.getMutations());

Map<String,Map<String,FieldIndexHole>> fieldIndexHoles = getIndexHoleFunction(cf).get();
// @formatter:on
Map<String,Map<String,FieldIndexHole>> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106")));
// @formatter:off
Assertions.assertEquals(expected, fieldIndexHoles);
}

/**
* Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the
* threshold.
* This uses a positive index marker derived from an older date-less format with type class
*/
@ParameterizedTest
@ValueSource(strings = {"i", "ri"})
void testFieldIndexHoleWithIndexedMarkerOldTypeFormat(String cf) {
FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator();
mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L);
mutationCreator.addIndexMarkerMutation(cf, "NAME", "wiki", "20200103", LcNoDiacriticsType.class);
mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L);
mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L);
mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L);
writeMutations(mutationCreator.getMutations());

Map<String,Map<String,FieldIndexHole>> fieldIndexHoles = getIndexHoleFunction(cf).get();
// @formatter:on
Map<String,Map<String,FieldIndexHole>> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106")));
// @formatter:off
Assertions.assertEquals(expected, fieldIndexHoles);
}

@ParameterizedTest
@ValueSource(strings = {"i", "ri"})
void testFieldIndexHoleWithIndexedMarkerAndMissingFrequency(String cf) {
Expand Down Expand Up @@ -1066,6 +1115,14 @@ private void addIndexMarkerMutation(String cf, String fieldName, String datatype
addMutation(fieldName, cf, datatype, endDate, indexed);
}

private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate) {
addMutation(fieldName, cf, datatype, endDate);
}

private void addIndexMarkerMutation(String cf, String fieldName, String datatype, String endDate, Class typeClass) {
addMutation(fieldName, cf, datatype, endDate, typeClass);
}

private List<String> getDatesInRange(String startDateStr, String endDateStr) {
Date startDate = DateHelper.parse(startDateStr);
Date endDate = DateHelper.parse(endDateStr);
Expand Down Expand Up @@ -1100,6 +1157,22 @@ private void addMutation(String row, String columnFamily, String datatype, Strin
mutations.add(mutation);
}

private void addMutation(String row, String columnFamily, String datatype, String date) {
Mutation mutation = new Mutation(row);
mutation.put(columnFamily, datatype, getTimestamp(date), new Value());
mutations.add(mutation);
}

private void addMutation(String row, String columnFamily, String datatype, String date, Class type) {
Mutation mutation = new Mutation(row);
mutation.put(columnFamily, datatype + NULL_BYTE + type.getName(), getTimestamp(date), new Value());
mutations.add(mutation);
}

private long getTimestamp(String date) {
return DateHelper.parse(date).getTime();
}

private List<Mutation> getMutations() {
return mutations;
}
Expand Down

0 comments on commit 9796df5

Please sign in to comment.