Skip to content

Commit

Permalink
Detect regex patterns we may need to handle at eval_only time (#39)
Browse files Browse the repository at this point in the history
* detect wildcard patterns we may need to handle at eval_only time
* implement a way to check if a normalized regex has lost information, or is lossy
* Return false for lossy regex on Lc normalizers for now
* Extend the lossy regex method into the type

---------

Co-authored-by: hlgp <[email protected]>
Co-authored-by: Ivan Bella <[email protected]>
  • Loading branch information
3 people authored Oct 7, 2024
1 parent ca73444 commit ace1017
Show file tree
Hide file tree
Showing 13 changed files with 184 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ public String normalizeRegex(String fieldRegex) throws IllegalArgumentException
throw new IllegalArgumentException("Cannot normalize a regex against a geometry field");
}

@Override
public boolean normalizedRegexIsLossy(String in) {
throw new IllegalArgumentException("Cannot normalize a regex against a geometry field");
}

public String normalizeDelegateType(T geometry) {
return getEncodedStringFromIndexBytes(getSingleIndexFromGeometry(geometry));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ public abstract class AbstractNormalizer<T> implements Normalizer<T> {
public Collection<String> expand(String in) {
return Collections.singletonList(normalize(in));
}

@Override
public boolean normalizedRegexIsLossy(String in) {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ public String normalizeRegex(String fieldRegex) {
}
}

@Override
public boolean normalizedRegexIsLossy(String regex) {
// Despite this normalizer actually being lossy, we are still
// returning false as users are used to overmatching when including
// diacritics or upper case letter. We may consider changing this
// down the road, but for now returning false.
return false;
}

@Override
public String normalizeDelegateType(String delegateIn) {
return normalize(delegateIn);
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/datawave/data/normalizer/LcNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ public String normalizeRegex(String fieldRegex) {
}
}

@Override
public boolean normalizedRegexIsLossy(String regex) {
// Despite this normalizer actually being lossy, we are still
// returning false as users are used to overmatching when including
// diacritics or upper case letter. We may consider changing this
// down the road, but for now returning false.
return false;
}

@Override
public String normalizeDelegateType(String delegateIn) {
return normalize(delegateIn);
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/datawave/data/normalizer/Normalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ public interface Normalizer<T> extends Serializable {

String normalizeRegex(String in);

boolean normalizedRegexIsLossy(String in);

Collection<String> expand(String in);
}
8 changes: 7 additions & 1 deletion src/main/java/datawave/data/normalizer/NumberNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public String normalize(String fv) {
}

/**
* We cannot support regex against numbers
* We can support regex against numbers.
*/
public String normalizeRegex(String fieldRegex) {
try {
Expand All @@ -40,6 +40,12 @@ public String normalizeRegex(String fieldRegex) {
}
}

public boolean normalizedRegexIsLossy(String untrimmedRegex) {
ZeroRegexStatus status = NumericRegexEncoder.getZeroRegexStatus(untrimmedRegex);

return (status.equals(ZeroRegexStatus.LEADING) || status.equals(ZeroRegexStatus.TRAILING));
}

@Override
public String normalizeDelegateType(BigDecimal delegateIn) {
return normalize(delegateIn.toString());
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/datawave/data/normalizer/ZeroRegexStatus.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package datawave.data.normalizer;

public enum ZeroRegexStatus {
LEADING, TRAILING, NONE
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import com.google.common.base.CharMatcher;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.visitor.AlternationDeduper;
import datawave.data.normalizer.regex.visitor.AnchorTrimmer;
import datawave.data.normalizer.regex.visitor.DecimalPointPlacer;
Expand Down Expand Up @@ -143,6 +144,10 @@ private NumericRegexEncoder(String pattern) {
this.pattern = pattern;
}

public static ZeroRegexStatus getZeroRegexStatus(String regex) {
return ZeroTrimmer.getStatus(RegexParser.parse(regex).getChildren());
}

private String encode() {
if (log.isDebugEnabled()) {
log.debug("Encoding pattern " + pattern);
Expand Down
35 changes: 35 additions & 0 deletions src/main/java/datawave/data/normalizer/regex/RegexUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,24 @@ public static boolean matchesChar(Node node, char character) {
}
}

public static boolean groupNodeMatches(Node node, char character) {
GroupNode group = (GroupNode) node;
boolean matchFound = false;

for (Node child : group.getChildren()) {
// If the current child is a single character, see if it is a match for the character.
if (child instanceof SingleCharNode) {
if (isChar(child, character)) {
matchFound = true;
} else {
// A character other than the target was found, but there may be more in the group
continue;
}
}
}
return matchFound;
}

/**
* Return whether the given node is a regex element that can only match against the given character.
*
Expand Down Expand Up @@ -374,6 +392,23 @@ public static boolean matchesZero(Node node) {
return matchesChar(node, RegexConstants.ZERO);
}

public static boolean matchesCharExplicitly(Node node, char character) {
switch (node.getType()) {
case SINGLE_CHAR:
return isChar(node, character);
case CHAR_CLASS:
return charClassMatches(node, character);
case GROUP:
return groupNodeMatches(node, character);
default:
return false;
}
}

public static boolean matchesZeroExplicitly(Node node) {
return matchesCharExplicitly(node, RegexConstants.ZERO);
}

/**
* Return whether the given node is a regex element that can only match against the character '0'.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

import org.apache.commons.lang3.tuple.Pair;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.AnyCharNode;
import datawave.data.normalizer.regex.EncodedPatternNode;
import datawave.data.normalizer.regex.EscapedSingleCharNode;
import datawave.data.normalizer.regex.ExpressionNode;
import datawave.data.normalizer.regex.GroupNode;
import datawave.data.normalizer.regex.IntegerNode;
import datawave.data.normalizer.regex.IntegerRangeNode;
Expand Down Expand Up @@ -42,6 +45,60 @@ public static Node trim(Node node) {
return (Node) node.accept(visitor, null);
}

public static ZeroRegexStatus getStatus(List<Node> encodedRegexNodes) {
if (hasPossiblyLeadingZeroes(encodedRegexNodes)) {
return ZeroRegexStatus.LEADING;
} else if (hasTrailingZeroes(encodedRegexNodes)) {
return ZeroRegexStatus.TRAILING;
} else
return ZeroRegexStatus.NONE;

}

private static boolean hasTrailingZeroes(List<Node> encodedRegexNodes) {
Collections.reverse(encodedRegexNodes);

NodeListIterator iter = new NodeListIterator(encodedRegexNodes);

while (iter.hasNext()) {
iter.seekPastQuantifiers();
iter.seekPastQuestionMarks();

Node next = iter.peekNext();

if (RegexUtils.matchesZero(next)) {
if (RegexUtils.matchesZeroExplicitly(next)) {
return true;
}
iter.next();
} else {
return false;
}

}
return true;

}

private static boolean hasPossiblyLeadingZeroes(List<Node> encodedRegexNodes) {
NodeListIterator iter = new NodeListIterator(encodedRegexNodes);

while (iter.hasNext()) {
Node next = iter.peekNext();

if (RegexUtils.matchesZero(next)) {
return true;
} else if (RegexUtils.isChar(next, RegexConstants.HYPHEN) || next.equals(new EscapedSingleCharNode(RegexConstants.PERIOD))) {
iter.next();
} else {
return false;
}
}

return true;

}

@Override
public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
EncodedPatternNode trimmed = new EncodedPatternNode();
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/datawave/data/type/BaseType.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ public String normalizeRegex(String in) {
return normalizer.normalizeRegex(in);
}

@Override
public boolean normalizedRegexIsLossy(String in) {
return normalizer.normalizedRegexIsLossy(in);
}

@Override
public void normalizeAndSetNormalizedValue(T valueToNormalize) {
setNormalizedValue(normalizer.normalizeDelegateType(valueToNormalize));
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/datawave/data/type/Type.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ public interface Type<T extends Comparable<T>> extends Comparable<Type<T>> {

String normalizeRegex(String in);

boolean normalizedRegexIsLossy(String in);

Collection<String> expand(String in);

Collection<String> expand();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.locationtech.jts.util.Assert;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.Node;
import datawave.data.normalizer.regex.RegexParser;

class ZeroTrimmerTest {

Expand Down Expand Up @@ -284,6 +287,8 @@ void testNoLeadingOrTrailingZeros() {
assertTrimmedTo("45.*", "\\+[b-z]E45.*");
assertTrimmedTo("300454.*", "\\+[f-z]E300454.*");
assertTrimmedTo("300.*0003", "\\+[c-z]E300.*0003");
assertTrimmedTo("300.*000[1-9]", "\\+[c-z]E300.*000[1-9]");

}

@Test
Expand All @@ -299,6 +304,35 @@ void testSingleElementPatterns() {
assertTrimmedTo("\\d{3}", "\\+[a-c]E\\d{3}");
}

@Test
void testStatus() {
// TODO: more test cases

ZeroRegexStatus status = ZeroRegexStatus.NONE;
assertStatus("300.*0003", status);
assertStatus("300.*000[1-9]", status);
assertStatus("45.*", status);
assertStatus("-45.*", status);

status = ZeroRegexStatus.LEADING;
assertStatus(".*?", status);
assertStatus(".*?11", status);
assertStatus("[04][05][06]", status);
assertStatus("[04]{1,3}[05][06]", status);
assertStatus("\\d{3}", status);
assertStatus(".\\.000034.*", status);
assertStatus("00345.*", status);
assertStatus("\\.000034.*", status);
assertStatus("-00345.*", status);

status = ZeroRegexStatus.TRAILING;
assertStatus("3.*0{0,}[01]", status);
assertStatus("3400\\.0000.", status);
assertStatus("340.*", status);
assertStatus("3400{3}0{2}", status);

}

@Test
void testTrailingZerosWithoutQuantifiers() {
assertTrimmedTo(".*34300", "\\+[e-zA-Z]E.*343");
Expand All @@ -314,6 +348,10 @@ void testMixedAlternation() {
assertTrimmedTo("234\\.45|343.*|0\\.00[0]34.*", "\\+cE2\\.3445|\\+[c-z]E343.*|\\+WE34.*");
}

private void assertStatus(String pattern, ZeroRegexStatus status) {
Assert.equals(ZeroTrimmer.getStatus(RegexParser.parse(pattern).getChildren()), status);
}

private void assertTrimmedTo(String pattern, String expectedPattern) {
Node actual = SimpleNumberEncoder.encode(parse(pattern));
actual = ExponentialBinAdder.addBins(actual);
Expand Down

0 comments on commit ace1017

Please sign in to comment.