From f8a750181b2e41dbb108fc7a73550085cfbb502d Mon Sep 17 00:00:00 2001 From: YANGDB Date: Thu, 14 Nov 2024 15:29:11 -0700 Subject: [PATCH] replace loading of grok patterns from the resources folder in favour of hard coded java map (#906) Signed-off-by: YANGDB --- .../sql/common/grok/DefaultPatterns.java | 89 +++++++++++++++++++ .../sql/common/grok/GrokCompiler.java | 78 +--------------- .../opensearch/sql/ppl/utils/ParseUtils.java | 4 - ...PLLogicalPlanGrokTranslatorTestSuite.scala | 1 - 4 files changed, 92 insertions(+), 80 deletions(-) create mode 100644 ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/DefaultPatterns.java diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/DefaultPatterns.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/DefaultPatterns.java new file mode 100644 index 000000000..411542fb4 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/DefaultPatterns.java @@ -0,0 +1,89 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.sql.common.grok; + +import java.util.Map; + +public interface DefaultPatterns { + + /** + * populate map with default patterns as they appear under the '/resources/patterns/*' resource folder + */ + static Map withDefaultPatterns(Map patterns) { + patterns.put("PATH" , "(?:%{UNIXPATH}|%{WINPATH})"); + patterns.put("MONTH" , "\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\\b"); + patterns.put("TZ" , "(?:[PMCE][SD]T|UTC)"); + patterns.put("DATESTAMP_OTHER" , "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}"); + patterns.put("HTTPDATE" , "%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}"); + patterns.put("HOST" , "%{HOSTNAME:UNWANTED}"); + patterns.put("DATESTAMP_EVENTLOG" , "%{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND}"); + patterns.put("MESSAGESLOG" , "%{SYSLOGBASE} %{DATA}"); + patterns.put("WINDOWSMAC" , "(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})"); + patterns.put("YEAR" , "(?>\\d\\d){1,2}"); + patterns.put("POSINT" , "\\b(?:[1-9][0-9]*)\\b"); + patterns.put("USERNAME" , "[a-zA-Z0-9._-]+"); + patterns.put("MINUTE" , "(?:[0-5][0-9])"); + patterns.put("UUID" , "[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}"); + patterns.put("DATE_US" , "%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}"); + patterns.put("LOGLEVEL" , "([A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)"); + patterns.put("WINPATH" , "(?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+"); + patterns.put("NUMBER" , "(?:%{BASE10NUM:UNWANTED})"); + patterns.put("WORD" , "\\b\\w+\\b"); + patterns.put("QS" , "%{QUOTEDSTRING:UNWANTED}"); + patterns.put("TIMESTAMP_ISO8601" , "%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?"); + patterns.put("MONTHNUM" , "(?:0?[1-9]|1[0-2])"); + patterns.put("NOTSPACE" , "\\S+"); + patterns.put("IPV6" , "((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?"); + patterns.put("IPV4" , "(?[+-]?(?:(?:[0-9]+(?:\\.[0-9]+)?)|(?:\\.[0-9]+)))"); + patterns.put("NONNEGINT" , "\\b(?:[0-9]+)\\b"); + patterns.put("DATESTAMP_RFC822" , "%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}"); + patterns.put("URI" , "%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?"); + patterns.put("INT" , "(?:[+-]?(?:[0-9]+))"); + patterns.put("SPACE" , "\\s*"); + patterns.put("GREEDYDATA" , ".*"); + patterns.put("ISO8601_SECOND" , "(?:%{SECOND}|60)"); + patterns.put("UNIXPATH" , "(?>/(?>[\\w_%!$@:.,~-]+|\\.)*)+"); + patterns.put("TTY" , "(?:/dev/(pts|tty([pq])?)(\\w+)?/?(?:[0-9]+))"); + patterns.put("COMBINEDAPACHELOG" , "%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}"); + patterns.put("URIPROTO" , "[A-Za-z]+(\\+[A-Za-z+]+)?"); + patterns.put("HOSTPORT" , "(?:%{IPORHOST}:%{POSINT:PORT})"); + patterns.put("SYSLOGPROG" , "%{PROG:program}(?:\\[%{POSINT:pid}\\])?"); + patterns.put("SYSLOGBASE" , "%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:"); + patterns.put("SYSLOGFACILITY" , "<%{NONNEGINT:facility}.%{NONNEGINT:priority}>"); + patterns.put("DATESTAMP" , "%{DATE}[- ]%{TIME}"); + patterns.put("TIME" , "(?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])"); + patterns.put("USER" , "%{USERNAME:UNWANTED}"); + patterns.put("COMMONMAC" , "(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})"); + patterns.put("IPORHOST" , "(?:%{HOSTNAME:UNWANTED}|%{IP:UNWANTED})"); + patterns.put("BASE16NUM" , "(?(?\"(?>\\\\.|[^\\\\\"]+)+\"|\"\"|(?>'(?>\\\\.|[^\\\\']+)+')|''|(?>`(?>\\\\.|[^\\\\`]+)+`)|``))"); + patterns.put("DAY" , "(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)"); + patterns.put("ISO8601_TIMEZONE" , "(?:Z|[+-]%{HOUR}(?::?%{MINUTE}))"); + patterns.put("PROG" , "(?:[\\w._/%-]+)"); + return patterns; + } +} diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/GrokCompiler.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/GrokCompiler.java index 7d51038cd..b9dd2df83 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/GrokCompiler.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/grok/GrokCompiler.java @@ -26,14 +26,12 @@ import java.util.regex.Pattern; import static java.lang.String.format; +import static org.opensearch.sql.common.grok.DefaultPatterns.withDefaultPatterns; public class GrokCompiler implements Serializable { - - // We don't want \n and commented line - private static final Pattern patternLinePattern = Pattern.compile("^([A-z0-9_]+)\\s+(.*)$"); - + /** {@code Grok} patterns definitions. */ - private final Map grokPatternDefinitions = new HashMap<>(); + private final Map grokPatternDefinitions = withDefaultPatterns(new HashMap<>()); private GrokCompiler() {} @@ -41,76 +39,6 @@ public static GrokCompiler newInstance() { return new GrokCompiler(); } - public Map getPatternDefinitions() { - return grokPatternDefinitions; - } - - /** - * Registers a new pattern definition. - * - * @param name : Pattern Name - * @param pattern : Regular expression Or {@code Grok} pattern - * @throws GrokException runtime expt - */ - public void register(String name, String pattern) { - name = Objects.requireNonNull(name).trim(); - pattern = Objects.requireNonNull(pattern).trim(); - - if (!name.isEmpty() && !pattern.isEmpty()) { - grokPatternDefinitions.put(name, pattern); - } - } - - /** Registers multiple pattern definitions. */ - public void register(Map patternDefinitions) { - Objects.requireNonNull(patternDefinitions); - patternDefinitions.forEach(this::register); - } - - /** - * Registers multiple pattern definitions from a given inputStream, and decoded as a UTF-8 source. - */ - public void register(InputStream input) throws IOException { - register(input, StandardCharsets.UTF_8); - } - - /** Registers multiple pattern definitions from a given inputStream. */ - public void register(InputStream input, Charset charset) throws IOException { - try (BufferedReader in = new BufferedReader(new InputStreamReader(input, charset))) { - in.lines() - .map(patternLinePattern::matcher) - .filter(Matcher::matches) - .forEach(m -> register(m.group(1), m.group(2))); - } - } - - /** Registers multiple pattern definitions from a given Reader. */ - public void register(Reader input) throws IOException { - new BufferedReader(input) - .lines() - .map(patternLinePattern::matcher) - .filter(Matcher::matches) - .forEach(m -> register(m.group(1), m.group(2))); - } - - public void registerDefaultPatterns() { - registerPatternFromClasspath("/patterns/patterns"); - } - - public void registerPatternFromClasspath(String path) throws GrokException { - registerPatternFromClasspath(path, StandardCharsets.UTF_8); - } - - /** registerPatternFromClasspath. */ - public void registerPatternFromClasspath(String path, Charset charset) throws GrokException { - final InputStream inputStream = this.getClass().getResourceAsStream(path); - try (Reader reader = new InputStreamReader(inputStream, charset)) { - register(reader); - } catch (IOException e) { - throw new GrokException(e.getMessage(), e); - } - } - /** Compiles a given Grok pattern and returns a Grok object which can parse the pattern. */ public Grok compile(String pattern) throws IllegalArgumentException { return compile(pattern, false); diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java index a463767f0..6a4d4b032 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java @@ -138,10 +138,6 @@ public static String extractPattern(String patterns, List columns) { public static class GrokExpression { private static final GrokCompiler grokCompiler = GrokCompiler.newInstance(); - static { - grokCompiler.registerDefaultPatterns(); - } - public static Expression getRegExpCommand(Expression sourceField, org.apache.spark.sql.catalyst.expressions.Literal patternLiteral, org.apache.spark.sql.catalyst.expressions.Literal groupIndexLiteral) { return new RegExpExtract(sourceField, patternLiteral, groupIndexLiteral); } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala index f33a4a66b..91da923de 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala @@ -30,7 +30,6 @@ class PPLLogicalPlanGrokTranslatorTestSuite test("test grok email & host expressions") { val grokCompiler = GrokCompiler.newInstance - grokCompiler.registerDefaultPatterns() /* Grok pattern to compile, here httpd logs */ /* Grok pattern to compile, here httpd logs */ val grok = grokCompiler.compile(".+@%{HOSTNAME:host}")