SurajAralihalli · SurajAralihalli · Jun 4, 2024 · Jun 7, 2024 · Jul 1, 2024 · Sep 11, 2024
diff --git a/integration_tests/src/main/python/regexp_perf_test.py b/integration_tests/src/main/python/regexp_perf_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import difflib
+import sys
+
+from data_gen import *
+from spark_session import *
+
+if not is_jvm_charset_utf8():
+    pytestmark = [pytest.mark.regexp_perf, pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled"))]
+else:
+    pytestmark = pytest.mark.regexp_perf
+
+
+def mk_str_gen(pattern):
+    return StringGen(pattern).with_special_case('').with_special_pattern('.{0,10}')
+
+def do_cudf_rlike_test(spark, name, str_gen, num_regexes=10):
+    re_gen = StringGen('[bf]o{0,2}:?\\+?\\$')
+    df = unary_op_df(spark, str_gen)
+    regexes = gen_scalar_values(re_gen, num_regexes, force_no_nulls=True)
+    exprs = ["a"] + [f"(a rlike '{regex}')" for regex in regexes]
+    transpiled = df.selectExpr(*exprs).collect()
+    spark.conf.set("spark.rapids.sql.regexp.transpiler.enabled", False)
+    df = unary_op_df(spark, str_gen)
+    cudf = df.selectExpr(*exprs).collect()
+    print(name)
+    sys.stdout.writelines(difflib.unified_diff(
+        a=[f"{x}\n" for x in transpiled],
+        b=[f"{x}\n" for x in cudf],
+        fromfile='TRANSPILED OUTPUT',
+        tofile='CUDF OUTPUT'))
+
+
+def do_cudf_extract_test(spark, name, str_gen, transpile, num_regexes=1):
+    re_gen = StringGen('\\([bf]oo:?\\+?\\)\\$')
+    # df = unary_op_df(spark, str_gen)
+    # regexes = gen_scalar_values(re_gen, num_regexes, force_no_nulls=True)
+    regexes = ['(boo:+)$']
+    exprs = ["a"] + [f"regexp_extract(a,'{regex}', 1)" for regex in regexes]
+    # transpiled = df.selectExpr(*exprs).collect()
+    spark.conf.set("spark.rapids.sql.regexp.transpiler.enabled", transpile)
+    df = unary_op_df(spark, str_gen)
+    # cudf = df.selectExpr(*exprs).collect()
+    print(name)
+    debug_df(df.selectExpr(*exprs))
+    # sys.stdout.writelines(difflib.unified_diff(
+    #     a=[f"{x}\n" for x in transpiled],
+    #     b=[f"{x}\n" for x in cudf],
+    #     fromfile='TRANSPILED OUTPUT',
+    #     tofile='CUDF OUTPUT'))
+
+
+def test_re_rlike_newline(request):
+    str_gen = mk_str_gen('([bf]o{0,2}|:){1,100}\n') \
+        .with_special_case('boo:and:foo\n')
+    with_gpu_session(lambda spark: do_cudf_rlike_test(spark, request.node.name, str_gen))
+
+
+def test_re_rlike_line_terminators(request):
+    str_gen = mk_str_gen('([bf]o{0,2}|:){1,100}(\r\n)|[\r\n\u0085\u2028\u2029]') \
+        .with_special_case('boo:and:foo\n') \
+        .with_special_case('boo:and:foo\r\n')
+    with_gpu_session(lambda spark: do_cudf_rlike_test(spark, request.node.name, str_gen))
+
+@pytest.mark.parametrize('transpile', [True, False], ids=idfn)
+def test_re_extract_newline(request, transpile):
+    str_gen = mk_str_gen('([bf]oo|:){1,100}\n') \
+        .with_special_case('boo:and:foo\n')
+    with_gpu_session(lambda spark: do_cudf_extract_test(spark, request.node.name, str_gen, transpile))
+
+@pytest.mark.parametrize('transpile', [True, False], ids=idfn)
+def test_re_extract_line_terminators(request, transpile):
+    str_gen = mk_str_gen('([bf]oo|:){1,100}(\r\n)|[\r\n\u0085\u2028\u2029]') \
+        .with_special_case('boo:and:foo\n') \
+        .with_special_case('boo:and:foo\r\n')
+    with_gpu_session(lambda spark: do_cudf_extract_test(spark, request.node.name, str_gen, transpile))
+
+
+
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala
@@ -49,7 +49,8 @@ class GpuRegExpReplaceMeta(
     }
 
     expr.regexp match {
-      case Literal(s: UTF8String, DataTypes.StringType) if s != null =>
+      case Literal(s: UTF8String, DataTypes.StringType)
+          if conf.isRegexpTranspilerEnabled && s != null =>
         javaPattern = Some(s.toString())
         try {
           val (pat, repl) =
@@ -77,7 +78,14 @@ class GpuRegExpReplaceMeta(
           case e: RegexUnsupportedException =>
             willNotWorkOnGpu(e.getMessage)
         }
-
+      case Literal(s: UTF8String, DataTypes.StringType) if s != null =>
+        javaPattern = Some(s.toString())
+        cudfPattern = Some(s.toString())
+        replacement.map { r => GpuRegExpUtils.backrefConversion(r) }.foreach {
+          case (hasBackref, convertedRep) =>
+            containsBackref = hasBackref
+            replacement = Some(GpuRegExpUtils.unescapeReplaceString(convertedRep))
+        }
       case _ =>
         willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
     }

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1574,6 +1574,13 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .bytesConf(ByteUnit.BYTE)
     .createWithDefault(Integer.MAX_VALUE)
 
+  val ENABLE_REGEXP_TRANSPILER = conf("spark.rapids.sql.regexp.transpiler.enabled")
+    .doc("Enables the transpilation of regular expressions to a format where cuDF can produce " +
+         "the equivalent result for Spark.")
+    .internal()
+    .booleanConf
+    .createWithDefault(true)
+
   // INTERNAL TEST AND DEBUG CONFIGS
 
   val TEST_RETRY_OOM_INJECTION_MODE = conf("spark.rapids.sql.test.injectRetryOOM")
@@ -3159,6 +3166,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val isRegExpEnabled: Boolean = get(ENABLE_REGEXP)
 
+  lazy val isRegexpTranspilerEnabled: Boolean = get(ENABLE_REGEXP_TRANSPILER)
+
   lazy val maxRegExpStateMemory: Long =  {
     val size = get(REGEXP_MAX_STATE_MEMORY_BYTES)
     if (size > 3 * gpuTargetBatchSizeBytes) {

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -850,7 +850,7 @@ class CudfRegexTranspiler(mode: RegexMode) {
   private def lineTerminatorMatcher(exclude: Set[Char], excludeCRLF: Boolean,
       capture: Boolean): RegexAST = {
     val terminatorChars = new ListBuffer[RegexCharacterClassComponent]()
-    terminatorChars ++= lineTerminatorChars.filter(!exclude.contains(_)).map(RegexChar)
+    terminatorChars ++= Seq('\r').filter(!exclude.contains(_)).map(RegexChar)
 
     if (terminatorChars.size == 0 && excludeCRLF) {
       RegexEmpty()
@@ -863,7 +863,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
         None
       )
     } else {
-      RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None)
+      // RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None)
+      RegexGroup(capture = capture, RegexParser.parse("\r?"), None)
     }
   }
 

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
@@ -1068,7 +1068,8 @@ class GpuRLikeMeta(
     override def tagExprForGpu(): Unit = {
       GpuRegExpUtils.tagForRegExpEnabled(this)
       expr.right match {
-        case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+        case Literal(str: UTF8String, DataTypes.StringType)
+            if conf.isRegexpTranspilerEnabled && str != null =>
           try {
             // verify that we support this regex and can transpile it to cuDF format
             val originalPattern = str.toString
@@ -1084,6 +1085,8 @@ class GpuRLikeMeta(
             case e: RegexUnsupportedException =>
               willNotWorkOnGpu(e.getMessage)
           }
+        case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+          pattern = Some(str.toString)
         case _ =>
           willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
       }
@@ -1326,7 +1329,8 @@ class GpuRegExpExtractMeta(
     }
 
     expr.regexp match {
-      case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+      case Literal(str: UTF8String, DataTypes.StringType)
+          if conf.isRegexpTranspilerEnabled && str != null =>
         try {
           val javaRegexpPattern = str.toString
           // verify that we support this regex and can transpile it to cuDF format
@@ -1340,6 +1344,9 @@ class GpuRegExpExtractMeta(
           case e: RegexUnsupportedException =>
             willNotWorkOnGpu(e.getMessage)
         }
+      case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+        pattern = Some(str.toString)
+        numGroups = GpuRegExpUtils.countGroups(str.toString)
       case _ =>
         willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
     }
@@ -1454,7 +1461,8 @@ class GpuRegExpExtractAllMeta(
     }
 
     expr.regexp match {
-      case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+      case Literal(str: UTF8String, DataTypes.StringType)
+          if conf.isRegexpTranspilerEnabled && str != null =>
         try {
           val javaRegexpPattern = str.toString
           // verify that we support this regex and can transpile it to cuDF format
@@ -1468,6 +1476,9 @@ class GpuRegExpExtractAllMeta(
           case e: RegexUnsupportedException =>
             willNotWorkOnGpu(e.getMessage)
         }
+      case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
+        pattern = Some(str.toString)
+        numGroups = GpuRegExpUtils.countGroups(str.toString)
       case _ =>
         willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
     }
@@ -1716,7 +1727,10 @@ abstract class StringSplitRegExpMeta[INPUT <: TernaryExpression](expr: INPUT,
         transpiler.transpileToSplittableString(utf8Str.toString) match {
           case Some(simplified) =>
             pattern = simplified
-          case None =>
+          case _ if conf.isRegexpTranspilerEnabled =>
+            pattern = utf8Str.toString
+            isRegExp = true
+          case _ =>
             try {
               val (transpiledAST, _) = transpiler.getTranspiledAST(utf8Str.toString, None, None)
               GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)