Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DIFF test regexp line terminator optimization #1

Open
wants to merge 5 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions integration_tests/src/main/python/regexp_perf_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

import difflib
import sys

from data_gen import *
from spark_session import *

if not is_jvm_charset_utf8():
pytestmark = [pytest.mark.regexp_perf, pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled"))]
else:
pytestmark = pytest.mark.regexp_perf


def mk_str_gen(pattern):
return StringGen(pattern).with_special_case('').with_special_pattern('.{0,10}')

def do_cudf_rlike_test(spark, name, str_gen, num_regexes=10):
re_gen = StringGen('[bf]o{0,2}:?\\+?\\$')
df = unary_op_df(spark, str_gen)
regexes = gen_scalar_values(re_gen, num_regexes, force_no_nulls=True)
exprs = ["a"] + [f"(a rlike '{regex}')" for regex in regexes]
transpiled = df.selectExpr(*exprs).collect()
spark.conf.set("spark.rapids.sql.regexp.transpiler.enabled", False)
df = unary_op_df(spark, str_gen)
cudf = df.selectExpr(*exprs).collect()
print(name)
sys.stdout.writelines(difflib.unified_diff(
a=[f"{x}\n" for x in transpiled],
b=[f"{x}\n" for x in cudf],
fromfile='TRANSPILED OUTPUT',
tofile='CUDF OUTPUT'))


def do_cudf_extract_test(spark, name, str_gen, transpile, num_regexes=1):
re_gen = StringGen('\\([bf]oo:?\\+?\\)\\$')
# df = unary_op_df(spark, str_gen)
# regexes = gen_scalar_values(re_gen, num_regexes, force_no_nulls=True)
regexes = ['(boo:+)$']
exprs = ["a"] + [f"regexp_extract(a,'{regex}', 1)" for regex in regexes]
# transpiled = df.selectExpr(*exprs).collect()
spark.conf.set("spark.rapids.sql.regexp.transpiler.enabled", transpile)
df = unary_op_df(spark, str_gen)
# cudf = df.selectExpr(*exprs).collect()
print(name)
debug_df(df.selectExpr(*exprs))
# sys.stdout.writelines(difflib.unified_diff(
# a=[f"{x}\n" for x in transpiled],
# b=[f"{x}\n" for x in cudf],
# fromfile='TRANSPILED OUTPUT',
# tofile='CUDF OUTPUT'))


def test_re_rlike_newline(request):
str_gen = mk_str_gen('([bf]o{0,2}|:){1,100}\n') \
.with_special_case('boo:and:foo\n')
with_gpu_session(lambda spark: do_cudf_rlike_test(spark, request.node.name, str_gen))


def test_re_rlike_line_terminators(request):
str_gen = mk_str_gen('([bf]o{0,2}|:){1,100}(\r\n)|[\r\n\u0085\u2028\u2029]') \
.with_special_case('boo:and:foo\n') \
.with_special_case('boo:and:foo\r\n')
with_gpu_session(lambda spark: do_cudf_rlike_test(spark, request.node.name, str_gen))

@pytest.mark.parametrize('transpile', [True, False], ids=idfn)
def test_re_extract_newline(request, transpile):
str_gen = mk_str_gen('([bf]oo|:){1,100}\n') \
.with_special_case('boo:and:foo\n')
with_gpu_session(lambda spark: do_cudf_extract_test(spark, request.node.name, str_gen, transpile))

@pytest.mark.parametrize('transpile', [True, False], ids=idfn)
def test_re_extract_line_terminators(request, transpile):
str_gen = mk_str_gen('([bf]oo|:){1,100}(\r\n)|[\r\n\u0085\u2028\u2029]') \
.with_special_case('boo:and:foo\n') \
.with_special_case('boo:and:foo\r\n')
with_gpu_session(lambda spark: do_cudf_extract_test(spark, request.node.name, str_gen, transpile))



Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ class GpuRegExpReplaceMeta(
}

expr.regexp match {
case Literal(s: UTF8String, DataTypes.StringType) if s != null =>
case Literal(s: UTF8String, DataTypes.StringType)
if conf.isRegexpTranspilerEnabled && s != null =>
javaPattern = Some(s.toString())
try {
val (pat, repl) =
Expand Down Expand Up @@ -77,7 +78,14 @@ class GpuRegExpReplaceMeta(
case e: RegexUnsupportedException =>
willNotWorkOnGpu(e.getMessage)
}

case Literal(s: UTF8String, DataTypes.StringType) if s != null =>
javaPattern = Some(s.toString())
cudfPattern = Some(s.toString())
replacement.map { r => GpuRegExpUtils.backrefConversion(r) }.foreach {
case (hasBackref, convertedRep) =>
containsBackref = hasBackref
replacement = Some(GpuRegExpUtils.unescapeReplaceString(convertedRep))
}
case _ =>
willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,13 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
.bytesConf(ByteUnit.BYTE)
.createWithDefault(Integer.MAX_VALUE)

val ENABLE_REGEXP_TRANSPILER = conf("spark.rapids.sql.regexp.transpiler.enabled")
.doc("Enables the transpilation of regular expressions to a format where cuDF can produce " +
"the equivalent result for Spark.")
.internal()
.booleanConf
.createWithDefault(true)

// INTERNAL TEST AND DEBUG CONFIGS

val TEST_RETRY_OOM_INJECTION_MODE = conf("spark.rapids.sql.test.injectRetryOOM")
Expand Down Expand Up @@ -3159,6 +3166,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {

lazy val isRegExpEnabled: Boolean = get(ENABLE_REGEXP)

lazy val isRegexpTranspilerEnabled: Boolean = get(ENABLE_REGEXP_TRANSPILER)

lazy val maxRegExpStateMemory: Long = {
val size = get(REGEXP_MAX_STATE_MEMORY_BYTES)
if (size > 3 * gpuTargetBatchSizeBytes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ class CudfRegexTranspiler(mode: RegexMode) {
private def lineTerminatorMatcher(exclude: Set[Char], excludeCRLF: Boolean,
capture: Boolean): RegexAST = {
val terminatorChars = new ListBuffer[RegexCharacterClassComponent]()
terminatorChars ++= lineTerminatorChars.filter(!exclude.contains(_)).map(RegexChar)
terminatorChars ++= Seq('\r').filter(!exclude.contains(_)).map(RegexChar)

if (terminatorChars.size == 0 && excludeCRLF) {
RegexEmpty()
Expand All @@ -863,7 +863,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
None
)
} else {
RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None)
// RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None)
RegexGroup(capture = capture, RegexParser.parse("\r?"), None)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,8 @@ class GpuRLikeMeta(
override def tagExprForGpu(): Unit = {
GpuRegExpUtils.tagForRegExpEnabled(this)
expr.right match {
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
case Literal(str: UTF8String, DataTypes.StringType)
if conf.isRegexpTranspilerEnabled && str != null =>
try {
// verify that we support this regex and can transpile it to cuDF format
val originalPattern = str.toString
Expand All @@ -1084,6 +1085,8 @@ class GpuRLikeMeta(
case e: RegexUnsupportedException =>
willNotWorkOnGpu(e.getMessage)
}
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
pattern = Some(str.toString)
case _ =>
willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
}
Expand Down Expand Up @@ -1326,7 +1329,8 @@ class GpuRegExpExtractMeta(
}

expr.regexp match {
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
case Literal(str: UTF8String, DataTypes.StringType)
if conf.isRegexpTranspilerEnabled && str != null =>
try {
val javaRegexpPattern = str.toString
// verify that we support this regex and can transpile it to cuDF format
Expand All @@ -1340,6 +1344,9 @@ class GpuRegExpExtractMeta(
case e: RegexUnsupportedException =>
willNotWorkOnGpu(e.getMessage)
}
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
pattern = Some(str.toString)
numGroups = GpuRegExpUtils.countGroups(str.toString)
case _ =>
willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
}
Expand Down Expand Up @@ -1454,7 +1461,8 @@ class GpuRegExpExtractAllMeta(
}

expr.regexp match {
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
case Literal(str: UTF8String, DataTypes.StringType)
if conf.isRegexpTranspilerEnabled && str != null =>
try {
val javaRegexpPattern = str.toString
// verify that we support this regex and can transpile it to cuDF format
Expand All @@ -1468,6 +1476,9 @@ class GpuRegExpExtractAllMeta(
case e: RegexUnsupportedException =>
willNotWorkOnGpu(e.getMessage)
}
case Literal(str: UTF8String, DataTypes.StringType) if str != null =>
pattern = Some(str.toString)
numGroups = GpuRegExpUtils.countGroups(str.toString)
case _ =>
willNotWorkOnGpu(s"only non-null literal strings are supported on GPU")
}
Expand Down Expand Up @@ -1716,7 +1727,10 @@ abstract class StringSplitRegExpMeta[INPUT <: TernaryExpression](expr: INPUT,
transpiler.transpileToSplittableString(utf8Str.toString) match {
case Some(simplified) =>
pattern = simplified
case None =>
case _ if conf.isRegexpTranspilerEnabled =>
pattern = utf8Str.toString
isRegExp = true
case _ =>
try {
val (transpiledAST, _) = transpiler.getTranspiledAST(utf8Str.toString, None, None)
GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST)
Expand Down