From e268ab465006b195eac76e7c4c283a0822e16648 Mon Sep 17 00:00:00 2001 From: Marc Rovira Date: Tue, 25 Jun 2024 15:28:25 +0200 Subject: [PATCH 1/2] Add str_length built-in check for PySpark backend Signed-off-by: Marc Rovira --- pandera/backends/pyspark/builtin_checks.py | 27 ++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/pandera/backends/pyspark/builtin_checks.py b/pandera/backends/pyspark/builtin_checks.py index 0725fbafd..f26f716f8 100644 --- a/pandera/backends/pyspark/builtin_checks.py +++ b/pandera/backends/pyspark/builtin_checks.py @@ -1,10 +1,10 @@ """PySpark implementation of built-in checks""" import re -from typing import Any, Iterable, TypeVar +from typing import Any, Iterable, Optional, TypeVar import pyspark.sql.types as pst -from pyspark.sql.functions import col +from pyspark.sql.functions import col, length, lit import pandera.strategies as st from pandera.api.extensions import register_builtin_check @@ -332,3 +332,26 @@ def str_endswith(data: PysparkDataframeColumnObject, string: str) -> bool: """ cond = col(data.column_name).endswith(string) return data.dataframe.filter(~cond).limit(1).count() == 0 + + +@register_builtin_check( + error="str_length({min_value}, {max_value})", +) +@register_input_datatypes(acceptable_datatypes=convert_to_list(STRING_TYPE)) +def str_length( + data: PysparkDataframeColumnObject, + min_value: Optional[int] = None, + max_value: Optional[int] = None, +) -> bool: + """Ensure that the length of strings in a column is within a specified range.""" + if min_value is None and max_value is None: + raise ValueError("Must provide at least one of 'min_value' and 'max_value'") + + str_len = length(col(data.column_name)) + cond = lit(True) + if min_value is not None: + cond = cond & (str_len >= min_value) + if max_value is not None: + cond = cond & (str_len <= max_value) + + return data.dataframe.filter(~cond).limit(1).count() == 0 \ No newline at end of file From e6db6fe3d4eade04b4947b2c7e6eef9b56fc39b7 Mon Sep 17 00:00:00 2001 From: Marc Rovira <54272586+marrov@users.noreply.github.com> Date: Thu, 27 Jun 2024 07:17:55 +0000 Subject: [PATCH 2/2] Fix formatting on str_length Signed-off-by: Marc Rovira <54272586+marrov@users.noreply.github.com> --- pandera/backends/pyspark/builtin_checks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandera/backends/pyspark/builtin_checks.py b/pandera/backends/pyspark/builtin_checks.py index f26f716f8..4acdf5ced 100644 --- a/pandera/backends/pyspark/builtin_checks.py +++ b/pandera/backends/pyspark/builtin_checks.py @@ -345,7 +345,9 @@ def str_length( ) -> bool: """Ensure that the length of strings in a column is within a specified range.""" if min_value is None and max_value is None: - raise ValueError("Must provide at least one of 'min_value' and 'max_value'") + raise ValueError( + "Must provide at least one of 'min_value' and 'max_value'" + ) str_len = length(col(data.column_name)) cond = lit(True) @@ -354,4 +356,4 @@ def str_length( if max_value is not None: cond = cond & (str_len <= max_value) - return data.dataframe.filter(~cond).limit(1).count() == 0 \ No newline at end of file + return data.dataframe.filter(~cond).limit(1).count() == 0