Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add str_length built-in check for PySpark backend #1709

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions pandera/backends/pyspark/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""PySpark implementation of built-in checks"""

from typing import Any, Iterable, TypeVar
import re
from typing import Any, Iterable, Optional, TypeVar

import pyspark.sql.types as pst
from pyspark.sql.functions import col
from pyspark.sql.functions import col, length, lit

import pandera.strategies as st
from pandera.api.extensions import register_builtin_check
Expand Down Expand Up @@ -328,3 +329,28 @@
"""
cond = col(data.column_name).endswith(string)
return data.dataframe.filter(~cond).limit(1).count() == 0


@register_builtin_check(
error="str_length({min_value}, {max_value})",
)
@register_input_datatypes(acceptable_datatypes=convert_to_list(STRING_TYPE))
def str_length(
data: PysparkDataframeColumnObject,
min_value: Optional[int] = None,
max_value: Optional[int] = None,
) -> bool:
"""Ensure that the length of strings in a column is within a specified range."""
if min_value is None and max_value is None:
raise ValueError(

Check warning on line 345 in pandera/backends/pyspark/builtin_checks.py

View check run for this annotation

Codecov / codecov/patch

pandera/backends/pyspark/builtin_checks.py#L344-L345

Added lines #L344 - L345 were not covered by tests
"Must provide at least one of 'min_value' and 'max_value'"
)

str_len = length(col(data.column_name))
cond = lit(True)
if min_value is not None:
cond = cond & (str_len >= min_value)
if max_value is not None:
cond = cond & (str_len <= max_value)

Check warning on line 354 in pandera/backends/pyspark/builtin_checks.py

View check run for this annotation

Codecov / codecov/patch

pandera/backends/pyspark/builtin_checks.py#L349-L354

Added lines #L349 - L354 were not covered by tests

return data.dataframe.filter(~cond).limit(1).count() == 0

Check warning on line 356 in pandera/backends/pyspark/builtin_checks.py

View check run for this annotation

Codecov / codecov/patch

pandera/backends/pyspark/builtin_checks.py#L356

Added line #L356 was not covered by tests
Loading