Skip to content

Commit

Permalink
feat: Add SQL support for the NORMALIZE string function (#20705)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Jan 15, 2025
1 parent e8d10a9 commit 73cb2a2
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 7 deletions.
3 changes: 0 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions crates/polars-sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ repository = { workspace = true }
description = "SQL transpiler for Polars. Converts SQL to Polars logical plans"

[dependencies]
arrow = { workspace = true }
polars-core = { workspace = true, features = ["rows"] }
polars-error = { workspace = true }
polars-lazy = { workspace = true, features = ["abs", "binary_encoding", "concat_str", "cross_join", "cum_agg", "dtype-date", "dtype-decimal", "dtype-struct", "is_in", "list_eval", "log", "meta", "regex", "round_series", "sign", "string_normalize", "string_reverse", "strings", "timezones", "trigonometry"] }
Expand All @@ -19,10 +18,8 @@ polars-time = { workspace = true }
polars-utils = { workspace = true }

hex = { workspace = true }
once_cell = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
sqlparser = { workspace = true }

[dev-dependencies]
Expand Down
41 changes: 40 additions & 1 deletion crates/polars-sql/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use polars_core::prelude::{
use polars_lazy::dsl::Expr;
#[cfg(feature = "list_eval")]
use polars_lazy::dsl::ListNameSpaceExtension;
use polars_ops::chunked_array::UnicodeForm;
use polars_plan::dsl::{coalesce, concat_str, len, max_horizontal, min_horizontal, when};
use polars_plan::plans::{typed_lit, LiteralValue};
use polars_plan::prelude::LiteralValue::Null;
Expand Down Expand Up @@ -376,6 +377,13 @@ pub(crate) enum PolarsSQLFunctions {
/// SELECT LTRIM(column_1) FROM df;
/// ```
LTrim,
/// SQL 'normalize' function
/// Convert string to Unicode normalization form
/// (one of NFC, NFKC, NFD, or NFKD - unquoted).
/// ```sql
/// SELECT NORMALIZE(column_1, NFC) FROM df;
/// ```
Normalize,
/// SQL 'octet_length' function
/// Returns the length of a given string in bytes.
/// ```sql
Expand All @@ -391,7 +399,7 @@ pub(crate) enum PolarsSQLFunctions {
/// SQL 'replace' function
/// Replace a given substring with another string.
/// ```sql
/// SELECT REPLACE(column_1,'old','new') FROM df;
/// SELECT REPLACE(column_1, 'old', 'new') FROM df;
/// ```
Replace,
/// SQL 'reverse' function
Expand Down Expand Up @@ -859,6 +867,7 @@ impl PolarsSQLFunctions {
"left" => Self::Left,
"lower" => Self::Lower,
"ltrim" => Self::LTrim,
"normalize" => Self::Normalize,
"octet_length" => Self::OctetLength,
"strpos" => Self::StrPos,
"regexp_like" => Self::RegexpLike,
Expand Down Expand Up @@ -1152,6 +1161,36 @@ impl SQLFunctionVisitor<'_> {
},
}
},
Normalize => {
let args = extract_args(function)?;
match args.len() {
1 => self.visit_unary(|e| e.str().normalize(UnicodeForm::NFC)),
2 => {
let form = if let FunctionArgExpr::Expr(SQLExpr::Identifier(Ident {
value: s,
quote_style: None,
span: _,
})) = args[1]
{
match s.to_uppercase().as_str() {
"NFC" => UnicodeForm::NFC,
"NFD" => UnicodeForm::NFD,
"NFKC" => UnicodeForm::NFKC,
"NFKD" => UnicodeForm::NFKD,
_ => {
polars_bail!(SQLSyntax: "invalid 'form' for NORMALIZE (found {})", s)
},
}
} else {
polars_bail!(SQLSyntax: "invalid 'form' for NORMALIZE (found {})", args[1])
};
self.try_visit_binary(|e, _form: Expr| Ok(e.str().normalize(form.clone())))
},
_ => {
polars_bail!(SQLSyntax: "NORMALIZE expects 1-2 arguments (found {})", args.len())
},
}
},
OctetLength => self.visit_unary(|e| e.str().len_bytes()),
StrPos => {
// // note: SQL is 1-indexed; returns zero if no match found
Expand Down
35 changes: 35 additions & 0 deletions py-polars/docs/source/reference/sql/functions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ String
- Returns a lowercased column.
* - :ref:`LTRIM <ltrim>`
- Strips whitespaces from the left.
* - :ref:`NORMALIZE <normalize>`
- Convert string to the specified Unicode normalization form (one of NFC, NFD, NFKC, NFKD).
* - :ref:`OCTET_LENGTH <octet_length>`
- Returns the length of a given string in bytes.
* - :ref:`REGEXP_LIKE <regexp_like>`
Expand Down Expand Up @@ -366,6 +368,39 @@ Strips whitespaces from the left.
# │ DD ┆ DD │
# └───────┴─────────┘
.. _normalize:

NORMALIZE
---------
Convert string to the specified Unicode normalization form (one of NFC, NFD, NFKC, NFKD).
If the normalization form is not provided, NFC is used by default.

**Example:**

.. code-block:: python
df = pl.DataFrame({
"txt": [
"Test",
"Ⓣⓔⓢⓣ",
"𝕿𝖊𝖘𝖙",
"𝕋𝕖𝕤𝕥",
"𝗧𝗲𝘀𝘁",
],
})
df.sql("""
SELECT NORMALIZE(txt, NFKC) FROM self
""").to_series()
# shape: (5,)
# Series: 'txt' [str]
# [
# "Test"
# "Test"
# "Test"
# "Test"
# "Test"
# ]
.. _octet_length:

OCTET_LENGTH
Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/unit/sql/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,21 @@ def test_string_like_multiline() -> None:
assert df.sql(f"SELECT txt FROM self WHERE txt LIKE '{s}'").item() == s


@pytest.mark.parametrize("form", ["NFKC", "NFKD"])
def test_string_normalize(form: str) -> None:
df = pl.DataFrame({"txt": ["Test", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"]}) # noqa: RUF001
res = df.sql(
f"""
SELECT txt, NORMALIZE(txt,{form}) AS norm_txt
FROM self
"""
)
assert res.to_dict(as_series=False) == {
"txt": ["Test", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"], # noqa: RUF001
"norm_txt": ["Test", "Test", "Test", "Test", "Test"],
}


def test_string_position() -> None:
df = pl.Series(
name="city",
Expand Down

0 comments on commit 73cb2a2

Please sign in to comment.