feat: Add SQL support for the NORMALIZE string function (#20705)

pola-rs · Jan 15, 2025 · 73cb2a2 · 73cb2a2
1 parent e8d10a9
commit 73cb2a2
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 7 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml
@@ -9,7 +9,6 @@ repository = { workspace = true }
 description = "SQL transpiler for Polars. Converts SQL to Polars logical plans"
 
 [dependencies]
-arrow = { workspace = true }
 polars-core = { workspace = true, features = ["rows"] }
 polars-error = { workspace = true }
 polars-lazy = { workspace = true, features = ["abs", "binary_encoding", "concat_str", "cross_join", "cum_agg", "dtype-date", "dtype-decimal", "dtype-struct", "is_in", "list_eval", "log", "meta", "regex", "round_series", "sign", "string_normalize", "string_reverse", "strings", "timezones", "trigonometry"] }
@@ -19,10 +18,8 @@ polars-time = { workspace = true }
 polars-utils = { workspace = true }
 
 hex = { workspace = true }
-once_cell = { workspace = true }
 rand = { workspace = true }
 serde = { workspace = true }
-serde_json = { workspace = true }
 sqlparser = { workspace = true }
 
 [dev-dependencies]

diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs
@@ -8,6 +8,7 @@ use polars_core::prelude::{
 use polars_lazy::dsl::Expr;
 #[cfg(feature = "list_eval")]
 use polars_lazy::dsl::ListNameSpaceExtension;
+use polars_ops::chunked_array::UnicodeForm;
 use polars_plan::dsl::{coalesce, concat_str, len, max_horizontal, min_horizontal, when};
 use polars_plan::plans::{typed_lit, LiteralValue};
 use polars_plan::prelude::LiteralValue::Null;
@@ -376,6 +377,13 @@ pub(crate) enum PolarsSQLFunctions {
     /// SELECT LTRIM(column_1) FROM df;
     /// ```
     LTrim,
+    /// SQL 'normalize' function
+    /// Convert string to Unicode normalization form
+    /// (one of NFC, NFKC, NFD, or NFKD - unquoted).
+    /// ```sql
+    /// SELECT NORMALIZE(column_1, NFC) FROM df;
+    /// ```
+    Normalize,
     /// SQL 'octet_length' function
     /// Returns the length of a given string in bytes.
     /// ```sql
@@ -391,7 +399,7 @@ pub(crate) enum PolarsSQLFunctions {
     /// SQL 'replace' function
     /// Replace a given substring with another string.
     /// ```sql
-    /// SELECT REPLACE(column_1,'old','new') FROM df;
+    /// SELECT REPLACE(column_1, 'old', 'new') FROM df;
     /// ```
     Replace,
     /// SQL 'reverse' function
@@ -859,6 +867,7 @@ impl PolarsSQLFunctions {
             "left" => Self::Left,
             "lower" => Self::Lower,
             "ltrim" => Self::LTrim,
+            "normalize" => Self::Normalize,
             "octet_length" => Self::OctetLength,
             "strpos" => Self::StrPos,
             "regexp_like" => Self::RegexpLike,
@@ -1152,6 +1161,36 @@ impl SQLFunctionVisitor<'_> {
                     },
                 }
             },
+            Normalize => {
+                let args = extract_args(function)?;
+                match args.len() {
+                    1 => self.visit_unary(|e| e.str().normalize(UnicodeForm::NFC)),
+                    2 => {
+                        let form = if let FunctionArgExpr::Expr(SQLExpr::Identifier(Ident {
+                            value: s,
+                            quote_style: None,
+                            span: _,
+                        })) = args[1]
+                        {
+                            match s.to_uppercase().as_str() {
+                                "NFC" => UnicodeForm::NFC,
+                                "NFD" => UnicodeForm::NFD,
+                                "NFKC" => UnicodeForm::NFKC,
+                                "NFKD" => UnicodeForm::NFKD,
+                                _ => {
+                                    polars_bail!(SQLSyntax: "invalid 'form' for NORMALIZE (found {})", s)
+                                },
+                            }
+                        } else {
+                            polars_bail!(SQLSyntax: "invalid 'form' for NORMALIZE (found {})", args[1])
+                        };
+                        self.try_visit_binary(|e, _form: Expr| Ok(e.str().normalize(form.clone())))
+                    },
+                    _ => {
+                        polars_bail!(SQLSyntax: "NORMALIZE expects 1-2 arguments (found {})", args.len())
+                    },
+                }
+            },
             OctetLength => self.visit_unary(|e| e.str().len_bytes()),
             StrPos => {
                 // // note: SQL is 1-indexed; returns zero if no match found

diff --git a/py-polars/docs/source/reference/sql/functions/string.rst b/py-polars/docs/source/reference/sql/functions/string.rst
@@ -27,6 +27,8 @@ String
      - Returns a lowercased column.
    * - :ref:`LTRIM <ltrim>`
      - Strips whitespaces from the left.
+   * - :ref:`NORMALIZE <normalize>`
+     - Convert string to the specified Unicode normalization form (one of NFC, NFD, NFKC, NFKD).
    * - :ref:`OCTET_LENGTH <octet_length>`
      - Returns the length of a given string in bytes.
    * - :ref:`REGEXP_LIKE <regexp_like>`
@@ -366,6 +368,39 @@ Strips whitespaces from the left.
     # │   DD  ┆ DD      │
     # └───────┴─────────┘
 
+.. _normalize:
+
+NORMALIZE
+---------
+Convert string to the specified Unicode normalization form (one of NFC, NFD, NFKC, NFKD).
+If the normalization form is not provided, NFC is used by default.
+
+**Example:**
+
+.. code-block:: python
+
+    df = pl.DataFrame({
+        "txt": [
+            "Ｔｅｓｔ",
+            "Ⓣⓔⓢⓣ",
+            "𝕿𝖊𝖘𝖙",
+            "𝕋𝕖𝕤𝕥",
+            "𝗧𝗲𝘀𝘁",
+        ],
+    })
+    df.sql("""
+      SELECT NORMALIZE(txt, NFKC) FROM self
+    """).to_series()
+    # shape: (5,)
+    # Series: 'txt' [str]
+    # [
+    #   "Test"
+    #   "Test"
+    #   "Test"
+    #   "Test"
+    #   "Test"
+    # ]
+
 .. _octet_length:
 
 OCTET_LENGTH

diff --git a/py-polars/tests/unit/sql/test_strings.py b/py-polars/tests/unit/sql/test_strings.py
@@ -275,6 +275,21 @@ def test_string_like_multiline() -> None:
         assert df.sql(f"SELECT txt FROM self WHERE txt LIKE '{s}'").item() == s
 
 
+@pytest.mark.parametrize("form", ["NFKC", "NFKD"])
+def test_string_normalize(form: str) -> None:
+    df = pl.DataFrame({"txt": ["Ｔｅｓｔ", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"]})  # noqa: RUF001
+    res = df.sql(
+        f"""
+        SELECT txt, NORMALIZE(txt,{form}) AS norm_txt
+        FROM self
+        """
+    )
+    assert res.to_dict(as_series=False) == {
+        "txt": ["Ｔｅｓｔ", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"],  # noqa: RUF001
+        "norm_txt": ["Test", "Test", "Test", "Test", "Test"],
+    }
+
+
 def test_string_position() -> None:
     df = pl.Series(
         name="city",