From 9eea14faf27ad46bd5eed49d2654cbdc4a1068dd Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 23 Sep 2024 12:30:10 +0200
Subject: [PATCH 1/2] Mutualize char normalizers

---
 .../src/normalizer/character_converter.rs     | 188 ++++++++++++++++++
 charabia/src/normalizer/mod.rs                |  34 +++-
 2 files changed, 214 insertions(+), 8 deletions(-)
 create mode 100644 charabia/src/normalizer/character_converter.rs
diff --git a/charabia/src/normalizer/character_converter.rs b/charabia/src/normalizer/character_converter.rs
new file mode 100644
index 0000000..fead6f8
--- /dev/null
+++ b/charabia/src/normalizer/character_converter.rs
@@ -0,0 +1,188 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::{Script, Token};
+
+/// This module contains the implementation of the `CharacterConverterNormalizer` struct, which is a character normalizer
+
+pub struct CharacterConverterNormalizer;
+
+// All normalizers only need to implement the method `normalize_char` and the method `should_normalize` of the `CharNormalizer` trait.
+impl CharNormalizer for CharacterConverterNormalizer {
+    // Creates the normalized version of the provided char.
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        let mut normalized = c.to_lowercase();
+
+        // if the original character is converted in exactly one character,
+        // then we return the character directly instead of creating a string for it.
+        match (normalized.next(), normalized.next()) {
+            (Some(c), None) => normalize_char(c),
+            (Some(first), Some(second)) => {
+                let first = normalize_char(first);
+                let second = normalize_char(second);
+                match (first, second) {
+                    (Some(first), Some(second)) => Some(first.merge(&second)),
+                    (Some(first), None) => Some(first),
+                    (None, Some(second)) => Some(second),
+                    (None, None) => None,
+                }
+            }
+            (None, _) => None,
+        }
+    }
+
+    // Returns `true` if the Normalizer should be used.
+    fn should_normalize(&self, token: &Token) -> bool {
+        true
+    }
+}
+
+fn normalize_char(c: char) -> Option<CharOrStr> {
+    match c {
+        'œ' | 'Œ' => Some("oe".to_string().into()),
+        'æ' | 'Æ' => Some("ae".to_string().into()),
+        'ـ' => None,
+        'ٱ' => Some('ا'.into()),
+        'ى' => Some('ي'.into()),
+        'ة' => Some('ه'.into()),
+        '’' | '‘' | '‛' => Some('\''.into()),
+        #[cfg(feature = "turkish")]
+        'ı' => Some('i'.into()),
+        #[cfg(feature = "vietnamese")]
+        'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()),
+        _ => Some(c.into()),
+    }
+}
+
+fn is_control(c: char) -> bool {
+    c.is_control() && !c.is_whitespace()
+}
+
+// Test the normalizer:
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::{Normalizer, NormalizerOption};
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("œ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("Œ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("æ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("Æ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+        ]
+    }
+
+    test_normalizer!(
+        CharacterConverterNormalizer,
+        tokens(),
+        normalizer_result(),
+        normalized_tokens()
+    );
+}
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index 2cc31ad..2c53cd3 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -21,6 +21,7 @@ use self::swedish_recomposition::SwedishRecompositionNormalizer;
 pub use self::turkish::TurkishNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
+use crate::normalizer::character_converter::CharacterConverterNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
@@ -47,6 +48,7 @@ mod turkish;
 mod vietnamese;
 
 mod ae_oe_normalizer;
+mod character_converter;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -62,21 +64,22 @@ pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are considered lossy.
 pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
-        Box::new(LowercaseNormalizer),
-        Box::new(QuoteNormalizer),
-        Box::new(AeOeNormalizer),
+        // Box::new(LowercaseNormalizer),
+        // Box::new(QuoteNormalizer),
+        // Box::new(AeOeNormalizer),
+        Box::new(CharacterConverterNormalizer),
         #[cfg(feature = "chinese-normalization")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]
         Box::new(JapaneseNormalizer),
         #[cfg(feature = "greek")]
         Box::new(GreekNormalizer),
-        Box::new(ArabicNormalizer),
+        // Box::new(ArabicNormalizer),
         Box::new(NonspacingMarkNormalizer),
-        #[cfg(feature = "vietnamese")]
-        Box::new(VietnameseNormalizer),
-        #[cfg(feature = "turkish")]
-        Box::new(TurkishNormalizer),
+        // #[cfg(feature = "vietnamese")]
+        // Box::new(VietnameseNormalizer),
+        // #[cfg(feature = "turkish")]
+        // Box::new(TurkishNormalizer),
     ]
 });
 
@@ -226,6 +229,21 @@ pub enum CharOrStr {
     Str(String),
 }
 
+impl CharOrStr {
+    pub fn merge(&self, other: &Self) -> Self {
+        let mut result = String::new();
+        match self {
+            Self::Char(c) => result.push(*c),
+            Self::Str(s) => result.push_str(s),
+        }
+        match other {
+            Self::Char(c) => result.push(*c),
+            Self::Str(s) => result.push_str(s),
+        }
+        Self::Str(result)
+    }
+}
+
 impl From<char> for CharOrStr {
     fn from(c: char) -> Self {
         Self::Char(c)

From f8d8308cdb8db80819be7eeed5652cc4a995cc71 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 24 Sep 2024 09:30:54 +0200
Subject: [PATCH 2/2] Use binary search

---
 .../src/normalizer/character_converter.rs     | 107 +++++++++++++++---
 1 file changed, 91 insertions(+), 16 deletions(-)

diff --git a/charabia/src/normalizer/character_converter.rs b/charabia/src/normalizer/character_converter.rs
index fead6f8..0494462 100644
--- a/charabia/src/normalizer/character_converter.rs
+++ b/charabia/src/normalizer/character_converter.rs
@@ -1,6 +1,30 @@
 use super::{CharNormalizer, CharOrStr};
 use crate::{Script, Token};
 
+const CHAR_PAIRS: &[(char, Option<(char, Option<char>)>)] = &[
+    ('Æ', Some(('a', Some('e')))),
+    #[cfg(feature = "vietnamese")]
+    ('Ð', Some(('d', None))),
+    ('æ', Some(('a', Some('e')))),
+    #[cfg(feature = "vietnamese")]
+    ('ð', Some(('d', None))),
+    #[cfg(feature = "vietnamese")]
+    ('Đ', Some(('d', None))),
+    #[cfg(feature = "vietnamese")]
+    ('đ', Some(('d', None))),
+    #[cfg(feature = "turkish")]
+    ('ı', Some(('i', None))),
+    ('Œ', Some(('o', Some('e')))),
+    ('œ', Some(('o', Some('e')))),
+    ('ة', Some(('ه', None))),
+    ('ـ', None),
+    ('ٱ', Some(('ا', None))),
+    ('ى', Some(('ي', None))),
+    ('‘', Some(('\'', None))),
+    ('’', Some(('\'', None))),
+    ('‛', Some(('\'', None))),
+];
+
 /// This module contains the implementation of the `CharacterConverterNormalizer` struct, which is a character normalizer
 
 pub struct CharacterConverterNormalizer;
@@ -31,29 +55,37 @@ impl CharNormalizer for CharacterConverterNormalizer {
 
     // Returns `true` if the Normalizer should be used.
     fn should_normalize(&self, token: &Token) -> bool {
-        true
+        token
+            .lemma
+            .chars()
+            .any(|c| c.is_uppercase() || CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).is_ok())
     }
 }
 
 fn normalize_char(c: char) -> Option<CharOrStr> {
-    match c {
-        'œ' | 'Œ' => Some("oe".to_string().into()),
-        'æ' | 'Æ' => Some("ae".to_string().into()),
-        'ـ' => None,
-        'ٱ' => Some('ا'.into()),
-        'ى' => Some('ي'.into()),
-        'ة' => Some('ه'.into()),
-        '’' | '‘' | '‛' => Some('\''.into()),
-        #[cfg(feature = "turkish")]
-        'ı' => Some('i'.into()),
-        #[cfg(feature = "vietnamese")]
-        'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()),
+    match CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).map(|i| &CHAR_PAIRS[i].1) {
+        Ok(Some((first, Some(second)))) => {
+            Some(CharOrStr::Char(*first).merge(&CharOrStr::Char(*second)))
+        }
+        Ok(Some((first, None))) => Some(CharOrStr::Char(*first)),
+        Ok(None) => None,
         _ => Some(c.into()),
     }
-}
 
-fn is_control(c: char) -> bool {
-    c.is_control() && !c.is_whitespace()
+    // match c {
+    //     'œ' | 'Œ' => Some("oe".to_string().into()),
+    //     'æ' | 'Æ' => Some("ae".to_string().into()),
+    //     'ـ' => None,
+    //     'ٱ' => Some('ا'.into()),
+    //     'ى' => Some('ي'.into()),
+    //     'ة' => Some('ه'.into()),
+    //     '’' | '‘' | '‛' => Some('\''.into()),
+    //     #[cfg(feature = "turkish")]
+    //     'ı' => Some('i'.into()),
+    //     #[cfg(feature = "vietnamese")]
+    //     'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()),
+    //     _ => Some(c.into()),
+    // }
 }
 
 // Test the normalizer:
@@ -96,6 +128,14 @@ mod test {
                 script: Script::Latin,
                 ..Default::default()
             },
+            // Taa Marbuta
+            Token {
+                lemma: Owned("النهاردة".to_string()),
+                char_end: 8,
+                byte_end: 16,
+                script: Script::Arabic,
+                ..Default::default()
+            },
         ]
     }
 
@@ -134,6 +174,23 @@ mod test {
                 char_map: Some(vec![(2, 2)]),
                 ..Default::default()
             },
+            Token {
+                lemma: Owned("النهارده".to_string()),
+                char_end: 8,
+                byte_end: 16,
+                char_map: Some(vec![
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                ]),
+                script: Script::Arabic,
+                ..Default::default()
+            },
         ]
     }
 
@@ -176,6 +233,24 @@ mod test {
                 kind: TokenKind::Word,
                 ..Default::default()
             },
+            Token {
+                lemma: Owned("النهارده".to_string()),
+                char_end: 8,
+                byte_end: 16,
+                char_map: Some(vec![
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                    (2, 2),
+                ]),
+                script: Script::Arabic,
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
         ]
     }