Merge #282

282: Make the pinyin-normalization optional r=Kerollmops a=ManyTheFish - split the Chinese feature flag in order to deactivate a part of the pipeline - remove pinyin normalization from the default pipeline Co-authored-by: ManyTheFish <[email protected]>
meilisearch · Apr 17, 2024 · 4e1f15b · 4e1f15b
2 parents 99ab996 + 5f2c737
commit 4e1f15b
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 14 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -26,6 +26,8 @@ jobs:
       run: cargo test --verbose
     - name: Run tests with japanese-transliteration on
       run: cargo test --verbose --features japanese-transliteration
+    - name: Run tests with chinese-normalization-pinyin on
+      run: cargo test --verbose --features chinese chinese-normalization-pinyin
     - name: Run irg-kvariants tests
       run: cargo test -p irg-kvariants --verbose
 

diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -38,7 +38,10 @@ zerovec = "0.10.1"
 default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
-chinese = ["dep:pinyin", "dep:jieba-rs"]
+chinese = ["chinese-segmentation", "chinese-normalization"]
+chinese-segmentation = ["dep:jieba-rs"]
+chinese-normalization = []
+chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]
 
 # allow hebrew specialized tokenization
 hebrew = []

diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs
@@ -1,3 +1,4 @@
+#[cfg(feature = "chinese-normalization-pinyin")]
 use pinyin::ToPinyin;
 
 use super::CharNormalizer;
@@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer {
         // Normalize to Pinyin
         // If we don't manage to convert the kvariant, we try to convert the original character.
         // If none of them are converted, we return the kvariant.
-        match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
+        #[cfg(feature = "chinese-normalization-pinyin")]
+        let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
             Some(converted) => {
                 let with_tone = converted.with_tone();
 
-                Some(with_tone.to_string().into())
+                with_tone.to_string()
             }
-            None => Some(kvariant.into()), // e.g. 杤
-        }
+            None => kvariant, // e.g. 杤
+        };
+
+        Some(kvariant.into())
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
@@ -77,6 +81,7 @@ mod test {
     }
 
     // expected result of the current Normalizer.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalizer_result() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -113,6 +118,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -148,5 +154,79 @@ mod test {
         ]
     }
 
+    // expected result of the current Normalizer.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("尊嚴".to_string()),
+                char_end: 2,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由".to_string()),
+                char_end: 4,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_end: 5,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("尊嚴".to_string()),
+                char_start: 0,
+                char_end: 2,
+                byte_start: 0,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("生而自由".to_string()),
+                char_start: 0,
+                char_end: 4,
+                byte_start: 0,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_start: 0,
+                char_end: 5,
+                byte_start: 0,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+        ]
+    }
+
     test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs
@@ -103,6 +103,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -146,5 +147,50 @@ mod test {
         ]
     }
 
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+        ]
+    }
+
     test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -3,7 +3,7 @@ use std::borrow::Cow;
 use once_cell::sync::Lazy;
 
 pub use self::arabic::ArabicNormalizer;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 pub use self::chinese::ChineseNormalizer;
 pub use self::classify::{Classifier, ClassifierOption};
 pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
@@ -21,7 +21,7 @@ use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 mod chinese;
 mod classify;
 mod compatibility_decomposition;
@@ -50,7 +50,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
         Box::new(LowercaseNormalizer),
         Box::new(QuoteNormalizer),
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-normalization")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]
         Box::new(JapaneseNormalizer),

diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs
@@ -64,6 +64,7 @@ mod test {
     ];
 
     // Segmented and normalized version of the text.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     const TOKENIZED: &[&str] = &[
         "rénrén",
         "shēngérzìyóu",
@@ -99,6 +100,42 @@ mod test {
         "。",
     ];
 
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    const TOKENIZED: &[&str] = &[
+        "人人",
+        "生而自由",
+        ",",
+        "在",
+        "尊",
+        "嚴",
+        "和",
+        "權",
+        "利",
+        "上",
+        "一律平等",
+        "。",
+        "他",
+        "們",
+        "賦",
+        "有",
+        "理性",
+        "和",
+        "良心",
+        ",",
+        "並",
+        "應",
+        "以",
+        "兄弟",
+        "關",
+        "係",
+        "的",
+        "精神",
+        "互相",
+        "對",
+        "待",
+        "。",
+    ];
+
     // Macro that run several tests on the Segmenter.
     test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
 }
diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -3,11 +3,13 @@ use std::collections::HashMap;
 
 use aho_corasick::{AhoCorasick, FindIter, MatchKind};
 pub use arabic::ArabicSegmenter;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 pub use chinese::ChineseSegmenter;
 use either::Either;
 #[cfg(feature = "japanese")]
 pub use japanese::JapaneseSegmenter;
+#[cfg(feature = "khmer")]
+pub use khmer::KhmerSegmenter;
 #[cfg(feature = "korean")]
 pub use korean::KoreanSegmenter;
 pub use latin::LatinSegmenter;
@@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy;
 #[cfg(feature = "thai")]
 pub use thai::ThaiSegmenter;
 
-#[cfg(feature = "khmer")]
-pub use khmer::KhmerSegmenter;
-
 use crate::detection::{Detect, Language, Script, StrDetection};
 use crate::separators::DEFAULT_SEPARATORS;
 use crate::token::Token;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 mod chinese;
 #[cfg(feature = "japanese")]
 mod japanese;
@@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
         // latin segmenter
         ((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
         // chinese segmenter
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-segmentation")]
         ((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
         // japanese segmenter
         #[cfg(feature = "japanese")]