From 9eea14faf27ad46bd5eed49d2654cbdc4a1068dd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 23 Sep 2024 12:30:10 +0200 Subject: [PATCH 1/2] Mutualize char normalizers --- .../src/normalizer/character_converter.rs | 188 ++++++++++++++++++ charabia/src/normalizer/mod.rs | 34 +++- 2 files changed, 214 insertions(+), 8 deletions(-) create mode 100644 charabia/src/normalizer/character_converter.rs diff --git a/charabia/src/normalizer/character_converter.rs b/charabia/src/normalizer/character_converter.rs new file mode 100644 index 0000000..fead6f8 --- /dev/null +++ b/charabia/src/normalizer/character_converter.rs @@ -0,0 +1,188 @@ +use super::{CharNormalizer, CharOrStr}; +use crate::{Script, Token}; + +/// This module contains the implementation of the `CharacterConverterNormalizer` struct, which is a character normalizer + +pub struct CharacterConverterNormalizer; + +// All normalizers only need to implement the method `normalize_char` and the method `should_normalize` of the `CharNormalizer` trait. +impl CharNormalizer for CharacterConverterNormalizer { + // Creates the normalized version of the provided char. + fn normalize_char(&self, c: char) -> Option { + let mut normalized = c.to_lowercase(); + + // if the original character is converted in exactly one character, + // then we return the character directly instead of creating a string for it. + match (normalized.next(), normalized.next()) { + (Some(c), None) => normalize_char(c), + (Some(first), Some(second)) => { + let first = normalize_char(first); + let second = normalize_char(second); + match (first, second) { + (Some(first), Some(second)) => Some(first.merge(&second)), + (Some(first), None) => Some(first), + (None, Some(second)) => Some(second), + (None, None) => None, + } + } + (None, _) => None, + } + } + + // Returns `true` if the Normalizer should be used. + fn should_normalize(&self, token: &Token) -> bool { + true + } +} + +fn normalize_char(c: char) -> Option { + match c { + 'œ' | 'Œ' => Some("oe".to_string().into()), + 'æ' | 'Æ' => Some("ae".to_string().into()), + 'ـ' => None, + 'ٱ' => Some('ا'.into()), + 'ى' => Some('ي'.into()), + 'ة' => Some('ه'.into()), + '’' | '‘' | '‛' => Some('\''.into()), + #[cfg(feature = "turkish")] + 'ı' => Some('i'.into()), + #[cfg(feature = "vietnamese")] + 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), + _ => Some(c.into()), + } +} + +fn is_control(c: char) -> bool { + c.is_control() && !c.is_whitespace() +} + +// Test the normalizer: +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::{Normalizer, NormalizerOption}; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![ + Token { + lemma: Owned("œ".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("Œ".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("æ".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("Æ".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + ..Default::default() + }, + ] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![ + Token { + lemma: Owned("oe".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + ..Default::default() + }, + Token { + lemma: Owned("oe".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + ..Default::default() + }, + Token { + lemma: Owned("ae".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + ..Default::default() + }, + Token { + lemma: Owned("ae".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + ..Default::default() + }, + ] + } + + // expected result of the complete Normalizer pieline. + fn normalized_tokens() -> Vec> { + vec![ + Token { + lemma: Owned("oe".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, + Token { + lemma: Owned("oe".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, + Token { + lemma: Owned("ae".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, + Token { + lemma: Owned("ae".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Latin, + char_map: Some(vec![(2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, + ] + } + + test_normalizer!( + CharacterConverterNormalizer, + tokens(), + normalizer_result(), + normalized_tokens() + ); +} diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 2cc31ad..2c53cd3 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -21,6 +21,7 @@ use self::swedish_recomposition::SwedishRecompositionNormalizer; pub use self::turkish::TurkishNormalizer; #[cfg(feature = "vietnamese")] pub use self::vietnamese::VietnameseNormalizer; +use crate::normalizer::character_converter::CharacterConverterNormalizer; use crate::segmenter::SegmentedTokenIter; use crate::Token; @@ -47,6 +48,7 @@ mod turkish; mod vietnamese; mod ae_oe_normalizer; +mod character_converter; /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy. pub static NORMALIZERS: Lazy>> = Lazy::new(|| { @@ -62,21 +64,22 @@ pub static NORMALIZERS: Lazy>> = Lazy::new(|| { /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are considered lossy. pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { vec![ - Box::new(LowercaseNormalizer), - Box::new(QuoteNormalizer), - Box::new(AeOeNormalizer), + // Box::new(LowercaseNormalizer), + // Box::new(QuoteNormalizer), + // Box::new(AeOeNormalizer), + Box::new(CharacterConverterNormalizer), #[cfg(feature = "chinese-normalization")] Box::new(ChineseNormalizer), #[cfg(feature = "japanese-transliteration")] Box::new(JapaneseNormalizer), #[cfg(feature = "greek")] Box::new(GreekNormalizer), - Box::new(ArabicNormalizer), + // Box::new(ArabicNormalizer), Box::new(NonspacingMarkNormalizer), - #[cfg(feature = "vietnamese")] - Box::new(VietnameseNormalizer), - #[cfg(feature = "turkish")] - Box::new(TurkishNormalizer), + // #[cfg(feature = "vietnamese")] + // Box::new(VietnameseNormalizer), + // #[cfg(feature = "turkish")] + // Box::new(TurkishNormalizer), ] }); @@ -226,6 +229,21 @@ pub enum CharOrStr { Str(String), } +impl CharOrStr { + pub fn merge(&self, other: &Self) -> Self { + let mut result = String::new(); + match self { + Self::Char(c) => result.push(*c), + Self::Str(s) => result.push_str(s), + } + match other { + Self::Char(c) => result.push(*c), + Self::Str(s) => result.push_str(s), + } + Self::Str(result) + } +} + impl From for CharOrStr { fn from(c: char) -> Self { Self::Char(c) From f8d8308cdb8db80819be7eeed5652cc4a995cc71 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 24 Sep 2024 09:30:54 +0200 Subject: [PATCH 2/2] Use binary search --- .../src/normalizer/character_converter.rs | 107 +++++++++++++++--- 1 file changed, 91 insertions(+), 16 deletions(-) diff --git a/charabia/src/normalizer/character_converter.rs b/charabia/src/normalizer/character_converter.rs index fead6f8..0494462 100644 --- a/charabia/src/normalizer/character_converter.rs +++ b/charabia/src/normalizer/character_converter.rs @@ -1,6 +1,30 @@ use super::{CharNormalizer, CharOrStr}; use crate::{Script, Token}; +const CHAR_PAIRS: &[(char, Option<(char, Option)>)] = &[ + ('Æ', Some(('a', Some('e')))), + #[cfg(feature = "vietnamese")] + ('Ð', Some(('d', None))), + ('æ', Some(('a', Some('e')))), + #[cfg(feature = "vietnamese")] + ('ð', Some(('d', None))), + #[cfg(feature = "vietnamese")] + ('Đ', Some(('d', None))), + #[cfg(feature = "vietnamese")] + ('đ', Some(('d', None))), + #[cfg(feature = "turkish")] + ('ı', Some(('i', None))), + ('Œ', Some(('o', Some('e')))), + ('œ', Some(('o', Some('e')))), + ('ة', Some(('ه', None))), + ('ـ', None), + ('ٱ', Some(('ا', None))), + ('ى', Some(('ي', None))), + ('‘', Some(('\'', None))), + ('’', Some(('\'', None))), + ('‛', Some(('\'', None))), +]; + /// This module contains the implementation of the `CharacterConverterNormalizer` struct, which is a character normalizer pub struct CharacterConverterNormalizer; @@ -31,29 +55,37 @@ impl CharNormalizer for CharacterConverterNormalizer { // Returns `true` if the Normalizer should be used. fn should_normalize(&self, token: &Token) -> bool { - true + token + .lemma + .chars() + .any(|c| c.is_uppercase() || CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).is_ok()) } } fn normalize_char(c: char) -> Option { - match c { - 'œ' | 'Œ' => Some("oe".to_string().into()), - 'æ' | 'Æ' => Some("ae".to_string().into()), - 'ـ' => None, - 'ٱ' => Some('ا'.into()), - 'ى' => Some('ي'.into()), - 'ة' => Some('ه'.into()), - '’' | '‘' | '‛' => Some('\''.into()), - #[cfg(feature = "turkish")] - 'ı' => Some('i'.into()), - #[cfg(feature = "vietnamese")] - 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), + match CHAR_PAIRS.binary_search_by(|(k, _)| k.cmp(&c)).map(|i| &CHAR_PAIRS[i].1) { + Ok(Some((first, Some(second)))) => { + Some(CharOrStr::Char(*first).merge(&CharOrStr::Char(*second))) + } + Ok(Some((first, None))) => Some(CharOrStr::Char(*first)), + Ok(None) => None, _ => Some(c.into()), } -} -fn is_control(c: char) -> bool { - c.is_control() && !c.is_whitespace() + // match c { + // 'œ' | 'Œ' => Some("oe".to_string().into()), + // 'æ' | 'Æ' => Some("ae".to_string().into()), + // 'ـ' => None, + // 'ٱ' => Some('ا'.into()), + // 'ى' => Some('ي'.into()), + // 'ة' => Some('ه'.into()), + // '’' | '‘' | '‛' => Some('\''.into()), + // #[cfg(feature = "turkish")] + // 'ı' => Some('i'.into()), + // #[cfg(feature = "vietnamese")] + // 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), + // _ => Some(c.into()), + // } } // Test the normalizer: @@ -96,6 +128,14 @@ mod test { script: Script::Latin, ..Default::default() }, + // Taa Marbuta + Token { + lemma: Owned("النهاردة".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Arabic, + ..Default::default() + }, ] } @@ -134,6 +174,23 @@ mod test { char_map: Some(vec![(2, 2)]), ..Default::default() }, + Token { + lemma: Owned("النهارده".to_string()), + char_end: 8, + byte_end: 16, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + script: Script::Arabic, + ..Default::default() + }, ] } @@ -176,6 +233,24 @@ mod test { kind: TokenKind::Word, ..Default::default() }, + Token { + lemma: Owned("النهارده".to_string()), + char_end: 8, + byte_end: 16, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + script: Script::Arabic, + kind: TokenKind::Word, + ..Default::default() + }, ] }