From 6debc92637392674eb963389de1023802630be74 Mon Sep 17 00:00:00 2001 From: ToshinoriTakahashi Date: Sun, 18 Aug 2024 13:15:07 +0900 Subject: [PATCH 1/3] Add turkish feature --- charabia/Cargo.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 6bf0568..f9e2aef 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] @@ -71,6 +71,9 @@ latin-snakecase = ["dep:finl_unicode"] # force Charabia to recompose Swedish characters swedish-recomposition = [] +# allow turkish specialized tokenization +turkish = [] + [dev-dependencies] criterion = "0.5" jemallocator = "0.5.4" From a8014ed8d0048fb7fa58132172896d6c90ecaf76 Mon Sep 17 00:00:00 2001 From: ToshinoriTakahashi Date: Mon, 19 Aug 2024 01:34:07 +0900 Subject: [PATCH 2/3] Add turkish normalizer definition and tests --- charabia/src/normalizer/mod.rs | 6 + charabia/src/normalizer/turkish.rs | 421 +++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 charabia/src/normalizer/turkish.rs diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index fa28e83..2cc31ad 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -17,6 +17,8 @@ use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; #[cfg(feature = "swedish-recomposition")] use self::swedish_recomposition::SwedishRecompositionNormalizer; +#[cfg(feature = "turkish")] +pub use self::turkish::TurkishNormalizer; #[cfg(feature = "vietnamese")] pub use self::vietnamese::VietnameseNormalizer; use crate::segmenter::SegmentedTokenIter; @@ -39,6 +41,8 @@ mod nonspacing_mark; mod quote; #[cfg(feature = "swedish-recomposition")] mod swedish_recomposition; +#[cfg(feature = "turkish")] +mod turkish; #[cfg(feature = "vietnamese")] mod vietnamese; @@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(NonspacingMarkNormalizer), #[cfg(feature = "vietnamese")] Box::new(VietnameseNormalizer), + #[cfg(feature = "turkish")] + Box::new(TurkishNormalizer), ] }); diff --git a/charabia/src/normalizer/turkish.rs b/charabia/src/normalizer/turkish.rs new file mode 100644 index 0000000..03dfb0d --- /dev/null +++ b/charabia/src/normalizer/turkish.rs @@ -0,0 +1,421 @@ +use super::{CharNormalizer, CharOrStr}; +use crate::{Script, Token}; + +/// Turkish specialized [`Normalizer`]. +/// +/// Turkish text should be normalized by: +/// - Normalizing the Turkish alphabet: 'ı' to 'i' +/// +/// There are other peculiar characters in the Turkish alphabet[1], +/// but this `Normalizer` only supports 'ı', as normalization +/// is already achieved by the existing `Normalizer` used +/// in the case of `Script::Latin`. +/// +/// [1]: https://en.wikipedia.org/wiki/Turkish_alphabet +pub struct TurkishNormalizer; + +impl CharNormalizer for TurkishNormalizer { + fn normalize_char(&self, c: char) -> Option { + match c { + 'ı' => Some("i".to_string().into()), + _ => Some(c.into()), + } + } + + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Latin && token.lemma.chars().any(is_should_normalize) + } +} + +fn is_should_normalize(c: char) -> bool { + c == 'ı' +} + +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::{Normalizer, NormalizerOption}; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![ + // Turkish alphabet + Token { + lemma: Owned("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("abcçdefgğhıijklmnoöprsştuüvyz".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + ..Default::default() + }, + // Turkish texts containing 'ı' + Token { + lemma: Owned("çalışma".to_string()), + char_end: 7, + byte_end: 10, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("şarkı".to_string()), + char_end: 5, + byte_end: 7, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("ışık".to_string()), + char_end: 4, + byte_end: 7, + script: Script::Latin, + ..Default::default() + }, + // Turkish texts without 'ı' + // - verify that the complete pipeline normalizes turkish text as expected + Token { + lemma: Owned("günlük".to_string()), + char_end: 6, + byte_end: 8, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("İstanbul".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("İstasyon".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("ömür".to_string()), + char_end: 4, + byte_end: 6, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("ütü".to_string()), + char_end: 3, + byte_end: 5, + script: Script::Latin, + ..Default::default() + }, + ] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![ + // Turkish alphabet + Token { + lemma: Owned("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + Token { + lemma: Owned("abcçdefgğhiijklmnoöprsştuüvyz".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + // Turkish texts containing 'ı' + Token { + lemma: Owned("çalişma".to_string()), + char_end: 7, + byte_end: 10, + script: Script::Latin, + char_map: Some(vec![(2, 2), (1, 1), (1, 1), (2, 1), (2, 2), (1, 1), (1, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("şarki".to_string()), + char_end: 5, + byte_end: 7, + script: Script::Latin, + char_map: Some(vec![(2, 2), (1, 1), (1, 1), (1, 1), (2, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("işik".to_string()), + char_end: 4, + byte_end: 7, + script: Script::Latin, + char_map: Some(vec![(2, 1), (2, 2), (2, 1), (1, 1)]), + ..Default::default() + }, + // Turkish texts without 'ı' + // - verify that the complete pipeline normalizes turkish text as expected + Token { + lemma: Owned("günlük".to_string()), + char_end: 6, + byte_end: 8, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + Token { + lemma: Owned("İstanbul".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + Token { + lemma: Owned("İstasyon".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + Token { + lemma: Owned("ömür".to_string()), + char_end: 4, + byte_end: 6, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + Token { + lemma: Owned("ütü".to_string()), + char_end: 3, + byte_end: 5, + script: Script::Latin, + char_map: None, + ..Default::default() + }, + ] + } + + // expected result of the complete Normalizer pipeline. + fn normalized_tokens() -> Vec> { + vec![ + // Turkish alphabet + Token { + lemma: Owned("abccdefgghiijklmnooprsstuuvyz".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("abccdefgghiijklmnooprsstuuvyz".to_string()), + char_end: 29, + byte_end: 35, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (2, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + // Turkish texts containing 'ı' + Token { + lemma: Owned("calisma".to_string()), + char_end: 7, + byte_end: 10, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(2, 1), (1, 1), (1, 1), (2, 1), (2, 1), (1, 1), (1, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("sarki".to_string()), + char_end: 5, + byte_end: 7, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(2, 1), (1, 1), (1, 1), (1, 1), (2, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("isik".to_string()), + char_end: 4, + byte_end: 7, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(2, 1), (2, 1), (2, 1), (1, 1)]), + ..Default::default() + }, + // Turkish texts without 'ı' + // - verify that the complete pipeline normalizes turkish text as expected + Token { + lemma: Owned("gunluk".to_string()), + char_end: 6, + byte_end: 8, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(1, 1), (2, 1), (1, 1), (1, 1), (2, 1), (1, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("istanbul".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("istasyon".to_string()), + char_end: 8, + byte_end: 9, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (2, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("omur".to_string()), + char_end: 4, + byte_end: 6, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(2, 1), (1, 1), (2, 1), (1, 1)]), + ..Default::default() + }, + Token { + lemma: Owned("utu".to_string()), + char_end: 3, + byte_end: 5, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![(2, 1), (1, 1), (2, 1)]), + ..Default::default() + }, + ] + } + + test_normalizer!(TurkishNormalizer, tokens(), normalizer_result(), normalized_tokens()); +} From d365fcfaed68e2ef90288bd9100c347efae8a172 Mon Sep 17 00:00:00 2001 From: ToshinoriTakahashi Date: Mon, 19 Aug 2024 01:43:41 +0900 Subject: [PATCH 3/3] Update README.md --- charabia/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/README.md b/charabia/README.md index 9bc0d01..4ee9770 100644 --- a/charabia/README.md +++ b/charabia/README.md @@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor | Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level | |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---| -| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec | +| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec | | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec | | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec | | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |