From 6debc92637392674eb963389de1023802630be74 Mon Sep 17 00:00:00 2001
From: ToshinoriTakahashi <dr.t.takahashi.0917@gmail.com>
Date: Sun, 18 Aug 2024 13:15:07 +0900
Subject: [PATCH 1/3] Add turkish feature

---
 charabia/Cargo.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index 6bf0568..f9e2aef 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
 irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -71,6 +71,9 @@ latin-snakecase = ["dep:finl_unicode"]
 # force Charabia to recompose Swedish characters
 swedish-recomposition = []
 
+# allow turkish specialized tokenization
+turkish = []
+
 [dev-dependencies]
 criterion = "0.5"
 jemallocator = "0.5.4"

From a8014ed8d0048fb7fa58132172896d6c90ecaf76 Mon Sep 17 00:00:00 2001
From: ToshinoriTakahashi <dr.t.takahashi.0917@gmail.com>
Date: Mon, 19 Aug 2024 01:34:07 +0900
Subject: [PATCH 2/3] Add turkish normalizer definition and tests

---
 charabia/src/normalizer/mod.rs     |   6 +
 charabia/src/normalizer/turkish.rs | 421 +++++++++++++++++++++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 charabia/src/normalizer/turkish.rs

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index fa28e83..2cc31ad 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -17,6 +17,8 @@ use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
 #[cfg(feature = "swedish-recomposition")]
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
+#[cfg(feature = "turkish")]
+pub use self::turkish::TurkishNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -39,6 +41,8 @@ mod nonspacing_mark;
 mod quote;
 #[cfg(feature = "swedish-recomposition")]
 mod swedish_recomposition;
+#[cfg(feature = "turkish")]
+mod turkish;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
 
@@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(NonspacingMarkNormalizer),
         #[cfg(feature = "vietnamese")]
         Box::new(VietnameseNormalizer),
+        #[cfg(feature = "turkish")]
+        Box::new(TurkishNormalizer),
     ]
 });
 
diff --git a/charabia/src/normalizer/turkish.rs b/charabia/src/normalizer/turkish.rs
new file mode 100644
index 0000000..03dfb0d
--- /dev/null
+++ b/charabia/src/normalizer/turkish.rs
@@ -0,0 +1,421 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::{Script, Token};
+
+/// Turkish specialized [`Normalizer`].
+///
+/// Turkish text should be normalized by:
+/// - Normalizing the Turkish alphabet: 'ı' to 'i'
+///
+/// There are other peculiar characters in the Turkish alphabet[1],
+/// but this `Normalizer` only supports 'ı', as normalization
+/// is already achieved by the existing `Normalizer` used
+/// in the case of `Script::Latin`.
+///
+/// [1]: https://en.wikipedia.org/wiki/Turkish_alphabet
+pub struct TurkishNormalizer;
+
+impl CharNormalizer for TurkishNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        match c {
+            'ı' => Some("i".to_string().into()),
+            _ => Some(c.into()),
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
+    }
+}
+
+fn is_should_normalize(c: char) -> bool {
+    c == 'ı'
+}
+
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::{Normalizer, NormalizerOption};
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![
+            // Turkish alphabet
+            Token {
+                lemma: Owned("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("abcçdefgğhıijklmnoöprsştuüvyz".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            // Turkish texts containing 'ı'
+            Token {
+                lemma: Owned("çalışma".to_string()),
+                char_end: 7,
+                byte_end: 10,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("şarkı".to_string()),
+                char_end: 5,
+                byte_end: 7,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ışık".to_string()),
+                char_end: 4,
+                byte_end: 7,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            // Turkish texts without 'ı'
+            // - verify that the complete pipeline normalizes turkish text as expected
+            Token {
+                lemma: Owned("günlük".to_string()),
+                char_end: 6,
+                byte_end: 8,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("İstanbul".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("İstasyon".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ömür".to_string()),
+                char_end: 4,
+                byte_end: 6,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ütü".to_string()),
+                char_end: 3,
+                byte_end: 5,
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            // Turkish alphabet
+            Token {
+                lemma: Owned("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("abcçdefgğhiijklmnoöprsştuüvyz".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                char_map: Some(vec![
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 2),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 2),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 2),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 2),
+                    (1, 1),
+                    (1, 1),
+                    (2, 2),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                ]),
+                ..Default::default()
+            },
+            // Turkish texts containing 'ı'
+            Token {
+                lemma: Owned("çalişma".to_string()),
+                char_end: 7,
+                byte_end: 10,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2), (1, 1), (1, 1), (2, 1), (2, 2), (1, 1), (1, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("şarki".to_string()),
+                char_end: 5,
+                byte_end: 7,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2), (1, 1), (1, 1), (1, 1), (2, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("işik".to_string()),
+                char_end: 4,
+                byte_end: 7,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 1), (2, 2), (2, 1), (1, 1)]),
+                ..Default::default()
+            },
+            // Turkish texts without 'ı'
+            // - verify that the complete pipeline normalizes turkish text as expected
+            Token {
+                lemma: Owned("günlük".to_string()),
+                char_end: 6,
+                byte_end: 8,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("İstanbul".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("İstasyon".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ömür".to_string()),
+                char_end: 4,
+                byte_end: 6,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ütü".to_string()),
+                char_end: 3,
+                byte_end: 5,
+                script: Script::Latin,
+                char_map: None,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pipeline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            // Turkish alphabet
+            Token {
+                lemma: Owned("abccdefgghiijklmnooprsstuuvyz".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                ]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("abccdefgghiijklmnooprsstuuvyz".to_string()),
+                char_end: 29,
+                byte_end: 35,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                ]),
+                ..Default::default()
+            },
+            // Turkish texts containing 'ı'
+            Token {
+                lemma: Owned("calisma".to_string()),
+                char_end: 7,
+                byte_end: 10,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(2, 1), (1, 1), (1, 1), (2, 1), (2, 1), (1, 1), (1, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("sarki".to_string()),
+                char_end: 5,
+                byte_end: 7,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(2, 1), (1, 1), (1, 1), (1, 1), (2, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("isik".to_string()),
+                char_end: 4,
+                byte_end: 7,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(2, 1), (2, 1), (2, 1), (1, 1)]),
+                ..Default::default()
+            },
+            // Turkish texts without 'ı'
+            // - verify that the complete pipeline normalizes turkish text as expected
+            Token {
+                lemma: Owned("gunluk".to_string()),
+                char_end: 6,
+                byte_end: 8,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(1, 1), (2, 1), (1, 1), (1, 1), (2, 1), (1, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("istanbul".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                ]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("istasyon".to_string()),
+                char_end: 8,
+                byte_end: 9,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![
+                    (2, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                ]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("omur".to_string()),
+                char_end: 4,
+                byte_end: 6,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(2, 1), (1, 1), (2, 1), (1, 1)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("utu".to_string()),
+                char_end: 3,
+                byte_end: 5,
+                script: Script::Latin,
+                kind: TokenKind::Word,
+                char_map: Some(vec![(2, 1), (1, 1), (2, 1)]),
+                ..Default::default()
+            },
+        ]
+    }
+
+    test_normalizer!(TurkishNormalizer, tokens(), normalizer_result(), normalized_tokens());
+}

From d365fcfaed68e2ef90288bd9100c347efae8a172 Mon Sep 17 00:00:00 2001
From: ToshinoriTakahashi <dr.t.takahashi.0917@gmail.com>
Date: Mon, 19 Aug 2024 01:43:41 +0900
Subject: [PATCH 3/3] Update README.md

---
 charabia/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charabia/README.md b/charabia/README.md
index 9bc0d01..4ee9770 100644
--- a/charabia/README.md
+++ b/charabia/README.md
@@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 
 |  Script / Language  |                           specialized segmentation                            | specialized normalization | Segmentation Performance level | Tokenization Performance level |
 |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
-| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization         | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
+| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
 | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |