Skip to content

Commit

Permalink
Merge #282
Browse files Browse the repository at this point in the history
282: Make the pinyin-normalization optional r=Kerollmops a=ManyTheFish

- split the Chinese feature flag in order to deactivate a part of the pipeline
- remove pinyin normalization from the default pipeline

Co-authored-by: ManyTheFish <[email protected]>
  • Loading branch information
meili-bors[bot] and ManyTheFish authored Apr 17, 2024
2 parents 99ab996 + 5f2c737 commit 4e1f15b
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
run: cargo test --verbose
- name: Run tests with japanese-transliteration on
run: cargo test --verbose --features japanese-transliteration
- name: Run tests with chinese-normalization-pinyin on
run: cargo test --verbose --features chinese chinese-normalization-pinyin
- name: Run irg-kvariants tests
run: cargo test -p irg-kvariants --verbose

Expand Down
5 changes: 4 additions & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ zerovec = "0.10.1"
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]

# allow chinese specialized tokenization
chinese = ["dep:pinyin", "dep:jieba-rs"]
chinese = ["chinese-segmentation", "chinese-normalization"]
chinese-segmentation = ["dep:jieba-rs"]
chinese-normalization = []
chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]

# allow hebrew specialized tokenization
hebrew = []
Expand Down
88 changes: 84 additions & 4 deletions charabia/src/normalizer/chinese.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#[cfg(feature = "chinese-normalization-pinyin")]
use pinyin::ToPinyin;

use super::CharNormalizer;
Expand All @@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer {
// Normalize to Pinyin
// If we don't manage to convert the kvariant, we try to convert the original character.
// If none of them are converted, we return the kvariant.
match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
#[cfg(feature = "chinese-normalization-pinyin")]
let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
Some(converted) => {
let with_tone = converted.with_tone();

Some(with_tone.to_string().into())
with_tone.to_string()
}
None => Some(kvariant.into()), // e.g. 杤
}
None => kvariant, // e.g. 杤
};

Some(kvariant.into())
}

fn should_normalize(&self, token: &Token) -> bool {
Expand Down Expand Up @@ -77,6 +81,7 @@ mod test {
}

// expected result of the current Normalizer.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -113,6 +118,7 @@ mod test {
}

// expected result of the complete Normalizer pieline.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -148,5 +154,79 @@ mod test {
]
}

// expected result of the current Normalizer.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("尊嚴".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
lemma: Owned("生而自由".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
lemma: Owned("澳䁈亞本刃𣜜".to_string()),
char_end: 5,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
]
}

// expected result of the complete Normalizer pieline.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
kind: TokenKind::Word,
lemma: Owned("尊嚴".to_string()),
char_start: 0,
char_end: 2,
byte_start: 0,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
Token {
kind: TokenKind::Word,
lemma: Owned("生而自由".to_string()),
char_start: 0,
char_end: 4,
byte_start: 0,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
Token {
kind: TokenKind::Word,
lemma: Owned("澳䁈亞本刃𣜜".to_string()),
char_start: 0,
char_end: 5,
byte_start: 0,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
]
}

test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
46 changes: 46 additions & 0 deletions charabia/src/normalizer/control_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ mod test {
}

// expected result of the complete Normalizer pieline.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -146,5 +147,50 @@ mod test {
]
}

// expected result of the complete Normalizer pieline.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
(1, 0),
]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
(1, 0),
]),
kind: TokenKind::Word,
..Default::default()
},
]
}

test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
6 changes: 3 additions & 3 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::borrow::Cow;
use once_cell::sync::Lazy;

pub use self::arabic::ArabicNormalizer;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
pub use self::chinese::ChineseNormalizer;
pub use self::classify::{Classifier, ClassifierOption};
pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
Expand All @@ -21,7 +21,7 @@ use crate::segmenter::SegmentedTokenIter;
use crate::Token;

mod arabic;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
mod chinese;
mod classify;
mod compatibility_decomposition;
Expand Down Expand Up @@ -50,7 +50,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
vec![
Box::new(LowercaseNormalizer),
Box::new(QuoteNormalizer),
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
Box::new(ChineseNormalizer),
#[cfg(feature = "japanese-transliteration")]
Box::new(JapaneseNormalizer),
Expand Down
37 changes: 37 additions & 0 deletions charabia/src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ mod test {
];

// Segmented and normalized version of the text.
#[cfg(feature = "chinese-normalization-pinyin")]
const TOKENIZED: &[&str] = &[
"rénrén",
"shēngérzìyóu",
Expand Down Expand Up @@ -99,6 +100,42 @@ mod test {
"。",
];

#[cfg(not(feature = "chinese-normalization-pinyin"))]
const TOKENIZED: &[&str] = &[
"人人",
"生而自由",
",",
"在",
"尊",
"嚴",
"和",
"權",
"利",
"上",
"一律平等",
"。",
"他",
"們",
"賦",
"有",
"理性",
"和",
"良心",
",",
"並",
"應",
"以",
"兄弟",
"關",
"係",
"的",
"精神",
"互相",
"對",
"待",
"。",
];

// Macro that run several tests on the Segmenter.
test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
}
11 changes: 5 additions & 6 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ use std::collections::HashMap;

use aho_corasick::{AhoCorasick, FindIter, MatchKind};
pub use arabic::ArabicSegmenter;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
pub use chinese::ChineseSegmenter;
use either::Either;
#[cfg(feature = "japanese")]
pub use japanese::JapaneseSegmenter;
#[cfg(feature = "khmer")]
pub use khmer::KhmerSegmenter;
#[cfg(feature = "korean")]
pub use korean::KoreanSegmenter;
pub use latin::LatinSegmenter;
Expand All @@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy;
#[cfg(feature = "thai")]
pub use thai::ThaiSegmenter;

#[cfg(feature = "khmer")]
pub use khmer::KhmerSegmenter;

use crate::detection::{Detect, Language, Script, StrDetection};
use crate::separators::DEFAULT_SEPARATORS;
use crate::token::Token;

mod arabic;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
mod chinese;
#[cfg(feature = "japanese")]
mod japanese;
Expand Down Expand Up @@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
// latin segmenter
((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
// chinese segmenter
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
// japanese segmenter
#[cfg(feature = "japanese")]
Expand Down

0 comments on commit 4e1f15b

Please sign in to comment.