Skip to content

Commit

Permalink
Merge #296
Browse files Browse the repository at this point in the history
296: Normalizer for russian r=ManyTheFish a=aignatovich

# Pull Request
- Normalizer for russian

## Related issue
- No related issue.

## Why this changes could be helpful?
- In written russian language it is permissible to use "е" in words containing diacritical version (ex. "ёжик" -> "ежик").

- Below is the current search behavior, using latest version of meilisearch available to date, and it is questionable.  
   - Case 1: Search Query: "Ёж", Indexed: ["Ежик", "Ёжик"], Result: "Ёжик", Expected: Both
   - Case 2: Search Query: "Еж", Indexed: ["Ежик", "Ёжик"], Result: "Ежик", Expected: Both
   - Case 3: Search Query: "ёж", Indexed: ["Ежик", "Ёжик"], Result: "Ежик", Expected: Both, or at least "Ёжик". This one seems to be incorrect.

If my assumptions are correct, this change may impact some of the cases above, though it has to be validated.

## What does this PR do?
- Performs a grammatically permissible normalization of "ё" into "е" for russian language, given that compatibility decomposition already replaces 1-codepoint version with 2-codepoint version.

## PR checklist
Please check if your PR fulfills the following requirements:
- [ ❓ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [ 🟢 ] Have you read the contributing guidelines?
- [ 🟢 ] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Arty I <[email protected]>
Co-authored-by: Many the fish <[email protected]>
  • Loading branch information
3 people authored Aug 28, 2024
2 parents dd260b9 + 2edcf4a commit 79d85f4
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 1 deletion.
5 changes: 4 additions & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
Expand All @@ -57,6 +57,9 @@ thai = []
# allow greek specialized tokenization
greek = []

# allow russian specialized tokenization
russian = []

# allow splitting camelCase latin words
latin-camelcase = ["dep:finl_unicode"]

Expand Down
24 changes: 24 additions & 0 deletions charabia/src/normalizer/compatibility_decomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ mod test {
// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
..Default::default()
},
Token {
// Decompose 1E69 to 0073 0323 0307
lemma: Owned("ṩ ṩ".to_string()),
Expand All @@ -74,6 +81,14 @@ mod test {
// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned(\u{308}е\u{308}".to_string()),
char_end: 2,
byte_end: 2,
char_map: Some(vec![(2, 4), (2, 4)]),
script: Script::Cyrillic,
..Default::default()
},
Token {
lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
char_end: 2,
Expand Down Expand Up @@ -108,6 +123,15 @@ mod test {
// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("ее".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: Some(vec![(2, 2), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("s s".to_string()),
char_end: 2,
Expand Down
6 changes: 6 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer;
pub use self::lowercase::LowercaseNormalizer;
use self::nonspacing_mark::NonspacingMarkNormalizer;
use self::quote::QuoteNormalizer;
#[cfg(feature = "russian")]
pub use self::russian::RussianNormalizer;
#[cfg(feature = "swedish-recomposition")]
use self::swedish_recomposition::SwedishRecompositionNormalizer;
#[cfg(feature = "turkish")]
Expand All @@ -39,6 +41,8 @@ mod japanese;
mod lowercase;
mod nonspacing_mark;
mod quote;
#[cfg(feature = "russian")]
mod russian;
#[cfg(feature = "swedish-recomposition")]
mod swedish_recomposition;
#[cfg(feature = "turkish")]
Expand Down Expand Up @@ -75,6 +79,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(NonspacingMarkNormalizer),
#[cfg(feature = "vietnamese")]
Box::new(VietnameseNormalizer),
#[cfg(feature = "russian")]
Box::new(RussianNormalizer),
#[cfg(feature = "turkish")]
Box::new(TurkishNormalizer),
]
Expand Down
133 changes: 133 additions & 0 deletions charabia/src/normalizer/russian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
use std::borrow::Cow;

use super::{Normalizer, NormalizerOption};
use crate::{Script, Token};
use aho_corasick::AhoCorasick;
use once_cell::sync::Lazy;

pub struct RussianNormalizer;

static MATCHING_STR: Lazy<AhoCorasick> =
Lazy::new(|| AhoCorasick::new([\u{308}", \u{308}"]).unwrap());

impl Normalizer for RussianNormalizer {
fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
match token.char_map.take() {
Some(mut char_map) => {
// if a char_map already exists,iterate over it to reconstruct sub-strings.
let mut lemma = String::new();
let mut tail = token.lemma.as_ref();
let mut normalized = String::new();
for (_, normalized_len) in char_map.iter_mut() {
let (head, t) = tail.split_at(*normalized_len as usize);
tail = t;
normalized.clear();
// then normalize each sub-strings recomputing the size in the char_map.
let mut peekable = head.chars().peekable();
while let Some(c) = peekable.next() {
let (c, peek_consumed) = normalize_russian(c, peekable.peek());

if peek_consumed {
peekable.next();
}

normalized.push(c);
}

*normalized_len = normalized.len() as u8;
lemma.push_str(normalized.as_ref());
}

token.lemma = Cow::Owned(lemma);
token.char_map = Some(char_map);
}
None => {
// if no char_map exists, iterate over the lemma recomposing characters.
let mut char_map = Vec::new();
let mut lemma = String::new();
let mut peekable = token.lemma.chars().peekable();
while let Some(c) = peekable.next() {
let (normalized, peek_consumed) = normalize_russian(c, peekable.peek());

if peek_consumed {
peekable.next();
}

if options.create_char_map {
char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
}
lemma.push(normalized);
}
token.lemma = Cow::Owned(lemma);
if options.create_char_map {
token.char_map = Some(char_map);
}
}
}

token
}

fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma())
}
}

// https://en.wikipedia.org/wiki/Russian_alphabet
// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms.
fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) {
match (current, next) {
// ё -> е, grammatically permissible, common in writing
('Е', Some('\u{308}')) => ('Е', true),
('е', Some('\u{308}')) => ('е', true),

(c, _) => (c, false),
}
}

#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
..Default::default()
}]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: None,
..Default::default()
}]
}

// expected result of the complete Normalizer pipeline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("ее".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: Some(vec![(2, 2), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
}]
}

test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens());
}

0 comments on commit 79d85f4

Please sign in to comment.