Skip to content

Commit

Permalink
Adds some more test cases and updates dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
luflow committed Sep 10, 2024
1 parent 38b8529 commit 8e02fff
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
Binary file modified charabia/dictionaries/fst/german/words.fst
Binary file not shown.
4 changes: 2 additions & 2 deletions charabia/dictionaries/txt/german/words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6192,7 +6192,7 @@ Glas
Glaser
Glasnost
Glasur
Glatteis
Glatt
Glatze
Glaube
Glauben
Expand Down Expand Up @@ -21691,7 +21691,6 @@ glas
glaser
glasnost
glasur
glatteis
glatze
glaube
glauben
Expand Down Expand Up @@ -23805,6 +23804,7 @@ kuli
kulisse
kult
kultur
kulturalität
kultus
kummer
kumpan
Expand Down
11 changes: 9 additions & 2 deletions charabia/src/segmenter/german.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ static WORDS_FST: Lazy<Fst<&[u8]>> =
Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/german/words.fst")[..]).unwrap());

static FST_SEGMENTER: Lazy<FstSegmenter> =
Lazy::new(|| FstSegmenter::new(&WORDS_FST, Some(4), true));
Lazy::new(|| FstSegmenter::new(&WORDS_FST, Some(2), true));

impl Segmenter for GermanSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
Expand Down Expand Up @@ -100,7 +100,14 @@ mod test {
word1
);
test_segmentation!("Schreibprozess", &["Schreib", "prozess"], word2);
test_segmentation!("Interkulturalität", &["Inter", "kultur", "alität"], word3);
test_segmentation!("Interkulturalität", &["Inter", "kulturalität"], word3);
test_segmentation!("Wissensorganisation", &["Wissens", "organisation"], word4);
test_segmentation!("Aufgabenplanung", &["Aufgaben", "planung"], word5);
test_segmentation!("Eisbrecher", &["Eis", "brecher"], word6);
test_segmentation!("Zuckerei", &["Zucker", "ei"], word7);
test_segmentation!("Glatteis", &["Glatt", "eis"], word8);
test_segmentation!("Sinnfindung", &["Sinn", "findung"], word9);
test_segmentation!("Donaudampfschifffahrtsgesellschaftskapitän", &["Donau", "dampf", "schifffahrts", "gesellschafts", "kapitän"], word10);
test_segmentation!("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", &["Rind", "fleisch", "etikettierungs", "überwachungs", "aufgaben", "übertragungs", "gesetz"], word11);
test_segmentation!("Nahrungsmittelunverträglichkeitsdiagnoseverfahren", &["Nahrungs", "mittel", "un", "verträglichkeits", "diagnose", "verfahren"], word12);
}

0 comments on commit 8e02fff

Please sign in to comment.