Skip to content

Commit

Permalink
Filter empty token before inserting them in the AhoCorasick automaton…
Browse files Browse the repository at this point in the history
… avoiding a char boundary panic
  • Loading branch information
ManyTheFish committed Apr 17, 2024
1 parent e3df008 commit b7d1c99
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions charabia/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,26 +313,26 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
// TODO: avoid recreating the automaton if nothing changed
match (self.normalizer_option.classifier.separators, self.words_dict) {
(Some(separators), None) => {
let pattern = separators.into_iter().filter(|s| !s.is_empty());
let aho = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostLongest)
.build(separators)
.build(pattern)
.unwrap();

self.segmenter_option.aho = Some(aho);
self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
}
(separators, Some(words)) => {
// use the default separators' list if a custom words' list is given but no custom separators' list.
let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
// merge both lists together and create the Aho-Corasick automaton.
let mut vec = Vec::with_capacity(separators.len() + words.len());
vec.extend_from_slice(words);
vec.extend_from_slice(separators);
let pattern =
words.into_iter().chain(separators.into_iter()).filter(|s| !s.is_empty());
let aho = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostLongest)
.build(vec)
.build(pattern)
.unwrap();

self.segmenter_option.aho = Some(aho);
self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
}
// reset the state in case the builder is reused.
(None, None) => self.segmenter_option.aho = None,
Expand Down

0 comments on commit b7d1c99

Please sign in to comment.