Skip to content

Commit

Permalink
OPENNLP-1261: The Language Detector should not ignore ngram counts.
Browse files Browse the repository at this point in the history
git push
  • Loading branch information
kottmann committed May 22, 2019
1 parent a27bc32 commit 3b894b4
Showing 1 changed file with 11 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
import java.util.ArrayList;
import java.util.Collection;

import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;

Expand Down Expand Up @@ -58,14 +57,19 @@ public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength,
public String[] getContext(CharSequence document) {
Collection<String> context = new ArrayList<>();

NGramModel model = new NGramModel();
model.add(normalizer.normalize(document), minLength, maxLength);
CharSequence chars = normalizer.normalize(document);

for (StringList tokenList : model) {
if (tokenList.size() > 0) {
context.add(tokenList.getToken(0));
for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) {
for (int textIndex = 0;
textIndex + lengthIndex - 1 < chars.length(); textIndex++) {

String gram = StringUtil.toLowerCase(
chars.subSequence(textIndex, textIndex + lengthIndex));

context.add(gram);
}
}

return context.toArray(new String[context.size()]);
}
}

0 comments on commit 3b894b4

Please sign in to comment.