Skip to content

Commit

Permalink
refactor: clean todo comments
Browse files Browse the repository at this point in the history
- HunspellTokenizer to implement stopWords

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Nov 8, 2024
1 parent f760492 commit ebde62d
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 7 deletions.
131 changes: 124 additions & 7 deletions src/org/omegat/tokenizer/HunspellTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,43 @@
import java.util.Set;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pl.PolishAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;

import org.omegat.core.Core;
import org.omegat.core.CoreEvents;
import org.omegat.core.events.IProjectEventListener.PROJECT_CHANGE_TYPE;
Expand All @@ -65,7 +99,7 @@ public class HunspellTokenizer extends BaseTokenizer {
private volatile Dictionary dict;
private volatile boolean failedToLoadDict;

private Dictionary getDict() {
protected Dictionary getDict() {
if (failedToLoadDict) {
return null;
}
Expand Down Expand Up @@ -95,10 +129,13 @@ protected TokenStream getTokenStream(final String strOrig, final boolean stemsAl
if (dictionary == null) {
return tokenizer;
}

return new HunspellStemFilter(tokenizer, dictionary);

/// TODO: implement stop words checks
CharArraySet stopWords;
if (stopWordsAllowed) {
stopWords = getEffectiveStopWordSet();
} else {
stopWords = CharArraySet.EMPTY_SET;
}
return new StopFilter(new HunspellStemFilter(tokenizer, dictionary), stopWords);
} else {
return tokenizer;
}
Expand Down Expand Up @@ -127,7 +164,12 @@ private static synchronized void populateInstalledDicts() {
return;
}

for (File file : dictionaryDir.listFiles()) {
var fileList = dictionaryDir.listFiles();
if (fileList == null) {
return;
}

for (File file : fileList) {
String name = file.getName();
if (name.endsWith(OConsts.SC_AFFIX_EXTENSION)) {
Language lang = new Language(name.substring(0, name.lastIndexOf(OConsts.SC_AFFIX_EXTENSION)));
Expand Down Expand Up @@ -173,7 +215,7 @@ private static String[] langsToStrings(Set<Language> langs) {
result.add(lang.getLanguage().toLowerCase(Locale.ENGLISH));
result.add(lang.getLanguageCode().toLowerCase(Locale.ENGLISH));
}
return result.toArray(new String[result.size()]);
return result.toArray(new String[0]);
}

private static synchronized void reset() {
Expand All @@ -193,4 +235,79 @@ public static void loadPlugins() {

public static void unloadPlugins() {
}

private CharArraySet getEffectiveStopWordSet() {
String language = getEffectiveLanguage().getLanguageCode();
String country = getEffectiveLanguage().getCountryCode();
switch (language) {
case "ar":
return ArabicAnalyzer.getDefaultStopSet();
case "hy":
return ArmenianAnalyzer.getDefaultStopSet();
case "eu":
return BasqueAnalyzer.getDefaultStopSet();
case "es":
if (country.equals("BR")) {
return BrazilianAnalyzer.getDefaultStopSet();
} else {
return SpanishAnalyzer.getDefaultStopSet();
}
case "bg":
return BulgarianAnalyzer.getDefaultStopSet();
case "ca":
return CatalanAnalyzer.getDefaultStopSet();
case "ja":
case "ko":
case "zh":
return CJKAnalyzer.getDefaultStopSet();
case "cs":
return CzechAnalyzer.getDefaultStopSet();
case "da":
return DanishAnalyzer.getDefaultStopSet();
case "nl":
return DutchAnalyzer.getDefaultStopSet();
case "en":
return EnglishAnalyzer.getDefaultStopSet();
case "fi":
return FinnishAnalyzer.getDefaultStopSet();
case "fr":
return FrenchAnalyzer.getDefaultStopSet();
case "gl":
return GalicianAnalyzer.getDefaultStopSet();
case "de":
return GermanAnalyzer.getDefaultStopSet();
case "el":
return GreekAnalyzer.getDefaultStopSet();
case "hi":
return HindiAnalyzer.getDefaultStopSet();
case "hu":
return HungarianAnalyzer.getDefaultStopSet();
case "id":
return IndonesianAnalyzer.getDefaultStopSet();
case "ga":
return IrishAnalyzer.getDefaultStopSet();
case "it":
return ItalianAnalyzer.getDefaultStopSet();
case "lv":
return LatvianAnalyzer.getDefaultStopSet();
case "nb":
return NorwegianAnalyzer.getDefaultStopSet();
case "fa":
return PersianAnalyzer.getDefaultStopSet();
case "pl":
return PolishAnalyzer.getDefaultStopSet();
case "ro":
return RomanianAnalyzer.getDefaultStopSet();
case "ru":
return RussianAnalyzer.getDefaultStopSet();
case "sv":
return SwedishAnalyzer.getDefaultStopSet();
case "th":
return ThaiAnalyzer.getDefaultStopSet();
case "tr":
return TurkishAnalyzer.getDefaultStopSet();
default:
return CharArraySet.EMPTY_SET;
}
}
}
55 changes: 55 additions & 0 deletions test/src/org/omegat/tokenizer/TokenizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,22 @@
import static org.junit.Assert.assertEquals;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.junit.Before;
import org.junit.Test;

import org.omegat.tokenizer.ITokenizer.StemmingMode;
import org.omegat.util.Language;
import org.omegat.util.TestPreferencesInitializer;
import org.omegat.util.Token;

public class TokenizerTest {

@Before
public void setUp() throws Exception {
TestPreferencesInitializer.init();
}

@Test
public void testEnglish() {
ITokenizer tok = new LuceneEnglishTokenizer();
Expand Down Expand Up @@ -228,6 +238,51 @@ public void testDefault() {
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}

@Test
public void testHunspellEnglish() {
ITokenizer tok = new HunspellTokenizerMock(new Language("en"));
String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" dog.";
assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
"jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "dog", "." },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] {"The", "quick", "brown", "jumped", "over", "lazy", "dog"},
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] {"The", "quick", "brown", "x0", "jumped", "over", "1", "lazy", "dog"},
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] {"The", "quick", "brown", "jumped", "over", "lazy", "dog"},
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}

@Test
public void testHunspellGerman() {
ITokenizer tok = new HunspellTokenizerMock(new Language("de"));
assertResult(new String[] { "pr\u00e4sentierte" },
tok.tokenizeWordsToStrings("pr\u00e4sentierte", StemmingMode.GLOSSARY));
assertResult(new String[] { "pr\u00e4sentieren" },
tok.tokenizeWordsToStrings("pr\u00e4sentieren", StemmingMode.GLOSSARY));
}

@Tokenizer(languages = { Tokenizer.DISCOVER_AT_RUNTIME })
public static class HunspellTokenizerMock extends HunspellTokenizer {
private final Language language;

public HunspellTokenizerMock(Language language) {
this.language = language;
}

@Override
protected Language getEffectiveLanguage() {
return language;
}

@Override
protected Dictionary getDict() {
return null;
}
}

private void assertVerbatim(String[] expected, String[] test, Token[] testTok, String origString) {
assertResult(expected, test);
assertEquals(StringUtils.join(expected), StringUtils.join(test));
Expand Down

0 comments on commit ebde62d

Please sign in to comment.