Skip to content

Commit

Permalink
Fix stowords filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Oct 1, 2019
1 parent 8e72b30 commit de1fd98
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
5 changes: 3 additions & 2 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def supported_languages():
except LookupError: # when no NLTK data is available
pass

return [file.capitalize() for file in stopwords_listdir]
return sorted(file.capitalize() for file in stopwords_listdir)

@wait_nltk_data
def __init__(self, language='English', word_list=None):
Expand All @@ -96,7 +96,8 @@ def language(self, value):
if not self._language:
self.stopwords = []
else:
self.stopwords = set(stopwords.words(self.language.lower()))
self.stopwords = set(
x.strip() for x in stopwords.words(self.language.lower()))

def __str__(self):
config = ''
Expand Down
20 changes: 17 additions & 3 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,24 @@ def check(self, token):
self.assertEqual(df([['a', '1']]), [['a']])

def test_stopwords(self):
filter = preprocess.StopwordsFilter('english')
f = preprocess.StopwordsFilter('english')

self.assertFalse(filter.check('a'))
self.assertTrue(filter.check('filter'))
self.assertFalse(f.check('a'))
self.assertTrue(f.check('filter'))

self.assertListEqual(
["snake", "house"],
f(["a", "snake", "is", "in", "a", "house"]))

def test_stopwords_slovene(self):
f = preprocess.StopwordsFilter('slovene')

self.assertFalse(f.check('in'))
self.assertTrue(f.check('abeceda'))

self.assertListEqual(
["kača", "hiši"],
f(["kača", "je", "v", "hiši", "in"]))

def test_lexicon(self):
filter = preprocess.LexiconFilter(['filter'])
Expand Down

0 comments on commit de1fd98

Please sign in to comment.