Skip to content

Commit

Permalink
Fix stowords filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 30, 2019
1 parent 70be2b8 commit cc5e9b8
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
5 changes: 3 additions & 2 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def supported_languages():
except LookupError: # when no NLTK data is available
pass

return [file.capitalize() for file in stopwords_listdir]
return sorted(file.capitalize() for file in stopwords_listdir)

@wait_nltk_data
def __init__(self, language='English', word_list=None):
Expand All @@ -96,7 +96,8 @@ def language(self, value):
if not self._language:
self.stopwords = []
else:
self.stopwords = set(stopwords.words(self.language.lower()))
self.stopwords = set(
x.strip() for x in stopwords.words(self.language.lower()))

def __str__(self):
config = ''
Expand Down
14 changes: 14 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,20 @@ def test_stopwords(self):
self.assertFalse(filter.check('a'))
self.assertTrue(filter.check('filter'))

self.assertListEqual(
["snake", "house"],
filter(["a", "snake", "is", "in", "a", "house"]))

def test_stopwords_slovene(self):
filter = preprocess.StopwordsFilter('slovene')

self.assertFalse(filter.check('in'))
self.assertTrue(filter.check('abeceda'))

self.assertListEqual(
["kača", "hiši"],
filter(["kača", "je", "v", "hiši", "in"]))

def test_lexicon(self):
filter = preprocess.LexiconFilter(['filter'])
self.assertFalse(filter.check('false'))
Expand Down

0 comments on commit cc5e9b8

Please sign in to comment.