Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/corpus analysis #183

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pypln/backend/celery_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
database = mongo_client[config.MONGODB_DBNAME]
document_collection = database[config.MONGODB_COLLECTION]
corpora_collection = database[config.MONGODB_CORPORA_COLLECTION]

class DocumentNotFound(Exception):
pass
Expand Down Expand Up @@ -69,3 +70,34 @@ def process(self, document):
and must return a dictionary with the keys to be saved in the database.
"""
raise NotImplementedError

class PyPLNCorpusTask(Task):
"""
This is the base class for a Corpus task. It is very similar to
`PyPLNTask`, but it needs a corpus_id and a list of document_ids.
"""

def run(self, corpus_id, document_ids):
"""
This method is called by Celery, and should not be overridden.
It will call the `process` method with a list of dictionaries
containing all the documents and will update de database with results.
"""
documents = document_collection.find({"_id": {"$in": document_ids}})
if documents is None:
self.retry(exc=DocumentNotFound('Documents with ids "{}" '
'not found in database'.format(document_ids)))
result = self.process(documents)
corpora_collection.update({"corpus_id": corpus_id}, {"$set": result},
upsert=True)
return corpus_id, document_ids

def process(self, documents):
"""
This method should be implemented by subclasses. It is responsible for
performing the analysis itself. It will receive a list of dictionaries
as a paramenter (containing all the documents and the analysis that are
ready for it) and must return a dictionary with the new keys to be
saved in the corpora analysis collection.
"""
raise NotImplementedError
2 changes: 2 additions & 0 deletions pypln/backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def split_uris(uri):
cast=split_uris)
MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln')
MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis')
MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION',
default='corpora_analysis')

ELASTICSEARCH_CONFIG = {
'hosts': config('ELASTICSEARCH_HOSTS',
Expand Down
3 changes: 2 additions & 1 deletion pypln/backend/workers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
from palavras_semantic_tagger import SemanticTagger
from word_cloud import WordCloud
from elastic_indexer import ElasticIndexer
from corpus_freqdist import CorpusFreqDist


__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',
'Bigrams', 'PalavrasRaw', 'Lemmatizer', 'NounPhrase', 'SemanticTagger',
'WordCloud', 'ElasticIndexer']
'WordCloud', 'ElasticIndexer', 'CorpusFreqDist']
32 changes: 32 additions & 0 deletions pypln/backend/workers/corpus_freqdist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
from pypln.backend.celery_task import PyPLNCorpusTask

from collections import Counter

class CorpusFreqDist(PyPLNCorpusTask):

def process(self, documents):
result = Counter()
for document in documents:
d = {}
for word, count in document['freqdist']:
d[word] = count
result += Counter(d)
return {'freqdist': result.most_common()}
63 changes: 63 additions & 0 deletions tests/test_celery_corpus_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# coding: utf-8
#
# Copyright 2015 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
from pypln.backend.celery_task import PyPLNCorpusTask
from mock import MagicMock
from utils import TaskTest


class FakeCorpusTask(PyPLNCorpusTask):
def process(self, documents):
return {'result': sum([d["input"] for d in documents])}

class TestCeleryCorpusTask(TaskTest):
def test_task_should_only_get_the_correct_documents(self):
# This is just preparing the expected input in the database
wrong_doc_id = self.collection.insert({'input': 999}, w=1)
correct_doc_id_1 = self.collection.insert({'input': 1}, w=1)
correct_doc_id_2 = self.collection.insert({'input': 1}, w=1)
fake_corpus_id = 1

FakeCorpusTask.process = MagicMock(return_value={'result': 2})

corpus_task = FakeCorpusTask()

corpus_task.delay(fake_corpus_id, [correct_doc_id_1, correct_doc_id_2])

corpus_task.process.assert_called()

# We need to compare the call args because it's called with a mongo
# cursor, not a list.
# We're getting [0][0] because we want the args (not kwargs) for the
# first call to the method.
call_args = list(corpus_task.process.call_args[0][0])
for arg in call_args:
self.assertEqual(arg['input'], 1)

def test_task_is_saving_the_result_to_mongo_with_the_corpus_id(self):
expected_result = 42
doc_id_1 = self.collection.insert({'input': 21}, w=1)
doc_id_2 = self.collection.insert({'input': 21}, w=1)
fake_corpus_id = 1

FakeCorpusTask().delay(fake_corpus_id, [doc_id_1, doc_id_2])

resulting_corpus_analysis = self.corpora_collection.find_one(
{'corpus_id': fake_corpus_id})['result']

self.assertEqual(resulting_corpus_analysis, expected_result)
43 changes: 43 additions & 0 deletions tests/test_worker_corpus_freqdist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
from pypln.backend.workers import CorpusFreqDist
from utils import TaskTest


class TestCorpusFreqDistWorker(TaskTest):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to test PyPLNCorpusTask separately from CorpusFreqDist? Then later if another subclass of PyPLNCorpusTask is created only the returned dict would need to be checked.

Also, is this hitting an actual mongo instance? If so, would you consider mocking the db methods?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. I was testing both in the same test case (and not testing correctly). I separated the tests and I think it's better now.

It is really hitting an actual mongo instance. This is inherited from the old days when MongoDict was still part of our codebase. It's also one of the reasons our tests are slow. I would be very glad to mock everything and have better, more isolated and quicker tests. I would probably need your help, though @geron :)

def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self):

freqdist_1 = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1],
[u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]]

freqdist_2 = [[u'the', 2], [u'brown', 1], [u'lazy', 1],
[u'over', 1], [u'fox', 1], [u'dog', 1], [u'.', 1],
[u'quick', 1], [u'jumps', 1]]

corpus_fd = [(u'the', 4), (u'is', 2), (u'.', 2), (u'blue', 1),
(u'brown', 1), (u'lazy', 1), (u'fox', 1), (u'jumps', 1),
(u'sun', 1), (u'dog', 1), (u'sky', 1), (u',', 1),
(u'yellow', 1), (u'quick', 1), (u'over', 1)]

result = CorpusFreqDist().process([{'freqdist': freqdist_1},
{'freqdist': freqdist_2}])

resulting_corpus_fd = result['freqdist']

self.assertEqual(resulting_corpus_fd, corpus_fd)
1 change: 1 addition & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def setUp(self):
app.conf.update(CELERY_ALWAYS_EAGER=True)
self.db = pymongo.Connection()[self.db_name]
self.collection = self.db[config.MONGODB_COLLECTION]
self.corpora_collection = self.db[config.MONGODB_CORPORA_COLLECTION]

def tearDown(self):
self.collection.remove({})
Expand Down