NAMD · flavioamieiro · Jun 16, 2016 · Jul 12, 2016 · Aug 11, 2016 · geron
diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
@@ -34,6 +34,7 @@
 mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
 database = mongo_client[config.MONGODB_DBNAME]
 document_collection = database[config.MONGODB_COLLECTION]
+corpora_collection = database[config.MONGODB_CORPORA_COLLECTION]
 
 class DocumentNotFound(Exception):
     pass
@@ -69,3 +70,34 @@ def process(self, document):
         and must return a dictionary with the keys to be saved in the database.
         """
         raise NotImplementedError
+
+class PyPLNCorpusTask(Task):
+    """
+    This is the base class for a Corpus task. It is very similar to
+    `PyPLNTask`, but it needs a corpus_id and a list of document_ids.
+    """
+
+    def run(self, corpus_id, document_ids):
+        """
+        This method is called by Celery, and should not be overridden.
+        It will call the `process` method with a list of dictionaries
+        containing all the documents and will update de database with results.
+        """
+        documents = document_collection.find({"_id": {"$in": document_ids}})
+        if documents is None:
+            self.retry(exc=DocumentNotFound('Documents with ids "{}" '
+                'not found in database'.format(document_ids)))
+        result = self.process(documents)
+        corpora_collection.update({"corpus_id": corpus_id}, {"$set": result},
+                upsert=True)
+        return corpus_id, document_ids
+
+    def process(self, documents):
+        """
+        This method should be implemented by subclasses. It is responsible for
+        performing the analysis itself. It will receive a list of dictionaries
+        as a paramenter (containing all the documents and the analysis that are
+        ready for it) and must return a dictionary with the new keys to be
+        saved in the corpora analysis collection.
+        """
+        raise NotImplementedError
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
@@ -31,6 +31,8 @@ def split_uris(uri):
         cast=split_uris)
 MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln')
 MONGODB_COLLECTION =  config('MONGODB_COLLECTION', default='analysis')
+MONGODB_CORPORA_COLLECTION =  config('MONGODB_CORPORA_COLLECTION',
+        default='corpora_analysis')
 
 ELASTICSEARCH_CONFIG = {
     'hosts': config('ELASTICSEARCH_HOSTS',

diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
@@ -29,8 +29,9 @@
 from palavras_semantic_tagger import SemanticTagger
 from word_cloud import WordCloud
 from elastic_indexer import ElasticIndexer
+from corpus_freqdist import CorpusFreqDist
 
 
 __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',
         'Bigrams', 'PalavrasRaw', 'Lemmatizer', 'NounPhrase', 'SemanticTagger',
-        'WordCloud', 'ElasticIndexer']
+        'WordCloud', 'ElasticIndexer', 'CorpusFreqDist']
diff --git a/pypln/backend/workers/corpus_freqdist.py b/pypln/backend/workers/corpus_freqdist.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from pypln.backend.celery_task import PyPLNCorpusTask
+
+from collections import Counter
+
+class CorpusFreqDist(PyPLNCorpusTask):
+
+    def process(self, documents):
+        result = Counter()
+        for document in documents:
+            d = {}
+            for word, count in document['freqdist']:
+                d[word] = count
+            result += Counter(d)
+        return {'freqdist': result.most_common()}
diff --git a/tests/test_celery_corpus_task.py b/tests/test_celery_corpus_task.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from pypln.backend.celery_task import PyPLNCorpusTask
+from mock import MagicMock
+from utils import TaskTest
+
+
+class FakeCorpusTask(PyPLNCorpusTask):
+    def process(self, documents):
+        return {'result': sum([d["input"] for d in documents])}
+
+class TestCeleryCorpusTask(TaskTest):
+    def test_task_should_only_get_the_correct_documents(self):
+        # This is just preparing the expected input in the database
+        wrong_doc_id = self.collection.insert({'input': 999}, w=1)
+        correct_doc_id_1 = self.collection.insert({'input': 1}, w=1)
+        correct_doc_id_2 = self.collection.insert({'input': 1}, w=1)
+        fake_corpus_id = 1
+
+        FakeCorpusTask.process = MagicMock(return_value={'result': 2})
+
+        corpus_task = FakeCorpusTask()
+
+        corpus_task.delay(fake_corpus_id, [correct_doc_id_1, correct_doc_id_2])
+
+        corpus_task.process.assert_called()
+
+        # We need to compare the call args because it's called with a mongo
+        # cursor, not a list.
+        # We're getting [0][0] because we want the args (not kwargs) for the
+        # first call to the method.
+        call_args = list(corpus_task.process.call_args[0][0])
+        for arg in call_args:
+            self.assertEqual(arg['input'], 1)
+
+    def test_task_is_saving_the_result_to_mongo_with_the_corpus_id(self):
+        expected_result = 42
+        doc_id_1 = self.collection.insert({'input': 21}, w=1)
+        doc_id_2 = self.collection.insert({'input': 21}, w=1)
+        fake_corpus_id = 1
+
+        FakeCorpusTask().delay(fake_corpus_id, [doc_id_1, doc_id_2])
+
+        resulting_corpus_analysis = self.corpora_collection.find_one(
+                {'corpus_id': fake_corpus_id})['result']
+
+        self.assertEqual(resulting_corpus_analysis, expected_result)
diff --git a/tests/test_worker_corpus_freqdist.py b/tests/test_worker_corpus_freqdist.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from pypln.backend.workers import CorpusFreqDist
+from utils import TaskTest
+
+
+class TestCorpusFreqDistWorker(TaskTest):
+    def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self):
+
+        freqdist_1 =  [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1],
+                [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]]
+
+        freqdist_2 = [[u'the', 2], [u'brown', 1], [u'lazy', 1],
+                [u'over', 1], [u'fox', 1], [u'dog', 1], [u'.', 1],
+                [u'quick', 1], [u'jumps', 1]]
+
+        corpus_fd =  [(u'the', 4), (u'is', 2), (u'.', 2), (u'blue', 1),
+                (u'brown', 1), (u'lazy', 1), (u'fox', 1), (u'jumps', 1),
+                (u'sun', 1), (u'dog', 1), (u'sky', 1), (u',', 1),
+                (u'yellow', 1), (u'quick', 1), (u'over', 1)]
+
+        result = CorpusFreqDist().process([{'freqdist': freqdist_1},
+            {'freqdist': freqdist_2}])
+
+        resulting_corpus_fd = result['freqdist']
+
+        self.assertEqual(resulting_corpus_fd, corpus_fd)
diff --git a/tests/utils.py b/tests/utils.py
@@ -34,6 +34,7 @@ def setUp(self):
         app.conf.update(CELERY_ALWAYS_EAGER=True)
         self.db = pymongo.Connection()[self.db_name]
         self.collection = self.db[config.MONGODB_COLLECTION]
+        self.corpora_collection = self.db[config.MONGODB_CORPORA_COLLECTION]
 
     def tearDown(self):
         self.collection.remove({})