Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/corpus analysis #140

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
4 changes: 4 additions & 0 deletions contrib/postactivate
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ function manage_with_settings() {
PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* --settings=pypln.web.settings.$SETTINGS;
}

function manage() {
PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $*
}

alias manage_dev="manage_with_settings development"
alias manage_test="manage_with_settings test"
alias run_tests="manage_test test pypln.web.core.tests"
Expand Down
4 changes: 4 additions & 0 deletions pypln/web/backend_adapter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ def create_indexing_pipeline(doc):
{"index_name": doc.index_name, "doc_type": doc.doc_type}})
(Extractor().si(doc_id) | ElasticIndexer().si(doc_id))()

def calculate_corpus_freqdist(corpus):
blob_ids = map(ObjectId, corpus.document_set.values_list('blob', flat=True))
CorpusFreqDist().delay(corpus.pk, blob_ids)

def get_config_from_router(api, timeout=5):
client = Client()
client.connect(api)
Expand Down
23 changes: 18 additions & 5 deletions pypln/web/backend_adapter/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
from mock import patch

from pypln.web.backend_adapter.pipelines import (create_indexing_pipeline,
call_default_pipeline, create_pipeline_from_document)
from pypln.web.core.models import IndexedDocument, Document, mongodb_storage
call_default_pipeline, create_pipeline_from_document, calculate_corpus_freqdist)
from pypln.web.core.models import IndexedDocument, Document, mongodb_storage, Corpus
from pypln.web.core.tests.utils import TestWithMongo


__all__ = ["CreatePipelineTest", "CreateIndexingPipelineTest",
"CreatePipelineFromDocumentTest"]
"CreatePipelineFromDocumentTest", "CorpusFreqDistTest"]

class CreatePipelineTest(TestWithMongo):

Expand Down Expand Up @@ -68,14 +68,14 @@ def test_should_create_indexing_pipelines_for_document(self, extractor):
extractor.assert_called_with()
extractor.return_value.si.assert_called_with(ObjectId(self.document.blob.name))

@patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True)
@patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True)
def test_should_add_index_name_to_the_document_in_mongo(self,
gridfs_data_retriever):
create_indexing_pipeline(self.document)
mongo_document = self.get_mongo_doc(self.document)
self.assertEqual(mongo_document['index_name'], self.document.index_name)

@patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True)
@patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True)
def test_should_add_doc_type_to_the_document_in_mongo(self,
gridfs_data_retriever):
create_indexing_pipeline(self.document)
Expand All @@ -91,3 +91,16 @@ def test_create_pipeline_from_document_instantiates_a_document_id(self, fake_cal
doc = Document.objects.all()[0]
create_pipeline_from_document(doc)
fake_call_default_pipeline.assert_called_with(ObjectId(doc.blob.name))


class CorpusFreqDistTest(TestWithMongo):
fixtures = ['users', 'corpora', 'documents']

@patch('pypln.web.backend_adapter.pipelines.CorpusFreqDist', autospec=True)
def test_should_call_CorpusFreqDist_with_document_ids(self,
corpus_freqdist_worker):
corpus = Corpus.objects.get(pk=2)
ids = [ObjectId("562526d9798ebd4616b23bb1")]
calculate_corpus_freqdist(corpus)
corpus_freqdist_worker.assert_called_with()
corpus_freqdist_worker.return_value.delay.assert_called_with(corpus.pk, ids)
16 changes: 16 additions & 0 deletions pypln/web/core/fixtures/mongodb/corpora_analysis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[
{
"_id" : { "$oid": "5785005257bc3a1070d8cdbf" },
"corpus_id" : 2,
"freqdist" : [
[ "á", 1 ],
[ "non-ascii", 1 ],
[ ".", 1 ],
[ "char", 1 ],
[ "file", 1 ],
[ "test", 1 ],
[ ":", 1 ],
[ "with", 1 ]
]
}
]
9 changes: 9 additions & 0 deletions pypln/web/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
from django.contrib.auth.models import User
from django.dispatch import receiver
from django.db import models
import pymongo

from rest_framework.reverse import reverse
from rest_framework.authtoken.models import Token

from pypln.web.core.storage import MongoDBBase64Storage

mongodb_storage = MongoDBBase64Storage()
corpus_collection = pymongo.Connection(host=settings.MONGODB_URIS)[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION]


class Corpus(models.Model):
Expand All @@ -43,6 +45,13 @@ class Meta:
def __unicode__(self):
return self.name

@property
def properties(self):
corpus_analysis = corpus_collection.find_one({"corpus_id": self.id})
if corpus_analysis is None:
return {}
return corpus_analysis


class Document(models.Model):
blob = models.FileField(upload_to='/', storage=mongodb_storage)
Expand Down
1 change: 0 additions & 1 deletion pypln/web/core/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from django.conf import settings
from django.utils.encoding import filepath_to_uri
from pymongo import Connection
from gridfs import GridFS, NoFile


class MongoDBBase64Storage(Storage):
Expand Down
21 changes: 20 additions & 1 deletion pypln/web/core/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from pypln.web.core.models import Corpus, Document
from pypln.web.core.tests.utils import TestWithMongo

__all__ = ["CorpusModelTest", "DocumentModelTest"]
__all__ = ["CorpusModelTest", "CorpusPropertiesTest", "DocumentModelTest"]

class CorpusModelTest(TestCase):
fixtures = ['users']
Expand All @@ -46,6 +46,25 @@ def test_different_users_can_have_corpora_with_the_same_name(self):
self.assertEqual(corpus_1.name, corpus_2.name)


class CorpusPropertiesTest(TestWithMongo):
fixtures = ['users', 'corpora', 'corpora_analysis']

def test_returns_keyerror_when_key_does_not_exist(self):
expected_data = u'Test file with non-ascii char: á.'
corpus = Corpus.objects.all()[0]
with self.assertRaises(KeyError):
corpus.properties['analysis_that_does_not_exist']

def test_get_freqdist_from_store(self):
expected_data = [
[u"á", 1], [u"non-ascii", 1], [u".", 1],
[u"char", 1], [u"file", 1], [u"test", 1], [u":", 1],
[u"with", 1 ]
]
corpus = Corpus.objects.get(pk=2)
self.assertEqual(corpus.properties['freqdist'], expected_data)


class DocumentModelTest(TestWithMongo):
fixtures = ['users', 'corpora', 'documents']

Expand Down
7 changes: 7 additions & 0 deletions pypln/web/core/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def _pre_setup(self, *args, **kwargs):
mongodb_storage.save(os.path.basename(doc.blob.name),
StringIO(u"Test file with non-ascii char: á.".encode('utf-8')))

if hasattr(self, 'fixtures') and self.fixtures is not None and 'corpora_analysis' in self.fixtures:
filename = os.path.join(settings.PROJECT_ROOT, 'core/fixtures/mongodb/corpora_analysis.json')
with open(filename, 'r') as mongo_fixture:
for obj in json_util.loads(mongo_fixture.read()):
mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj, w=1)


def _post_teardown(self, *args, **kwargs):
mongodb_storage._connection.drop_database(mongodb_storage._db.name)
super(TestWithMongo, self)._post_teardown(*args, **kwargs)
106 changes: 106 additions & 0 deletions pypln/web/core/tests/views/test_corpus_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding:utf-8 -*-
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
import json

from django.contrib.auth.models import User
from django.core.urlresolvers import reverse
from mock import patch

from pypln.web.core.models import Corpus, User
from pypln.web.core.tests.utils import TestWithMongo

__all__ = ["CorpusFreqDistViewTest"]


class CorpusFreqDistViewTest(TestWithMongo):
fixtures = ['users', 'corpora', 'documents', 'corpora_analysis']

def test_requires_login(self):
response = self.client.get(reverse('corpus-freqdist',
kwargs={'pk': 2}))
self.assertEqual(response.status_code, 403)

def test_returns_404_for_inexistent_corpus(self):
self.client.login(username="user", password="user")
response = self.client.get(reverse('corpus-freqdist',
kwargs={'pk': 9999}))
self.assertEqual(response.status_code, 404)

def test_returns_404_if_user_is_not_the_owner_of_the_corpus(self):
self.client.login(username="user", password="user")
corpus = Corpus.objects.filter(owner__username="admin")[0]
response = self.client.get(reverse('corpus-freqdist',
kwargs={'pk': corpus.id}))
self.assertEqual(response.status_code, 404)

def test_returns_404_if_corpus_has_no_freqdist_yet(self):
self.client.login(username="admin", password="admin")
corpus = Corpus.objects.filter(owner__username="admin")[0]
response = self.client.get(reverse('corpus-freqdist',
kwargs={'pk': corpus.id}))
self.assertEqual(response.status_code, 404)

def test_shows_corpus_freqdist_correctly(self):
self.client.login(username="user", password="user")
corpus = Corpus.objects.filter(owner__username="user")[0]
response = self.client.get(reverse('corpus-freqdist',
kwargs={'pk': corpus.id}))

self.assertEqual(response.status_code, 200)
self.assertEqual(response.renderer_context['view'].get_object(),
corpus)
expected_data = corpus.properties['freqdist']
self.assertEqual(response.data['value'], expected_data)

@patch('pypln.web.core.views.calculate_corpus_freqdist')
def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self,
calculate_corpus_freqdist):
"""
This is a regression test. There used to be a bug that returned 404
before queueing the analysis if the corpus didn't have a freqdist
analysis yet.
"""
self.user = User.objects.get(username="admin")
self.client.login(username="admin", password="admin")

corpus = self.user.corpus_set.all()[0]
response = self.client.put(reverse('corpus-freqdist',
kwargs={"pk": corpus.id}))

self.assertFalse(corpus.properties.has_key("freqdist"))

self.assertEqual(response.status_code, 200)
self.assertTrue(calculate_corpus_freqdist.called)
calculate_corpus_freqdist.assert_called_with(corpus)

@patch('pypln.web.core.views.calculate_corpus_freqdist')
def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self,
calculate_corpus_freqdist):
self.user = User.objects.get(username="user")
self.client.login(username="user", password="user")

corpus = self.user.corpus_set.all()[0]
response = self.client.put(reverse('corpus-freqdist',
kwargs={"pk": corpus.id}))

self.assertTrue(corpus.properties.has_key("freqdist"))

self.assertEqual(response.status_code, 200)
self.assertTrue(calculate_corpus_freqdist.called)
calculate_corpus_freqdist.assert_called_with(corpus)
3 changes: 3 additions & 0 deletions pypln/web/core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from django.conf.urls import patterns, url, include
from rest_framework.urlpatterns import format_suffix_patterns
from pypln.web.core.views import CorpusList, CorpusDetail, CorpusDocumentList
from pypln.web.core.views import CorpusFreqDist
from pypln.web.core.views import DocumentList, DocumentDetail
from pypln.web.core.views import PropertyList, PropertyDetail

Expand All @@ -28,6 +29,8 @@
url(r'^user/api-token/$', 'auth_token', name='auth_token'),
url(r'^corpora/$', CorpusList.as_view(), name='corpus-list'),
url(r'^corpora/(?P<pk>\d+)/$', CorpusDetail.as_view(), name='corpus-detail'),
url(r'^corpora/(?P<pk>\d+)/freqdist/$', CorpusFreqDist.as_view(),
name='corpus-freqdist'),
url(r'^corpora/(?P<pk>\d+)/documents/$', CorpusDocumentList.as_view(),
name='corpus-document-list'),
url(r'^documents/$', DocumentList.as_view(), name='document-list'),
Expand Down
35 changes: 34 additions & 1 deletion pypln/web/core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from rest_framework.response import Response
from rest_framework import serializers

from pypln.web.backend_adapter.pipelines import create_pipeline_from_document
from pypln.web.backend_adapter.pipelines import (create_pipeline_from_document,
calculate_corpus_freqdist)
from pypln.web.core.models import Corpus, Document
from pypln.web.core.serializers import CorpusSerializer, DocumentSerializer
from pypln.web.core.serializers import PropertyListSerializer
Expand Down Expand Up @@ -116,6 +117,38 @@ def get_queryset(self):
def perform_update(self, serializer):
instance = serializer.save(owner=self.request.user)

class CorpusFreqDist(generics.RetrieveUpdateAPIView):
"""
Shows FreqDist for the corpus

`GET` requests will show the last calculated FreqDist for the corpus

`PUT` requests will queue a new task for calculating the Corpus
FreqDist using the documents currently contained in the corpus

"""
model = Corpus
permission_classes = (permissions.IsAuthenticated, )

class CorpusFreqDistSerializer(serializers.Serializer):
value = serializers.ReadOnlyField(source="properties.freqdist")

serializer_class = CorpusFreqDistSerializer

def get_queryset(self):
return Corpus.objects.filter(owner=self.request.user)

def retrieve(self, *args, **kwargs):
corpus = self.get_object()
if corpus.properties.has_key("freqdist"):
return super(CorpusFreqDist, self).retrieve(self, *args, **kwargs)
else:
raise Http404("FreqDist for Corpus {} is not yet available".format(corpus))

def perform_update(self, serializer):
calculate_corpus_freqdist(serializer.instance)


class DocumentList(generics.ListCreateAPIView):
"""
Lists all documents available to the current user and creates new documents.
Expand Down
1 change: 1 addition & 0 deletions pypln/web/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def split_uris(uri):

MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln')
MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis')
MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION', default='corpora_analysis')

ALLOWED_HOSTS = config('ALLOWED_HOSTS', cast=Csv())

Expand Down