diff --git a/config/graphql/mutations.py b/config/graphql/mutations.py index 2340c1aa..a19135e6 100644 --- a/config/graphql/mutations.py +++ b/config/graphql/mutations.py @@ -11,6 +11,7 @@ from django.db import transaction from django.db.models import Q from django.utils import timezone +from filetype import filetype from graphene.types.generic import GenericScalar from graphql import GraphQLError from graphql_jwt.decorators import login_required, user_passes_test @@ -844,9 +845,23 @@ def mutate( ) try: + file_bytes = base64.b64decode(base64_file_string) + + # Check file type + kind = filetype.guess(file_bytes) + if kind is None: + return UploadDocument( + message="Unable to determine file type", ok=False, document=None + ) + + if kind.mime not in settings.ALLOWED_DOCUMENT_MIMETYPES: + return UploadDocument( + message=f"Unallowed filetype: {kind.mime}", ok=False, document=None + ) + user = info.context.user - pdf_file = ContentFile(base64.b64decode(base64_file_string), name=filename) - document = Document.objects.create( + pdf_file = ContentFile(file_bytes, name=filename) + document = Document( creator=user, title=title, description=description, @@ -854,6 +869,7 @@ def mutate( pdf_file=pdf_file, backend_lock=True, ) + document.save() set_permissions_for_obj_to_user(user, document, [PermissionTypes.CRUD]) ok = True message = "Success" diff --git a/config/settings/base.py b/config/settings/base.py index e229dd62..d59c379f 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -116,6 +116,7 @@ MIGRATION_MODULES = {"sites": "opencontractserver.contrib.sites.migrations"} # USER LIMITS (FOR USERS WITH IS_USAGE_CAPPED=True) +# ------------------------------------------------------------------------------ USAGE_CAPPED_USER_DOC_CAP_COUNT = env.int( "USAGE_CAPPED_USER_CORPUS_CAP_COUNT", default=10 ) @@ -129,6 +130,10 @@ "USAGE_CAPPED_USER_CAN_EXPORT_CORPUS", default=True ) +# UPLOAD CONTROLS +# ------------------------------------------------------------------------------ +ALLOWED_DOCUMENT_MIMETYPES = ["application/pdf"] + # AUTHENTICATION # ------------------------------------------------------------------------------ # https://docs.djangoproject.com/en/dev/ref/settings/#authentication-backends diff --git a/opencontractserver/tests/test_document_uploads.py b/opencontractserver/tests/test_document_uploads.py new file mode 100644 index 00000000..362c8892 --- /dev/null +++ b/opencontractserver/tests/test_document_uploads.py @@ -0,0 +1,119 @@ +import io +from unittest.mock import patch + +from django.contrib.auth import get_user_model +from django.test import TestCase +from docx import Document +from graphene.test import Client + +from config.graphql.schema import schema +from opencontractserver.utils.pdf import base_64_encode_bytes + +User = get_user_model() + + +class TestContext: + def __init__(self, user): + self.user = user + + +class UploadDocumentMutationTestCase(TestCase): + def setUp(self): + self.user = User.objects.create_user( + username="testuser", password="testpassword" + ) + self.client = Client(schema, context_value=TestContext(self.user)) + + def test_upload_document_mime_type_check(self): + mutation = """ + mutation UploadDocument($file: String!, $filename: String!, $title: String!, $description: String!, $customMeta: GenericScalar!) { + uploadDocument( + base64FileString: $file, + filename: $filename, + title: $title, + description: $description, + customMeta: $customMeta + ) { + ok + message + document { + id + title + } + } + } + """ # noqa + + # Mock file content + pdf_content = b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n" + + # Generate DOCX content + docx_buffer = io.BytesIO() + doc = Document() + doc.add_paragraph("This is a test DOCX file.") + doc.save(docx_buffer) + docx_content = docx_buffer.getvalue() + + txt_content = b"This is a text file." + + # Encode file content + pdf_base64 = base_64_encode_bytes(pdf_content) + docx_base64 = base_64_encode_bytes(docx_content) + txt_base64 = base_64_encode_bytes(txt_content) + + # Test PDF upload (should succeed) + with patch( + "opencontractserver.documents.models.Document.objects.create" + ) as mock_create: + mock_create.return_value = None + result = self.client.execute( + mutation, + variables={ + "file": pdf_base64, + "filename": "test.pdf", + "title": "Test PDF", + "description": "A test PDF file", + "customMeta": {}, + }, + ) + + self.assertIsNone(result.get("errors")) + self.assertTrue(result["data"]["uploadDocument"]["ok"]) + self.assertEqual(result["data"]["uploadDocument"]["message"], "Success") + + # Test DOCX upload (should fail) + result = self.client.execute( + mutation, + variables={ + "file": docx_base64, + "filename": "test.docx", + "title": "Test DOCX", + "description": "A test DOCX file", + "customMeta": {}, + }, + ) + + self.assertIsNone(result.get("errors")) + self.assertFalse(result["data"]["uploadDocument"]["ok"]) + self.assertEqual( + result["data"]["uploadDocument"]["message"], + "Unallowed filetype: application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + # Test TXT upload (should fail) + result = self.client.execute( + mutation, + variables={ + "file": txt_base64, + "filename": "test.txt", + "title": "Test TXT", + "description": "A test TXT file", + "customMeta": {}, + }, + ) + + self.assertIsNone(result.get("errors")) + self.assertFalse(result["data"]["uploadDocument"]["ok"]) + self.assertEqual( + result["data"]["uploadDocument"]["message"], "Unable to determine file type" + ) diff --git a/requirements/base.txt b/requirements/base.txt index 10ca5be3..b5b451f6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -39,7 +39,7 @@ drf-extra-fields==3.4.1 # https://github.com/Hipo/drf-extra-fields # ------------------------------------------------------------------------------ # Pawls preprocessors are available as a command line utility in their repo for now # BUT we can install them from their github repo subdirectory using the syntax below: -git+https://github.com/JSv4/PDF-Preprocessors@v1.0.5 +git+https://github.com/JSv4/PDF-Preprocessors@v1.0.5 # TODO - DEPRECATED. REMOVE. scikit-learn pdfplumber pytesseract @@ -64,6 +64,7 @@ marvin==2.3.4 # Data Processing Tools # ------------------------------------------------------------------------------- opencv-python==4.7.0.68 # https://github.com/opencv/opencv-python +filetype==1.2.0 # https://github.com/h2non/filetype.py # Permissioning # ------------------------------------------------------------------------------ diff --git a/requirements/local.txt b/requirements/local.txt index 302e1554..75ee3b9c 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -47,3 +47,7 @@ pytest-django==4.5.2 # https://github.com/pytest-dev/pytest-django # PDFs # ------------------------------------------------------------------------------ pypdf + +# Docx +# ------------------------------------------------------------------------------ +python-docx