Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File inspection and Mimetype Limits on Document Upload Mutation. #144

Merged
merged 2 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions config/graphql/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from filetype import filetype
from graphene.types.generic import GenericScalar
from graphql import GraphQLError
from graphql_jwt.decorators import login_required, user_passes_test
Expand Down Expand Up @@ -844,16 +845,31 @@ def mutate(
)

try:
file_bytes = base64.b64decode(base64_file_string)

# Check file type
kind = filetype.guess(file_bytes)
if kind is None:
return UploadDocument(
message="Unable to determine file type", ok=False, document=None
)

if kind.mime not in settings.ALLOWED_DOCUMENT_MIMETYPES:
return UploadDocument(
message=f"Unallowed filetype: {kind.mime}", ok=False, document=None
)

user = info.context.user
pdf_file = ContentFile(base64.b64decode(base64_file_string), name=filename)
document = Document.objects.create(
pdf_file = ContentFile(file_bytes, name=filename)
document = Document(
creator=user,
title=title,
description=description,
custom_meta=custom_meta,
pdf_file=pdf_file,
backend_lock=True,
)
document.save()
set_permissions_for_obj_to_user(user, document, [PermissionTypes.CRUD])
ok = True
message = "Success"
Expand Down
5 changes: 5 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
MIGRATION_MODULES = {"sites": "opencontractserver.contrib.sites.migrations"}

# USER LIMITS (FOR USERS WITH IS_USAGE_CAPPED=True)
# ------------------------------------------------------------------------------
USAGE_CAPPED_USER_DOC_CAP_COUNT = env.int(
"USAGE_CAPPED_USER_CORPUS_CAP_COUNT", default=10
)
Expand All @@ -129,6 +130,10 @@
"USAGE_CAPPED_USER_CAN_EXPORT_CORPUS", default=True
)

# UPLOAD CONTROLS
# ------------------------------------------------------------------------------
ALLOWED_DOCUMENT_MIMETYPES = ["application/pdf"]

# AUTHENTICATION
# ------------------------------------------------------------------------------
# https://docs.djangoproject.com/en/dev/ref/settings/#authentication-backends
Expand Down
119 changes: 119 additions & 0 deletions opencontractserver/tests/test_document_uploads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import io
from unittest.mock import patch

from django.contrib.auth import get_user_model
from django.test import TestCase
from docx import Document
from graphene.test import Client

from config.graphql.schema import schema
from opencontractserver.utils.pdf import base_64_encode_bytes

User = get_user_model()


class TestContext:
def __init__(self, user):
self.user = user


class UploadDocumentMutationTestCase(TestCase):
def setUp(self):
self.user = User.objects.create_user(
username="testuser", password="testpassword"
)
self.client = Client(schema, context_value=TestContext(self.user))

def test_upload_document_mime_type_check(self):
mutation = """
mutation UploadDocument($file: String!, $filename: String!, $title: String!, $description: String!, $customMeta: GenericScalar!) {
uploadDocument(
base64FileString: $file,
filename: $filename,
title: $title,
description: $description,
customMeta: $customMeta
) {
ok
message
document {
id
title
}
}
}
""" # noqa

# Mock file content
pdf_content = b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n"

# Generate DOCX content
docx_buffer = io.BytesIO()
doc = Document()
doc.add_paragraph("This is a test DOCX file.")
doc.save(docx_buffer)
docx_content = docx_buffer.getvalue()

txt_content = b"This is a text file."

# Encode file content
pdf_base64 = base_64_encode_bytes(pdf_content)
docx_base64 = base_64_encode_bytes(docx_content)
txt_base64 = base_64_encode_bytes(txt_content)

# Test PDF upload (should succeed)
with patch(
"opencontractserver.documents.models.Document.objects.create"
) as mock_create:
mock_create.return_value = None
result = self.client.execute(
mutation,
variables={
"file": pdf_base64,
"filename": "test.pdf",
"title": "Test PDF",
"description": "A test PDF file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertTrue(result["data"]["uploadDocument"]["ok"])
self.assertEqual(result["data"]["uploadDocument"]["message"], "Success")

# Test DOCX upload (should fail)
result = self.client.execute(
mutation,
variables={
"file": docx_base64,
"filename": "test.docx",
"title": "Test DOCX",
"description": "A test DOCX file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertFalse(result["data"]["uploadDocument"]["ok"])
self.assertEqual(
result["data"]["uploadDocument"]["message"],
"Unallowed filetype: application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)

# Test TXT upload (should fail)
result = self.client.execute(
mutation,
variables={
"file": txt_base64,
"filename": "test.txt",
"title": "Test TXT",
"description": "A test TXT file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertFalse(result["data"]["uploadDocument"]["ok"])
self.assertEqual(
result["data"]["uploadDocument"]["message"], "Unable to determine file type"
)
3 changes: 2 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ drf-extra-fields==3.4.1 # https://github.com/Hipo/drf-extra-fields
# ------------------------------------------------------------------------------
# Pawls preprocessors are available as a command line utility in their repo for now
# BUT we can install them from their github repo subdirectory using the syntax below:
git+https://github.com/JSv4/[email protected]
git+https://github.com/JSv4/[email protected] # TODO - DEPRECATED. REMOVE.
scikit-learn
pdfplumber
pytesseract
Expand All @@ -64,6 +64,7 @@ marvin==2.3.4
# Data Processing Tools
# -------------------------------------------------------------------------------
opencv-python==4.7.0.68 # https://github.com/opencv/opencv-python
filetype==1.2.0 # https://github.com/h2non/filetype.py

# Permissioning
# ------------------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions requirements/local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ pytest-django==4.5.2 # https://github.com/pytest-dev/pytest-django
# PDFs
# ------------------------------------------------------------------------------
pypdf

# Docx
# ------------------------------------------------------------------------------
python-docx