Skip to content

Commit

Permalink
Merge pull request #144 from JSv4/JSv4/add-mimetype-checks
Browse files Browse the repository at this point in the history
File inspection and Mimetype Limits on Document Upload Mutation.
  • Loading branch information
JSv4 authored Jul 11, 2024
2 parents 55d5525 + 24ebca3 commit b96f2ea
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 3 deletions.
20 changes: 18 additions & 2 deletions config/graphql/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from filetype import filetype
from graphene.types.generic import GenericScalar
from graphql import GraphQLError
from graphql_jwt.decorators import login_required, user_passes_test
Expand Down Expand Up @@ -844,16 +845,31 @@ def mutate(
)

try:
file_bytes = base64.b64decode(base64_file_string)

# Check file type
kind = filetype.guess(file_bytes)
if kind is None:
return UploadDocument(
message="Unable to determine file type", ok=False, document=None
)

if kind.mime not in settings.ALLOWED_DOCUMENT_MIMETYPES:
return UploadDocument(
message=f"Unallowed filetype: {kind.mime}", ok=False, document=None
)

user = info.context.user
pdf_file = ContentFile(base64.b64decode(base64_file_string), name=filename)
document = Document.objects.create(
pdf_file = ContentFile(file_bytes, name=filename)
document = Document(
creator=user,
title=title,
description=description,
custom_meta=custom_meta,
pdf_file=pdf_file,
backend_lock=True,
)
document.save()
set_permissions_for_obj_to_user(user, document, [PermissionTypes.CRUD])
ok = True
message = "Success"
Expand Down
5 changes: 5 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
MIGRATION_MODULES = {"sites": "opencontractserver.contrib.sites.migrations"}

# USER LIMITS (FOR USERS WITH IS_USAGE_CAPPED=True)
# ------------------------------------------------------------------------------
USAGE_CAPPED_USER_DOC_CAP_COUNT = env.int(
"USAGE_CAPPED_USER_CORPUS_CAP_COUNT", default=10
)
Expand All @@ -129,6 +130,10 @@
"USAGE_CAPPED_USER_CAN_EXPORT_CORPUS", default=True
)

# UPLOAD CONTROLS
# ------------------------------------------------------------------------------
ALLOWED_DOCUMENT_MIMETYPES = ["application/pdf"]

# AUTHENTICATION
# ------------------------------------------------------------------------------
# https://docs.djangoproject.com/en/dev/ref/settings/#authentication-backends
Expand Down
119 changes: 119 additions & 0 deletions opencontractserver/tests/test_document_uploads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import io
from unittest.mock import patch

from django.contrib.auth import get_user_model
from django.test import TestCase
from docx import Document
from graphene.test import Client

from config.graphql.schema import schema
from opencontractserver.utils.pdf import base_64_encode_bytes

User = get_user_model()


class TestContext:
def __init__(self, user):
self.user = user


class UploadDocumentMutationTestCase(TestCase):
def setUp(self):
self.user = User.objects.create_user(
username="testuser", password="testpassword"
)
self.client = Client(schema, context_value=TestContext(self.user))

def test_upload_document_mime_type_check(self):
mutation = """
mutation UploadDocument($file: String!, $filename: String!, $title: String!, $description: String!, $customMeta: GenericScalar!) {
uploadDocument(
base64FileString: $file,
filename: $filename,
title: $title,
description: $description,
customMeta: $customMeta
) {
ok
message
document {
id
title
}
}
}
""" # noqa

# Mock file content
pdf_content = b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n"

# Generate DOCX content
docx_buffer = io.BytesIO()
doc = Document()
doc.add_paragraph("This is a test DOCX file.")
doc.save(docx_buffer)
docx_content = docx_buffer.getvalue()

txt_content = b"This is a text file."

# Encode file content
pdf_base64 = base_64_encode_bytes(pdf_content)
docx_base64 = base_64_encode_bytes(docx_content)
txt_base64 = base_64_encode_bytes(txt_content)

# Test PDF upload (should succeed)
with patch(
"opencontractserver.documents.models.Document.objects.create"
) as mock_create:
mock_create.return_value = None
result = self.client.execute(
mutation,
variables={
"file": pdf_base64,
"filename": "test.pdf",
"title": "Test PDF",
"description": "A test PDF file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertTrue(result["data"]["uploadDocument"]["ok"])
self.assertEqual(result["data"]["uploadDocument"]["message"], "Success")

# Test DOCX upload (should fail)
result = self.client.execute(
mutation,
variables={
"file": docx_base64,
"filename": "test.docx",
"title": "Test DOCX",
"description": "A test DOCX file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertFalse(result["data"]["uploadDocument"]["ok"])
self.assertEqual(
result["data"]["uploadDocument"]["message"],
"Unallowed filetype: application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)

# Test TXT upload (should fail)
result = self.client.execute(
mutation,
variables={
"file": txt_base64,
"filename": "test.txt",
"title": "Test TXT",
"description": "A test TXT file",
"customMeta": {},
},
)

self.assertIsNone(result.get("errors"))
self.assertFalse(result["data"]["uploadDocument"]["ok"])
self.assertEqual(
result["data"]["uploadDocument"]["message"], "Unable to determine file type"
)
3 changes: 2 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ drf-extra-fields==3.4.1 # https://github.com/Hipo/drf-extra-fields
# ------------------------------------------------------------------------------
# Pawls preprocessors are available as a command line utility in their repo for now
# BUT we can install them from their github repo subdirectory using the syntax below:
git+https://github.com/JSv4/[email protected]
git+https://github.com/JSv4/[email protected] # TODO - DEPRECATED. REMOVE.
scikit-learn
pdfplumber
pytesseract
Expand All @@ -64,6 +64,7 @@ marvin==2.3.4
# Data Processing Tools
# -------------------------------------------------------------------------------
opencv-python==4.7.0.68 # https://github.com/opencv/opencv-python
filetype==1.2.0 # https://github.com/h2non/filetype.py

# Permissioning
# ------------------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions requirements/local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ pytest-django==4.5.2 # https://github.com/pytest-dev/pytest-django
# PDFs
# ------------------------------------------------------------------------------
pypdf

# Docx
# ------------------------------------------------------------------------------
python-docx

0 comments on commit b96f2ea

Please sign in to comment.