Merge pull request #170 from JSv4/JSv4/upgrade-django-lts

Upgrade Django LTS. Finally got the coverage checks to pass!
JSv4 · Jul 27, 2024 · d26b78c · d26b78c
2 parents 4791f48 + 25db223
commit d26b78c
Show file tree

Hide file tree

Showing 13 changed files with 465 additions and 135 deletions.
diff --git a/config/graphql/custom_connections.py b/config/graphql/custom_connections.py
@@ -13,18 +13,18 @@ class Meta:
     page_count = Int()
 
     def resolve_current_page(root, info, **kwargs):
-        print(
-            f"PdfPageAwareConnection- resolve_total_count kwargs: {kwargs} / root {dir(root)} / iteracble "
-            f"{root.iterable.count()}"
-        )
+        # print(
+        #     f"PdfPageAwareConnection- resolve_total_count kwargs: {kwargs} / root {dir(root)} / iteracble "
+        #     f"{root.iterable.count()}"
+        # )
         return 1
 
     def resolve_page_count(root, info, **kwargs):
 
         largest_page_number = max(
             list(root.iterable.values_list("page", flat=True).distinct())
         )
-        print(f"Unique page list: {largest_page_number}")
+        # print(f"Unique page list: {largest_page_number}")
 
-        print(f"PdfPageAwareConnection - resolve_edge_count kwargs: {kwargs}")
+        # print(f"PdfPageAwareConnection - resolve_edge_count kwargs: {kwargs}")
         return largest_page_number
diff --git a/config/graphql/filters.py b/config/graphql/filters.py
@@ -144,7 +144,7 @@ def filter_by_label_from_labelset_id(self, queryset, info, value):
 
     def filter_by_created_by_analysis_ids(self, queryset, info, value):
 
-        print(f"filter_by_created_by_analysis_ids - value: {value}")
+        # print(f"filter_by_created_by_analysis_ids - value: {value}")
 
         analysis_ids = value.split(",")
         if "~~MANUAL~~" in analysis_ids:
@@ -219,14 +219,14 @@ def filter_by_labelset_id(self, queryset, name, value):
 
     def filter_by_used_in_labelset_for_corpus_id(self, queryset, name, value):
 
-        print(f"Raw corpus id: {value}")
+        # print(f"Raw corpus id: {value}")
         django_pk = from_global_id(value)[1]
-        print("Lookup labels for pk", django_pk)
+        # print("Lookup labels for pk", django_pk)
         queryset = queryset.filter(Q(included_in_labelset__used_by_corpus=django_pk))
-        print(
-            "Filtered to values",
-            queryset,
-        )
+        # print(
+        #     "Filtered to values",
+        #     queryset,
+        # )
         return queryset.filter(included_in_labelset__used_by_corpus=django_pk)
 
     class Meta:

diff --git a/config/graphql/mutations.py b/config/graphql/mutations.py
@@ -63,14 +63,8 @@
     package_annotated_docs,
 )
 from opencontractserver.tasks.analyzer_tasks import start_analysis
-from opencontractserver.tasks.doc_tasks import (
-    convert_doc_to_funsd,
-    convert_doc_to_langchain_task,
-)
-from opencontractserver.tasks.export_tasks import (
-    package_funsd_exports,
-    package_langchain_exports,
-)
+from opencontractserver.tasks.doc_tasks import convert_doc_to_funsd
+from opencontractserver.tasks.export_tasks import package_funsd_exports
 from opencontractserver.tasks.extract_orchestrator_tasks import run_extract
 from opencontractserver.tasks.permissioning_tasks import (
     make_analysis_public_task,
@@ -543,7 +537,7 @@ def mutate(root, info, corpus_id, query):
                 creator=info.context.user,
                 corpus_id=from_global_id(corpus_id)[1],
             )
-            print(f"Obj created: {obj}")
+            # print(f"Obj created: {obj}")
             set_permissions_for_obj_to_user(
                 info.context.user, obj, [PermissionTypes.CRUD]
             )
@@ -611,16 +605,6 @@ def mutate(root, info, corpus_id, export_format):
                     ),
                 ).apply_async()
 
-                ok = True
-                message = "SUCCESS"
-            elif export_format == ExportType.LANGCHAIN.value:
-                chord(
-                    group(
-                        convert_doc_to_langchain_task.s(doc_id, corpus_pk)
-                        for doc_id in doc_ids
-                    ),
-                    package_langchain_exports.s(export.id, corpus_pk),
-                ).apply_async()
                 ok = True
                 message = "SUCCESS"
             elif export_format == ExportType.FUNSD:
@@ -1735,15 +1719,15 @@ def mutate(
         corpus = None
         if corpus_id is not None:
             corpus = Corpus.objects.get(pk=from_global_id(corpus_id)[1])
-            print(f"Corpus is: {corpus}")
+            # print(f"Corpus is: {corpus}")
 
         if fieldset_id is not None:
-            print(f"Fieldset id is not None: {fieldset_id}")
+            # print(f"Fieldset id is not None: {fieldset_id}")
             fieldset = Fieldset.objects.get(pk=from_global_id(fieldset_id)[1])
         else:
             if fieldset_name is None:
                 fieldset_name = f"{name} Fieldset"
-            print(f"Creating new fieldset... name will be: {fieldset_name}")
+            # print(f"Creating new fieldset... name will be: {fieldset_name}")
 
             fieldset = Fieldset.objects.create(
                 name=fieldset_name,
@@ -1765,7 +1749,7 @@ def mutate(
         extract.save()
 
         if corpus is not None:
-            print(f"Try to add corpus docs: {corpus.documents.all()}")
+            # print(f"Try to add corpus docs: {corpus.documents.all()}")
             extract.documents.add(*corpus.documents.all())
         else:
             print("Corpus IS still None... no docs to add.")
@@ -1833,7 +1817,7 @@ def mutate(root, info, extract_id, document_ids):
             doc_objs = Document.objects.filter(
                 Q(pk__in=doc_pks) & (Q(creator=user) | Q(is_public=True))
             )
-            print(f"Add documents to extract {extract}: {doc_objs}")
+            # print(f"Add documents to extract {extract}: {doc_objs}")
             extract.documents.add(*doc_objs)
 
             ok = True

diff --git a/config/graphql/queries.py b/config/graphql/queries.py
@@ -141,8 +141,6 @@ def resolve_bulk_doc_annotations_in_corpus(self, info, corpus_id, **kwargs):
                 Q(creator=info.context.user) | Q(is_public=True)
             )
 
-        print(f"Base queryset: {queryset}")
-
         # Now build query to stuff they want to see (filter to annotations in this corpus or with NO corpus FK, which
         # travel with document.
         q_objects = Q(corpus_id=corpus_django_pk) | Q(corpus_id__isnull=True)

diff --git a/opencontractserver/shared/fields.py b/opencontractserver/shared/fields.py
@@ -1,12 +1,10 @@
-import io
 import json
 import logging
 
-import PyPDF2
-from django.contrib.postgres.forms import JSONField
 from django.db.models import JSONField as DbJSONField
-from django.forms.fields import InvalidJSONInput
+from django.forms.fields import InvalidJSONInput, JSONField
 from drf_extra_fields.fields import Base64FileField
+from filetype import filetype
 
 # Logging setup
 logger = logging.getLogger(__name__)
@@ -15,24 +13,26 @@
 
 # Field to accept base64-encoded file strings for PDF only as a field on our serializers
 class PDFBase64File(Base64FileField):
-    ALLOWED_TYPES = ["pdf"]
+
+    ALLOWED_TYPES = ("pdf",)
 
     def get_file_extension(self, filename, decoded_file):
-        try:
-            PyPDF2.PdfFileReader(io.BytesIO(decoded_file))
-        except PyPDF2.utils.PdfReadError as e:
-            logger.warning(e)
+
+        # Check file type
+        kind = filetype.guess(decoded_file)
+        if kind is None:
+            logger.warning("Could not determine valid filetype")
+            return None
+        elif kind.mime != "application/pdf":
+            logger.warning(f"Not a PDF: {kind.mime}")
+            return None
         else:
             return "pdf"
 
 
 # Needed to override the default JSONField due to some undesired validation behavior where an empty dict throws a
 # validation error for JSONField.
 # See: https://stackoverflow.com/questions/55147169/django-admin-jsonfield-default-empty-dict-wont-save-in-admin
-class MyJSONField(JSONField):
-
-    empty_values = [None, "", [], ()]
-
 
 # Combined a couple things into a single custom JSON Field...
 #
@@ -48,7 +48,7 @@ class MyJSONField(JSONField):
 # http://blog.qax.io/unescaped-utf-8-in-djangos-admin-with-jsonfield/
 class UTF8JSONFormField(JSONField):
 
-    empty_values = [None, "", [], ()]
+    empty_values = [None, "", [], (), {}]
 
     def prepare_value(self, value):
         if isinstance(value, InvalidJSONInput):
@@ -66,7 +66,7 @@ class NullableJSONField(DbJSONField):
     Also lets you have null inputs, which otherwise throw a validation error...
     """
 
-    empty_values = [None, "", [], ()]
+    empty_values = [None, "", [], (), {}]
 
     def formfield(self, **kwargs):
         return super().formfield(**{"form_class": UTF8JSONFormField, **kwargs})
diff --git a/opencontractserver/tasks/doc_tasks.py b/opencontractserver/tasks/doc_tasks.py
@@ -19,7 +19,6 @@
 from config import celery_app
 from config.graphql.serializers import AnnotationLabelSerializer
 from opencontractserver.annotations.models import (
-    METADATA_LABEL,
     TOKEN_LABEL,
     Annotation,
     AnnotationLabel,
@@ -428,34 +427,6 @@ def burn_doc_annotations(
     )
 
 
-@celery_app.task()
-@validate_arguments
-def convert_doc_to_langchain_task(doc_id: int, corpus_id: int) -> tuple[str, dict]:
-    """
-    Given a doc and corpus, export text and all metadata in a tuple that can then be combined and exported for langchain
-    """
-    doc = Document.objects.get(id=doc_id)
-
-    metadata_annotations = Annotation.objects.filter(
-        document_id=doc_id,
-        corpus_id=corpus_id,
-        annotation_label__label_type=METADATA_LABEL,
-    )
-
-    if doc.txt_extract_file.name:
-        with doc.txt_extract_file.open("r") as txt_file:
-            text = txt_file.read()
-    else:
-        text = ""
-
-    metadata_json = {"doc_id": doc_id, "corpus_id": corpus_id}
-
-    for metadata in metadata_annotations:
-        metadata_json[metadata.annotation_label.text] = metadata.raw_text
-
-    return text, metadata_json
-
-
 @celery_app.task()
 def convert_doc_to_funsd(
     user_id: int, doc_id: int, corpus_id: int

diff --git a/opencontractserver/tasks/export_tasks.py b/opencontractserver/tasks/export_tasks.py
@@ -9,7 +9,6 @@
 from celery import shared_task
 from django.conf import settings
 from django.contrib.auth import get_user_model
-from django.core.files.base import ContentFile
 from django.utils import timezone
 
 from opencontractserver.corpuses.models import Corpus
@@ -105,34 +104,6 @@ def package_annotated_docs(
     logger.info(f"Export {export_id} is completed. Signal should now notify creator.")
 
 
-@shared_task
-def package_langchain_exports(
-    burned_docs: tuple[tuple[str, dict]],
-    export_id: str | int,
-    corpus_pk: str | int,
-):
-
-    logger.info(f"Package corpus for export {export_id}...")
-
-    langchain_export = []
-    corpus = Corpus.objects.get(id=corpus_pk)
-
-    for doc in burned_docs:
-
-        langchain_export.append({"page_content": doc[0], "metdata": doc[1]})
-
-    json_str = json.dumps(langchain_export)
-    json_file = ContentFile(json_str.encode("utf-8"))
-
-    export = UserExport.objects.get(pk=export_id)
-    export.file.save(f"{corpus.title} LangChain Export.json", json_file)
-    export.finished = timezone.now()
-    export.backend_lock = False
-    export.save()
-
-    logger.info(f"Export {export_id} is completed. Signal should now notify creator.")
-
-
 @shared_task
 def package_funsd_exports(
     funsd_data: tuple[

diff --git a/opencontractserver/tests/test_corpus_import.py b/opencontractserver/tests/test_corpus_import.py
@@ -34,16 +34,16 @@ def test_import(self):
         export_zip_base64_file_string = package_zip_into_base64(
             self.fixtures_path / "Test_Corpus_EXPORT.zip"
         )
-        print("\t\tLOADED")
+        # print("\t\tLOADED")
 
-        print("2)\tCreate seed corpus to import data into...")
+        # print("2)\tCreate seed corpus to import data into...")
         corpus_obj = Corpus.objects.create(
             title="New Import", creator=self.user, backend_lock=False
         )
         set_permissions_for_obj_to_user(self.user, corpus_obj, [PermissionTypes.ALL])
-        print("\t\tCREATED")
+        # print("\t\tCREATED")
 
-        print("3)\tBuild celery task to import")
+        # print("3)\tBuild celery task to import")
         base64_img_bytes = export_zip_base64_file_string.encode("utf-8")
         decoded_file_data = base64.decodebytes(base64_img_bytes)
 
@@ -53,12 +53,12 @@ def test_import(self):
                 ContentFile(decoded_file_data, name=f"corpus_import_{uuid.uuid4()}.pdf")
             )
         import_task = import_corpus.s(temporary_file.id, self.user.id, corpus_obj.id)
-        print("\t\tBUILT")
+        # print("\t\tBUILT")
 
-        print("4)\tRun the celery task...")
+        # print("4)\tRun the celery task...")
         import_results = import_task.apply().get()
         assert isinstance(import_results, str)
-        print("\t\tCOMPLETED")
+        # print("\t\tCOMPLETED")
 
         labels = AnnotationLabel.objects.all()
         assert labels.count() == 2