Skip to content

Commit

Permalink
Merge pull request #170 from JSv4/JSv4/upgrade-django-lts
Browse files Browse the repository at this point in the history
Upgrade Django LTS. Finally got the coverage checks to pass!
  • Loading branch information
JSv4 authored Jul 27, 2024
2 parents 4791f48 + 25db223 commit d26b78c
Show file tree
Hide file tree
Showing 13 changed files with 465 additions and 135 deletions.
12 changes: 6 additions & 6 deletions config/graphql/custom_connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ class Meta:
page_count = Int()

def resolve_current_page(root, info, **kwargs):
print(
f"PdfPageAwareConnection- resolve_total_count kwargs: {kwargs} / root {dir(root)} / iteracble "
f"{root.iterable.count()}"
)
# print(
# f"PdfPageAwareConnection- resolve_total_count kwargs: {kwargs} / root {dir(root)} / iteracble "
# f"{root.iterable.count()}"
# )
return 1

def resolve_page_count(root, info, **kwargs):

largest_page_number = max(
list(root.iterable.values_list("page", flat=True).distinct())
)
print(f"Unique page list: {largest_page_number}")
# print(f"Unique page list: {largest_page_number}")

print(f"PdfPageAwareConnection - resolve_edge_count kwargs: {kwargs}")
# print(f"PdfPageAwareConnection - resolve_edge_count kwargs: {kwargs}")
return largest_page_number
14 changes: 7 additions & 7 deletions config/graphql/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def filter_by_label_from_labelset_id(self, queryset, info, value):

def filter_by_created_by_analysis_ids(self, queryset, info, value):

print(f"filter_by_created_by_analysis_ids - value: {value}")
# print(f"filter_by_created_by_analysis_ids - value: {value}")

analysis_ids = value.split(",")
if "~~MANUAL~~" in analysis_ids:
Expand Down Expand Up @@ -219,14 +219,14 @@ def filter_by_labelset_id(self, queryset, name, value):

def filter_by_used_in_labelset_for_corpus_id(self, queryset, name, value):

print(f"Raw corpus id: {value}")
# print(f"Raw corpus id: {value}")
django_pk = from_global_id(value)[1]
print("Lookup labels for pk", django_pk)
# print("Lookup labels for pk", django_pk)
queryset = queryset.filter(Q(included_in_labelset__used_by_corpus=django_pk))
print(
"Filtered to values",
queryset,
)
# print(
# "Filtered to values",
# queryset,
# )
return queryset.filter(included_in_labelset__used_by_corpus=django_pk)

class Meta:
Expand Down
32 changes: 8 additions & 24 deletions config/graphql/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,8 @@
package_annotated_docs,
)
from opencontractserver.tasks.analyzer_tasks import start_analysis
from opencontractserver.tasks.doc_tasks import (
convert_doc_to_funsd,
convert_doc_to_langchain_task,
)
from opencontractserver.tasks.export_tasks import (
package_funsd_exports,
package_langchain_exports,
)
from opencontractserver.tasks.doc_tasks import convert_doc_to_funsd
from opencontractserver.tasks.export_tasks import package_funsd_exports
from opencontractserver.tasks.extract_orchestrator_tasks import run_extract
from opencontractserver.tasks.permissioning_tasks import (
make_analysis_public_task,
Expand Down Expand Up @@ -543,7 +537,7 @@ def mutate(root, info, corpus_id, query):
creator=info.context.user,
corpus_id=from_global_id(corpus_id)[1],
)
print(f"Obj created: {obj}")
# print(f"Obj created: {obj}")
set_permissions_for_obj_to_user(
info.context.user, obj, [PermissionTypes.CRUD]
)
Expand Down Expand Up @@ -611,16 +605,6 @@ def mutate(root, info, corpus_id, export_format):
),
).apply_async()

ok = True
message = "SUCCESS"
elif export_format == ExportType.LANGCHAIN.value:
chord(
group(
convert_doc_to_langchain_task.s(doc_id, corpus_pk)
for doc_id in doc_ids
),
package_langchain_exports.s(export.id, corpus_pk),
).apply_async()
ok = True
message = "SUCCESS"
elif export_format == ExportType.FUNSD:
Expand Down Expand Up @@ -1735,15 +1719,15 @@ def mutate(
corpus = None
if corpus_id is not None:
corpus = Corpus.objects.get(pk=from_global_id(corpus_id)[1])
print(f"Corpus is: {corpus}")
# print(f"Corpus is: {corpus}")

if fieldset_id is not None:
print(f"Fieldset id is not None: {fieldset_id}")
# print(f"Fieldset id is not None: {fieldset_id}")
fieldset = Fieldset.objects.get(pk=from_global_id(fieldset_id)[1])
else:
if fieldset_name is None:
fieldset_name = f"{name} Fieldset"
print(f"Creating new fieldset... name will be: {fieldset_name}")
# print(f"Creating new fieldset... name will be: {fieldset_name}")

fieldset = Fieldset.objects.create(
name=fieldset_name,
Expand All @@ -1765,7 +1749,7 @@ def mutate(
extract.save()

if corpus is not None:
print(f"Try to add corpus docs: {corpus.documents.all()}")
# print(f"Try to add corpus docs: {corpus.documents.all()}")
extract.documents.add(*corpus.documents.all())
else:
print("Corpus IS still None... no docs to add.")
Expand Down Expand Up @@ -1833,7 +1817,7 @@ def mutate(root, info, extract_id, document_ids):
doc_objs = Document.objects.filter(
Q(pk__in=doc_pks) & (Q(creator=user) | Q(is_public=True))
)
print(f"Add documents to extract {extract}: {doc_objs}")
# print(f"Add documents to extract {extract}: {doc_objs}")
extract.documents.add(*doc_objs)

ok = True
Expand Down
2 changes: 0 additions & 2 deletions config/graphql/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,6 @@ def resolve_bulk_doc_annotations_in_corpus(self, info, corpus_id, **kwargs):
Q(creator=info.context.user) | Q(is_public=True)
)

print(f"Base queryset: {queryset}")

# Now build query to stuff they want to see (filter to annotations in this corpus or with NO corpus FK, which
# travel with document.
q_objects = Q(corpus_id=corpus_django_pk) | Q(corpus_id__isnull=True)
Expand Down
30 changes: 15 additions & 15 deletions opencontractserver/shared/fields.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import io
import json
import logging

import PyPDF2
from django.contrib.postgres.forms import JSONField
from django.db.models import JSONField as DbJSONField
from django.forms.fields import InvalidJSONInput
from django.forms.fields import InvalidJSONInput, JSONField
from drf_extra_fields.fields import Base64FileField
from filetype import filetype

# Logging setup
logger = logging.getLogger(__name__)
Expand All @@ -15,24 +13,26 @@

# Field to accept base64-encoded file strings for PDF only as a field on our serializers
class PDFBase64File(Base64FileField):
ALLOWED_TYPES = ["pdf"]

ALLOWED_TYPES = ("pdf",)

def get_file_extension(self, filename, decoded_file):
try:
PyPDF2.PdfFileReader(io.BytesIO(decoded_file))
except PyPDF2.utils.PdfReadError as e:
logger.warning(e)

# Check file type
kind = filetype.guess(decoded_file)
if kind is None:
logger.warning("Could not determine valid filetype")
return None
elif kind.mime != "application/pdf":
logger.warning(f"Not a PDF: {kind.mime}")
return None
else:
return "pdf"


# Needed to override the default JSONField due to some undesired validation behavior where an empty dict throws a
# validation error for JSONField.
# See: https://stackoverflow.com/questions/55147169/django-admin-jsonfield-default-empty-dict-wont-save-in-admin
class MyJSONField(JSONField):

empty_values = [None, "", [], ()]


# Combined a couple things into a single custom JSON Field...
#
Expand All @@ -48,7 +48,7 @@ class MyJSONField(JSONField):
# http://blog.qax.io/unescaped-utf-8-in-djangos-admin-with-jsonfield/
class UTF8JSONFormField(JSONField):

empty_values = [None, "", [], ()]
empty_values = [None, "", [], (), {}]

def prepare_value(self, value):
if isinstance(value, InvalidJSONInput):
Expand All @@ -66,7 +66,7 @@ class NullableJSONField(DbJSONField):
Also lets you have null inputs, which otherwise throw a validation error...
"""

empty_values = [None, "", [], ()]
empty_values = [None, "", [], (), {}]

def formfield(self, **kwargs):
return super().formfield(**{"form_class": UTF8JSONFormField, **kwargs})
29 changes: 0 additions & 29 deletions opencontractserver/tasks/doc_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from config import celery_app
from config.graphql.serializers import AnnotationLabelSerializer
from opencontractserver.annotations.models import (
METADATA_LABEL,
TOKEN_LABEL,
Annotation,
AnnotationLabel,
Expand Down Expand Up @@ -428,34 +427,6 @@ def burn_doc_annotations(
)


@celery_app.task()
@validate_arguments
def convert_doc_to_langchain_task(doc_id: int, corpus_id: int) -> tuple[str, dict]:
"""
Given a doc and corpus, export text and all metadata in a tuple that can then be combined and exported for langchain
"""
doc = Document.objects.get(id=doc_id)

metadata_annotations = Annotation.objects.filter(
document_id=doc_id,
corpus_id=corpus_id,
annotation_label__label_type=METADATA_LABEL,
)

if doc.txt_extract_file.name:
with doc.txt_extract_file.open("r") as txt_file:
text = txt_file.read()
else:
text = ""

metadata_json = {"doc_id": doc_id, "corpus_id": corpus_id}

for metadata in metadata_annotations:
metadata_json[metadata.annotation_label.text] = metadata.raw_text

return text, metadata_json


@celery_app.task()
def convert_doc_to_funsd(
user_id: int, doc_id: int, corpus_id: int
Expand Down
29 changes: 0 additions & 29 deletions opencontractserver/tasks/export_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from celery import shared_task
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.files.base import ContentFile
from django.utils import timezone

from opencontractserver.corpuses.models import Corpus
Expand Down Expand Up @@ -105,34 +104,6 @@ def package_annotated_docs(
logger.info(f"Export {export_id} is completed. Signal should now notify creator.")


@shared_task
def package_langchain_exports(
burned_docs: tuple[tuple[str, dict]],
export_id: str | int,
corpus_pk: str | int,
):

logger.info(f"Package corpus for export {export_id}...")

langchain_export = []
corpus = Corpus.objects.get(id=corpus_pk)

for doc in burned_docs:

langchain_export.append({"page_content": doc[0], "metdata": doc[1]})

json_str = json.dumps(langchain_export)
json_file = ContentFile(json_str.encode("utf-8"))

export = UserExport.objects.get(pk=export_id)
export.file.save(f"{corpus.title} LangChain Export.json", json_file)
export.finished = timezone.now()
export.backend_lock = False
export.save()

logger.info(f"Export {export_id} is completed. Signal should now notify creator.")


@shared_task
def package_funsd_exports(
funsd_data: tuple[
Expand Down
14 changes: 7 additions & 7 deletions opencontractserver/tests/test_corpus_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ def test_import(self):
export_zip_base64_file_string = package_zip_into_base64(
self.fixtures_path / "Test_Corpus_EXPORT.zip"
)
print("\t\tLOADED")
# print("\t\tLOADED")

print("2)\tCreate seed corpus to import data into...")
# print("2)\tCreate seed corpus to import data into...")
corpus_obj = Corpus.objects.create(
title="New Import", creator=self.user, backend_lock=False
)
set_permissions_for_obj_to_user(self.user, corpus_obj, [PermissionTypes.ALL])
print("\t\tCREATED")
# print("\t\tCREATED")

print("3)\tBuild celery task to import")
# print("3)\tBuild celery task to import")
base64_img_bytes = export_zip_base64_file_string.encode("utf-8")
decoded_file_data = base64.decodebytes(base64_img_bytes)

Expand All @@ -53,12 +53,12 @@ def test_import(self):
ContentFile(decoded_file_data, name=f"corpus_import_{uuid.uuid4()}.pdf")
)
import_task = import_corpus.s(temporary_file.id, self.user.id, corpus_obj.id)
print("\t\tBUILT")
# print("\t\tBUILT")

print("4)\tRun the celery task...")
# print("4)\tRun the celery task...")
import_results = import_task.apply().get()
assert isinstance(import_results, str)
print("\t\tCOMPLETED")
# print("\t\tCOMPLETED")

labels = AnnotationLabel.objects.all()
assert labels.count() == 2
Expand Down
Loading

0 comments on commit d26b78c

Please sign in to comment.