Skip to content

Commit

Permalink
Merge pull request #117 from JSv4/JSv4/add-data-extraction
Browse files Browse the repository at this point in the history
Add Data Extraction
  • Loading branch information
JSv4 authored Jun 19, 2024
2 parents ef648e4 + ece27d2 commit f55cdcf
Show file tree
Hide file tree
Showing 135 changed files with 44,503 additions and 1,774 deletions.
4 changes: 3 additions & 1 deletion .envs/.test/.django
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ CELERY_FLOWER_PASSWORD=U3Md7XEGNZ67HlNwHon8fbwiT0GemPeCrwDubZ6BRvX3dwxMEsLhuLoiU
# ------------------------------------------------------------------------------
USE_AUTH0=false

# Turn on Embeddings Microservice
# LLM SETTINGS
# ------------------------------------------------------------------------------
OPENAI_API_KEY=FAKE_API_KEY
OPENAI_MODEL=gpt-4o
52 changes: 51 additions & 1 deletion .github/workflows/backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ jobs:
linter:
runs-on: ubuntu-latest
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: false
dotnet: false
haskell: false
large-packages: true
docker-images: true
swap-storage: true

- name: Checkout Code Repository
uses: actions/[email protected]
Expand All @@ -41,13 +56,28 @@ jobs:
requirements/local.txt
- name: Run pre-commit
uses: pre-commit/[email protected].0
uses: pre-commit/[email protected].1

# With no caching at all the entire ci process takes 4m 30s to complete!
pytest:
runs-on: ubuntu-latest

steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: false
dotnet: false
haskell: false
large-packages: true
docker-images: true
swap-storage: true

- name: Checkout Code Repository
uses: actions/[email protected]
Expand All @@ -67,6 +97,26 @@ jobs:
- name: Collect Static Files
run: docker-compose -f test.yml run --rm django python manage.py collectstatic

- name: Verify Docker Containers
run: |
docker-compose -f test.yml ps
- name: Inspect Docker Network
run: |
docker network inspect $(docker-compose -f test.yml ps -q | xargs docker inspect --format='{{range .NetworkSettings.Networks}}{{.NetworkID}}{{end}}' | uniq)
- name: Capture Docker Compose Logs
if: failure()
run: |
docker-compose -f test.yml logs --no-color > docker-compose-logs.txt
- name: Upload Docker Compose Logs
if: failure()
uses: actions/upload-artifact@v3
with:
name: docker-compose-logs
path: docker-compose-logs.txt

- name: Build Pytest Coverage File
run: |
docker-compose -f test.yml run django coverage run -m pytest --cov-report=xml --cov
Expand Down
2 changes: 1 addition & 1 deletion .idea/OpenContracts.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ default_stages: [commit]

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand Down
1 change: 0 additions & 1 deletion =3.0.0

This file was deleted.

9 changes: 8 additions & 1 deletion compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ COPY ./requirements .
RUN pip wheel --wheel-dir /usr/src/app/wheels \
-r ${BUILD_ENVIRONMENT}.txt


# Python 'run' stage
FROM python as python-run-stage

Expand Down Expand Up @@ -99,6 +98,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
# copy python dependency wheels from python-build-stage
COPY --from=python-build-stage /usr/src/app/wheels /wheels/

# Install CPU-less requirements
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
RUN pip install sentence-transformers

# use wheels to install python dependencies
RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \
&& rm -rf /wheels/
Expand All @@ -108,6 +111,10 @@ RUN echo "RUN STAGE GITHUB_ACTIONS: $GITHUB_ACTIONS"
COPY ./setup_codecov.sh .
RUN if [ "$GITHUB_ACTIONS" ] ; then echo "GITHUB ACTION MODE" && chmod u+x setup_codecov.sh && ./setup_codecov.sh ; else echo "NOT GITHUB ACTION. DO NOT INSTALL CODECOV" ; fi

# Download sentence transformer binaries
COPY download_embeddings_model.py .
RUN mkdir -p /models
RUN python download_embeddings_model.py

COPY ./compose/production/django/entrypoint /entrypoint
RUN sed -i 's/\r$//g' /entrypoint
Expand Down
5 changes: 5 additions & 0 deletions config/graphql/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import inspect
import logging
import traceback
from abc import ABC

import django.db.models
Expand Down Expand Up @@ -161,6 +162,7 @@ def mutate(cls, root, info, *args, **kwargs):
from_global_id(kwargs.get(global_id, None))[1]
)
else:
logger.info(f"pk field is: {kwargs.get(pk_field, None)}")
pk_value = from_global_id(kwargs.get(pk_field, None))[1]
kwargs[pk_field] = pk_value

Expand All @@ -170,6 +172,8 @@ def mutate(cls, root, info, *args, **kwargs):
pk=from_global_id(kwargs.get(cls.IOSettings.lookup_field, None))[1]
)

logger.info(f"Retrieved obj: {obj}")

# Check the object isn't locked by another user
if hasattr(obj, "user_lock") and obj.user_lock is not None:
if info.context.user.id == obj.user_lock_id:
Expand Down Expand Up @@ -228,6 +232,7 @@ def mutate(cls, root, info, *args, **kwargs):
)

except Exception as e:
logger.error(traceback.format_exc())
message = f"Mutation failed due to error: {e}"

return cls(ok=ok, message=message, obj_id=obj_id)
83 changes: 70 additions & 13 deletions config/graphql/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@
LabelSet,
Relationship,
)
from opencontractserver.corpuses.models import Corpus
from opencontractserver.corpuses.models import Corpus, CorpusQuery
from opencontractserver.documents.models import Document
from opencontractserver.extracts.models import (
Column,
Datacell,
Extract,
Fieldset,
LanguageModel,
)
from opencontractserver.users.models import Assignment, UserExport

User = get_user_model()
Expand All @@ -29,7 +36,6 @@ class Meta:


class AnalyzerFilter(django_filters.FilterSet):

analyzer_id = filters.CharFilter(method="filter_by_analyzer_id")

def filter_by_analyzer_id(self, queryset, info, value):
Expand Down Expand Up @@ -62,7 +68,6 @@ class Meta:


class AnalysisFilter(django_filters.FilterSet):

#####################################################################
# Filter by analyses that have received callbacks
received_callback_results = filters.BooleanFilter(
Expand Down Expand Up @@ -108,7 +113,6 @@ class Meta:


class CorpusFilter(django_filters.FilterSet):

text_search = filters.CharFilter(method="text_search_method")

def text_search_method(self, queryset, name, value):
Expand All @@ -132,7 +136,6 @@ class Meta:


class AnnotationFilter(django_filters.FilterSet):

uses_label_from_labelset_id = django_filters.CharFilter(
method="filter_by_label_from_labelset_id"
)
Expand Down Expand Up @@ -195,7 +198,6 @@ class Meta:


class LabelFilter(django_filters.FilterSet):

used_in_labelset_id = django_filters.CharFilter(method="filter_by_labelset_id")
used_in_labelset_for_corpus_id = django_filters.CharFilter(
method="filter_by_used_in_labelset_for_corpus_id"
Expand All @@ -222,13 +224,16 @@ def filter_by_labelset_id(self, queryset, name, value):
return queryset.filter(included_in_labelset__pk=django_pk)

def filter_by_used_in_labelset_for_corpus_id(self, queryset, name, value):

print(f"Raw corpus id: {value}")
django_pk = from_global_id(value)[1]
print("Lookup labels for pk", django_pk)
queryset = queryset.filter(Q(included_in_labelset__used_by_corpus=django_pk))
print(
"Filtered to values",
queryset.filter(included_in_labelset__used_by_corpus_id=django_pk),
queryset,
)
return queryset.filter(included_in_labelset__used_by_corpus_id=django_pk)
return queryset.filter(included_in_labelset__used_by_corpus=django_pk)

class Meta:
model = AnnotationLabel
Expand All @@ -240,7 +245,6 @@ class Meta:


class LabelsetFilter(django_filters.FilterSet):

text_search = filters.CharFilter(method="text_search_method")

def text_search_method(self, queryset, name, value):
Expand All @@ -264,7 +268,6 @@ class Meta:


class RelationshipFilter(django_filters.FilterSet):

# Old-style filter when relationships let you cross documents. Think this creates too taxing a query on the
# Database. If we need document-level relationships, we can create a new model for that.
# document_id = django_filters.CharFilter(method='filter_document_id')
Expand All @@ -284,7 +287,6 @@ class Meta:


class AssignmentFilter(django_filters.FilterSet):

document_id = django_filters.CharFilter(method="filter_document_id")

def filter_document_id(self, queryset, name, value):
Expand All @@ -297,7 +299,6 @@ class Meta:


class ExportFilter(django_filters.FilterSet):

# This uses the django-filters ordering capabilities. Following filters available:
# 1) created (earliest to latest)
# 2) -created (latest to earliest)
Expand Down Expand Up @@ -333,7 +334,6 @@ class Meta:


class DocumentFilter(django_filters.FilterSet):

company_search = filters.CharFilter(method="company_name_search")
has_pdf = filters.BooleanFilter(method="has_pdf_search")
has_annotations_with_ids = filters.CharFilter(
Expand Down Expand Up @@ -379,3 +379,60 @@ class Meta:
"description": ["exact", "contains"],
"id": ["exact"],
}


class LanguageModelFilter(django_filters.FilterSet):
class Meta:
model = LanguageModel
fields = {
"model": ["exact", "contains"],
}


class FieldsetFilter(django_filters.FilterSet):
class Meta:
model = Fieldset
fields = {
"name": ["exact", "contains"],
"description": ["contains"],
}


class ColumnFilter(django_filters.FilterSet):
class Meta:
model = Column
fields = {
"query": ["contains"],
"match_text": ["contains"],
"output_type": ["exact"],
"limit_to_label": ["exact"],
"agentic": ["exact"],
}


class ExtractFilter(django_filters.FilterSet):
class Meta:
model = Extract
fields = {
"name": ["exact", "contains"],
"created": ["lte", "gte"],
"started": ["lte", "gte"],
"finished": ["lte", "gte"],
}


class CorpusQueryFilter(django_filters.FilterSet):
class Meta:
model = CorpusQuery
fields = {"corpus_id": ["exact"]}


class DatacellFilter(django_filters.FilterSet):
class Meta:
model = Datacell
fields = {
"data_definition": ["exact"],
"started": ["lte", "gte"],
"completed": ["lte", "gte"],
"failed": ["lte", "gte"],
}
Loading

0 comments on commit f55cdcf

Please sign in to comment.