Tons of work here on extract and some improvements to Django Vector S…

…tore.
JSv4 · Jun 11, 2024 · 231f2fe · 231f2fe
1 parent bdfe0eb
commit 231f2fe
Show file tree

Hide file tree

Showing 36 changed files with 708 additions and 333 deletions.
diff --git a/config/graphql/filters.py b/config/graphql/filters.py
@@ -224,13 +224,18 @@ def filter_by_labelset_id(self, queryset, name, value):
         return queryset.filter(included_in_labelset__pk=django_pk)
 
     def filter_by_used_in_labelset_for_corpus_id(self, queryset, name, value):
+
+        print(f"Raw corpus id: {value}")
         django_pk = from_global_id(value)[1]
         print("Lookup labels for pk", django_pk)
+        queryset = queryset.filter(
+            Q(included_in_labelset__used_by_corpus=django_pk)
+        )
         print(
             "Filtered to values",
-            queryset.filter(included_in_labelset__used_by_corpus_id=django_pk),
+            queryset,
         )
-        return queryset.filter(included_in_labelset__used_by_corpus_id=django_pk)
+        return queryset.filter(included_in_labelset__used_by_corpus=django_pk)
 
     class Meta:
         model = AnnotationLabel

diff --git a/config/graphql/mutations.py b/config/graphql/mutations.py
@@ -1534,6 +1534,8 @@ class Arguments:
         instructions = graphene.String(required=False)
         language_model_id = graphene.ID(required=False)
         agentic = graphene.Boolean(required=False)
+        extract_is_list = graphene.Boolean(required=False)
+        must_contain_text = graphene.String(required=False)
 
     ok = graphene.Boolean()
     message = graphene.String()
@@ -1552,8 +1554,10 @@ def mutate(
         limit_to_label=None,
         instructions=None,
         agentic=None,
+        extract_is_list=None,
         language_model_id=None,
         fieldset_id=None,
+        must_contain_text=None
     ):
 
         ok = False
@@ -1591,6 +1595,12 @@ def mutate(
             if agentic is not None:
                 obj.agentic = agentic
 
+            if extract_is_list is not None:
+                obj.extract_is_list = extract_is_list
+
+            if must_contain_text is not None:
+                obj.must_contain_text = must_contain_text
+
             obj.save()
             message = "SUCCESS!"
             ok = True
@@ -1611,6 +1621,8 @@ class Arguments:
         instructions = graphene.String(required=False)
         language_model_id = graphene.ID(required=True)
         agentic = graphene.Boolean(required=False)
+        extract_is_list = graphene.Boolean(required=False)
+        must_contain_text = graphene.String(required=False)
         name = graphene.String(required=True)
 
     ok = graphene.Boolean()
@@ -1627,6 +1639,8 @@ def mutate(
         output_type,
         language_model_id,
         agentic=None,
+        extract_is_list=None,
+        must_contain_text=None,
         query=None,
         match_text=None,
         limit_to_label=None,
@@ -1648,7 +1662,9 @@ def mutate(
             limit_to_label=limit_to_label,
             instructions=instructions,
             language_model=language_model,
+            must_contain_text=must_contain_text,
             agentic=agentic if agentic is not None else False,
+            extract_is_list=extract_is_list if extract_is_list is not None else False,
             creator=info.context.user,
         )
         column.save()
@@ -1702,7 +1718,7 @@ class CreateExtract(graphene.Mutation):
     """
 
     class Arguments:
-        corpus_id = graphene.ID(required=True)
+        corpus_id = graphene.ID(required=False)
         name = graphene.String(required=True)
         fieldset_id = graphene.ID(required=False)
         fieldset_name = graphene.String(required=False)
@@ -1717,23 +1733,29 @@ class Arguments:
     def mutate(
         root,
         info,
-        corpus_id,
         name,
+        corpus_id=None,
         fieldset_id=None,
         fieldset_name=None,
         fieldset_description=None,
     ):
 
-        corpus = Corpus.objects.get(pk=from_global_id(corpus_id)[1])
+        corpus = None
+        if corpus_id is not None:
+            corpus = Corpus.objects.get(pk=from_global_id(corpus_id)[1])
+            print(f"Corpus is: {corpus}")
 
         if fieldset_id is not None:
+            print(f"Fieldset id is not None: {fieldset_id}")
             fieldset = Fieldset.objects.get(pk=from_global_id(fieldset_id)[1])
         else:
             if fieldset_name is None:
                 fieldset_name = f"{name} Fieldset"
+            print(f"Creating new fieldset... name will be: {fieldset_name}")
+
             fieldset = Fieldset.objects.create(
                 name=fieldset_name,
-                description=fieldset_description,
+                description=fieldset_description if fieldset_description is not None else f"Autogenerated {fieldset_name}",
                 creator=info.context.user,
             )
             set_permissions_for_obj_to_user(
@@ -1747,6 +1769,13 @@ def mutate(
             creator=info.context.user,
         )
         extract.save()
+
+        if corpus is not None:
+            print(f"Try to add corpus docs: {corpus.documents.all()}")
+            extract.documents.add(*corpus.documents.all())
+        else:
+            print(f"Corpus IS still None... no docs to add.")
+
         set_permissions_for_obj_to_user(
             info.context.user, extract, [PermissionTypes.CRUD]
         )

diff --git a/config/graphql/serializers.py b/config/graphql/serializers.py
@@ -71,6 +71,8 @@ class Meta:
             "instructions",
             "language_model_id",
             "agentic",
+            "extract_is_list",
+            "must_contain_text"
         ]
         read_only_fields = ["id", "created"]
 
@@ -96,6 +98,7 @@ class Meta:
             "icon",
             "text",
             "creator_id",
+            "read_only"
         ]
         read_only_fields = ["id"]
 

diff --git a/frontend/src/components/annotations/AnnotationCards.tsx b/frontend/src/components/annotations/AnnotationCards.tsx
@@ -68,6 +68,7 @@ export const AnnotationCards = ({
 
   const handleUpdate = () => {
     if (!loading && pageInfo?.hasNextPage) {
+      console.log("Fetching more annotation cards...");
       fetchMore({
         variables: {
           limit: 20,

diff --git a/frontend/src/components/annotator/Annotator.tsx b/frontend/src/components/annotator/Annotator.tsx
@@ -52,7 +52,6 @@ import { SidebarContainer } from "../common";
 import { getPawlsLayer } from "./api/rest";
 import {
   AnnotationLabelType,
-  CorpusQueryType,
   CorpusType,
   DocumentType,
   LabelDisplayBehavior,
@@ -75,7 +74,7 @@ import { getPermissions } from "../../utils/transform";
 import _ from "lodash";
 import {
   displayAnnotationOnAnnotatorLoad,
-  openedQueryObj,
+  pagesVisible,
   selectedAnalysesIds,
 } from "../../graphql/cache";
 import useWindowDimensions from "../hooks/WindowDimensionHook";
@@ -115,7 +114,6 @@ interface AnnotatorProps {
   show_selected_annotation_only: boolean;
   show_annotation_bounding_boxes: boolean;
   show_annotation_labels: LabelDisplayBehavior;
-  show_query?: CorpusQueryType | null;
   onClose: (args?: any) => void | any;
 }
 
@@ -128,13 +126,15 @@ export const Annotator = ({
   show_selected_annotation_only,
   show_annotation_bounding_boxes,
   show_annotation_labels,
-  show_query,
   onClose,
 }: AnnotatorProps) => {
+  console.log("Opened document: ", openedDocument);
+  console.log("Opened corpus: ", openedCorpus);
+  console.log("scroll_to_annotation_on_open: ", scroll_to_annotation_on_open);
+
   const { width } = useWindowDimensions();
   const responsive_sidebar_width = width <= 1000 ? "0px" : "400px";
 
-  const opened_query = useReactiveVar(openedQueryObj);
   const selected_analysis_ids = useReactiveVar(selectedAnalysesIds);
   // console.log("selected_analysis_ids", selected_analysis_ids);
 
@@ -190,6 +190,18 @@ export const Annotator = ({
       : {}),
   };
 
+  const setPageVisible = (
+    page_number: number,
+    state: "VISIBLE" | "NOT VISIBLE"
+  ) => {
+    setPagesVisible((old_pages_visible) => {
+      return {
+        ...old_pages_visible,
+        [page_number]: state,
+      };
+    });
+  };
+
   // Hold our query variables (using a state var lets us bundle updates to the
   // query var in a single useEffect that prevents multiple re-renders)
   const [annotator_query_vars, setAnnotatorQueryVars] =
@@ -444,72 +456,6 @@ export const Annotator = ({
     }
   }, [openedDocument]);
 
-  // If oquery we want to show changes, load it and its annotations into state store
-  useEffect(() => {
-    if (show_query && show_query.fullSourceList.length > 0) {
-      if (!read_only) {
-        throw new TypeError(
-          "read_only must be true when show_query is not null"
-        );
-      }
-
-      // First let's get all of the labels used in our answer by looking at the returned source annotation and getting unique list of labels by ids
-      const unique_annot_labels: AnnotationLabelType[] = _.uniqBy(
-        show_query.fullSourceList.map((source) => source.annotationLabel),
-        (label) => label.id
-      );
-      const span_label_lookup = unique_annot_labels
-        .filter((label) => label.labelType === LabelType.TokenLabel)
-        .reduce(function (obj: Record<string, any>, label) {
-          obj[label.id] = {
-            id: label.id,
-            color: label.color,
-            text: label.text,
-            icon: label.icon as SemanticICONS,
-            description: label.description,
-            labelType: label.labelType,
-          };
-          return obj;
-        }, {});
-
-      // TODO - store labels in state store
-      setSpanLabels(Object.values(span_label_lookup));
-
-      // We want to make sure we jump to the FIRST source
-      // We only want to load annotation page for selected annotation on load ONCE
-      const first_annotation = show_query.fullSourceList[0]; // TODO - make sure these are filtered by page on server
-      if (
-        loaded_page_for_annotation === null &&
-        jumped_to_annotation_on_load !== first_annotation.id
-      ) {
-        setLoadedPageForAnnotation(scroll_to_annotation_on_open);
-      }
-
-      // This is the annotations start loading
-      // Turn existing annotation data into PDFAnnotations obj and inject into state:
-      let annotation_objs: ServerAnnotation[] = show_query.fullSourceList
-        .filter((annotation) => annotation.analysis !== null)
-        .map(
-          (annot) =>
-            new ServerAnnotation(
-              annot.page,
-              annot.annotationLabel,
-              annot.rawText ? annot.rawText : "",
-              annot.json ? annot.json : {},
-              annot.myPermissions ? getPermissions(annot.myPermissions) : [],
-              annot.id
-            )
-        );
-
-      // TODO - let queries label docs and create relationships
-      // For now, we're assuming relationships and doc type labels cannot come out of the query... there is no reasons for this to remain true. Just a lot of work :-)
-      setPdfAnnotations(new PdfAnnotations(annotation_objs, [], []));
-
-      // Set up contexts for annotations
-      setViewState(ViewState.LOADED);
-    }
-  }, [show_query]);
-
   useEffect(() => {
     // console.log("New Annotator data", annotator_data);
 
@@ -526,10 +472,7 @@ export const Annotator = ({
       // );
     }
 
-    // if annotator_data changes due to loading from graphql (and we didn't somehow also have show_query set)
-    if (annotator_data && !show_query) {
-      console.log("Processing annotator data", annotator_data);
-
+    if (annotator_data) {
       // Build proper span label objs from GraphQL results
       let span_label_lookup: LooseObject = {};
       let human_span_label_lookup: LooseObject = {};
@@ -551,7 +494,6 @@ export const Annotator = ({
               return obj;
             }, {}),
         };
-        setSpanLabels(Object.values(span_label_lookup));
 
         // console.log(
         //   "Span choices",

diff --git a/frontend/src/components/annotator/PDF.tsx b/frontend/src/components/annotator/PDF.tsx
@@ -174,30 +174,6 @@ const Page = ({
 
   useEffect(() => {
     try {
-      // Saving this for now... is a great piece of code to determine if page is visible for page-by-page
-      // loading or rendering.
-      // const determinePageVisiblity = () => {
-
-      //   if (canvasRef.current !== null && scrollContainerRef && scrollContainerRef.current !== null) {
-
-      //     const scroll_window_bounding_rect = scrollContainerRef.current?.getBoundingClientRect()
-      //     const page_bounding_rect = canvasRef.current.getBoundingClientRect();
-
-      //     let pageVisibility = (
-      //       (page_bounding_rect.top >= scroll_window_bounding_rect.top && page_bounding_rect.top <= scroll_window_bounding_rect.bottom) ||
-      //       (page_bounding_rect.bottom >= scroll_window_bounding_rect.top && page_bounding_rect.bottom <= scroll_window_bounding_rect.bottom) ||
-      //       (page_bounding_rect.top < scroll_window_bounding_rect.top && page_bounding_rect.bottom >= scroll_window_bounding_rect.bottom)
-      //     );
-
-      //     // if (pageVisibility) {
-      //     //   console.log(`Page ${pageInfo.page.pageNumber} is VISIBLE!`);
-      //     //   setPageVisible(pageInfo.page.pageNumber, "VISIBLE");
-      //     // } else {
-      //     //   setPageVisible(pageInfo.page.pageNumber, "NOT VISIBLE");
-      //     // }
-      //   }
-      // };
-
       if (canvasRef.current === null) {
         onError(new Error("No canvas element"));
         return;

diff --git a/frontend/src/components/annotator/context/AnnotationStore.ts b/frontend/src/components/annotator/context/AnnotationStore.ts
@@ -320,7 +320,7 @@ export const AnnotationStore = createContext<_AnnotationStore>({
   pageSelectionQueue: [],
   spanLabels: [],
   humanSpanLabelChoices: [],
-  showStructuralLabels: false,
+  showStructuralLabels: true,
   activeSpanLabel: undefined,
   showOnlySpanLabels: [],
   docText: undefined,

diff --git a/frontend/src/components/annotator/pages/PDFView.tsx b/frontend/src/components/annotator/pages/PDFView.tsx
@@ -156,7 +156,7 @@ export const PDFView = ({
     Record<number, PDFPageInfo>
   >([]);
 
-  const [showStructuralLabels, setShowStructuralLabels] = useState(false);
+  const [showStructuralLabels, setShowStructuralLabels] = useState(true);
   const [activeSpanLabel, setActiveSpanLabel] = useState<
     AnnotationLabelType | undefined
   >(humanSpanLabelChoices.length > 0 ? humanSpanLabelChoices[0] : undefined);

diff --git a/frontend/src/components/queries/NewQuerySearch.tsx b/frontend/src/components/queries/NewQuerySearch.tsx
@@ -1,6 +1,13 @@
 import { useMutation } from "@apollo/client";
 import React from "react";
-import { Button, Container, Header, Image, Input } from "semantic-ui-react";
+import {
+  Button,
+  Container,
+  Header,
+  Icon,
+  Image,
+  Input,
+} from "semantic-ui-react";
 import {
   ASK_QUERY_OF_CORPUS,
   AskQueryOfCorpusInputType,
@@ -62,9 +69,9 @@ export const NewQuerySearch: React.FC<NewQuerySearchProps> = ({
             marginBottom: "1rem",
           }}
         >
-          <Image src="path/to/your/logo.png" size="small" />
+          <Icon name="search" size="huge" />
           <Header as="h2" style={{ marginLeft: "1rem" }}>
-            Agentic Query
+            Corpus Query
             <Header.Subheader>Query your document collection</Header.Subheader>
           </Header>
         </div>