Merge branch 'dev' into logger_part_1

AI4Bharat · Aug 22, 2023 · d6db03c · d6db03c
2 parents 075e22c + 5def97e
commit d6db03c
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 35 deletions.
diff --git a/backend/functions/utils.py b/backend/functions/utils.py
@@ -15,6 +15,7 @@
     LANG_NAME_TO_CODE_ULCA,
     LANG_TRANS_MODEL_CODES,
     LANG_NAME_TO_CODE_AZURE,
+    LANG_NAME_TO_CODE_ITV2,
 )
 from google.cloud import vision
 from users.utils import LANG_NAME_TO_CODE_ULCA
@@ -234,8 +235,8 @@ def get_batch_translations_using_indictransv2_nmt_api(
             target_language = "Urdu"
 
     # Convert language names to the language code
-    source_language = LANG_NAME_TO_CODE_ULCA[source_language]
-    target_language = LANG_NAME_TO_CODE_ULCA[target_language]
+    source_language = LANG_NAME_TO_CODE_ITV2[source_language]
+    target_language = LANG_NAME_TO_CODE_ITV2[target_language]
 
     # Create the input sentences list
     input_sentences = [{"source": sentence} for sentence in sentence_list]

diff --git a/backend/organizations/views.py b/backend/organizations/views.py
@@ -2265,6 +2265,8 @@ def cumulative_tasks_count(self, request, pk=None):
             proj_objs = Project.objects.filter(
                 organization_id=pk, project_type=project_type
             )
+            if not request.user.is_authenticated:
+                proj_objs = proj_objs.filter(workspace_id__public_analytics=True)
 
             languages = list(set([proj.tgt_language for proj in proj_objs]))
             general_lang = []

diff --git a/backend/projects/tasks.py b/backend/projects/tasks.py
@@ -214,6 +214,10 @@ def create_tasks_from_dataitems(items, project):
     # Bulk create the tasks
     Task.objects.bulk_create(tasks)
 
+    if "automatic_annotation_creation_mode" in project.metadata_json:
+        create_automatic_annotations(
+            tasks, project.metadata_json["automatic_annotation_creation_mode"]
+        )
     if input_dataset_info["prediction"] is not None:
         user_object = User.objects.get(email="[email protected]")
 
@@ -317,7 +321,7 @@ def create_parameters_for_task_creation(
         sampling_parameters (dict): Parameters for sampling
         variable_parameters (dict): _description_
         project_id (int): ID of the project object created in this iteration
-
+        automatic_annotation_creation_mode: Creation mode for tasks
     """
 
     filtered_items = filter_data_items(
@@ -360,6 +364,10 @@ def create_parameters_for_task_creation(
     # Create Tasks from Parameters
     tasks = create_tasks_from_dataitems(sampled_items, project)
     if automatic_annotation_creation_mode != None:
+        project.metadata_json[
+            "automatic_annotation_creation_mode"
+        ] = automatic_annotation_creation_mode
+        project.save()
         create_automatic_annotations(tasks, automatic_annotation_creation_mode)
 
 

diff --git a/backend/tasks/views.py b/backend/tasks/views.py
@@ -138,7 +138,9 @@ def annotations(self, request, pk):
         # modifications for integrations of chitralekha UI
         if "enable_chitralekha_UI" in dict(request.query_params):
             for ann in annotations:
-                modified_result = convert_result_to_chitralekha_format(ann.result)
+                modified_result = convert_result_to_chitralekha_format(
+                    ann.result, ann.id
+                )
                 ann.result = modified_result
 
         serializer = AnnotationSerializer(annotations, many=True)

diff --git a/backend/users/utils.py b/backend/users/utils.py
@@ -101,6 +101,36 @@
     "Urdu": "ur",
 }
 
+LANG_NAME_TO_CODE_ITV2 = {
+    "English": "en",
+    "Assamese": "as",
+    "Bhojpuri": "bho",
+    "Bengali": "bn",
+    "Bodo": "brx",
+    "Dogri": "doi",
+    "Dhivehi": "dv",
+    "Konkani": "gom",
+    "Gujarati": "gu",
+    "Hindi": "hi",
+    "Kannada": "kn",
+    "Kashmiri": "ks",
+    "Mizo": "lus",
+    "Maithili": "mai",
+    "Malayalam": "ml",
+    "Manipuri": "mni",
+    "Marathi": "mr",
+    "Nepali": "ne",
+    "Odia": "or",
+    "Punjabi": "pa",
+    "Sanskrit": "sa",
+    "Santali": "sat",
+    "Sindhi": "sd",
+    "Sinhala": "si",
+    "Tamil": "ta",
+    "Telugu": "te",
+    "Urdu": "ur",
+}
+
 # Language codes to language names
 LANG_CODE_TO_NAME_GOOGLE = {
     lang_code: lang_name for lang_name, lang_code in LANG_NAME_TO_CODE_GOOGLE.items()

diff --git a/backend/utils/convert_result_to_chitralekha_format.py b/backend/utils/convert_result_to_chitralekha_format.py
@@ -1,43 +1,88 @@
-def convert_result_to_chitralekha_format(result):
-    result = sort_array_by_start(result)
+def create_memory(result):
+    memory = {}
+    for i in range(len(result)):
+        key = result[i]["id"]
+        if key not in memory:
+            memory[key] = {"labels_dict_idx": -1, "text_dict_idx": -1}
+        if result[i]["type"] == "labels":
+            memory[key]["labels_dict_idx"] = i
+        else:
+            memory[key]["text_dict_idx"] = i
+    return memory
+
+
+def convert_result_to_chitralekha_format(result, ann_id):
+    memory = create_memory(result)
     modified_result = []
     count = 1
-    for i in range(1, len(result), 2):
-        label_dict = result[i - 1]
-        text_dict = result[i]
+    seen = set()
+    for i in range(len(result)):
+        if i in seen:
+            continue
+        labels_dict_idx, text_dict_idx = (
+            memory[result[i]["id"]]["labels_dict_idx"],
+            memory[result[i]["id"]]["text_dict_idx"],
+        )
+        if labels_dict_idx == -1:
+            text_dict = result[text_dict_idx]
+            speaker_id = "Speaker 0"
+            seen.add(text_dict_idx)
+        elif text_dict_idx == -1:
+            print(
+                f"The data is corrupt for annotation id-{ann_id}, data id- {result[i]['id']}. "
+                f"It does not contain a corresponding text dictionary."
+            )
+            continue
+        else:
+            label_dict = result[labels_dict_idx]
+            text_dict = result[text_dict_idx]
+            seen.add(labels_dict_idx)
+            seen.add(text_dict_idx)
+            speaker_id = label_dict["value"]["labels"][0]
         text = text_dict["value"]["text"][0] if text_dict["value"]["text"] else ""
-        chitra_dict = {
-            "text": text,
-            "end_time": convert_fractional_time_to_formatted(text_dict["value"]["end"]),
-            "speaker_id": label_dict["value"]["labels"][0],
-            "start_time": convert_fractional_time_to_formatted(
-                text_dict["value"]["start"]
-            ),
-            "id": count,
-        }
+        try:
+            chitra_dict = {
+                "text": text,
+                "end_time": convert_fractional_time_to_formatted(
+                    text_dict["value"]["end"], ann_id, text_dict["id"]
+                ),
+                "speaker_id": speaker_id,
+                "start_time": convert_fractional_time_to_formatted(
+                    text_dict["value"]["start"], ann_id, text_dict["id"]
+                ),
+                "id": count,
+            }
+        except Exception:
+            continue
         count += 1
 
         modified_result.append(chitra_dict)
-
+    modified_result = (
+        sort_result_by_start_time(modified_result) if len(modified_result) > 0 else []
+    )
     return modified_result
 
 
-def convert_fractional_time_to_formatted(minutes):
-    total_seconds = minutes * 60
-
-    hours = int(total_seconds // 3600)
-    total_seconds %= 3600
-
-    minutes = int(total_seconds // 60)
-    seconds = total_seconds % 60
-
-    formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
-    return formatted_time
+def convert_fractional_time_to_formatted(decimal_time, ann_id, data_id):
+    if not (
+        isinstance(decimal_time, str)
+        or isinstance(decimal_time, int)
+        or isinstance(decimal_time, float)
+    ):
+        print(
+            f"The data is corrupt for annotation id-{ann_id}, data id- {data_id}. "
+            f"Its start/end time are not stored as proper data type (int or float or string)."
+        )
+    decimal_time = float(decimal_time)
+    hours = int(decimal_time // 60)
+    remaining_minutes = int(decimal_time % 60)
+    seconds_fraction = decimal_time - ((hours * 60) + remaining_minutes)
+    seconds = int(seconds_fraction * 60)
+    milliseconds = int((seconds_fraction * 60 - seconds) * 1000)
 
+    return f"{hours:02d}:{remaining_minutes:02d}:{seconds:02d}.{milliseconds:03d}"
 
-def sort_array_by_start(array):
-    def sort_key(entry):
-        return entry["value"]["start"]
 
-    sorted_array = sorted(array, key=sort_key)
-    return sorted_array
+def sort_result_by_start_time(result):
+    sorted_result = sorted(result, key=lambda x: x["start_time"])
+    return sorted_result
diff --git a/docker-compose-dev-elasticsearch.yml b/docker-compose-dev-elasticsearch.yml
@@ -0,0 +1,27 @@
+version: '3.3'
+
+services:
+  elasticsearch:
+    container_name: elasticsearch
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
+    volumes:
+      - ./elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
+      - elasticsearch_vol:/elasticsearch_data
+    environment:
+      - discovery.type=single-node
+    ports:
+      - "9200:9200"
+      - "9300:9300"
+
+  kibana:
+    container_name: kibana
+    image: docker.elastic.co/kibana/kibana:7.14.0
+    ports:
+      - 5601:5601
+    depends_on:
+      - elasticsearch
+
+volumes:
+  elasticsearch_vol:
+    external: true
+  static_volume:
diff --git a/elasticsearch/elasticsearch.yml b/elasticsearch/elasticsearch.yml
@@ -0,0 +1,4 @@
+cluster.name: "docker-cluster"
+network.host: 0.0.0.0
+path.data: /elasticsearch_data
+discovery.type: "single-node"