From cc964cc8766d864c57fd8f86bdd560726d85cf69 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Thu, 17 Aug 2023 11:53:39 +0530 Subject: [PATCH 1/2] bug-fixes --- backend/functions/utils.py | 5 +- backend/projects/tasks.py | 10 +- backend/tasks/views.py | 4 +- backend/users/utils.py | 30 +++++ .../convert_result_to_chitralekha_format.py | 107 +++++++++++++----- 5 files changed, 121 insertions(+), 35 deletions(-) diff --git a/backend/functions/utils.py b/backend/functions/utils.py index 07df9516a..b5cb46b36 100644 --- a/backend/functions/utils.py +++ b/backend/functions/utils.py @@ -15,6 +15,7 @@ LANG_NAME_TO_CODE_ULCA, LANG_TRANS_MODEL_CODES, LANG_NAME_TO_CODE_AZURE, + LANG_NAME_TO_CODE_ITV2, ) from google.cloud import vision from users.utils import LANG_NAME_TO_CODE_ULCA @@ -234,8 +235,8 @@ def get_batch_translations_using_indictransv2_nmt_api( target_language = "Urdu" # Convert language names to the language code - source_language = LANG_NAME_TO_CODE_ULCA[source_language] - target_language = LANG_NAME_TO_CODE_ULCA[target_language] + source_language = LANG_NAME_TO_CODE_ITV2[source_language] + target_language = LANG_NAME_TO_CODE_ITV2[target_language] # Create the input sentences list input_sentences = [{"source": sentence} for sentence in sentence_list] diff --git a/backend/projects/tasks.py b/backend/projects/tasks.py index b6a74bc66..e03bca8ef 100644 --- a/backend/projects/tasks.py +++ b/backend/projects/tasks.py @@ -214,6 +214,10 @@ def create_tasks_from_dataitems(items, project): # Bulk create the tasks Task.objects.bulk_create(tasks) + if "automatic_annotation_creation_mode" in project.metadata_json: + create_automatic_annotations( + tasks, project.metadata_json["automatic_annotation_creation_mode"] + ) if input_dataset_info["prediction"] is not None: user_object = User.objects.get(email="prediction@ai4bharat.org") @@ -317,7 +321,7 @@ def create_parameters_for_task_creation( sampling_parameters (dict): Parameters for sampling variable_parameters (dict): _description_ project_id (int): ID of the project object created in this iteration - + automatic_annotation_creation_mode: Creation mode for tasks """ filtered_items = filter_data_items( @@ -360,6 +364,10 @@ def create_parameters_for_task_creation( # Create Tasks from Parameters tasks = create_tasks_from_dataitems(sampled_items, project) if automatic_annotation_creation_mode != None: + project.metadata_json[ + "automatic_annotation_creation_mode" + ] = automatic_annotation_creation_mode + project.save() create_automatic_annotations(tasks, automatic_annotation_creation_mode) diff --git a/backend/tasks/views.py b/backend/tasks/views.py index 6932185ca..7a4b75532 100644 --- a/backend/tasks/views.py +++ b/backend/tasks/views.py @@ -138,7 +138,9 @@ def annotations(self, request, pk): # modifications for integrations of chitralekha UI if "enable_chitralekha_UI" in dict(request.query_params): for ann in annotations: - modified_result = convert_result_to_chitralekha_format(ann.result) + modified_result = convert_result_to_chitralekha_format( + ann.result, ann.id + ) ann.result = modified_result serializer = AnnotationSerializer(annotations, many=True) diff --git a/backend/users/utils.py b/backend/users/utils.py index 64974f4a7..09d3dbec2 100644 --- a/backend/users/utils.py +++ b/backend/users/utils.py @@ -101,6 +101,36 @@ "Urdu": "ur", } +LANG_NAME_TO_CODE_ITV2 = { + "English": "en", + "Assamese": "as", + "Bhojpuri": "bho", + "Bengali": "bn", + "Bodo": "brx", + "Dogri": "doi", + "Dhivehi": "dv", + "Konkani": "gom", + "Gujarati": "gu", + "Hindi": "hi", + "Kannada": "kn", + "Kashmiri": "ks", + "Mizo": "lus", + "Maithili": "mai", + "Malayalam": "ml", + "Manipuri": "mni", + "Marathi": "mr", + "Nepali": "ne", + "Odia": "or", + "Punjabi": "pa", + "Sanskrit": "sa", + "Santali": "sat", + "Sindhi": "sd", + "Sinhala": "si", + "Tamil": "ta", + "Telugu": "te", + "Urdu": "ur", +} + # Language codes to language names LANG_CODE_TO_NAME_GOOGLE = { lang_code: lang_name for lang_name, lang_code in LANG_NAME_TO_CODE_GOOGLE.items() diff --git a/backend/utils/convert_result_to_chitralekha_format.py b/backend/utils/convert_result_to_chitralekha_format.py index 4714e2c85..3e60317cf 100644 --- a/backend/utils/convert_result_to_chitralekha_format.py +++ b/backend/utils/convert_result_to_chitralekha_format.py @@ -1,43 +1,88 @@ -def convert_result_to_chitralekha_format(result): - result = sort_array_by_start(result) +def create_memory(result): + memory = {} + for i in range(len(result)): + key = result[i]["id"] + if key not in memory: + memory[key] = {"labels_dict_idx": -1, "text_dict_idx": -1} + if result[i]["type"] == "labels": + memory[key]["labels_dict_idx"] = i + else: + memory[key]["text_dict_idx"] = i + return memory + + +def convert_result_to_chitralekha_format(result, ann_id): + memory = create_memory(result) modified_result = [] count = 1 - for i in range(1, len(result), 2): - label_dict = result[i - 1] - text_dict = result[i] + seen = set() + for i in range(len(result)): + if i in seen: + continue + labels_dict_idx, text_dict_idx = ( + memory[result[i]["id"]]["labels_dict_idx"], + memory[result[i]["id"]]["text_dict_idx"], + ) + if labels_dict_idx == -1: + text_dict = result[text_dict_idx] + speaker_id = "Speaker 0" + seen.add(text_dict_idx) + elif text_dict_idx == -1: + print( + f"The data is corrupt for annotation id-{ann_id}, data id- {result[i]['id']}. " + f"It does not contain a corresponding text dictionary." + ) + continue + else: + label_dict = result[labels_dict_idx] + text_dict = result[text_dict_idx] + seen.add(labels_dict_idx) + seen.add(text_dict_idx) + speaker_id = label_dict["value"]["labels"][0] text = text_dict["value"]["text"][0] if text_dict["value"]["text"] else "" - chitra_dict = { - "text": text, - "end_time": convert_fractional_time_to_formatted(text_dict["value"]["end"]), - "speaker_id": label_dict["value"]["labels"][0], - "start_time": convert_fractional_time_to_formatted( - text_dict["value"]["start"] - ), - "id": count, - } + try: + chitra_dict = { + "text": text, + "end_time": convert_fractional_time_to_formatted( + text_dict["value"]["end"], ann_id, text_dict["id"] + ), + "speaker_id": speaker_id, + "start_time": convert_fractional_time_to_formatted( + text_dict["value"]["start"], ann_id, text_dict["id"] + ), + "id": count, + } + except Exception: + continue count += 1 modified_result.append(chitra_dict) - + modified_result = ( + sort_result_by_start_time(modified_result) if len(modified_result) > 0 else [] + ) return modified_result -def convert_fractional_time_to_formatted(minutes): - total_seconds = minutes * 60 - - hours = int(total_seconds // 3600) - total_seconds %= 3600 - - minutes = int(total_seconds // 60) - seconds = total_seconds % 60 - - formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:06.3f}" - return formatted_time +def convert_fractional_time_to_formatted(decimal_time, ann_id, data_id): + if not ( + isinstance(decimal_time, str) + or isinstance(decimal_time, int) + or isinstance(decimal_time, float) + ): + print( + f"The data is corrupt for annotation id-{ann_id}, data id- {data_id}. " + f"Its start/end time are not stored as proper data type (int or float or string)." + ) + decimal_time = float(decimal_time) + hours = int(decimal_time // 60) + remaining_minutes = int(decimal_time % 60) + seconds_fraction = decimal_time - ((hours * 60) + remaining_minutes) + seconds = int(seconds_fraction * 60) + milliseconds = int((seconds_fraction * 60 - seconds) * 1000) + return f"{hours:02d}:{remaining_minutes:02d}:{seconds:02d}.{milliseconds:03d}" -def sort_array_by_start(array): - def sort_key(entry): - return entry["value"]["start"] - sorted_array = sorted(array, key=sort_key) - return sorted_array +def sort_result_by_start_time(result): + sorted_result = sorted(result, key=lambda x: x["start_time"]) + return sorted_result From c7ab04d0a7e0c2cbcd48bfa5b7ee2cb99b278fa6 Mon Sep 17 00:00:00 2001 From: Ayush Panwar Date: Fri, 18 Aug 2023 13:51:37 +0530 Subject: [PATCH 2/2] filter out private workspace analytics for public views --- backend/organizations/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/organizations/views.py b/backend/organizations/views.py index 1b5bf36ef..38813c616 100644 --- a/backend/organizations/views.py +++ b/backend/organizations/views.py @@ -2265,6 +2265,8 @@ def cumulative_tasks_count(self, request, pk=None): proj_objs = Project.objects.filter( organization_id=pk, project_type=project_type ) + if not request.user.is_authenticated: + proj_objs = proj_objs.filter(workspace_id__public_analytics=True) languages = list(set([proj.tgt_language for proj in proj_objs])) general_lang = []