Skip to content

Commit

Permalink
Merge branch 'dev' into logger_part_1
Browse files Browse the repository at this point in the history
  • Loading branch information
KunalTiwary authored Aug 22, 2023
2 parents 075e22c + 5def97e commit d6db03c
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 35 deletions.
5 changes: 3 additions & 2 deletions backend/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
LANG_NAME_TO_CODE_ULCA,
LANG_TRANS_MODEL_CODES,
LANG_NAME_TO_CODE_AZURE,
LANG_NAME_TO_CODE_ITV2,
)
from google.cloud import vision
from users.utils import LANG_NAME_TO_CODE_ULCA
Expand Down Expand Up @@ -234,8 +235,8 @@ def get_batch_translations_using_indictransv2_nmt_api(
target_language = "Urdu"

# Convert language names to the language code
source_language = LANG_NAME_TO_CODE_ULCA[source_language]
target_language = LANG_NAME_TO_CODE_ULCA[target_language]
source_language = LANG_NAME_TO_CODE_ITV2[source_language]
target_language = LANG_NAME_TO_CODE_ITV2[target_language]

# Create the input sentences list
input_sentences = [{"source": sentence} for sentence in sentence_list]
Expand Down
2 changes: 2 additions & 0 deletions backend/organizations/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2265,6 +2265,8 @@ def cumulative_tasks_count(self, request, pk=None):
proj_objs = Project.objects.filter(
organization_id=pk, project_type=project_type
)
if not request.user.is_authenticated:
proj_objs = proj_objs.filter(workspace_id__public_analytics=True)

languages = list(set([proj.tgt_language for proj in proj_objs]))
general_lang = []
Expand Down
10 changes: 9 additions & 1 deletion backend/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ def create_tasks_from_dataitems(items, project):
# Bulk create the tasks
Task.objects.bulk_create(tasks)

if "automatic_annotation_creation_mode" in project.metadata_json:
create_automatic_annotations(
tasks, project.metadata_json["automatic_annotation_creation_mode"]
)
if input_dataset_info["prediction"] is not None:
user_object = User.objects.get(email="[email protected]")

Expand Down Expand Up @@ -317,7 +321,7 @@ def create_parameters_for_task_creation(
sampling_parameters (dict): Parameters for sampling
variable_parameters (dict): _description_
project_id (int): ID of the project object created in this iteration
automatic_annotation_creation_mode: Creation mode for tasks
"""

filtered_items = filter_data_items(
Expand Down Expand Up @@ -360,6 +364,10 @@ def create_parameters_for_task_creation(
# Create Tasks from Parameters
tasks = create_tasks_from_dataitems(sampled_items, project)
if automatic_annotation_creation_mode != None:
project.metadata_json[
"automatic_annotation_creation_mode"
] = automatic_annotation_creation_mode
project.save()
create_automatic_annotations(tasks, automatic_annotation_creation_mode)


Expand Down
4 changes: 3 additions & 1 deletion backend/tasks/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ def annotations(self, request, pk):
# modifications for integrations of chitralekha UI
if "enable_chitralekha_UI" in dict(request.query_params):
for ann in annotations:
modified_result = convert_result_to_chitralekha_format(ann.result)
modified_result = convert_result_to_chitralekha_format(
ann.result, ann.id
)
ann.result = modified_result

serializer = AnnotationSerializer(annotations, many=True)
Expand Down
30 changes: 30 additions & 0 deletions backend/users/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,36 @@
"Urdu": "ur",
}

LANG_NAME_TO_CODE_ITV2 = {
"English": "en",
"Assamese": "as",
"Bhojpuri": "bho",
"Bengali": "bn",
"Bodo": "brx",
"Dogri": "doi",
"Dhivehi": "dv",
"Konkani": "gom",
"Gujarati": "gu",
"Hindi": "hi",
"Kannada": "kn",
"Kashmiri": "ks",
"Mizo": "lus",
"Maithili": "mai",
"Malayalam": "ml",
"Manipuri": "mni",
"Marathi": "mr",
"Nepali": "ne",
"Odia": "or",
"Punjabi": "pa",
"Sanskrit": "sa",
"Santali": "sat",
"Sindhi": "sd",
"Sinhala": "si",
"Tamil": "ta",
"Telugu": "te",
"Urdu": "ur",
}

# Language codes to language names
LANG_CODE_TO_NAME_GOOGLE = {
lang_code: lang_name for lang_name, lang_code in LANG_NAME_TO_CODE_GOOGLE.items()
Expand Down
107 changes: 76 additions & 31 deletions backend/utils/convert_result_to_chitralekha_format.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,88 @@
def convert_result_to_chitralekha_format(result):
result = sort_array_by_start(result)
def create_memory(result):
memory = {}
for i in range(len(result)):
key = result[i]["id"]
if key not in memory:
memory[key] = {"labels_dict_idx": -1, "text_dict_idx": -1}
if result[i]["type"] == "labels":
memory[key]["labels_dict_idx"] = i
else:
memory[key]["text_dict_idx"] = i
return memory


def convert_result_to_chitralekha_format(result, ann_id):
memory = create_memory(result)
modified_result = []
count = 1
for i in range(1, len(result), 2):
label_dict = result[i - 1]
text_dict = result[i]
seen = set()
for i in range(len(result)):
if i in seen:
continue
labels_dict_idx, text_dict_idx = (
memory[result[i]["id"]]["labels_dict_idx"],
memory[result[i]["id"]]["text_dict_idx"],
)
if labels_dict_idx == -1:
text_dict = result[text_dict_idx]
speaker_id = "Speaker 0"
seen.add(text_dict_idx)
elif text_dict_idx == -1:
print(
f"The data is corrupt for annotation id-{ann_id}, data id- {result[i]['id']}. "
f"It does not contain a corresponding text dictionary."
)
continue
else:
label_dict = result[labels_dict_idx]
text_dict = result[text_dict_idx]
seen.add(labels_dict_idx)
seen.add(text_dict_idx)
speaker_id = label_dict["value"]["labels"][0]
text = text_dict["value"]["text"][0] if text_dict["value"]["text"] else ""
chitra_dict = {
"text": text,
"end_time": convert_fractional_time_to_formatted(text_dict["value"]["end"]),
"speaker_id": label_dict["value"]["labels"][0],
"start_time": convert_fractional_time_to_formatted(
text_dict["value"]["start"]
),
"id": count,
}
try:
chitra_dict = {
"text": text,
"end_time": convert_fractional_time_to_formatted(
text_dict["value"]["end"], ann_id, text_dict["id"]
),
"speaker_id": speaker_id,
"start_time": convert_fractional_time_to_formatted(
text_dict["value"]["start"], ann_id, text_dict["id"]
),
"id": count,
}
except Exception:
continue
count += 1

modified_result.append(chitra_dict)

modified_result = (
sort_result_by_start_time(modified_result) if len(modified_result) > 0 else []
)
return modified_result


def convert_fractional_time_to_formatted(minutes):
total_seconds = minutes * 60

hours = int(total_seconds // 3600)
total_seconds %= 3600

minutes = int(total_seconds // 60)
seconds = total_seconds % 60

formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
return formatted_time
def convert_fractional_time_to_formatted(decimal_time, ann_id, data_id):
if not (
isinstance(decimal_time, str)
or isinstance(decimal_time, int)
or isinstance(decimal_time, float)
):
print(
f"The data is corrupt for annotation id-{ann_id}, data id- {data_id}. "
f"Its start/end time are not stored as proper data type (int or float or string)."
)
decimal_time = float(decimal_time)
hours = int(decimal_time // 60)
remaining_minutes = int(decimal_time % 60)
seconds_fraction = decimal_time - ((hours * 60) + remaining_minutes)
seconds = int(seconds_fraction * 60)
milliseconds = int((seconds_fraction * 60 - seconds) * 1000)

return f"{hours:02d}:{remaining_minutes:02d}:{seconds:02d}.{milliseconds:03d}"

def sort_array_by_start(array):
def sort_key(entry):
return entry["value"]["start"]

sorted_array = sorted(array, key=sort_key)
return sorted_array
def sort_result_by_start_time(result):
sorted_result = sorted(result, key=lambda x: x["start_time"])
return sorted_result
27 changes: 27 additions & 0 deletions docker-compose-dev-elasticsearch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
version: '3.3'

services:
elasticsearch:
container_name: elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
volumes:
- ./elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
- elasticsearch_vol:/elasticsearch_data
environment:
- discovery.type=single-node
ports:
- "9200:9200"
- "9300:9300"

kibana:
container_name: kibana
image: docker.elastic.co/kibana/kibana:7.14.0
ports:
- 5601:5601
depends_on:
- elasticsearch

volumes:
elasticsearch_vol:
external: true
static_volume:
4 changes: 4 additions & 0 deletions elasticsearch/elasticsearch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cluster.name: "docker-cluster"
network.host: 0.0.0.0
path.data: /elasticsearch_data
discovery.type: "single-node"

0 comments on commit d6db03c

Please sign in to comment.