Skip to content

Commit

Permalink
Merge branch 'dev' into workspace_reports/frozen_users
Browse files Browse the repository at this point in the history
  • Loading branch information
ishvindersethi22 authored Sep 23, 2024
2 parents 23be68c + af3243c commit d4c44d1
Show file tree
Hide file tree
Showing 36 changed files with 2,173 additions and 328 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.2.14 on 2024-05-21 06:02

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("dataset", "0046_merge_20240416_2233"),
]

operations = [
migrations.AddField(
model_name="speechconversation",
name="final_transcribed_json",
field=models.JSONField(
blank=True,
help_text="Field where data from this standardised_transcription_editing type will be exported.",
null=True,
verbose_name="final_transcribed_json",
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 3.2.14 on 2024-06-19 11:15

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("dataset", "0047_speechconversation_final_transcribed_json"),
]

operations = [
migrations.AddField(
model_name="ocrdocument",
name="bboxes_relation_prediction_json",
field=models.JSONField(
blank=True, null=True, verbose_name="bboxes_relation_prediction_json"
),
),
]
12 changes: 12 additions & 0 deletions backend/dataset/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ class OCRDocument(DatasetBase):
verbose_name="bboxes_relation_json", null=True, blank=True
)

bboxes_relation_prediction_json = models.JSONField(
verbose_name="bboxes_relation_prediction_json", null=True, blank=True
)

annotated_document_details_json = models.JSONField(
verbose_name="annotated_document_details_json", null=True, blank=True
)
Expand Down Expand Up @@ -484,6 +488,14 @@ class SpeechConversation(DatasetBase):
blank=True,
help_text=("Prepopulated prediction for the implemented models"),
)
final_transcribed_json = models.JSONField(
verbose_name="final_transcribed_json",
null=True,
blank=True,
help_text=(
"Field where data from this standardised_transcription_editing type will be exported."
),
)

def __str__(self):
return str(self.id)
Expand Down
10 changes: 4 additions & 6 deletions backend/dataset/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
#### CELERY SHARED TASKS


@shared_task(
bind=True,
)
@shared_task(queue="default")
def upload_data_to_data_instance(
self, dataset_string, pk, dataset_type, content_type, deduplicate=False
dataset_string, pk, dataset_type, content_type, deduplicate=False
):
# sourcery skip: raise-specific-error
"""Celery background task to upload the data to the dataset instance through file upload.
Expand Down Expand Up @@ -102,8 +100,8 @@ def upload_data_to_data_instance(
raise Exception(f"Upload failed for lines: {failed_rows}")


@shared_task(bind=True)
def deduplicate_dataset_instance_items(self, pk, deduplicate_field_list):
@shared_task(queue="default")
def deduplicate_dataset_instance_items(pk, deduplicate_field_list):
if len(deduplicate_field_list) == 0:
return "Field list cannot be empty"
try:
Expand Down
2 changes: 2 additions & 0 deletions backend/dataset/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,7 @@ def project_analytics(self, request, pk=None):
calculate_word_error_rate_between_two_audio_transcription_annotation(
review_annotation.result,
review_annotation.parent_annotation.result,
project_type,
)
)
except:
Expand Down Expand Up @@ -1067,6 +1068,7 @@ def project_analytics(self, request, pk=None):
calculate_word_error_rate_between_two_audio_transcription_annotation(
supercheck_annotation.result,
supercheck_annotation.parent_annotation.result,
project_type,
)
)
except:
Expand Down
120 changes: 111 additions & 9 deletions backend/functions/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
ANNOTATED,
)
from tasks.views import SentenceOperationViewSet
from users.models import User, LANG_CHOICES
from users.models import User
from django.core.mail import EmailMessage

from utils.blob_functions import (
Expand All @@ -56,7 +56,11 @@
import tempfile

from shoonya_backend.locks import Lock

from utils.constants import LANG_CHOICES
from projects.tasks import filter_data_items
from projects.models import BATCH
from dataset import models as dataset_models
from projects.registry_helper import ProjectRegistry
import logging

logger = logging.getLogger(__name__)
Expand All @@ -72,6 +76,10 @@ def sentence_text_translate_and_save_translation_pairs(
input_dataset_instance_id,
output_dataset_instance_id,
batch_size,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
api_type="indic-trans-v2",
checks_for_particular_languages=False,
automate_missing_data_items=True,
Expand All @@ -87,6 +95,10 @@ def sentence_text_translate_and_save_translation_pairs(
Allowed - [indic-trans, google, indic-trans-v2, azure, blank]
checks_for_particular_languages (bool): If True, checks for the particular languages in the translations.
automate_missing_data_items (bool): If True, consider only those data items that are missing in the target dataset instance.
filter_string (str): string to filter input data.
sampling_mode (str): can be batch or full.
sampling_parameters (json): is a json that contains, batch number and batch size
"""
task_name = "sentence_text_translate_and_save_translation_pairs"
output_sentences = list(
Expand All @@ -113,6 +125,14 @@ def sentence_text_translate_and_save_translation_pairs(
"metadata_json",
)
)
if filter_string and sampling_mode and sampling_parameters:
input_sentences = get_filtered_items(
"SentenceText",
input_dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)

# Convert the input_sentences list into a dataframe
input_sentences_complete_df = pd.DataFrame(
Expand Down Expand Up @@ -403,7 +423,15 @@ def conversation_data_machine_translation(

@shared_task(bind=True)
def generate_ocr_prediction_json(
self, dataset_instance_id, user_id, api_type, automate_missing_data_items
self,
dataset_instance_id,
user_id,
api_type,
automate_missing_data_items,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
"""Function to generate OCR prediction data and to save to the same data item.
Args:
Expand Down Expand Up @@ -436,7 +464,14 @@ def generate_ocr_prediction_json(
)
except Exception as e:
ocr_data_items = []

if filter_string and sampling_mode and sampling_parameters:
ocr_data_items = get_filtered_items(
"OCRDocument",
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)
# converting the dataset_instance to pandas dataframe.
ocr_data_items_df = pd.DataFrame(
ocr_data_items,
Expand Down Expand Up @@ -555,7 +590,15 @@ def generate_ocr_prediction_json(

@shared_task(bind=True)
def generate_asr_prediction_json(
self, dataset_instance_id, user_id, api_type, automate_missing_data_items
self,
dataset_instance_id,
user_id,
api_type,
automate_missing_data_items,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
"""Function to generate ASR prediction data and to save to the same data item.
Args:
Expand Down Expand Up @@ -589,7 +632,14 @@ def generate_asr_prediction_json(
)
except Exception as e:
asr_data_items = []

if filter_string and sampling_mode and sampling_parameters:
asr_data_items = get_filtered_items(
"SpeechConversation",
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
)
# converting the dataset_instance to pandas dataframe.
asr_data_items_df = pd.DataFrame(
asr_data_items,
Expand Down Expand Up @@ -703,7 +753,16 @@ def generate_asr_prediction_json(


@shared_task(bind=True)
def populate_draft_data_json(self, pk, user_id, fields_list):
def populate_draft_data_json(
self,
pk,
user_id,
fields_list,
filter_string,
sampling_mode,
sampling_parameters,
variable_parameters,
):
task_name = "populate_draft_data_json"
try:
dataset_instance = DatasetInstance.objects.get(pk=pk)
Expand All @@ -712,6 +771,10 @@ def populate_draft_data_json(self, pk, user_id, fields_list):
dataset_type = dataset_instance.dataset_type
dataset_model = apps.get_model("dataset", dataset_type)
dataset_items = dataset_model.objects.filter(instance_id=dataset_instance)
if filter_string and sampling_mode and sampling_parameters:
dataset_items = get_filtered_items(
dataset_type, pk, filter_string, sampling_mode, sampling_parameters
)
cnt = 0
for dataset_item in dataset_items:
new_draft_data_json = {}
Expand Down Expand Up @@ -1371,6 +1434,7 @@ def get_stats_helper(
get_most_recent_annotation(
ann_obj.parent_annotation
).result,
project_type,
)
)
except Exception as error:
Expand Down Expand Up @@ -1425,6 +1489,7 @@ def get_stats_helper(
get_most_recent_annotation(
ann_obj.parent_annotation.parent_annotation
).result,
project_type,
)
)
except Exception as error:
Expand All @@ -1436,6 +1501,7 @@ def get_stats_helper(
get_most_recent_annotation(
ann_obj.parent_annotation
).result,
project_type,
)
)
except Exception as error:
Expand All @@ -1447,6 +1513,7 @@ def get_stats_helper(
get_most_recent_annotation(
ann_obj.parent_annotation.parent_annotation
).result,
project_type,
)
)
except Exception as error:
Expand Down Expand Up @@ -1518,10 +1585,10 @@ def calculate_ced_between_two_annotations(annotation1, annotation2):
return ced_list


def calculate_wer_between_two_annotations(annotation1, annotation2):
def calculate_wer_between_two_annotations(annotation1, annotation2, project_type):
try:
return calculate_word_error_rate_between_two_audio_transcription_annotation(
annotation1, annotation2
annotation1, annotation2, project_type
)
except Exception as e:
return 0
Expand Down Expand Up @@ -1691,3 +1758,38 @@ def upload_all_projects_to_blob_and_get_url(csv_files_directory):
return "Error in generating url"
blob_url = f"https://{account_name}.blob.{endpoint_suffix}/{CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS}/{blob_client.blob_name}?{sas_token}"
return blob_url


def get_filtered_items(
dataset_model,
dataset_instance_id,
filter_string,
sampling_mode,
sampling_parameters,
):
registry_helper = ProjectRegistry.get_instance()
project_type = registry_helper.get_project_name_from_dataset(dataset_model)
if not isinstance(dataset_instance_id, list):
dataset_instance_id = [dataset_instance_id]
filtered_items = filter_data_items(
project_type=project_type,
dataset_instance_ids=dataset_instance_id,
filter_string=filter_string,
)
# Apply sampling
if sampling_mode == BATCH:
batch_size = sampling_parameters["batch_size"]
try:
batch_number = sampling_parameters["batch_number"]
if len(batch_number) == 0:
batch_number = [1]
except KeyError:
batch_number = [1]
sampled_items = []
for batch_num in batch_number:
sampled_items += filtered_items[
batch_size * (batch_num - 1) : batch_size * batch_num
]
else:
sampled_items = filtered_items
return sampled_items
Loading

0 comments on commit d4c44d1

Please sign in to comment.