Skip to content

Commit

Permalink
Added 11b version of BioASQ to existing bioasq_task_b.py. Closes #925
Browse files Browse the repository at this point in the history
…and #924 (#926)

* implemented bioasq 11b

* Normalized yesno answers and added choices to satisfy

runTest (__main__.TestDataLoader) [Check multiple choice]

Note: This test used to fail also for previous iterations of the dataset but is now satisfied.

* fixed path of training10b.json
  • Loading branch information
mart1nro authored Jul 24, 2024
1 parent f614bae commit 03d2d96
Showing 1 changed file with 46 additions and 5 deletions.
51 changes: 46 additions & 5 deletions bigbio/hub/hub_repos/bioasq_task_b/bioasq_task_b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,24 @@
_DATASETNAME = "bioasq_task_b"
_DISPLAYNAME = "BioASQ Task B"

_BIOASQ_11B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ
11, which will take place during 2023. There is one file containing the data:
- training11b.json
The file contains the data of the first ten editions of the challenge: 4719
questions [1] with their relevant documents, snippets, concepts and RDF
triples, exact and ideal answers.
Differences with BioASQ-training10b.json
- 485 new questions added from BioASQ10
- The question with id 621ecf1a3a8413c653000061 had identical body with
5ac0a36f19833b0d7b000002. All relevant elements from both questions
are available in the merged question with id 5ac0a36f19833b0d7b000002.
[1] The distribution of 4719 questions : 1417 factoid, 1271 yesno, 1130 summary, 901 list
"""

_BIOASQ_10B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ
10, which will take place during 2022. There is one file containing the data:
Expand Down Expand Up @@ -361,6 +379,7 @@
Natural Language Processing' """

_DESCRIPTION = {
"bioasq_11b": _BIOASQ_11B_DESCRIPTION,
"bioasq_10b": _BIOASQ_10B_DESCRIPTION,
"bioasq_9b": _BIOASQ_9B_DESCRIPTION,
"bioasq_8b": _BIOASQ_8B_DESCRIPTION,
Expand All @@ -380,6 +399,7 @@
_LICENSE = "NLM_LICENSE"

_URLs = {
"bioasq_11b": ["BioASQ-training11b.zip", "Task11BGoldenEnriched.zip"],
"bioasq_10b": ["BioASQ-training10b.zip", "Task10BGoldenEnriched.zip"],
"bioasq_9b": ["BioASQ-training9b.zip", "Task9BGoldenEnriched.zip"],
"bioasq_8b": ["BioASQ-training8b.zip", "Task8BGoldenEnriched.zip"],
Expand Down Expand Up @@ -489,9 +509,9 @@ class BioasqTaskBDataset(datasets.GeneratorBasedBuilder):
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

# BioASQ2 through BioASQ10
# BioASQ2 through BioASQ11
BUILDER_CONFIGS = []
for version in range(2, 11):
for version in range(2, 12):
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"bioasq_{version}b_source",
Expand Down Expand Up @@ -695,7 +715,8 @@ def _split_generators(self, dl_manager):
"bioasq_7b": "BioASQ-training7b/trainining7b.json",
"bioasq_8b": "training8b.json", # HACK - this zipfile strips the dirname
"bioasq_9b": "BioASQ-training9b/training9b.json",
"bioasq_10b": "BioASQ-training10b/training10b.json",
"bioasq_10b": "training10b.json",
"bioasq_11b": "BioASQ-training11b/training11b.json",
}

# BLURB has custom train/dev/test splits based on Task 7B
Expand Down Expand Up @@ -746,6 +767,19 @@ def _get_exact_answer(self, record):
)
return exact_answer

@staticmethod
def _normalize_yesno(yesno):
assert len(yesno) == 1, "There should be only one answer."
yesno = yesno[0]
# normalize answers like "Yes."
yesno = yesno.lower()
if yesno.startswith('yes'):
return ['yes']
elif yesno.startswith('no'):
return ['no']
else:
raise ValueError(f'Unrecognized yesno value: {yesno}')

def _generate_examples(self, filepath, split):
"""Yields examples as (key, example) tuples."""

Expand Down Expand Up @@ -777,6 +811,13 @@ def _generate_examples(self, filepath, split):
# for questions that do not have snippets, skip
if "snippets" not in record:
continue

choices = []
answer = self._get_exact_answer(record)
if record["type"] == 'yesno':
choices = ['yes', 'no']
answer = self._normalize_yesno(answer)

for i, snippet in enumerate(record["snippets"]):
key = f'{record["id"]}_{i}'
# ignore duplicate records
Expand All @@ -788,8 +829,8 @@ def _generate_examples(self, filepath, split):
"question_id": record["id"],
"question": record["body"],
"type": record["type"],
"choices": [],
"choices": choices,
"context": snippet["text"],
"answer": self._get_exact_answer(record),
"answer": answer,
}
uid += 1

0 comments on commit 03d2d96

Please sign in to comment.