forked from bigscience-workshop/biomedical
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bioasq_task_b.py
795 lines (723 loc) · 36.6 KB
/
bioasq_task_b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
BioASQ Task B On Biomedical Semantic QA (Involves IR, QA, Summarization qnd
More). This task uses benchmark datasets containing development and test
questions, in English, along with gold standard (reference) answers constructed
by a team of biomedical experts. The participants have to respond with relevant
concepts, articles, snippets and RDF triples, from designated resources, as well
as exact and 'ideal' answers.
Fore more information about the challenge, the organisers and the relevant
publications please visit: http://bioasq.org/
"""
import glob
import json
import os
import re
import datasets
from .bigbiohub import qa_features
from .bigbiohub import BigBioConfig
from .bigbiohub import Tasks
_LANGUAGES = ['English']
_PUBMED = True
_LOCAL = True
_CITATION = """\
@article{tsatsaronis2015overview,
title = {
An overview of the BIOASQ large-scale biomedical semantic indexing and
question answering competition
},
author = {
Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos
and Partalas, Ioannis and Zschunke, Matthias and Alvers, Michael R and
Weissenborn, Dirk and Krithara, Anastasia and Petridis, Sergios and
Polychronopoulos, Dimitris and others
},
year = 2015,
journal = {BMC bioinformatics},
publisher = {BioMed Central Ltd},
volume = 16,
number = 1,
pages = 138
}
"""
_DATASETNAME = "bioasq_task_b"
_DISPLAYNAME = "BioASQ Task B"
_BIOASQ_10B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ
10, which will take place during 2022. There is one file containing the data:
- training10b.json
The file contains the data of the first nine editions of the challenge: 4234
questions [1] with their relevant documents, snippets, concepts and RDF
triples, exact and ideal answers.
Differences with BioASQ-training9b.json
- 492 new questions added from BioASQ9
- The question with id 56c1f01eef6e394741000046 had identical body with
602498cb1cb411341a00009e. All relevant elements from both questions
are available in the merged question with id 602498cb1cb411341a00009e.
- The question with id 5c7039207c78d69471000065 had identical body with
601c317a1cb411341a000014. All relevant elements from both questions
are available in the merged question with id 601c317a1cb411341a000014.
- The question with id 5e4b540b6d0a27794100001c had identical body with
602828b11cb411341a0000fc. All relevant elements from both questions
are available in the merged question with id 602828b11cb411341a0000fc.
- The question with id 5fdb42fba43ad31278000027 had identical body with
5d35eb01b3a638076300000f. All relevant elements from both questions
are available in the merged question with id 5d35eb01b3a638076300000f.
- The question with id 601d76311cb411341a000045 had identical body with
6060732b94d57fd87900003d. All relevant elements from both questions
are available in the merged question with id 6060732b94d57fd87900003d.
[1] 4234 questions : 1252 factoid, 1148 yesno, 1018 summary, 816 list
"""
_BIOASQ_9B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 9,
which will take place during 2021. There is one file containing the data:
- training9b.json
The file contains the data of the first seven editions of the challenge: 3742
questions [1] with their relevant documents, snippets, concepts and RDF triples,
exact and ideal answers.
Differences with BioASQ-training8b.json
- 499 new questions added from BioASQ8
- The question with id 5e30e689fbd6abf43b00003a had identical body with
5880e417713cbdfd3d000001. All relevant elements from both questions
are available in the merged question with id 5880e417713cbdfd3d000001.
[1] 3742 questions : 1091 factoid, 1033 yesno, 899 summary, 719 list
"""
_BIOASQ_8B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 8,
which will take place during 2020. There is one file containing the data:
- training8b.json
The file contains the data of the first seven editions of the challenge: 3243
questions [1] with their relevant documents, snippets, concepts and RDF triples,
exact and ideal answers.
Differences with BioASQ-training7b.json
- 500 new questions added from BioASQ7
- 4 questions were removed
- The question with id 5717fb557de986d80d000009 had identical body with
571e06447de986d80d000016. All relevant elements from both questions
are available in the merged question with id 571e06447de986d80d000016.
- The question with id 5c589ddb86df2b917400000b had identical body with
5c6b7a9e7c78d69471000029. All relevant elements from both questions
are available in the merged question with id 5c6b7a9e7c78d69471000029.
- The question with id 52ffb5d12059c6d71c00007c had identical body with
52e7870a98d023950500001a. All relevant elements from both questions
are available in the merged question with id 52e7870a98d023950500001a.
- The question with id 53359338d6d3ac6a3400004f had identical body with
589a246878275d0c4a000030. All relevant elements from both questions
are available in the merged question with id 589a246878275d0c4a000030.
**** UPDATE 25/02/2020 *****
The previous version of the dataset contained an inconsistency on question with
id "5c9904eaecadf2e73f00002e", where the "ideal_answer" field was missing.
This has been fixed.
"""
_BIOASQ_7B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 7,
which will take place during 2019. There is one file containing the data:
- BioASQ-trainingDataset7b.json
The file contains the data of the first six editions of the challenge: 2747
questions [1] with their relevant documents, snippets, concepts and RDF triples,
exact and ideal answers.
Differences with BioASQ-trainingDataset6b.json
- 500 new questions added from BioASQ6
- 4 questions were removed
- The question with id 569ed752ceceede94d000004 had identical body with
a new question from BioASQ6. All relevant elements from both questions
are available in the merged question with id 5abd31e0fcf456587200002c
- 3 questions were removed as incomplete: 54d643023706e89528000007,
532819afd6d3ac6a3400000f, 517545168ed59a060a00002b
- 4 questions were revised for various confusions that have been identified
- In 2 questions the ideal answer has been revised :
51406e6223fec90375000009, 5172f8118ed59a060a000019
- In 4 questions the snippets and documents list has been revised :
51406e6223fec90375000009, 5172f8118ed59a060a000019,
51593dc8d24251bc05000099, 5158a5b8d24251bc05000097
- In 198 questions the documents list has updated with missing
documents from the relevant snippets list. [2]
[1] 2747 questions : 779 factoid, 745 yesno, 667 summary, 556 list
[2] 55031181e9bde69634000014, 51406e6223fec90375000009, 54d643023706e89528000007,
52bf1b0a03868f1b06000009, 52bf19c503868f1b06000001, 51593dc8d24251bc05000099,
530a5117970c65fa6b000007, 553a8d78f321868558000003, 531a3fe3b166e2b806000038,
532819afd6d3ac6a3400000f, 5158a5b8d24251bc05000097, 553653a5bc4f83e828000007,
535d2cf09a4572de6f000004, 53386282d6d3ac6a3400005a, 517a8ce98ed59a060a000045,
55391ce8bc4f83e828000018, 5547d700f35db75526000007, 5713bf261174fb1755000011,
6f15c5a2ac5ed1459000012, 52b2e498f828ad283c000010, 570a7594cf1c325851000026,
530cefaaad0bf1360c000012, 530f685c329f5fcf1e000002, 550c4011a103b78016000009,
552faababc4f83e828000005, 54cf48acf693c3b16b00000b, 550313aae9bde6963400001f,
551177626a8cde6b72000005, 54eded8c94afd6150400000c, 550c3754a103b78016000007,
56f555b609dd18d46b000007, 54c26e29f693c3b16b000003, 54da0c524b1fd0d33c00000b,
52bf1d3c03868f1b0600000d, 5343bdd6aeec6fbd07000001, 52cb9b9b03868f1b0600002d,
55423875ec76f5e50c000002, 571366ba1174fb1755000005, 56c4d14ab04e159d0e000003,
550c44d1a103b7801600000a, 5547a01cf35db75526000005, 55422640ccca0ce74b000004,
54ecb66d445c3b5a5f000002, 553656c4bc4f83e828000009, 5172f8118ed59a060a000019,
513711055274a5fb0700000e, 54d892ee014675820d000005, 52e6c92598d0239505000019,
5353aedb288f4dae47000006, 52bf1f1303868f1b06000014, 5519113b622b19434500000f,
52b2f1724003448f5500000b, 5525317687ecba3764000007, 554a0cadf35db7552600000f,
55152bd246478f2f2c000002, 516c3960298dcd4e51000073, 571e417bbb137a4b0c00000a,
551910d3622b194345000008, 54dc8ed6c0bb8dce23000002, 511a4ec01159fa8212000004,
54d8ea2c4b1fd0d33c000002, 5148e1d6d24251bc0500003a, 515dbb3b298dcd4e51000018,
56f7c15a09dd18d46b000012, 51475d5cd24251bc0500001b, 54db7c4ac0bb8dce23000001,
57152ebbcb4ef8864c000002, 57134d511174fb1755000002, 55149f156a8cde6b72000013,
56bcd422d36b5da378000005, 54ede5c394afd61504000006, 517545168ed59a060a00002b,
5710ed19a5ed216440000003, 53442472aeec6fbd07000008, 55088e412e93f0133a000001,
54d762653706e89528000014, 550aef0ec2af5d5b7000000a, 552435602c8b63434a000009,
552446612c8b63434a00000c, 54d901ec4b1fd0d33c000006, 54cf45e7f693c3b16b00000a,
52fc8b772059c6d71c00006e, 5314d05adae131f84700000d, 5512c91b6a8cde6b7200000b,
56c5a7605795f9a73e000002, 55030a6ce9bde6963400000f, 553fac39c6a5098552000001,
531a3a58b166e2b806000037, 5509bd6a1180f13250000002, 54f9c40ddd3fc62544000001,
553c8fd1f32186855800000a, 56bce51cd36b5da37800000a, 550316a6e9bde69634000029,
55031286e9bde6963400001b, 536e46f27d100faa09000012, 5502abd1e9bde69634000008,
551af9106b348bb82c000002, 54edeb4394afd6150400000b, 5717cdd2070aa3d072000001,
56c5ade15795f9a73e000003, 531464a6e3eabad021000014, 58a0d87a78275d0c4a000053,
58a3160d60087bc10a00000a, 58a5d54860087bc10a000025, 58a0da5278275d0c4a000054,
58a3264e60087bc10a00000d, 589c8ef878275d0c4a000042, 58a3428d60087bc10a00001b,
58a3196360087bc10a00000b, 58a341eb60087bc10a000018, 58a3275960087bc10a00000f,
58a342e760087bc10a00001c, 58bd645702b8c60953000010, 58bc8e5002b8c60953000006,
58bc8e7a02b8c60953000007, 58a1da4e78275d0c4a000059, 58bcb83d02b8c6095300000f,
58bc9a5002b8c60953000008, 589dee3778275d0c4a000050, 58a32efe60087bc10a000013,
58a327bf60087bc10a000011, 58bca08702b8c6095300000a, 58bc9dbb02b8c60953000009,
58c99fcc02b8c60953000029, 58bca2f302b8c6095300000c, 58cbf1f402b8c60953000036,
58cdb41302b8c60953000042, 58cdb80302b8c60953000043, 58cdbaf302b8c60953000044,
58cb305c02b8c60953000032, 58caf86f02b8c60953000030, 58c1b2f702b8c6095300001e,
58bde18b02b8c60953000014, 58eb7898eda5a57672000006, 58caf88c02b8c60953000031,
58e11bf76fddd3e83e00000c, 58cdbbd102b8c60953000045, 58df779d6fddd3e83e000001,
58dbb4f08acda3452900001a, 58dbb8968acda3452900001b, 58add7699ef3c34033000009,
58dbbbf08acda3452900001d, 58dbba438acda3452900001c, 58dd2cb08acda34529000029,
58eb9542eda5a57672000007, 58f3ca5c70f9fc6f0f00000d, 58e9e7aa3e8b6dc87c00000d,
58e3d9ab3e8b6dc87c000002, 58eb4ce7eda5a57672000004, 58f3c8f470f9fc6f0f00000c,
58f3c62970f9fc6f0f00000b, 58adca6d9ef3c34033000007, 58f4b3ee70f9fc6f0f000013,
593ff22b70f9fc6f0f000023, 5a679875b750ff4455000004, 5a774585faa1ab7d2e000005,
5a6f7245b750ff4455000050, 5a787544faa1ab7d2e00000b, 5a74d9980384be9551000008,
5a6a02a3b750ff4455000021, 5a6e47b1b750ff4455000049, 5a87124561bb38fb24000001,
5a6e42f1b750ff4455000046, 5a8b1264fcd1d6a10c00001d, 5a981e66fcd1d6a10c00002f,
5a8718c861bb38fb24000008, 5a7615af83b0d9ea6600001f, 5a87140a61bb38fb24000003,
5a77072c9e632bc06600000a, 5a897601fcd1d6a10c000008, 5a871a6861bb38fb24000009,
5a74e9ad0384be955100000a, 5a79d25dfaa1ab7d2e00000f, 5a6900ebb750ff445500001d,
5a87145861bb38fb24000004, 5a871b8d61bb38fb2400000a, 5a897a06fcd1d6a10c00000b,
5a8dc6b4fcd1d6a10c000026, 5a8712af61bb38fb24000002, 5a8714e261bb38fb24000005,
5aa304f1d6d6b54f79000004, 5a981bcffcd1d6a10c00002d, 5aa3fa73d6d6b54f79000008,
5aa55b45d6d6b54f7900000d, 5a981dd0fcd1d6a10c00002e, 5a9700adfcd1d6a10c00002c,
5a9d8ffe1d1251d03b000022, 5a96c74cfcd1d6a10c000029, 5aa50086d6d6b54f7900000c,
5a95765bfcd1d6a10c000028, 5a96f40cfcd1d6a10c00002b, 5ab144fefcf4565872000012,
5aa67b4fd6d6b54f7900000f, 5abd5a62fcf4565872000031, 5abbe429fcf456587200001c,
5aaef38dfcf456587200000f, 5abce6acfcf4565872000022, 5aae6499fcf456587200000c
"""
_BIOASQ_6B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 6,
which will take place during 2018. There is one file containing the data:
- BioASQ-trainingDataset6b.json
Differences with BioASQ-trainingDataset5b.json
- 500 new questions added from BioASQ5
- 48 pairs of questions with identical bodies have been merged into one
question having only one question-id, but all the documents, snippets,
concepts, RDF triples and answers of both questions of the pair.
- This normalization lead to the removal of 48 deprecated question
ids [2] from the dataset and to the update of the 48 remaining
questions [3].
- In cases where a pair of questions with identical bodies had some
inconsistency (e.g. different question type), the inconsistency has
been solved merging the pair manually consulting the BioASQ expert team.
- 12 questions were revised for various confusions that have been
identified
- In 8 questions the question type has been changed to better suit to
the question body. The change of type lead to corresponding changes
in exact answers existence and format : 54fc4e2e6ea36a810c000003,
530b01a6970c65fa6b000008, 530cf54dab4de4de0c000009,
531b2fc3b166e2b80600003c, 532819afd6d3ac6a3400000f,
532aad53d6d3ac6a34000010, 5710ade4cf1c32585100002c,
52f65f372059c6d71c000027
- In 6 questions the ideal answer has been revised :
532aad53d6d3ac6a34000010, 5710ade4cf1c32585100002c,
53147b52e3eabad021000015, 5147c8a6d24251bc05000027,
5509bd6a1180f13250000002, 58bbb71f22d3005309000016
- In 5 questions the exact answer has been revised :
5314bd7ddae131f847000006, 53130a77e3eabad02100000f,
53148a07dae131f847000002, 53147b52e3eabad021000015,
5147c8a6d24251bc05000027
- In 2 questions the question body has been revised :
52f65f372059c6d71c000027, 5503145ee9bde69634000022
- In lists of ideal answers, documents, snippets, concepts and RDF triples
any duplicate identical elements have been removed.
- Ideal answers in format of one string have been converted to a list with
one element for consistency with cases where more than one golden ideal
answers are available. (i.e. "ideal_ans1" converted to ["ideal_ans1"])
- For yesno questions: All exact answers have been normalized to "yes" or
"no" (replacing "Yes", "YES" and "No")
- For factoid questions: The format of the exact answer was normalized to a
list of strings for each question, representing a set of synonyms
answering the question (i.e. [`ans1`, `syn11`, ... ]).
- For list questions: The format of the exact answer was normalized to a
list of lists. Each internal list represents one element of the answer
as a set of synonyms
(i.e. [[`ans1`, `syn11`, `syn12`], [`ans2`], [`ans3`, `syn31`] ...]).
- Empty elements, e.g. empty lists of documents have been removed.
[1] 2251 questions : 619 factoid, 616 yesno, 531 summary, 485 list
[2] The 48 deprecated question ids are : 52f8b2902059c6d71c000053,
52f11bf22059c6d71c000005, 52f77edb2059c6d71c000028, 52ed795098d0239505000032,
56d1a9baab2fed4a47000002, 52f7d3472059c6d71c00002f, 52fbe2bf2059c6d71c00006c,
52ec961098d023950500002a, 52e8e98298d0239505000020, 56cae5125795f9a73e000024,
530cefaaad0bf1360c000007, 530cefaaad0bf1360c000005, 52d63b2803868f1b0600003a,
530cefaaad0bf1360c00000a, 516425ff298dcd4e51000051, 55191149622b194345000010,
52fa70142059c6d71c000056, 52f77f4d2059c6d71c00002a, 52efc016c8da89891000001a,
52efc001c8da898910000019, 52f896ae2059c6d71c000045, 52eceada98d023950500002d,
52efc05cc8da89891000001c, 515e078e298dcd4e51000031, 52fe54252059c6d71c000079,
514217a6d24251bc05000005, 52d1389303868f1b06000032, 530cf4d5e2bfff940c000003,
52fc946d2059c6d71c000071, 52e8e99e98d0239505000021, 52ef7786c8da898910000015,
52d8494698d0239505000007, 530cf51d5610acba0c000001, 52f637972059c6d71c000025,
52e9f99798d0239505000025, 515de572298dcd4e51000021, 52fe4ad52059c6d71c000077,
52f65bf02059c6d71c000026, 52e8e9d298d0239505000022, 52fa74052059c6d71c00005a,
52ffbddf2059c6d71c00007d, 56bc932aac7ad1001900001c, 56c02883ef6e394741000017,
52d2b75403868f1b06000035, 52f118aa2059c6d71c000003, 52e929eb98d0239505000023,
532c12f2d6d3ac6a3400001d, 52d8466298d0239505000006'
[3] The 48 questions resulting from merging with their pair have the
following ids: 5149aafcd24251bc05000045, 515db020298dcd4e51000011,
515db54c298dcd4e51000016, 51680a49298dcd4e51000062, 52b06a68f828ad283c000005,
52bf1aa503868f1b06000006, 52bf1af803868f1b06000008, 52bf1d6003868f1b0600000e,
52cb9b9b03868f1b0600002d, 52d2818403868f1b06000033, 52df887498d023950500000c,
52e0c9a298d0239505000010, 52e203bc98d0239505000011, 52e62bae98d0239505000015,
52e6c92598d0239505000019, 52e7bbf698d023950500001d, 52ea605098d0239505000028,
52ece29f98d023950500002c, 52ecf2dd98d023950500002e, 52ef7754c8da898910000014,
52f112bb2059c6d71c000002, 52f65f372059c6d71c000027, 52f77f752059c6d71c00002b,
52f77f892059c6d71c00002c, 52f89ee42059c6d71c00004d, 52f89f4f2059c6d71c00004e,
52f89fba2059c6d71c00004f, 52f89fc62059c6d71c000050, 52f89fd32059c6d71c000051,
52fa6ac72059c6d71c000055, 52fa73c62059c6d71c000058, 52fa73e82059c6d71c000059,
52fa74252059c6d71c00005b, 52fc8b772059c6d71c00006e, 52fc94572059c6d71c000070,
52fc94ae2059c6d71c000073, 52fc94db2059c6d71c000074, 52fe52702059c6d71c000078,
52fe58f82059c6d71c00007a, 530cefaaad0bf1360c000008, 530cefaaad0bf1360c000010,
533ba218fd9a95ea0d000007, 534bb147aeec6fbd07000014, 55167dec46478f2f2c00000a,
56c04412ef6e39474100001b, 56c1f01eef6e394741000046, 56c81fd15795f9a73e00000c,
587d016ed673c3eb14000002
"""
_BIOASQ_5B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 5,
which will take place during 2017. There is one file containing the data:
- BioASQ-trainingDataset5b.json
The file contains the data of the first four editions of the challenge: 1799
questions with their relevant documents, snippets, concepts and rdf triples,
exact and ideal answers.
"""
_BIOASQ_4B_DESCRIPTION = """\
The data are intended to be used as training and development data for BioASQ 4,
which will take place during 2016. There is one file containing the data:
- BioASQ-trainingDataset4b.json
The file contains the data of the first three editions of the challenge: 1307
questions with their relevant documents, snippets, concepts and rdf triples,
exact and ideal answers from the first two editions and 497 questions with
similar annotations from the third editions of the challenge.
"""
_BIOASQ_3B_DESCRIPTION = """No README provided."""
_BIOASQ_2B_DESCRIPTION = """No README provided."""
_BIOASQ_BLURB_DESCRIPTION = """The BioASQ corpus contains multiple question
answering tasks annotated by biomedical experts, including yes/no, factoid, list,
and summary questions. Pertaining to our objective of comparing neural language
models, we focus on the the yes/no questions (Task 7b), and leave the inclusion
of other tasks to future work. Each question is paired with a reference text
containing multiple sentences from a PubMed abstract and a yes/no answer. We use
the official train/dev/test split of 670/75/140 questions.
See 'Domain-Specific Language Model Pretraining for Biomedical
Natural Language Processing' """
_DESCRIPTION = {
"bioasq_10b": _BIOASQ_10B_DESCRIPTION,
"bioasq_9b": _BIOASQ_9B_DESCRIPTION,
"bioasq_8b": _BIOASQ_8B_DESCRIPTION,
"bioasq_7b": _BIOASQ_7B_DESCRIPTION,
"bioasq_6b": _BIOASQ_6B_DESCRIPTION,
"bioasq_5b": _BIOASQ_5B_DESCRIPTION,
"bioasq_4b": _BIOASQ_4B_DESCRIPTION,
"bioasq_3b": _BIOASQ_3B_DESCRIPTION,
"bioasq_2b": _BIOASQ_2B_DESCRIPTION,
"bioasq_blurb": _BIOASQ_BLURB_DESCRIPTION,
}
_HOMEPAGE = "http://participants-area.bioasq.org/datasets/"
# Data access reqires registering with BioASQ.
# See http://participants-area.bioasq.org/accounts/register/
_LICENSE = 'National Library of Medicine Terms and Conditions'
_URLs = {
"bioasq_10b": ["BioASQ-training10b.zip", None],
"bioasq_9b": ["BioASQ-training9b.zip", "Task9BGoldenEnriched.zip"],
"bioasq_8b": ["BioASQ-training8b.zip", "Task8BGoldenEnriched.zip"],
"bioasq_7b": ["BioASQ-training7b.zip", "Task7BGoldenEnriched.zip"],
"bioasq_6b": ["BioASQ-training6b.zip", "Task6BGoldenEnriched.zip"],
"bioasq_5b": ["BioASQ-training5b.zip", "Task5BGoldenEnriched.zip"],
"bioasq_4b": ["BioASQ-training4b.zip", "Task4BGoldenEnriched.zip"],
"bioasq_3b": ["BioASQ-trainingDataset3b.zip", "Task3BGoldenEnriched.zip"],
"bioasq_2b": ["BioASQ-trainingDataset2b.zip", "Task2BGoldenEnriched.zip"],
"bioasq_blurb": ["BioASQ-training7b.zip", "Task7BGoldenEnriched.zip"],
}
# BLURB train and dev contain all yesno questions from the offical training split
# test is all yesno question from the official test split
_BLURB_SPLITS = {
"dev": {
"5313b049e3eabad021000013",
"553a8d78f321868558000003",
"5158a5b8d24251bc05000097",
"571e3d42bb137a4b0c000007",
"5175b97a8ed59a060a00002f",
"56c9e9d15795f9a73e00001d",
"56d19ffaab2fed4a47000001",
"518ccac0310faafe0800000b",
"56f12ca92ac5ed145900000e",
"51680a49298dcd4e51000062",
"5339ed7bd6d3ac6a34000060",
"516e5f33298dcd4e5100007e",
"5327139ad6d3ac6a3400000d",
"54e12ae3ae9738404b000004",
"5321b8579b2d7acc7e000008",
"514a4679d24251bc0500005b",
"54c12fd1f693c3b16b000001",
"52df887498d023950500000c",
"52f20d802059c6d71c00000a",
"532f0c4ed6d3ac6a3400002e",
"52b2f3b74003448f5500000c",
"52b2f1724003448f5500000b",
"515d9a42298dcd4e5100000d",
"5159b990d24251bc050000a3",
"54e12c30ae9738404b000005",
"553a6a9fbc4f83e82800001c",
"5509ec41c2af5d5b70000006",
"56cae40b5795f9a73e000022",
"51680b0e298dcd4e51000065",
"515df89e298dcd4e5100002f",
"54f49e56d0d681a040000004",
"571e3e2abb137a4b0c000008",
"515debe7298dcd4e51000026",
"56f6ab7009dd18d46b00000d",
"53302bced6d3ac6a34000039",
"5322de919b2d7acc7e000012",
"5709f212cf1c325851000020",
"5502abd1e9bde69634000008",
"516c220e298dcd4e51000071",
"5894597e7d9090f353000004",
"5895ec5e7d9090f353000015",
"58bbb8ae22d3005309000018",
"58bc58c302b8c60953000001",
"58c276bc02b8c60953000020",
"58c0825502b8c6095300001b",
"58ab1f6c9ef3c34033000002",
"58adbe999ef3c34033000005",
"58df3e408acda3452900002d",
"58dfec676fddd3e83e000006",
"58d8d0cc8acda34529000008",
"58b67fae22d3005309000009",
"58dbbbf08acda3452900001d",
"58dbba438acda3452900001c",
"58dbbdac8acda3452900001e",
"58dcbb8c8acda34529000021",
"5a468785966455904c00000d",
"5a70de5199e2c3af26000005",
"5a67a550b750ff4455000009",
"5a679875b750ff4455000004",
"5a7a44b4faa1ab7d2e000010",
"5a67ade5b750ff445500000c",
"5a8881118cb19eca6b000006",
"5a67b48cb750ff4455000010",
"5a679be1b750ff4455000005",
"5a7340962dc08e987e000017",
"5a737e233b9d13c70800000d",
"5a8dc57ffcd1d6a10c000025",
"5a6d186db750ff4455000031",
"5a70d43b99e2c3af26000003",
"5a70ec6899e2c3af2600000c",
"5a9ac4161d1251d03b000010",
"5a733d2a2dc08e987e000015",
"5a74acd80384be9551000006",
"5aa6800ad6d6b54f79000011",
"5a9d9ab94e03427e73000003",
}
}
_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
_SOURCE_VERSION = "1.0.0"
_BIGBIO_VERSION = "1.0.0"
class BioasqTaskBDataset(datasets.GeneratorBasedBuilder):
"""
BioASQ Task B On Biomedical Semantic QA.
Creates configs for BioASQ2 through BioASQ10.
"""
DEFAULT_CONFIG_NAME = "bioasq_9b_source"
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
# BioASQ2 through BioASQ10
BUILDER_CONFIGS = []
for version in range(2, 11):
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"bioasq_{version}b_source",
version=SOURCE_VERSION,
description=f"bioasq{version} Task B source schema",
schema="source",
subset_id=f"bioasq_{version}b",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"bioasq_{version}b_bigbio_qa",
version=BIGBIO_VERSION,
description=f"bioasq{version} Task B in simplified BigBio schema",
schema="bigbio_qa",
subset_id=f"bioasq_{version}b",
)
)
# BLURB Benchmark config https://microsoft.github.io/BLURB/
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"bioasq_blurb_bigbio_qa",
version=BIGBIO_VERSION,
description=f"BLURB benchmark in simplified BigBio schema",
schema="bigbio_qa",
subset_id=f"bioasq_blurb",
)
)
def _info(self):
# BioASQ Task B source schema
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"body": datasets.Value("string"),
"documents": datasets.Sequence(datasets.Value("string")),
"concepts": datasets.Sequence(datasets.Value("string")),
"ideal_answer": datasets.Sequence(datasets.Value("string")),
"exact_answer": datasets.Sequence(datasets.Value("string")),
"triples": [
{
"p": datasets.Value("string"),
"s": datasets.Value("string"),
"o": datasets.Value("string"),
}
],
"snippets": [
{
"offsetInBeginSection": datasets.Value("int32"),
"offsetInEndSection": datasets.Value("int32"),
"text": datasets.Value("string"),
"beginSection": datasets.Value("string"),
"endSection": datasets.Value("string"),
"document": datasets.Value("string"),
}
],
}
)
# simplified schema for QA tasks
elif self.config.schema == "bigbio_qa":
features = qa_features
return datasets.DatasetInfo(
description=_DESCRIPTION[self.config.subset_id],
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=str(_LICENSE),
citation=_CITATION,
)
def _dump_gold_json(self, data_dir):
"""
BioASQ test data is split into multiple records {9B1_golden.json,...,9B5_golden.json}
We combine these files into a single test set file 9Bx_golden.json
"""
# BLURB is based on version 7
version = (
re.search(r"bioasq_([0-9]+)b", self.config.subset_id).group(1)
if "blurb" not in self.config.name
else "7"
)
gold_fpath = os.path.join(
data_dir, f"Task{version}BGoldenEnriched/bx_golden.json"
)
if not os.path.exists(gold_fpath):
# combine all gold json files
filelist = glob.glob(os.path.join(data_dir, "*/*.json"))
data = {"questions": []}
for fname in sorted(filelist):
with open(fname, "rt", encoding="utf-8") as file:
data["questions"].extend(json.load(file)["questions"])
# dump gold to json
with open(gold_fpath, "wt", encoding="utf-8") as file:
json.dump(data, file, indent=2)
return f"Task{version}BGoldenEnriched/bx_golden.json"
def _blurb_split_generator(self, train_dir, test_dir):
"""
Create splits for BLURB Benchmark
"""
gold_fpath = self._dump_gold_json(test_dir)
# create train/dev splits from yesno questions
train_fpath = os.path.join(train_dir, "blurb_bioasq_train.json")
dev_fpath = os.path.join(train_dir, "blurb_bioasq_dev.json")
blurb_splits = {
"train": {"questions": []},
"dev": {"questions": []},
"test": {"questions": []},
}
if not os.path.exists(train_fpath):
data_fpath = os.path.join(train_dir, "BioASQ-training7b/trainining7b.json")
with open(data_fpath, "rt", encoding="utf-8") as file:
data = json.load(file)
for record in data["questions"]:
if record["type"] != "yesno":
continue
if record["id"] in _BLURB_SPLITS["dev"]:
blurb_splits["dev"]["questions"].append(record)
else:
blurb_splits["train"]["questions"].append(record)
with open(train_fpath, "wt", encoding="utf-8") as file:
json.dump(blurb_splits["train"], file, indent=2)
with open(dev_fpath, "wt", encoding="utf-8") as file:
json.dump(blurb_splits["dev"], file, indent=2)
# create test split from yesno questions
with open(os.path.join(test_dir, gold_fpath), "rt", encoding="utf-8") as file:
data = json.load(file)
for record in data["questions"]:
if record["type"] != "yesno":
continue
blurb_splits["test"]["questions"].append(record)
test_fpath = os.path.join(test_dir, "blurb_bioasq_test.json")
with open(test_fpath, "wt", encoding="utf-8") as file:
json.dump(blurb_splits["test"], file, indent=2)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": train_fpath,
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": dev_fpath,
"split": "dev",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": test_fpath,
"split": "test",
},
),
]
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
if self.config.data_dir is None:
raise ValueError(
"This is a local dataset. Please pass the data_dir kwarg to load_dataset."
)
train_dir, test_dir = dl_manager.download_and_extract(
[
os.path.join(self.config.data_dir, _url)
for _url in _URLs[self.config.subset_id]
]
)
# create gold dump and get path
gold_fpath = self._dump_gold_json(test_dir)
# older versions of bioasq have different folder formats
train_fpaths = {
"bioasq_2b": "BioASQ_2013_TaskB/BioASQ-trainingDataset2b.json",
"bioasq_3b": "BioASQ-trainingDataset3b.json",
"bioasq_4b": "BioASQ-training4b/BioASQ-trainingDataset4b.json",
"bioasq_5b": "BioASQ-training5b/BioASQ-trainingDataset5b.json",
"bioasq_6b": "BioASQ-training6b/BioASQ-trainingDataset6b.json",
"bioasq_7b": "BioASQ-training7b/trainining7b.json",
"bioasq_8b": "training8b.json", # HACK - this zipfile strips the dirname
"bioasq_9b": "BioASQ-training9b/training9b.json",
"bioasq_10b": "BioASQ-training10b/training10b.json",
}
# BLURB has custom train/dev/test splits based on Task 7B
if "blurb" in self.config.name:
return self._blurb_split_generator(train_dir, test_dir)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(
train_dir, train_fpaths[self.config.subset_id]
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(test_dir, gold_fpath),
"split": "test",
},
),
]
def _get_exact_answer(self, record):
"""The value exact_answer can be in different formats based on question type."""
if record["type"] == "yesno":
exact_answer = [record["exact_answer"]]
elif record["type"] == "summary":
exact_answer = []
# summary question types only have an ideal answer, so use that for bigbio
if self.config.schema == "bigbio_qa":
exact_answer = (
record["ideal_answer"]
if isinstance(record["ideal_answer"], list)
else [record["ideal_answer"]]
)
elif record["type"] == "list":
exact_answer = record["exact_answer"]
elif record["type"] == "factoid":
# older version of bioasq sometimes represent this as as string
exact_answer = (
record["exact_answer"]
if isinstance(record["exact_answer"], list)
else [record["exact_answer"]]
)
return exact_answer
def _generate_examples(self, filepath, split):
"""Yields examples as (key, example) tuples."""
if self.config.schema == "source":
with open(filepath, encoding="utf-8") as file:
data = json.load(file)
for i, record in enumerate(data["questions"]):
yield i, {
"id": record["id"],
"type": record["type"],
"body": record["body"],
"documents": record["documents"],
"concepts": record["concepts"] if "concepts" in record else [],
"triples": record["triples"] if "triples" in record else [],
"ideal_answer": record["ideal_answer"]
if isinstance(record["ideal_answer"], list)
else [record["ideal_answer"]],
"exact_answer": self._get_exact_answer(record),
"snippets": record["snippets"] if "snippets" in record else [],
}
elif self.config.schema == "bigbio_qa":
# NOTE: Years 2014-2016 (BioASQ2-BioASQ4) have duplicate records
cache = set()
with open(filepath, encoding="utf-8") as file:
uid = 0
data = json.load(file)
for record in data["questions"]:
# for questions that do not have snippets, skip
if "snippets" not in record:
continue
for i, snippet in enumerate(record["snippets"]):
key = f'{record["id"]}_{i}'
# ignore duplicate records
if key not in cache:
cache.add(key)
yield uid, {
"id": key,
"document_id": snippet["document"],
"question_id": record["id"],
"question": record["body"],
"type": record["type"],
"choices": [],
"context": snippet["text"],
"answer": self._get_exact_answer(record),
}
uid += 1