-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add dataset loader for BEAR Dataset. * Add description. * Remove unwanted code and use jsonlines instead of json. * Change imports to use new bigbio module. * feat: Add BEAR dataset * fix: Fix license name in README.md --------- Co-authored-by: Mario Sänger <[email protected]>
- Loading branch information
1 parent
9da7283
commit c477a40
Showing
3 changed files
with
921 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
|
||
--- | ||
language: | ||
- en | ||
bigbio_language: | ||
- English | ||
license: cc-by-sa-4.0 | ||
multilinguality: monolingual | ||
bigbio_license_shortname: CC_BY_SA_4p0 | ||
pretty_name: BEAR | ||
homepage: https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/ | ||
bigbio_pubmed: False | ||
bigbio_public: True | ||
bigbio_tasks: | ||
- NAMED_ENTITY_RECOGNITION | ||
- RELATION_EXTRACTION | ||
--- | ||
|
||
|
||
# Dataset Card for BEAR | ||
|
||
## Dataset Description | ||
|
||
- **Homepage:** https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/ | ||
- **Pubmed:** False | ||
- **Public:** True | ||
- **Tasks:** NER, RE | ||
|
||
|
||
A dataset of 2100 Twitter posts annotated with 14 different types of biomedical entities (e.g., disease, treatment, | ||
risk factor, etc.) and 20 relation types (including caused, treated, worsens, etc.). | ||
|
||
|
||
|
||
## Citation Information | ||
|
||
``` | ||
@InProceedings{wuehrl_klinger_2022, | ||
author = {Wuehrl, Amelie and Klinger, Roman}, | ||
title = {Recovering Patient Journeys: A Corpus of Biomedical Entities and Relations on Twitter (BEAR)}, | ||
booktitle = {Proceedings of The 13th Language Resources and Evaluation Conference}, | ||
month = {June}, | ||
year = {2022}, | ||
address = {Marseille, France}, | ||
publisher = {European Language Resources Association} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
# coding=utf-8 | ||
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from pathlib import Path | ||
from typing import Dict, List, Tuple, Union | ||
|
||
import datasets | ||
import jsonlines | ||
|
||
from .bigbiohub import kb_features, BigBioConfig, Tasks | ||
|
||
_CITATION = """\ | ||
@InProceedings{wuehrl_klinger_2022, | ||
author = {Wuehrl, Amelie and Klinger, Roman}, | ||
title = {Recovering Patient Journeys: A Corpus of Biomedical Entities and Relations on Twitter (BEAR)}, | ||
booktitle = {Proceedings of The 13th Language Resources and Evaluation Conference}, | ||
month = {June}, | ||
year = {2022}, | ||
address = {Marseille, France}, | ||
publisher = {European Language Resources Association} | ||
} | ||
""" | ||
|
||
_DATASETNAME = "bear" | ||
_DISPLAYNAME = "BEAR" | ||
|
||
_LANGUAGES = ['English'] | ||
_PUBMED = True | ||
_LOCAL = False | ||
_LICENSE = "CC_BY_SA_4p0" | ||
|
||
_DESCRIPTION = """\ | ||
A dataset of 2100 Twitter posts annotated with 14 different types of biomedical entities (e.g., disease, treatment, | ||
risk factor, etc.) and 20 relation types (including caused, treated, worsens, etc.). | ||
""" | ||
_HOMEPAGE = "https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/" | ||
|
||
_URLS = { | ||
_DATASETNAME: "https://www.ims.uni-stuttgart.de/documents/ressourcen/korpora/bioclaim/bear-corpus-WuehrlKlinger-\ | ||
LREC2022.zip", | ||
} | ||
|
||
_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
_BIGBIO_VERSION = "1.0.0" | ||
|
||
|
||
class BearDataset(datasets.GeneratorBasedBuilder): | ||
""" | ||
BEAR: A Corpus of Biomedical Entities and Relations | ||
A dataset of 2100 Twitter posts annotated with 14 different types of | ||
biomedical entities (e.g., disease, treatment, risk factor, etc.) | ||
and 20 relation types (including caused, treated, worsens, etc.). | ||
""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) | ||
|
||
BUILDER_CONFIGS = [ | ||
BigBioConfig( | ||
name="bear_source", | ||
version=SOURCE_VERSION, | ||
description="bear source schema", | ||
schema="source", | ||
subset_id="bear", | ||
), | ||
BigBioConfig( | ||
name="bear_bigbio_kb", | ||
version=BIGBIO_VERSION, | ||
description="bear BigBio schema", | ||
schema="bigbio_kb", | ||
subset_id="bear", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "bear_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
|
||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"document_id": datasets.Value("string"), | ||
"document_text": datasets.Value("string"), | ||
"entities": [ | ||
{ | ||
"id": datasets.Value("string"), | ||
"type": datasets.Value("string"), | ||
"text": datasets.Value("string"), | ||
"offsets": datasets.Sequence(datasets.Value("int32")), | ||
} | ||
], | ||
"relations": [ | ||
{ | ||
"id": datasets.Value("string"), | ||
"type": datasets.Value("string"), | ||
"arg1_id": datasets.Value("string"), | ||
"arg2_id": datasets.Value("string"), | ||
} | ||
], | ||
} | ||
) | ||
elif self.config.schema == "bigbio_kb": | ||
features = kb_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
urls = _URLS[_DATASETNAME] | ||
data_dir = dl_manager.download_and_extract(urls) | ||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": Path(data_dir) / "corpus" / "bear.jsonl", | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: | ||
"""Yields examples as (key, example) tuples.""" | ||
uid = 0 | ||
input_file = filepath | ||
with jsonlines.open(input_file, "r") as file: | ||
for document in file: | ||
document_id: str = document.pop("doc_id") | ||
document_text: str = document.pop("doc_text") | ||
entities: Dict[str, Dict[str, Union[str, int]]] = document.pop("entities", {}) | ||
relations: List[Dict[str, Union[str, int]]] = document.pop("relations", []) | ||
|
||
if not entities and not relations: | ||
continue | ||
|
||
if self.config.schema == "source": | ||
source_example = self._to_source_example( | ||
document_id=document_id, | ||
document_text=document_text, | ||
entities=entities, | ||
relations=relations, | ||
) | ||
yield uid, source_example | ||
elif self.config.schema == "bigbio_kb": | ||
bigbio_example = self._to_bigbio_example( | ||
document_id=document_id, | ||
document_text=document_text, | ||
entities=entities, | ||
relations=relations, | ||
) | ||
yield uid, bigbio_example | ||
|
||
uid += 1 | ||
|
||
def _to_source_example( | ||
self, | ||
document_id: str, | ||
document_text: str, | ||
entities: Dict[str, Dict[str, Union[str, int]]], | ||
relations: List[Dict[str, Union[str, int]]], | ||
) -> Dict: | ||
source_example = { | ||
"document_id": document_id, | ||
"document_text": document_text, | ||
} | ||
|
||
# Capture Entities | ||
_entities = [] | ||
for id, entity_values in entities.items(): | ||
if not entity_values: | ||
continue | ||
start = entity_values.pop("begin") | ||
end = entity_values.pop("end") | ||
type = entity_values.pop("tag") | ||
text = document_text[start:end] | ||
|
||
entity = { | ||
"id": f"{document_id}_{id}", | ||
"type": type, | ||
"text": text, | ||
"offsets": [start, end], | ||
} | ||
_entities.append(entity) | ||
source_example["entities"] = _entities | ||
|
||
# Capture Relations | ||
_relations = [] | ||
for id, relation_values in enumerate(relations): | ||
end_entity = relation_values.pop("end_entity") | ||
rel_tag = relation_values.pop("rel_tag") | ||
start_entity = relation_values.pop("start_entity") | ||
|
||
relation = { | ||
"id": f"{document_id}_relation_{id}", | ||
"type": rel_tag, | ||
"arg1_id": f"{document_id}_{start_entity}", | ||
"arg2_id": f"{document_id}_{end_entity}", | ||
} | ||
_relations.append(relation) | ||
source_example["relations"] = _relations | ||
|
||
return source_example | ||
|
||
def _to_bigbio_example( | ||
self, | ||
document_id: str, | ||
document_text: str, | ||
entities: Dict[str, Dict[str, Union[str, int]]], | ||
relations: List[Dict[str, Union[str, int]]], | ||
) -> Dict: | ||
bigbio_example = { | ||
"id": f"{document_id}_id", | ||
"document_id": document_id, | ||
"passages": [ | ||
{ | ||
"id": f"{document_id}_passage", | ||
"type": "social_media_text", | ||
"text": [document_text], | ||
"offsets": [[0, len(document_text)]], | ||
} | ||
], | ||
"events": [], | ||
"coreferences": [], | ||
} | ||
|
||
# Capture Entities | ||
_entities = [] | ||
for id, entity_values in entities.items(): | ||
if not entity_values: | ||
continue | ||
start = entity_values.pop("begin") | ||
end = entity_values.pop("end") | ||
type = entity_values.pop("tag") | ||
text = document_text[start:end] | ||
|
||
entity = { | ||
"id": f"{document_id}_{id}", | ||
"type": type, | ||
"text": [text], | ||
"offsets": [[start, end]], | ||
"normalized": [], | ||
} | ||
_entities.append(entity) | ||
bigbio_example["entities"] = _entities | ||
|
||
# Capture Relations | ||
_relations = [] | ||
for id, relation_values in enumerate(relations): | ||
end_entity = relation_values.pop("end_entity") | ||
rel_tag = relation_values.pop("rel_tag") | ||
start_entity = relation_values.pop("start_entity") | ||
|
||
relation = { | ||
"id": f"{document_id}_relation_{id}", | ||
"type": rel_tag, | ||
"arg1_id": f"{document_id}_{start_entity}", | ||
"arg2_id": f"{document_id}_{end_entity}", | ||
"normalized": [], | ||
} | ||
_relations.append(relation) | ||
bigbio_example["relations"] = _relations | ||
|
||
return bigbio_example |
Oops, something went wrong.