Skip to content

Commit

Permalink
Closes #501 : BEAR Dataset (#511)
Browse files Browse the repository at this point in the history
* Add dataset loader for BEAR Dataset.

* Add description.

* Remove unwanted code and use jsonlines instead of json.

* Change imports to use new bigbio module.

* feat: Add BEAR dataset

* fix: Fix license name in README.md

---------

Co-authored-by: Mario Sänger <[email protected]>
  • Loading branch information
karthikrangasai and Mario Sänger authored Oct 20, 2024
1 parent 9da7283 commit c477a40
Show file tree
Hide file tree
Showing 3 changed files with 921 additions and 0 deletions.
47 changes: 47 additions & 0 deletions bigbio/hub/hub_repos/bear/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

---
language:
- en
bigbio_language:
- English
license: cc-by-sa-4.0
multilinguality: monolingual
bigbio_license_shortname: CC_BY_SA_4p0
pretty_name: BEAR
homepage: https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/
bigbio_pubmed: False
bigbio_public: True
bigbio_tasks:
- NAMED_ENTITY_RECOGNITION
- RELATION_EXTRACTION
---


# Dataset Card for BEAR

## Dataset Description

- **Homepage:** https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/
- **Pubmed:** False
- **Public:** True
- **Tasks:** NER, RE


A dataset of 2100 Twitter posts annotated with 14 different types of biomedical entities (e.g., disease, treatment,
risk factor, etc.) and 20 relation types (including caused, treated, worsens, etc.).



## Citation Information

```
@InProceedings{wuehrl_klinger_2022,
author = {Wuehrl, Amelie and Klinger, Roman},
title = {Recovering Patient Journeys: A Corpus of Biomedical Entities and Relations on Twitter (BEAR)},
booktitle = {Proceedings of The 13th Language Resources and Evaluation Conference},
month = {June},
year = {2022},
address = {Marseille, France},
publisher = {European Language Resources Association}
}
```
282 changes: 282 additions & 0 deletions bigbio/hub/hub_repos/bear/bear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Dict, List, Tuple, Union

import datasets
import jsonlines

from .bigbiohub import kb_features, BigBioConfig, Tasks

_CITATION = """\
@InProceedings{wuehrl_klinger_2022,
author = {Wuehrl, Amelie and Klinger, Roman},
title = {Recovering Patient Journeys: A Corpus of Biomedical Entities and Relations on Twitter (BEAR)},
booktitle = {Proceedings of The 13th Language Resources and Evaluation Conference},
month = {June},
year = {2022},
address = {Marseille, France},
publisher = {European Language Resources Association}
}
"""

_DATASETNAME = "bear"
_DISPLAYNAME = "BEAR"

_LANGUAGES = ['English']
_PUBMED = True
_LOCAL = False
_LICENSE = "CC_BY_SA_4p0"

_DESCRIPTION = """\
A dataset of 2100 Twitter posts annotated with 14 different types of biomedical entities (e.g., disease, treatment,
risk factor, etc.) and 20 relation types (including caused, treated, worsens, etc.).
"""
_HOMEPAGE = "https://www.ims.uni-stuttgart.de/en/research/resources/corpora/bioclaim/"

_URLS = {
_DATASETNAME: "https://www.ims.uni-stuttgart.de/documents/ressourcen/korpora/bioclaim/bear-corpus-WuehrlKlinger-\
LREC2022.zip",
}

_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]

_SOURCE_VERSION = "1.0.0"
_BIGBIO_VERSION = "1.0.0"


class BearDataset(datasets.GeneratorBasedBuilder):
"""
BEAR: A Corpus of Biomedical Entities and Relations
A dataset of 2100 Twitter posts annotated with 14 different types of
biomedical entities (e.g., disease, treatment, risk factor, etc.)
and 20 relation types (including caused, treated, worsens, etc.).
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

BUILDER_CONFIGS = [
BigBioConfig(
name="bear_source",
version=SOURCE_VERSION,
description="bear source schema",
schema="source",
subset_id="bear",
),
BigBioConfig(
name="bear_bigbio_kb",
version=BIGBIO_VERSION,
description="bear BigBio schema",
schema="bigbio_kb",
subset_id="bear",
),
]

DEFAULT_CONFIG_NAME = "bear_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
features = datasets.Features(
{
"document_id": datasets.Value("string"),
"document_text": datasets.Value("string"),
"entities": [
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"text": datasets.Value("string"),
"offsets": datasets.Sequence(datasets.Value("int32")),
}
],
"relations": [
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"arg1_id": datasets.Value("string"),
"arg2_id": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "bigbio_kb":
features = kb_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": Path(data_dir) / "corpus" / "bear.jsonl",
},
),
]

def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
uid = 0
input_file = filepath
with jsonlines.open(input_file, "r") as file:
for document in file:
document_id: str = document.pop("doc_id")
document_text: str = document.pop("doc_text")
entities: Dict[str, Dict[str, Union[str, int]]] = document.pop("entities", {})
relations: List[Dict[str, Union[str, int]]] = document.pop("relations", [])

if not entities and not relations:
continue

if self.config.schema == "source":
source_example = self._to_source_example(
document_id=document_id,
document_text=document_text,
entities=entities,
relations=relations,
)
yield uid, source_example
elif self.config.schema == "bigbio_kb":
bigbio_example = self._to_bigbio_example(
document_id=document_id,
document_text=document_text,
entities=entities,
relations=relations,
)
yield uid, bigbio_example

uid += 1

def _to_source_example(
self,
document_id: str,
document_text: str,
entities: Dict[str, Dict[str, Union[str, int]]],
relations: List[Dict[str, Union[str, int]]],
) -> Dict:
source_example = {
"document_id": document_id,
"document_text": document_text,
}

# Capture Entities
_entities = []
for id, entity_values in entities.items():
if not entity_values:
continue
start = entity_values.pop("begin")
end = entity_values.pop("end")
type = entity_values.pop("tag")
text = document_text[start:end]

entity = {
"id": f"{document_id}_{id}",
"type": type,
"text": text,
"offsets": [start, end],
}
_entities.append(entity)
source_example["entities"] = _entities

# Capture Relations
_relations = []
for id, relation_values in enumerate(relations):
end_entity = relation_values.pop("end_entity")
rel_tag = relation_values.pop("rel_tag")
start_entity = relation_values.pop("start_entity")

relation = {
"id": f"{document_id}_relation_{id}",
"type": rel_tag,
"arg1_id": f"{document_id}_{start_entity}",
"arg2_id": f"{document_id}_{end_entity}",
}
_relations.append(relation)
source_example["relations"] = _relations

return source_example

def _to_bigbio_example(
self,
document_id: str,
document_text: str,
entities: Dict[str, Dict[str, Union[str, int]]],
relations: List[Dict[str, Union[str, int]]],
) -> Dict:
bigbio_example = {
"id": f"{document_id}_id",
"document_id": document_id,
"passages": [
{
"id": f"{document_id}_passage",
"type": "social_media_text",
"text": [document_text],
"offsets": [[0, len(document_text)]],
}
],
"events": [],
"coreferences": [],
}

# Capture Entities
_entities = []
for id, entity_values in entities.items():
if not entity_values:
continue
start = entity_values.pop("begin")
end = entity_values.pop("end")
type = entity_values.pop("tag")
text = document_text[start:end]

entity = {
"id": f"{document_id}_{id}",
"type": type,
"text": [text],
"offsets": [[start, end]],
"normalized": [],
}
_entities.append(entity)
bigbio_example["entities"] = _entities

# Capture Relations
_relations = []
for id, relation_values in enumerate(relations):
end_entity = relation_values.pop("end_entity")
rel_tag = relation_values.pop("rel_tag")
start_entity = relation_values.pop("start_entity")

relation = {
"id": f"{document_id}_relation_{id}",
"type": rel_tag,
"arg1_id": f"{document_id}_{start_entity}",
"arg2_id": f"{document_id}_{end_entity}",
"normalized": [],
}
_relations.append(relation)
bigbio_example["relations"] = _relations

return bigbio_example
Loading

0 comments on commit c477a40

Please sign in to comment.