diff --git a/flair/data.py b/flair/data.py index dd8304405..17a1a3124 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1787,10 +1787,10 @@ def __init__( def __str__(self) -> str: output = ( - f"MultiCorpus: " # type: ignore[arg-type] - f"{len(self.train) if self.train else 0} train + " - f"{len(self.dev) if self.dev else 0} dev + " - f"{len(self.test) if self.test else 0} test sentences\n - " + f"MultiCorpus: " + f"{_len_dataset(self.train) if self.train else 0} train + " + f"{_len_dataset(self.dev) if self.dev else 0} dev + " + f"{_len_dataset(self.test) if self.test else 0} test sentences\n - " ) output += "\n - ".join([f"{type(corpus).__name__} {corpus!s} - {corpus.name}" for corpus in self.corpora]) return output diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 71b3b7c55..28f4aca98 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -96,9 +96,15 @@ def overlaps(self, other_entity) -> bool: class InternalBioNerDataset: """Internal class to represent a corpus and it's entities.""" - def __init__(self, documents: Dict[str, str], entities_per_document: Dict[str, List[Entity]]) -> None: + def __init__( + self, + documents: Dict[str, str], + entities_per_document: Dict[str, List[Entity]], + entity_types: List[str] = [], + ): self.documents = documents self.entities_per_document = entities_per_document + self.entity_types = entity_types class DpEntry(NamedTuple): @@ -111,18 +117,27 @@ class DpEntry(NamedTuple): def merge_datasets(data_sets: Iterable[InternalBioNerDataset]): all_documents = {} all_entities = {} + all_entity_types_set = set() for ds in data_sets: all_documents.update(ds.documents) all_entities.update(ds.entities_per_document) + all_entity_types_set.update(ds.entity_types) - return InternalBioNerDataset(documents=all_documents, entities_per_document=all_entities) + all_entity_types = list(all_entity_types_set) + + return InternalBioNerDataset( + documents=all_documents, + entities_per_document=all_entities, + entity_types=all_entity_types, + ) def filter_and_map_entities( dataset: InternalBioNerDataset, entity_type_to_canonical: Dict[str, str] ) -> InternalBioNerDataset: mapped_entities_per_document = {} + entity_types = list(entity_type_to_canonical.values()) for id, entities in dataset.entities_per_document.items(): new_entities = [] for entity in entities: @@ -130,11 +145,19 @@ def filter_and_map_entities( new_entity = copy(entity) new_entity.type = entity_type_to_canonical[entity.type] new_entities.append(new_entity) + elif entity.type.lower() in entity_type_to_canonical: # try lower case + new_entity = copy(entity) + new_entity.type = entity_type_to_canonical[entity.type.lower()] + new_entities.append(new_entity) else: logging.debug(f"Skip entity type {entity.type}") mapped_entities_per_document[id] = new_entities - return InternalBioNerDataset(documents=dataset.documents, entities_per_document=mapped_entities_per_document) + return InternalBioNerDataset( + documents=dataset.documents, + entities_per_document=mapped_entities_per_document, + entity_types=entity_types, + ) def filter_nested_entities(dataset: InternalBioNerDataset) -> None: @@ -337,16 +360,19 @@ def __init__( def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path): if "train" in datasets: - self.write_to_conll(datasets["train"], out_dir / "train.conll") + self.write_to_conll(datasets["train"], out_dir / (self.sentence_splitter.name + "_train.conll")) if "dev" in datasets: - self.write_to_conll(datasets["dev"], out_dir / "dev.conll") + self.write_to_conll(datasets["dev"], out_dir / (self.sentence_splitter.name + "_dev.conll")) if "test" in datasets: - self.write_to_conll(datasets["test"], out_dir / "test.conll") + self.write_to_conll(datasets["test"], out_dir / (self.sentence_splitter.name + "_test.conll")) def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): os.makedirs(str(output_file.parent), exist_ok=True) filter_nested_entities(dataset) + # Add task description for multi-task learning + assert len(dataset.entity_types) > 0 + with output_file.open("w", encoding="utf8") as f: for document_id in Tqdm.tqdm( dataset.documents.keys(), @@ -356,6 +382,7 @@ def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): document_text = ftfy.fix_text(dataset.documents[document_id]) document_text = re.sub(r"[\u2000-\u200B]", " ", document_text) # replace unicode space characters! document_text = document_text.replace("\xa0", " ") # replace non-break space + document_buffer = "" entities = deque( sorted( @@ -396,11 +423,13 @@ def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): whitespace_after = "+" if flair_token.whitespace_after > 0 else "-" if len(token) > 0: - f.write(" ".join([token, tag, whitespace_after]) + "\n") + document_buffer += " ".join([token, tag, whitespace_after]) + "\n" sentence_had_tokens = True if sentence_had_tokens: - f.write("\n") + document_buffer += "\n" + + f.write(document_buffer) class HunerDataset(ColumnCorpus, ABC): @@ -421,7 +450,7 @@ def to_internal(self, data_folder: Path) -> InternalBioNerDataset: @staticmethod @abstractmethod - def split_url() -> str: + def split_url() -> Union[str, List[str]]: raise NotImplementedError def get_corpus_sentence_splitter(self) -> Optional[SentenceSplitter]: @@ -494,15 +523,22 @@ def __init__( ) def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): - split_file = cached_path(f"{self.split_url()}.{split}", split_dir) - - with split_file.open(encoding="utf8") as f: - ids = [line.strip() for line in f if line.strip()] - ids = sorted(id_ for id_ in ids if id_ in dataset.documents) + split_urls = self.split_url() + if isinstance(split_urls, str): + split_urls = [split_urls] + split_ids_set = set() + for split_url in split_urls: + split_file = cached_path(f"{split_url}.{split}", split_dir) + with split_file.open(encoding="utf8") as f: + ids = [line.strip() for line in f if line.strip()] + ids = sorted(id_ for id_ in ids if id_ in dataset.documents) + split_ids_set.update(ids) + split_ids = sorted(split_ids_set) return InternalBioNerDataset( documents={k: dataset.documents[k] for k in ids}, - entities_per_document={k: dataset.entities_per_document[k] for k in ids}, + entities_per_document={k: dataset.entities_per_document[k] for k in split_ids}, + entity_types=dataset.entity_types, ) @@ -588,6 +624,13 @@ class HUNER_GENE_BIO_INFER(HunerDataset): """HUNER version of the BioInfer corpus containing only gene/protein annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = { + "Individual_protein": GENE_TAG, + "Gene/protein/RNA": GENE_TAG, + "Gene": GENE_TAG, + "DNA_family_or_group": GENE_TAG, + "Protein_family_or_group": GENE_TAG, + } super().__init__(*args, **kwargs) @staticmethod @@ -599,19 +642,14 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: train_data = BIO_INFER.parse_dataset(corpus_folder / "BioInfer-train.xml") test_data = BIO_INFER.parse_dataset(corpus_folder / "BioInfer-test.xml") - entity_type_mapping = { - "Individual_protein": GENE_TAG, - "Gene/protein/RNA": GENE_TAG, - "Gene": GENE_TAG, - "DNA_family_or_group": GENE_TAG, - "Protein_family_or_group": GENE_TAG, - } - - train_data = filter_and_map_entities(train_data, entity_type_mapping) - test_data = filter_and_map_entities(test_data, entity_type_mapping) + train_data = filter_and_map_entities(train_data, self.entity_type_mapping) + test_data = filter_and_map_entities(test_data, self.entity_type_mapping) return merge_datasets([train_data, test_data]) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class JNLPBA(ColumnCorpus): @@ -750,10 +788,11 @@ def read_file(cls, input_iob_file: Path, sentence_tag: str) -> InternalBioNerDat return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) -class HUNER_GENE_JNLPBA(HunerDataset): - """HUNER version of the JNLPBA corpus containing gene annotations.""" +class HUNER_JNLPBA(HunerDataset): + """HUNER version of the JNLPBA corpus.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping super().__init__(*args, **kwargs) @staticmethod @@ -772,42 +811,42 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: sentence_separator = self.sentence_splitter.tag train_data = HunerJNLPBA.download_and_prepare_train(orig_folder, sentence_separator) - train_data = filter_and_map_entities(train_data, {"protein": GENE_TAG}) + train_data = filter_and_map_entities(train_data, self.entity_type_mapping) test_data = HunerJNLPBA.download_and_prepare_test(orig_folder, sentence_separator) - test_data = filter_and_map_entities(test_data, {"protein": GENE_TAG}) + test_data = filter_and_map_entities(test_data, self.entity_type_mapping) return merge_datasets([train_data, test_data]) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping -class HUNER_CELL_LINE_JNLPBA(HunerDataset): - """HUNER version of the JNLPBA corpus containing cell line annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) +class HUNER_GENE_JNLPBA(HUNER_JNLPBA): + """HUNER version of the JNLPBA corpus containing gene annotations.""" - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia" + def __init__(self, *args, **kwargs): + entity_type_mapping = {"protein": GENE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - def get_corpus_sentence_splitter(self) -> SentenceSplitter: - return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - download_folder = data_dir / "original" - os.makedirs(str(download_folder), exist_ok=True) +class HUNER_CELL_LINE_JNLPBA(HUNER_JNLPBA): + """HUNER version of the JNLPBA corpus containing cell line annotations.""" - sentence_separator = " " - if isinstance(self.sentence_splitter, TagSentenceSplitter): - sentence_separator = self.sentence_splitter.tag + def __init__(self, *args, **kwargs): + entity_type_mapping = {"cell_line": CELL_LINE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - train_data = HunerJNLPBA.download_and_prepare_train(download_folder, sentence_separator) - train_data = filter_and_map_entities(train_data, {"cell_line": CELL_LINE_TAG}) - test_data = HunerJNLPBA.download_and_prepare_test(download_folder, sentence_separator) - test_data = filter_and_map_entities(test_data, {"cell_line": CELL_LINE_TAG}) +class HUNER_ALL_JNLPBA(HUNER_JNLPBA): + """HUNER version of the JNLPBA corpus containing gene and cell line annotations.""" - return merge_datasets([train_data, test_data]) + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "protein": GENE_TAG, + "cell_line": CELL_LINE_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) class CELL_FINDER(ColumnCorpus): @@ -943,7 +982,37 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return data -@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") +class HUNER_ALL_CELL_FINDER(HunerDataset): + """HUNER version of the CellFinder corpus containing only gene annotations.""" + + def __init__(self, *args, **kwargs): + self.entity_type_mapping = { + "CellLine": CELL_LINE_TAG, + "Species": SPECIES_TAG, + "GeneProtein": GENE_TAG, + } + super().__init__(*args, **kwargs) + + @staticmethod + def split_url() -> List[str]: + split_urls = [ + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_cellline", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_species", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_protein", + ] + return split_urls + + def to_internal(self, data_dir: Path) -> InternalBioNerDataset: + data = CELL_FINDER.download_and_prepare(data_dir) + + data = filter_and_map_entities(data, self.entity_type_mapping) + + return data + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + class MIRNA(ColumnCorpus): """Original miRNA corpus. @@ -1067,13 +1136,15 @@ def get_mirna_subset(dataset: InternalBioNerDataset, split_url: str, split_dir: return InternalBioNerDataset( documents={k: dataset.documents[k] for k in ids}, entities_per_document={k: dataset.entities_per_document[k] for k in ids}, + entity_types=dataset.entity_types, ) -class HUNER_GENE_MIRNA(HunerDataset): - """HUNER version of the miRNA corpus containing protein / gene annotations.""" +class HUNER_MIRNA(HunerDataset): + """HUNER version of the miRNA corpus.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping super().__init__(*args, **kwargs) @staticmethod @@ -1098,84 +1169,51 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: sentence_separator = self.sentence_splitter.tag train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) - train_data = filter_and_map_entities(train_data, {"Genes/Proteins": GENE_TAG}) + train_data = filter_and_map_entities(train_data, self.entity_type_mapping) test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) - test_data = filter_and_map_entities(test_data, {"Genes/Proteins": GENE_TAG}) + test_data = filter_and_map_entities(test_data, self.entity_type_mapping) return merge_datasets([train_data, test_data]) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping -class HUNER_SPECIES_MIRNA(HunerDataset): - """HUNER version of the miRNA corpus containing species annotations.""" - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA" - - def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): - # In the huner split files there is no information whether a given id originates - # from the train or test file of the original corpus - so we have to adapt corpus - # splitting here - return HunerMiRNAHelper.get_mirna_subset(dataset, f"{self.split_url()}.{split}", split_dir) - - def get_corpus_sentence_splitter(self) -> SentenceSplitter: - return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - download_folder = data_dir / "original" - os.makedirs(str(download_folder), exist_ok=True) +class HUNER_GENE_MIRNA(HUNER_MIRNA): + """HUNER version of the miRNA corpus containing protein / gene annotations.""" - sentence_separator = " " - if isinstance(self.sentence_splitter, TagSentenceSplitter): - sentence_separator = self.sentence_splitter.tag + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Genes/Proteins": GENE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) - train_data = filter_and_map_entities(train_data, {"Species": SPECIES_TAG}) - test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) - test_data = filter_and_map_entities(test_data, {"Species": SPECIES_TAG}) +class HUNER_SPECIES_MIRNA(HUNER_MIRNA): + """HUNER version of the miRNA corpus containing species annotations.""" - return merge_datasets([train_data, test_data]) + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Species": SPECIES_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_DISEASE_MIRNA(HunerDataset): +class HUNER_DISEASE_MIRNA(HUNER_MIRNA): """HUNER version of the miRNA corpus containing disease annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA" - - def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): - # In the huner split files there is no information whether a given id originates - # from the train or test file of the original corpus - so we have to adapt corpus - # splitting here - return HunerMiRNAHelper.get_mirna_subset(dataset, f"{self.split_url()}.{split}", split_dir) - - def get_corpus_sentence_splitter(self) -> SentenceSplitter: - return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) - - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - download_folder = data_dir / "original" - os.makedirs(str(download_folder), exist_ok=True) - - sentence_separator = " " - if isinstance(self.sentence_splitter, TagSentenceSplitter): - sentence_separator = self.sentence_splitter.tag + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Diseases": DISEASE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) - train_data = filter_and_map_entities(train_data, {"Diseases": DISEASE_TAG}) - test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) - test_data = filter_and_map_entities(test_data, {"Diseases": DISEASE_TAG}) +class HUNER_ALL_MIRNA(HUNER_MIRNA): + """HUNER version of the miRNA corpus containing gene, species and disease annotations.""" - return merge_datasets([train_data, test_data]) + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "Genes/Proteins": GENE_TAG, + "Species": SPECIES_TAG, + "Diseases": DISEASE_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) class KaewphanCorpusHelper: @@ -1511,10 +1549,11 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) -class HUNER_SPECIES_LOCTEXT(HunerDataset): - """HUNER version of the Loctext corpus containing species annotations.""" +class HUNER_LOCTEXT(HunerDataset): + """HUNER version of the Loctext corpus.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping super().__init__(*args, **kwargs) @staticmethod @@ -1525,24 +1564,34 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: LOCTEXT.download_dataset(data_dir) dataset = LOCTEXT.parse_dataset(data_dir) - return filter_and_map_entities(dataset, {"species": SPECIES_TAG}) + return filter_and_map_entities(dataset, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + +class HUNER_SPECIES_LOCTEXT(HUNER_LOCTEXT): + """HUNER version of the Loctext corpus containing species annotations.""" + def __init__(self, *args, **kwargs): + entity_type_mapping = {"species": SPECIES_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_GENE_LOCTEXT(HunerDataset): + +class HUNER_GENE_LOCTEXT(HUNER_LOCTEXT): """HUNER version of the Loctext corpus containing protein annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, *args, **kwargs): + entity_type_mapping = {"protein": GENE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext" - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - LOCTEXT.download_dataset(data_dir) - dataset = LOCTEXT.parse_dataset(data_dir) +class HUNER_ALL_LOCTEXT(HUNER_LOCTEXT): + """HUNER version of the Loctext corpus containing species and protein annotations.""" - return filter_and_map_entities(dataset, {"protein": GENE_TAG}) + def __init__(self, *args, **kwargs): + entity_type_mapping = {"species": SPECIES_TAG, "protein": GENE_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") @@ -1834,13 +1883,19 @@ def download_and_parse_dataset(data_dir: Path): if document_text[start:end] != text: raise AssertionError - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + entity_types = [SPECIES_TAG] + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) class HUNER_SPECIES_LINNEAUS(HunerDataset): """HUNER version of the LINNEAUS corpus containing species annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Species": SPECIES_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -1850,6 +1905,9 @@ def split_url() -> str: def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return LINNEAUS.download_and_parse_dataset(data_dir) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CDR(ColumnCorpus): @@ -1919,6 +1977,7 @@ class HUNER_DISEASE_CDR(HunerDataset): """HUNER version of the IEPA corpus containing disease annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Disease": DISEASE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -1932,15 +1991,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml") test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml") all_data = merge_datasets([train_data, dev_data, test_data]) - all_data = filter_and_map_entities(all_data, {"Disease": DISEASE_TAG}) + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) return all_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class HUNER_CHEMICAL_CDR(HunerDataset): """HUNER version of the IEPA corpus containing chemical annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Chemical": CHEMICAL_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -1954,12 +2017,45 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml") test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml") all_data = merge_datasets([train_data, dev_data, test_data]) - all_data = filter_and_map_entities(all_data, {"Chemical": CHEMICAL_TAG}) + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) return all_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + +class HUNER_ALL_CDR(HunerDataset): + """HUNER version of the IEPA corpus containing disease and chemical annotations.""" + + def __init__(self, *args, **kwargs): + self.entity_type_mapping = {"Disease": DISEASE_TAG, "Chemical": CHEMICAL_TAG} + super().__init__(*args, **kwargs) + + @staticmethod + def split_url() -> List[str]: + split_urls = [ + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRDisease", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRChem", + ] + return split_urls + + def to_internal(self, data_dir: Path) -> InternalBioNerDataset: + os.makedirs(str(data_dir), exist_ok=True) + CDR.download_dataset(data_dir) + train_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml") + dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml") + test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml") + all_data = merge_datasets([train_data, dev_data, test_data]) + + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) + + return all_data + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + -@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class VARIOME(ColumnCorpus): """Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip. @@ -2056,6 +2152,7 @@ class HUNER_GENE_VARIOME(HunerDataset): """HUNER version of the Variome corpus containing gene annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"gene": GENE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2066,15 +2163,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: os.makedirs(str(data_dir), exist_ok=True) VARIOME.download_dataset(data_dir) all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") - all_data = filter_and_map_entities(all_data, {"gene": GENE_TAG}) + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) return all_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class HUNER_DISEASE_VARIOME(HunerDataset): """HUNER version of the Variome corpus containing disease annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2085,15 +2186,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: os.makedirs(str(data_dir), exist_ok=True) VARIOME.download_dataset(data_dir) all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") - all_data = filter_and_map_entities(all_data, {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG}) + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) return all_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class HUNER_SPECIES_VARIOME(HunerDataset): """HUNER version of the Variome corpus containing species annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Living_Beings": SPECIES_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2104,12 +2209,48 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: os.makedirs(str(data_dir), exist_ok=True) VARIOME.download_dataset(data_dir) all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") - all_data = filter_and_map_entities(all_data, {"Living_Beings": SPECIES_TAG}) + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) return all_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + +class HUNER_ALL_VARIOME(HunerDataset): + """HUNER version of the Variome corpus containing gene, disease and species annotations.""" + + def __init__(self, *args, **kwargs): + self.entity_type_mapping = { + "gene": GENE_TAG, + "Disorder": DISEASE_TAG, + "disease": DISEASE_TAG, + "Living_Beings": SPECIES_TAG, + } + super().__init__(*args, **kwargs) + + @staticmethod + def split_url() -> List[str]: + split_urls = [ + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_gene", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_disease", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_species", + ] + return split_urls + + def to_internal(self, data_dir: Path) -> InternalBioNerDataset: + os.makedirs(str(data_dir), exist_ok=True) + VARIOME.download_dataset(data_dir) + all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") + + all_data = filter_and_map_entities(all_data, self.entity_type_mapping) + + return all_data + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + -@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class NCBI_DISEASE(ColumnCorpus): """Original NCBI disease corpus containing disease annotations. @@ -2237,13 +2378,19 @@ def parse_input_file(input_file: Path): documents[document_id] = document_text entities_per_document[document_id] = entities - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + entity_types = [DISEASE_TAG] + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) class HUNER_DISEASE_NCBI(HunerDataset): """HUNER version of the NCBI corpus containing disease annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Disease": DISEASE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2259,6 +2406,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return merge_datasets([train_data, dev_data, test_data]) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class ScaiCorpus(ColumnCorpus): """Base class to support the SCAI chemicals and disease corpora.""" @@ -2412,6 +2562,16 @@ class HUNER_CHEMICAL_SCAI(HunerDataset): """HUNER version of the SCAI chemicals corpus containing chemical annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = { + "FAMILY": CHEMICAL_TAG, + "TRIVIALVAR": CHEMICAL_TAG, + "PARTIUPAC": CHEMICAL_TAG, + "TRIVIAL": CHEMICAL_TAG, + "ABBREVIATION": CHEMICAL_TAG, + "IUPAC": CHEMICAL_TAG, + "MODIFIER": CHEMICAL_TAG, + "SUM": CHEMICAL_TAG, + } super().__init__(*args, **kwargs) @staticmethod @@ -2422,8 +2582,40 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: original_file = SCAI_CHEMICALS.perform_corpus_download(data_dir) corpus = ScaiCorpus.parse_input_file(original_file) - # Map all entities to chemicals - entity_mapping = { + return filter_and_map_entities(corpus, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + +class HUNER_DISEASE_SCAI(HunerDataset): + """HUNER version of the SCAI chemicals corpus containing disease annotations.""" + + def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG} + super().__init__(*args, **kwargs) + + @staticmethod + def split_url() -> str: + return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease" + + def to_internal(self, data_dir: Path) -> InternalBioNerDataset: + original_file = SCAI_DISEASE.perform_corpus_download(data_dir) + corpus = ScaiCorpus.parse_input_file(original_file) + + return filter_and_map_entities(corpus, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + + +class HUNER_ALL_SCAI(HunerDataset): + """HUNER version of the SCAI chemicals corpus containing chemical and disease annotations.""" + + def __init__(self, *args, **kwargs): + self.entity_type_mapping = { + "DISEASE": DISEASE_TAG, + "ADVERSE": DISEASE_TAG, "FAMILY": CHEMICAL_TAG, "TRIVIALVAR": CHEMICAL_TAG, "PARTIUPAC": CHEMICAL_TAG, @@ -2433,28 +2625,24 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: "MODIFIER": CHEMICAL_TAG, "SUM": CHEMICAL_TAG, } - - return filter_and_map_entities(corpus, entity_mapping) - - -class HUNER_DISEASE_SCAI(HunerDataset): - """HUNER version of the SCAI chemicals corpus containing chemical annotations.""" - - def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease" + def split_url() -> List[str]: + split_urls = [ + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_chemicals", + "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease", + ] + return split_urls def to_internal(self, data_dir: Path) -> InternalBioNerDataset: original_file = SCAI_DISEASE.perform_corpus_download(data_dir) corpus = ScaiCorpus.parse_input_file(original_file) - # Map all entities to disease - entity_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG} + return filter_and_map_entities(corpus, self.entity_type_mapping) - return filter_and_map_entities(corpus, entity_mapping) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") @@ -2562,6 +2750,7 @@ class HUNER_GENE_OSIRIS(HunerDataset): """HUNER version of the OSIRIS corpus containing (only) gene annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"ge": GENE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2572,8 +2761,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: original_file = OSIRIS.download_dataset(data_dir) corpus = OSIRIS.parse_dataset(original_file / "OSIRIScorpusv02") - entity_type_mapping = {"ge": GENE_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + return filter_and_map_entities(corpus, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping class S800(ColumnCorpus): @@ -2658,6 +2849,7 @@ class HUNER_SPECIES_S800(HunerDataset): """HUNER version of the S800 corpus containing species annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Species": SPECIES_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2667,10 +2859,13 @@ def split_url() -> str: def to_internal(self, data_dir: Path) -> InternalBioNerDataset: S800.download_dataset(data_dir) data = S800.parse_dataset(data_dir) - data = filter_and_map_entities(data, {"Species": SPECIES_TAG}) + data = filter_and_map_entities(data, self.entity_type_mapping) return data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class GPRO(ColumnCorpus): """Original GPRO corpus containing gene annotations. @@ -2783,13 +2978,19 @@ def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset: document_text = documents[document_id] assert columns[4] == document_text[start:end] - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + entity_types = [GENE_TAG] + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) class HUNER_GENE_GPRO(HunerDataset): """HUNER version of the GPRO corpus containing gene annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Gene": GENE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2809,6 +3010,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return merge_datasets([train_data, dev_data]) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class DECA(ColumnCorpus): """Original DECA corpus containing gene annotations. @@ -2892,13 +3096,19 @@ def parse_corpus(text_dir: Path, gold_file: Path) -> InternalBioNerDataset: document_text = documents[document_id] assert document_text[start:end] == columns[3] - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + entity_types = [GENE_TAG] + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) class HUNER_GENE_DECA(HunerDataset): """HUNER version of the DECA corpus containing gene annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Gene": GENE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -2912,6 +3122,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return DECA.parse_corpus(text_dir, gold_file) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class FSU(ColumnCorpus): """Original FSU corpus containing protein and derived annotations. @@ -2961,6 +3174,7 @@ def download_corpus(cls, data_dir: Path) -> Path: def parse_corpus(corpus_dir: Path, sentence_separator: str) -> InternalBioNerDataset: documents = {} entities_per_document = {} + entity_types_set = set() for subcorpus in corpus_dir.iterdir(): if not subcorpus.is_dir(): @@ -3026,25 +3240,33 @@ def parse_corpus(corpus_dir: Path, sentence_separator: str) -> InternalBioNerDat entities = [] sent_offset = 0 for sent, sent_entities in zip(sentence_texts, pre_entities): - entities += [ - Entity( - (start + sent_offset, end + sent_offset), - ent_type, - ) - for (start, end, ent_type) in sent_entities - ] + for start, end, ent_type in sent_entities: + entities.append(Entity((start + sent_offset, end + sent_offset), ent_type)) + entity_types_set.add(ent_type) sent_offset += len(sent) + len(sentence_separator) documents[document_id] = document entities_per_document[document_id] = entities - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + entity_types = list(entity_types_set) + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) class HUNER_GENE_FSU(HunerDataset): """HUNER version of the FSU corpus containing (only) gene annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = { + "protein": GENE_TAG, + "protein_familiy_or_group": GENE_TAG, + "protein_complex": GENE_TAG, + "protein_variant": GENE_TAG, + "protein_enum": GENE_TAG, + } super().__init__(*args, **kwargs) @staticmethod @@ -3063,14 +3285,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: corpus = FSU.parse_corpus(corpus_dir, sentence_separator) - entity_type_mapping = { - "protein": GENE_TAG, - "protein_familiy_or_group": GENE_TAG, - "protein_complex": GENE_TAG, - "protein_variant": GENE_TAG, - "protein_enum": GENE_TAG, - } - return filter_and_map_entities(corpus, entity_type_mapping) + return filter_and_map_entities(corpus, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping class CRAFT(ColumnCorpus): @@ -3556,6 +3774,18 @@ class HUNER_CHEMICAL_CEMP(HunerDataset): """HUNER version of the CEMP corpus containing chemical annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = { + x: CHEMICAL_TAG + for x in [ + "ABBREVIATION", + "FAMILY", + "FORMULA", + "IDENTIFIERS", + "MULTIPLE", + "SYSTEMATIC", + "TRIVIAL", + ] + } super().__init__(*args, **kwargs) @staticmethod @@ -3574,19 +3804,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: dev_data = CEMP.parse_input_file(dev_text_file, dev_ann_file) dataset = merge_datasets([train_data, dev_data]) - entity_type_mapping = { - x: CHEMICAL_TAG - for x in [ - "ABBREVIATION", - "FAMILY", - "FORMULA", - "IDENTIFIERS", - "MULTIPLE", - "SYSTEMATIC", - "TRIVIAL", - ] - } - return filter_and_map_entities(dataset, entity_type_mapping) + return filter_and_map_entities(dataset, self.entity_type_mapping) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") @@ -3708,10 +3929,11 @@ def get_entities(f): return entities -class HUNER_CHEMICAL_CHEBI(HunerDataset): - """HUNER version of the CHEBI corpus containing chemical annotations.""" +class HUNER_CHEBI(HunerDataset): + """HUNER version of the CHEBI corpus.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping super().__init__(*args, **kwargs) @staticmethod @@ -3721,42 +3943,46 @@ def split_url() -> str: def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: corpus_dir = CHEBI.download_dataset(data_dir) dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) - entity_type_mapping = {"Chemical": CHEMICAL_TAG} - return filter_and_map_entities(dataset, entity_type_mapping) + return filter_and_map_entities(dataset, self.entity_type_mapping) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping -class HUNER_GENE_CHEBI(HunerDataset): - """HUNER version of the CHEBI corpus containing gene annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) +class HUNER_CHEMICAL_CHEBI(HUNER_CHEBI): + """HUNER version of the CHEBI corpus containing chemical annotations.""" - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new" + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Chemical": CHEMICAL_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: - corpus_dir = CHEBI.download_dataset(data_dir) - dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) + +class HUNER_GENE_CHEBI(HUNER_CHEBI): + """HUNER version of the CHEBI corpus containing gene annotations.""" + + def __init__(self, *args, **kwargs): entity_type_mapping = {"Protein": GENE_TAG} - return filter_and_map_entities(dataset, entity_type_mapping) + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_SPECIES_CHEBI(HunerDataset): +class HUNER_SPECIES_CHEBI(HUNER_CHEBI): """HUNER version of the CHEBI corpus containing species annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Species": SPECIES_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new" - def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: - corpus_dir = CHEBI.download_dataset(data_dir) - dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) - entity_type_mapping = {"Species": SPECIES_TAG} - return filter_and_map_entities(dataset, entity_type_mapping) +class HUNER_ALL_CHEBI(HUNER_CHEBI): + """HUNER version of the CHEBI corpus containing chemical, gene and species annotations.""" + + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "Chemical": CHEMICAL_TAG, + "Protein": GENE_TAG, + "Species": SPECIES_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) class BioNLPCorpus(ColumnCorpus): @@ -3819,6 +4045,7 @@ def download_corpus(data_folder: Path) -> Tuple[Path, Path, Path]: def parse_input_files(input_folder: Path) -> InternalBioNerDataset: documents = {} entities_per_document = {} + entity_types_set = set() for txt_file in input_folder.glob("*.txt"): name = txt_file.with_suffix("").name @@ -3835,9 +4062,15 @@ def parse_input_files(input_folder: Path) -> InternalBioNerDataset: if fields[0].startswith("T"): ann_type, start, end = fields[1].split() entities.append(Entity(char_span=(int(start), int(end)), entity_type=ann_type)) + entity_types_set.add(ann_type) entities_per_document[name] = entities + entity_types = list(entity_types_set) - return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) + return InternalBioNerDataset( + documents=documents, + entities_per_document=entities_per_document, + entity_types=entity_types, + ) @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") @@ -4484,14 +4717,12 @@ def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset: return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) -class HUNER_CHEMICAL_CRAFT_V4(HunerDataset): +class HUNER_CRAFT_V4(HunerDataset): """HUNER version of the CRAFT corpus containing (only) chemical annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping + super().__init__(*args, **kwargs) @staticmethod def split_url() -> str: @@ -4501,58 +4732,52 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: corpus_dir = CRAFT_V4.download_corpus(data_dir) corpus = CRAFT_V4.parse_corpus(corpus_dir) - entity_type_mapping = {"chebi": CHEMICAL_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + return filter_and_map_entities(corpus, self.entity_type_mapping) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping -class HUNER_GENE_CRAFT_V4(HunerDataset): - """HUNER version of the CRAFT corpus containing (only) gene annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) +class HUNER_CHEMICAL_CRAFT_V4(HUNER_CRAFT_V4): + """HUNER version of the CRAFT corpus containing (only) chemical annotations.""" - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4" + def __init__(self, *args, **kwargs): + entity_type_mapping = {"chebi": CHEMICAL_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - corpus_dir = CRAFT_V4.download_corpus(data_dir) - corpus = CRAFT_V4.parse_corpus(corpus_dir) +class HUNER_GENE_CRAFT_V4(HUNER_CRAFT_V4): + """HUNER version of the CRAFT corpus containing (only) gene annotations.""" + + def __init__(self, *args, **kwargs): entity_type_mapping = {"pr": GENE_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_SPECIES_CRAFT_V4(HunerDataset): +class HUNER_SPECIES_CRAFT_V4(HUNER_CRAFT_V4): """HUNER version of the CRAFT corpus containing (only) species annotations.""" - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) + def __init__(self, *args, **kwargs): + entity_type_mapping = {"ncbitaxon": SPECIES_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4" - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - corpus_dir = CRAFT_V4.download_corpus(data_dir) - corpus = CRAFT_V4.parse_corpus(corpus_dir) +class HUNER_ALL_CRAFT_V4(HUNER_CRAFT_V4): + """HUNER version of the CRAFT corpus containing chemical, gene and species annotations.""" - entity_type_mapping = {"ncbitaxon": SPECIES_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "chebi": CHEMICAL_TAG, + "pr": GENE_TAG, + "ncbitaxon": SPECIES_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_CHEMICAL_BIONLP2013_CG(HunerDataset): - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) +class HUNER_BIONLP2013_CG(HunerDataset): + def __init__(self, entity_type_mapping, *args, **kwargs): + self.entity_type_mapping = entity_type_mapping + super().__init__(*args, **kwargs) @staticmethod def split_url() -> str: @@ -4565,74 +4790,48 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: test_corpus = BioNLPCorpus.parse_input_files(test_dir) corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) - entity_type_mapping = {"Simple_chemical": CHEMICAL_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + return filter_and_map_entities(corpus, self.entity_type_mapping) + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping -class HUNER_DISEASE_BIONLP2013_CG(HunerDataset): - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" +class HUNER_CHEMICAL_BIONLP2013_CG(HUNER_BIONLP2013_CG): + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "Simple_chemical": CHEMICAL_TAG, + "Amino_acid": CHEMICAL_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) - train_corpus = BioNLPCorpus.parse_input_files(train_dir) - dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) - test_corpus = BioNLPCorpus.parse_input_files(test_dir) - corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) +class HUNER_DISEASE_BIONLP2013_CG(HUNER_BIONLP2013_CG): + def __init__(self, *args, **kwargs): entity_type_mapping = {"Cancer": DISEASE_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) - + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_GENE_BIONLP2013_CG(HunerDataset): - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) - - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" - - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) - train_corpus = BioNLPCorpus.parse_input_files(train_dir) - dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) - test_corpus = BioNLPCorpus.parse_input_files(test_dir) - corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) +class HUNER_GENE_BIONLP2013_CG(HUNER_BIONLP2013_CG): + def __init__(self, *args, **kwargs): entity_type_mapping = {"Gene_or_gene_product": GENE_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) + super().__init__(entity_type_mapping, *args, **kwargs) -class HUNER_SPECIES_BIONLP2013_CG(HunerDataset): - def __init__(self, *args, **kwargs) -> None: - super().__init__( - *args, - **kwargs, - ) +class HUNER_SPECIES_BIONLP2013_CG(HUNER_BIONLP2013_CG): + def __init__(self, *args, **kwargs): + entity_type_mapping = {"Organism": SPECIES_TAG} + super().__init__(entity_type_mapping, *args, **kwargs) - @staticmethod - def split_url() -> str: - return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" - def to_internal(self, data_dir: Path) -> InternalBioNerDataset: - train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) - train_corpus = BioNLPCorpus.parse_input_files(train_dir) - dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) - test_corpus = BioNLPCorpus.parse_input_files(test_dir) - corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) - - entity_type_mapping = {"Organism": SPECIES_TAG} - return filter_and_map_entities(corpus, entity_type_mapping) +class HUNER_ALL_BIONLP2013_CG(HUNER_BIONLP2013_CG): + def __init__(self, *args, **kwargs): + entity_type_mapping = { + "Simple_chemical": CHEMICAL_TAG, + "Cancer": DISEASE_TAG, + "Gene_or_gene_product": GENE_TAG, + "Organism": SPECIES_TAG, + } + super().__init__(entity_type_mapping, *args, **kwargs) class AZDZ(ColumnCorpus): @@ -4801,6 +5000,7 @@ class HUNER_DISEASE_PDR(HunerDataset): """PDR Dataset with only Disease annotations.""" def __init__(self, *args, **kwargs) -> None: + self.entity_type_mapping = {"Disease": DISEASE_TAG} super().__init__(*args, **kwargs) @staticmethod @@ -4810,10 +5010,13 @@ def split_url() -> str: def to_internal(self, data_dir: Path) -> InternalBioNerDataset: corpus_folder = PDR.download_corpus(data_dir) corpus_data = brat_to_internal(corpus_folder, ann_file_suffixes=[".ann", ".ann2"]) - corpus_data = filter_and_map_entities(corpus_data, {"Disease": DISEASE_TAG}) + corpus_data = filter_and_map_entities(corpus_data, self.entity_type_mapping) return corpus_data + def get_entity_type_mapping(self) -> Optional[Dict]: + return self.entity_type_mapping + class HunerMultiCorpus(MultiCorpus): """Base class to build the union of all HUNER data sets considering a particular entity type.""" @@ -4834,32 +5037,48 @@ def entity_type_predicate(member): corpus = constructor_func(sentence_splitter=sentence_splitter) self.huner_corpora.append(corpus) - except (CompressionError, ExtractError, HeaderError, ReadError, StreamError, TarError): + except ( + CompressionError, + ExtractError, + HeaderError, + ReadError, + StreamError, + TarError, + ): logger.exception( - f"Error while processing Tar file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while processing Tar file from corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except (BadZipFile, LargeZipFile): logger.exception( - f"Error while processing Zip file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while processing Zip file from corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except OSError: logger.exception( - f"Error while downloading data for corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while downloading data for corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except shutil.Error: logger.exception( - f"Error while copying data files for corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while copying data files for corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except etree.LxmlError: logger.exception( - f"Error while processing XML file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while processing XML file from corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except json.JSONDecodeError: logger.exception( - f"Error while processing JSON file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False + f"Error while processing JSON file from corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, ) except (FileNotFoundError, OSError, ValueError): - logger.exception(f"Error while preparing corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False) + logger.exception( + f"Error while preparing corpus {name}:\n{sys.exc_info()[1]}\n\n", + exc_info=False, + ) super().__init__(corpora=self.huner_corpora, name=f"HUNER-{entity_type}") @@ -4949,9 +5168,9 @@ def __init__( self.sentence_splitter = sentence_splitter if sentence_splitter else SciSpacySentenceSplitter() dataset_dir_name = self.build_corpus_directory_name(dataset_name) - data_folder = base_path / dataset_dir_name / self.sentence_splitter.name + data_folder = base_path / dataset_dir_name - train_file = data_folder / "train.conll" + train_file = data_folder / (self.sentence_splitter.name + "_train.conll") # Download data if necessary # Some datasets in BigBio only have train or test splits, not both @@ -4986,18 +5205,73 @@ def __init__( type_mapping = self.get_entity_type_mapping() if type_mapping: splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()} + else: + logger.warning( + f"No entity type mapping found for {dataset_name}. Check CONLL files for task descriptions." + ) conll_writer = CoNLLWriter(sentence_splitter=self.sentence_splitter) conll_writer.process_dataset(splits, data_folder) - super().__init__(data_folder, columns, in_memory=in_memory, comment_symbol="#", sample_missing_splits=True) + super().__init__( + data_folder, + columns, + in_memory=in_memory, + comment_symbol="#", + sample_missing_splits=True, + ) def get_entity_type_mapping(self) -> Optional[Dict]: """Return the mapping of entity type given in the dataset to canonical types. Note, if a entity type is not present in the map it is discarded. """ - return None + # return None + # TODO: Add entity type mapping for all remaining bigbio datasets not in HunFlair? + return { + "chemical": "chemical", + "['chemical']": "chemical", + "simple_chemical": "chemical", + "cancer": "disease", + "disease": "disease", + "['disease']": "disease", + "gene": "gene", + "['gene']": "gene", + "gene_or_gene_product": "gene", + "species": "species", + "['species']": "species", + "cellline": "cell_line", + "cell_line": "cell_line", + "protein": "gene", + # "simple_chemical": "chemical", # BioNLP ST 2013 CG + "amino_acid": "chemical", # BioNLP ST 2013 CG + # "cancer": "disease", # BioNLP ST 2013 CG + # "gene_or_gene_product": "gene", # BioNLP ST 2013 CG + "organism": "species", # BioNLP ST 2013 CG + "pathological_formation": "disease", # BioNLP ST 2013 CG + # "gene": "gene", # NLM Gene + "generif": "gene", # NLM Gene + "stargene": "gene", # NLM Gene + "domain": "gene", # NLM Gene + "other": "gene", # NLM Gene + # "chemical": "chemical", # NLM Chem + "diseaseclass": "disease", # NCBI Disease + "specificdisease": "disease", # NCBI Disease + "modifier": "disease", # NCBI Disease + "geneprotein": "gene", # Cell Finder + # "cellline": "cell_line", # Cell Finder + # "species": "species", # Cell Finder + "geneorgeneproduct": "gene", # BioRED + "chemicalentity": "chemical", # BioRED + "organismtaxon": "species", # BioRED + "diseaseorphenotypicfeature": "disease", # BioRED + "pr": "gene", # CRAFT (local) + "chebi": "chemical", # CRAFT (local) + "ncbitaxon": "species", # CRAFT (local) + # "protein": "gene", # BioID + "mondo": "disease", # CRAFT (local) + "drug": "chemical", # BERNv2 + } def build_corpus_directory_name(self, dataset_name: str) -> str: """Builds the directory name for the given data set.""" @@ -5005,8 +5279,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: """Converts a dataset given in hugging datasets format to our internal corpus representation.""" - id_to_text = {} - id_to_entities: Dict[str, List] = {} + id_to_text: Dict[str, str] = {} + id_to_entities: Dict[str, list] = {} + entity_type_set = set() for document in dataset[split]: document_id = document["document_id"] passage_offsets = [] @@ -5032,9 +5307,13 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: # Adapt entity offsets according to passage offsets entity_offset = entity["offsets"][0] - entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0]) + entity_offset = ( + entity_offset[0] - passage_offset[0], + entity_offset[1] - passage_offset[0], + ) id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"])) + entity_type_set.add(entity["type"]) # FIXME: This is just for debugging purposes # passage_text = id_to_text[passage_id] @@ -5043,15 +5322,25 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: # if doc_text != mention_text: # print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}") - return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities) + entity_types = list(entity_type_set) + return InternalBioNerDataset( + documents=id_to_text, + entities_per_document=id_to_entities, + entity_types=entity_types, + ) - def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict): - """Helper methods to find the passage to a given entity mention inclusive offset. + def bin_search_passage( + self, + passages: List[Tuple[str, List[Tuple[int, int]]]], + low: int, + high: int, + entity: Dict, + ): + """Helper methods to find the passage to a given entity mention (incl. offset). The implementation uses binary search to find the passage in the ordered sequence passages. """ - # Check base case - if low > high: + if low > high: # Check base case raise NotImplementedError("There was a mistake concerning the lower and upper bound.") # Get element in the middle @@ -5093,7 +5382,13 @@ def __init__( ) def get_entity_type_mapping(self) -> Optional[Dict]: - return {"Gene": GENE_TAG, "GENERIF": GENE_TAG, "STARGENE": GENE_TAG, "Domain": GENE_TAG, "Other": GENE_TAG} + return { + "Gene": GENE_TAG, + "GENERIF": GENE_TAG, + "STARGENE": GENE_TAG, + "Domain": GENE_TAG, + "Other": GENE_TAG, + } def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() @@ -5153,6 +5448,17 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_ALL_DRUGPROT(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="drugprot", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"GENE-N": GENE_TAG, "GENE-Y": GENE_TAG, "CHEMICAL": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + class HUNER_GENE_BIORED(BIGBIO_NER_CORPUS): def __init__( self, @@ -5288,6 +5594,23 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_ALL_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return { + "GeneOrGeneProduct": GENE_TAG, + "ChemicalEntity": CHEMICAL_TAG, + "DiseaseOrPhenotypicFeature": DISEASE_TAG, + "OrganismTaxon": SPECIES_TAG, + "CellLine": CELL_LINE_TAG, + } + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + class HUNER_GENE_CPI(BIGBIO_NER_CORPUS): def __init__( self, @@ -5342,6 +5665,17 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_ALL_CPI(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="cpi", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"protein": GENE_TAG, "compound": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + class HUNER_GENE_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS): def __init__( self, @@ -5396,6 +5730,21 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_ALL_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return { + "Gene_or_gene_product": GENE_TAG, + "Complex": GENE_TAG, + "Simple_chemical": CHEMICAL_TAG, + } + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + class HUNER_GENE_BIONLP_ST_2013_GE(BIGBIO_NER_CORPUS): def __init__( self, @@ -5531,6 +5880,21 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_ALL_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return { + "Protein": GENE_TAG, + "Chemical": CHEMICAL_TAG, + "Organism": SPECIES_TAG, + } + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + class HUNER_GENE_BIONLP_ST_2011_REL(BIGBIO_NER_CORPUS): def __init__( self, @@ -5714,7 +6078,26 @@ def __init__( ) def get_entity_type_mapping(self) -> Optional[Dict]: - return {"cell": CELL_LINE_TAG} + # TODO whether cell or cell line is the correct tag + return {"cellline": CELL_LINE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_ALL_BIOID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super().__init__(*args, dataset_name="bioid", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + # TODO whether cell or cell line is the correct tag + return { + "gene": GENE_TAG, + "protein": GENE_TAG, + "chemical": CHEMICAL_TAG, + "species": SPECIES_TAG, + "cellline": CELL_LINE_TAG, + } def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() @@ -5834,23 +6217,11 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS): - def __init__( - self, - base_path: Optional[Union[str, Path]] = None, - in_memory: bool = True, - sentence_splitter: Optional[SentenceSplitter] = None, - train_split_name: Optional[str] = None, - dev_split_name: Optional[str] = None, - test_split_name: Optional[str] = None, - ) -> None: + def __init__(self, *args, **kwargs): super().__init__( + *args, dataset_name="tmvar_v3", - base_path=base_path, - in_memory=in_memory, - sentence_splitter=sentence_splitter, - train_split_name=train_split_name, - dev_split_name=dev_split_name, - test_split_name=test_split_name, + **kwargs, ) def get_entity_type_mapping(self) -> Optional[Dict]: @@ -5860,55 +6231,5 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() -class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS): - def __init__( - self, - base_path: Optional[Union[str, Path]] = None, - in_memory: bool = True, - sentence_splitter: Optional[SentenceSplitter] = None, - train_split_name: Optional[str] = None, - dev_split_name: Optional[str] = None, - test_split_name: Optional[str] = None, - ) -> None: - super().__init__( - dataset_name="tmvar_v3", - base_path=base_path, - in_memory=in_memory, - sentence_splitter=sentence_splitter, - train_split_name=train_split_name, - dev_split_name=dev_split_name, - test_split_name=test_split_name, - ) - - def get_entity_type_mapping(self) -> Optional[Dict]: - return {"['Species']": SPECIES_TAG} - - def build_corpus_directory_name(self, dataset_name: str) -> str: - return self.__class__.__name__.lower() - - -class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS): - def __init__( - self, - base_path: Optional[Union[str, Path]] = None, - in_memory: bool = True, - sentence_splitter: Optional[SentenceSplitter] = None, - train_split_name: Optional[str] = None, - dev_split_name: Optional[str] = None, - test_split_name: Optional[str] = None, - ) -> None: - super().__init__( - dataset_name="tmvar_v3", - base_path=base_path, - in_memory=in_memory, - sentence_splitter=sentence_splitter, - train_split_name=train_split_name, - dev_split_name=dev_split_name, - test_split_name=test_split_name, - ) - - def get_entity_type_mapping(self) -> Optional[Dict]: - return {"['CellLine']": CELL_LINE_TAG} - - def build_corpus_directory_name(self, dataset_name: str) -> str: - return self.__class__.__name__.lower() +if __name__ == "__main__": + HUNER_ALL_BIONLP_ST_2013_PC(base_path="/home/tmp/hunflair/tmp") diff --git a/flair/models/prefixed_tagger.py b/flair/models/prefixed_tagger.py new file mode 100644 index 000000000..b8c01c50a --- /dev/null +++ b/flair/models/prefixed_tagger.py @@ -0,0 +1,319 @@ +import logging +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import torch +from torch.utils.data import Dataset + +import flair.data +from flair.data import Corpus, Sentence, Token +from flair.datasets import DataLoader, FlairDatapointDataset +from flair.models import SequenceTagger + + +class PrefixedSentence(Sentence): + """An PrefixedSentence expresses that a sentence is augmented and compatible with the PrefixedSequenceTagger. + + For inference, i.e. `predict` and `evaluate`, the PrefixedSequenceTagger internally encodes the sentences. + Therefore, these functions work with the regular flair sentence objects. + """ + + +class SentenceAugmentationStrategy(ABC): + """Strategy to augment a sentence with additional information or instructions.""" + + @abstractmethod + def augment_sentence( + self, sentence: Sentence, annotation_layers: Optional[Union[str, List[str]]] = None + ) -> PrefixedSentence: + """Augments the given sentence text with additional instructions for working / predicting the task on the given annotations. + + Args: + sentence: The sentence to be augmented + annotation_layers: Annotations which should be predicted. + """ + ... + + @abstractmethod + def apply_predictions( + self, + augmented_sentence: Sentence, + original_sentence: Sentence, + source_annotation_layer: str, + target_annotation_layer: str, + ): + """Transfers the predictions made on the augmented sentence to the original one. + + Args: + augmented_sentence: The augmented sentence instance + original_sentence: The original sentence before the augmentation was applied + source_annotation_layer: Annotation layer of the augmented sentence in which the predictions are stored. + target_annotation_layer: Annotation layer in which the predictions should be stored in the original sentence. + """ + ... + + @abstractmethod + def _get_state_dict(self): + """Returns the state dict for the given augmentation strategy.""" + ... + + @classmethod + def _init_strategy_with_state_dict(cls, state, **kwargs): + """Initializes the strategy from the given state.""" + + def augment_dataset( + self, dataset: Dataset[Sentence], annotation_layers: Optional[Union[str, List[str]]] = None + ) -> FlairDatapointDataset[PrefixedSentence]: + """Transforms a dataset into a dataset containing augmented sentences specific to the `PrefixedSequenceTagger`. + + The returned dataset is stored in memory. For more information on the internal sentence transformation + procedure, see the :class:`PrefixedSequenceTagger` architecture. + + Args: + dataset: A dataset of sentences to augment + annotation_layers: Annotations which should be predicted. + + Returns: A dataset of augmented sentences specific to the `PrefixedSequenceTagger` + """ + data_loader: DataLoader = DataLoader(dataset, batch_size=1) + original_sentences: List[Sentence] = [batch[0] for batch in iter(data_loader)] + + augmented_sentences = [self.augment_sentence(sentence, annotation_layers) for sentence in original_sentences] + + return FlairDatapointDataset(augmented_sentences) + + def augment_corpus( + self, corpus: Corpus[Sentence], annotation_layers: Optional[Union[str, List[str]]] = None + ) -> Corpus[PrefixedSentence]: + """Transforms a corpus into a corpus containing augmented sentences specific to the `PrefixedSequenceTagger`. + + The splits of the returned corpus are stored in memory. For more information on the internal + sentence augmentation procedure, see the :class:`PrefixedSequenceTagger`. + + Args: + corpus: A corpus of sentences to augment + annotation_layers: Annotations which should be predicted. + + Returns: A corpus of encoded sentences specific to the `PrefixedSequenceTagger` + """ + return Corpus( + train=self.augment_dataset(corpus.train, annotation_layers) if corpus.train is not None else None, + dev=self.augment_dataset(corpus.dev, annotation_layers) if corpus.dev is not None else None, + test=self.augment_dataset(corpus.test, annotation_layers) if corpus.test is not None else None, + name=corpus.name, + # If we sample missing splits, the encoded sentences that correspond to the same original sentences + # may get distributed into different splits. For training purposes, this is always undesired. + sample_missing_splits=False, + ) + + +class EntityTypeTaskPromptAugmentationStrategy(SentenceAugmentationStrategy): + """Augmentation strategy that augments a sentence with a task description which specifies which entity types should be tagged. + + This approach is inspired by the paper from Luo et al.: + AIONER: All-in-one scheme-based biomedical named entity recognition using deep learning + https://arxiv.org/abs/2211.16944 + + Example: + "[Tag gene and disease] Mutations in the TP53 tumour suppressor gene are found in ~50% of human cancers" + """ + + def __init__(self, entity_types: List[str]): + if len(entity_types) <= 0: + raise AssertionError + + self.entity_types = entity_types + self.task_prompt = self._build_tag_prompt_prefix(entity_types) + + def augment_sentence( + self, sentence: Sentence, annotation_layers: Optional[Union[str, List[str]]] = None + ) -> PrefixedSentence: + # Prepend the task description prompt to the sentence text + augmented_sentence = PrefixedSentence( + text=self.task_prompt + [t.text for t in sentence.tokens], + use_tokenizer=False, + language_code=sentence.language_code, + start_position=sentence.start_position, + ) + + # Make sure it's a list + if annotation_layers and isinstance(annotation_layers, str): + annotation_layers = [annotation_layers] + + # Reconstruct all annotations from the original sentence (necessary for learning classifiers) + layers = annotation_layers if annotation_layers else sentence.annotation_layers.keys() + len_task_prompt = len(self.task_prompt) + + for layer in layers: + for label in sentence.get_labels(layer): + if isinstance(label.data_point, Token): + label_span = augmented_sentence[ + len_task_prompt + label.data_point.idx - 1 : len_task_prompt + label.data_point.idx + ] + else: + label_span = augmented_sentence[ + len_task_prompt + + label.data_point.tokens[0].idx + - 1 : len_task_prompt + + label.data_point.tokens[-1].idx + ] + + label_span.add_label(layer, label.value, label.score) + + return augmented_sentence + + def apply_predictions( + self, + augmented_sentence: Sentence, + original_sentence: Sentence, + source_annotation_layer: str, + target_annotation_layer: str, + ): + new_labels = augmented_sentence.get_labels(source_annotation_layer) + len_task_prompt = len(self.task_prompt) + + for label in new_labels: + if label.data_point.tokens[0].idx - len_task_prompt - 1 < 0: + continue + orig_span = original_sentence[ + label.data_point.tokens[0].idx - len_task_prompt - 1 : label.data_point.tokens[-1].idx - len_task_prompt + ] + orig_span.add_label(target_annotation_layer, label.value, label.score) + + def _build_tag_prompt_prefix(self, entity_types: List[str]) -> List[str]: + if len(self.entity_types) == 1: + prompt = f"[ Tag {entity_types[0]} ]" + else: + prompt = "[ Tag " + ", ".join(entity_types[:-1]) + " and " + entity_types[-1] + " ]" + + return prompt.split() + + def _get_state_dict(self): + return {"entity_types": self.entity_types} + + @classmethod + def _init_strategy_with_state_dict(cls, state, **kwargs): + return cls(state["entity_types"]) + + +class PrefixedSequenceTagger(SequenceTagger): + def __init__(self, *args, augmentation_strategy: SentenceAugmentationStrategy, **kwargs): + super().__init__(*args, **kwargs) + + if augmentation_strategy is None: + logging.warning("No augmentation strategy provided. Make sure that the strategy is set.") + + self.augmentation_strategy = augmentation_strategy + + def _get_state_dict(self): + state = super()._get_state_dict() + state["augmentation_strategy"] = self.augmentation_strategy + + return state + + @classmethod + def _init_model_with_state_dict(cls, state, **kwargs): + strategy = state["augmentation_strategy"] + return super()._init_model_with_state_dict(state, augmentation_strategy=strategy, **kwargs) + + @classmethod + def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "PrefixedSequenceTagger": + from typing import cast + + return cast("PrefixedSequenceTagger", super().load(model_path=model_path)) + + def forward_loss(self, sentences: Union[List[Sentence], List[PrefixedSentence]]) -> Tuple[torch.Tensor, int]: + # If all sentences are not augmented -> augment them + if all(isinstance(sentence, Sentence) for sentence in sentences): + # mypy does not infer the type of "sentences" restricted by the if statement + sentences = cast(List[Sentence], sentences) + + sentences = self.augment_sentences(sentences=sentences, annotation_layers=self.tag_type) + elif not all(isinstance(sentence, PrefixedSentence) for sentence in sentences): + raise ValueError("All passed sentences must be either uniformly augmented or not.") + + # mypy does not infer the type of "sentences" restricted by code above + sentences = cast(List[Sentence], sentences) + + return super().forward_loss(sentences) + + def predict( + self, + sentences: Union[List[Sentence], Sentence, List[PrefixedSentence], PrefixedSentence], + mini_batch_size: int = 32, + return_probabilities_for_all_classes: bool = False, + verbose: bool = False, + label_name: Optional[str] = None, + return_loss=False, + embedding_storage_mode="none", + force_token_predictions: bool = False, + ): + # Compute prediction label type + prediction_label_type: str = self.label_type if label_name is None else label_name + + # make sure it's a list + if not isinstance(sentences, list) and not isinstance(sentences, flair.data.Dataset): + sentences = [sentences] + + # If all sentences are already augmented (i.e. compatible with this class), just forward the sentences + if all(isinstance(sentence, PrefixedSentence) for sentence in sentences): + # mypy does not infer the type of "sentences" restricted by the if statement + sentences = cast(List[Sentence], sentences) + + return super().predict( + sentences, + mini_batch_size=mini_batch_size, + return_probabilities_for_all_classes=return_probabilities_for_all_classes, + verbose=verbose, + label_name=prediction_label_type, + return_loss=return_loss, + embedding_storage_mode=embedding_storage_mode, + ) + + elif not all(isinstance(sentence, Sentence) for sentence in sentences): + raise ValueError("All passed sentences must be either uniformly augmented or not.") + + # Remove existing labels + if label_name is not None: + for sentence in sentences: + sentence.remove_labels(prediction_label_type) + + sentences = cast(List[Sentence], sentences) + + # Augment sentences - copy all annotation of the given tag type + augmented_sentences = self.augment_sentences(sentences, self.tag_type) + + mypy_safe_augmented_sentences = cast(List[Sentence], augmented_sentences) + + # Predict on augmented sentence and store it in an internal annotation layer / label + loss_and_count = super().predict( + sentences=mypy_safe_augmented_sentences, + mini_batch_size=mini_batch_size, + return_probabilities_for_all_classes=return_probabilities_for_all_classes, + verbose=verbose, + label_name=prediction_label_type, + return_loss=return_loss, + embedding_storage_mode=embedding_storage_mode, + ) + + # Append predicted labels to the original sentences + for orig_sent, aug_sent in zip(sentences, augmented_sentences): + self.augmentation_strategy.apply_predictions( + aug_sent, orig_sent, prediction_label_type, prediction_label_type + ) + + if prediction_label_type == "predicted": + orig_sent.remove_labels("predicted_bio") + orig_sent.remove_labels("gold_bio") + + if loss_and_count is not None: + return loss_and_count + + def augment_sentences( + self, sentences: Union[Sentence, List[Sentence]], annotation_layers: Optional[Union[str, List[str]]] = None + ) -> List[PrefixedSentence]: + if not isinstance(sentences, list) and not isinstance(sentences, flair.data.Dataset): + sentences = [sentences] + + return [self.augmentation_strategy.augment_sentence(sentence, annotation_layers) for sentence in sentences] diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 2bcc2c5cc..9e1ff7719 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -394,6 +394,9 @@ def _get_gold_labels(self, sentences: List[Sentence]) -> List[str]: for sentence in sentences: sentence_labels = ["O"] * len(sentence) for label in sentence.get_labels(self.label_type): + if label.value == "O": + continue + span: Span = label.data_point if self.tag_format == "BIOES": if len(span) == 1: diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py new file mode 100644 index 000000000..435aed0d7 --- /dev/null +++ b/tests/test_augmentation.py @@ -0,0 +1,99 @@ +from flair.data import Sentence +from flair.models.prefixed_tagger import EntityTypeTaskPromptAugmentationStrategy, PrefixedSentence + + +def test_entity_type_task_prompt_augmentation_single_type(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"]) + + sent = Sentence("This is a test sentence.") + aug_sent = strategy.augment_sentence(sent) + + assert isinstance(aug_sent, PrefixedSentence) + assert aug_sent.text.startswith("[ Tag genes ] ") + assert len(aug_sent) == 10 + + +def test_entity_type_task_prompt_augmentation_two_types(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes", "diseases"]) + + sent = Sentence("This is a test sentence.") + aug_sent = strategy.augment_sentence(sent) + + assert isinstance(aug_sent, PrefixedSentence) + assert aug_sent.text.startswith("[ Tag genes and diseases ] ") + assert len(aug_sent) == 12 + + +def test_entity_type_task_prompt_augmentation_multiple_types(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes", "diseases", "chemicals"]) + + sent = Sentence("This is a test sentence.") + aug_sent = strategy.augment_sentence(sent) + + assert isinstance(aug_sent, PrefixedSentence) + assert aug_sent.text.startswith("[ Tag genes, diseases and chemicals ] ") + assert len(aug_sent) == 13 + + +def test_entity_type_task_prompt_augmentation_label_transfer(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"]) + + sent = Sentence("This is a test sentence.") + sent[0:2].add_label("ner", "test", 1.0) + sent[3:4].add_label("foo", "test", 1.0) + + aug_sent = strategy.augment_sentence(sent, "ner") + + assert isinstance(aug_sent, PrefixedSentence) + assert aug_sent.text.startswith("[ Tag genes ] ") + assert len(aug_sent.get_labels("foo")) == 0 + + ner_labels = aug_sent.get_labels("ner") + assert len(ner_labels) == 1 + assert len(ner_labels[0].data_point.tokens) == 2 + assert ner_labels[0].data_point.text == "This is" + assert ner_labels[0].data_point.tokens[0].idx == 5 + assert ner_labels[0].data_point.tokens[-1].idx == 6 + + +def test_entity_type_task_prompt_augmentation_label_application(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"]) + + sent = Sentence("TP53 - also known as tumour protein 53 - is an onco-gene.") + + aug_sent = strategy.augment_sentence(sent, "ner") + aug_sent[4:5].add_label("predict", "gene", 1.0) + aug_sent[9:12].add_label("predict", "gene", 1.0) + aug_sent[5:6].add_label("not-predict", "gene", 1.0) + + strategy.apply_predictions(aug_sent, sent, "predict", "ner") + + ner_labels = sent.get_labels("ner") + assert len(ner_labels) == 2 + + assert ner_labels[0].data_point.text == "TP53" + assert ner_labels[0].value == "gene" + assert ner_labels[0].score == 1.0 + assert len(ner_labels[0].data_point.tokens) == 1 + assert ner_labels[0].data_point.tokens[0].idx == 1 + + assert ner_labels[1].data_point.text == "tumour protein 53" + assert ner_labels[1].value == "gene" + assert ner_labels[1].score == 1.0 + assert len(ner_labels[1].data_point.tokens) == 3 + assert ner_labels[1].data_point.tokens[0].idx == 6 + assert ner_labels[1].data_point.tokens[-1].idx == 8 + + +def test_entity_type_task_prompt_augmentation_label_application_label_in_tag(): + strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"]) + + sent = Sentence("TP53 - also known as tumour protein 53 - is an onco-gene.") + + aug_sent = strategy.augment_sentence(sent, "ner") + aug_sent[2:4].add_label("predict", "gene", 1.0) # Add label in tagging prompt + + strategy.apply_predictions(aug_sent, sent, "predict", "ner") + + ner_labels = sent.get_labels("ner") + assert len(ner_labels) == 0 diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py index 2ba4cf3d6..4099bb928 100644 --- a/tests/test_datasets_biomedical.py +++ b/tests/test_datasets_biomedical.py @@ -117,6 +117,7 @@ def test_write_to_conll(): ), ] }, + entity_types=["E"], ) expected_labeling = [ "This O +", @@ -142,6 +143,7 @@ def test_conll_writer_one_token_multiple_entities1(): Entity((text.find("entity2"), text.find("entity2") + len("entity2")), "E"), ] }, + entity_types=["E"], ) assert_conll_writer_output(dataset, ["This O +", "is O +", "entity1 B-E +", "entity2 B-E -"]) @@ -157,6 +159,7 @@ def test_conll_writer_one_token_multiple_entities2(): Entity((text.find("tity1"), text.find("tity1") + 5), "E"), ] }, + entity_types=["E"], ) assert_conll_writer_output(dataset, ["This O +", "is O +", "entity1 B-E +", "entity2 O -"])