diff --git a/flair/data.py b/flair/data.py
index dd8304405..17a1a3124 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -1787,10 +1787,10 @@ def __init__(
 
     def __str__(self) -> str:
         output = (
-            f"MultiCorpus: "  # type: ignore[arg-type]
-            f"{len(self.train) if self.train else 0} train + "
-            f"{len(self.dev) if self.dev else 0} dev + "
-            f"{len(self.test) if self.test else 0} test sentences\n - "
+            f"MultiCorpus: "
+            f"{_len_dataset(self.train) if self.train else 0} train + "
+            f"{_len_dataset(self.dev) if self.dev else 0} dev + "
+            f"{_len_dataset(self.test) if self.test else 0} test sentences\n - "
         )
         output += "\n - ".join([f"{type(corpus).__name__} {corpus!s} - {corpus.name}" for corpus in self.corpora])
         return output
diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 71b3b7c55..28f4aca98 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -96,9 +96,15 @@ def overlaps(self, other_entity) -> bool:
 class InternalBioNerDataset:
     """Internal class to represent a corpus and it's entities."""
 
-    def __init__(self, documents: Dict[str, str], entities_per_document: Dict[str, List[Entity]]) -> None:
+    def __init__(
+        self,
+        documents: Dict[str, str],
+        entities_per_document: Dict[str, List[Entity]],
+        entity_types: List[str] = [],
+    ):
         self.documents = documents
         self.entities_per_document = entities_per_document
+        self.entity_types = entity_types
 
 
 class DpEntry(NamedTuple):
@@ -111,18 +117,27 @@ class DpEntry(NamedTuple):
 def merge_datasets(data_sets: Iterable[InternalBioNerDataset]):
     all_documents = {}
     all_entities = {}
+    all_entity_types_set = set()
 
     for ds in data_sets:
         all_documents.update(ds.documents)
         all_entities.update(ds.entities_per_document)
+        all_entity_types_set.update(ds.entity_types)
 
-    return InternalBioNerDataset(documents=all_documents, entities_per_document=all_entities)
+    all_entity_types = list(all_entity_types_set)
+
+    return InternalBioNerDataset(
+        documents=all_documents,
+        entities_per_document=all_entities,
+        entity_types=all_entity_types,
+    )
 
 
 def filter_and_map_entities(
     dataset: InternalBioNerDataset, entity_type_to_canonical: Dict[str, str]
 ) -> InternalBioNerDataset:
     mapped_entities_per_document = {}
+    entity_types = list(entity_type_to_canonical.values())
     for id, entities in dataset.entities_per_document.items():
         new_entities = []
         for entity in entities:
@@ -130,11 +145,19 @@ def filter_and_map_entities(
                 new_entity = copy(entity)
                 new_entity.type = entity_type_to_canonical[entity.type]
                 new_entities.append(new_entity)
+            elif entity.type.lower() in entity_type_to_canonical:  # try lower case
+                new_entity = copy(entity)
+                new_entity.type = entity_type_to_canonical[entity.type.lower()]
+                new_entities.append(new_entity)
             else:
                 logging.debug(f"Skip entity type {entity.type}")
         mapped_entities_per_document[id] = new_entities
 
-    return InternalBioNerDataset(documents=dataset.documents, entities_per_document=mapped_entities_per_document)
+    return InternalBioNerDataset(
+        documents=dataset.documents,
+        entities_per_document=mapped_entities_per_document,
+        entity_types=entity_types,
+    )
 
 
 def filter_nested_entities(dataset: InternalBioNerDataset) -> None:
@@ -337,16 +360,19 @@ def __init__(
 
     def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path):
         if "train" in datasets:
-            self.write_to_conll(datasets["train"], out_dir / "train.conll")
+            self.write_to_conll(datasets["train"], out_dir / (self.sentence_splitter.name + "_train.conll"))
         if "dev" in datasets:
-            self.write_to_conll(datasets["dev"], out_dir / "dev.conll")
+            self.write_to_conll(datasets["dev"], out_dir / (self.sentence_splitter.name + "_dev.conll"))
         if "test" in datasets:
-            self.write_to_conll(datasets["test"], out_dir / "test.conll")
+            self.write_to_conll(datasets["test"], out_dir / (self.sentence_splitter.name + "_test.conll"))
 
     def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
         os.makedirs(str(output_file.parent), exist_ok=True)
         filter_nested_entities(dataset)
 
+        # Add task description for multi-task learning
+        assert len(dataset.entity_types) > 0
+
         with output_file.open("w", encoding="utf8") as f:
             for document_id in Tqdm.tqdm(
                 dataset.documents.keys(),
@@ -356,6 +382,7 @@ def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
                 document_text = ftfy.fix_text(dataset.documents[document_id])
                 document_text = re.sub(r"[\u2000-\u200B]", " ", document_text)  # replace unicode space characters!
                 document_text = document_text.replace("\xa0", " ")  # replace non-break space
+                document_buffer = ""
 
                 entities = deque(
                     sorted(
@@ -396,11 +423,13 @@ def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
 
                         whitespace_after = "+" if flair_token.whitespace_after > 0 else "-"
                         if len(token) > 0:
-                            f.write(" ".join([token, tag, whitespace_after]) + "\n")
+                            document_buffer += " ".join([token, tag, whitespace_after]) + "\n"
                             sentence_had_tokens = True
 
                     if sentence_had_tokens:
-                        f.write("\n")
+                        document_buffer += "\n"
+
+                f.write(document_buffer)
 
 
 class HunerDataset(ColumnCorpus, ABC):
@@ -421,7 +450,7 @@ def to_internal(self, data_folder: Path) -> InternalBioNerDataset:
 
     @staticmethod
     @abstractmethod
-    def split_url() -> str:
+    def split_url() -> Union[str, List[str]]:
         raise NotImplementedError
 
     def get_corpus_sentence_splitter(self) -> Optional[SentenceSplitter]:
@@ -494,15 +523,22 @@ def __init__(
         )
 
     def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
-        split_file = cached_path(f"{self.split_url()}.{split}", split_dir)
-
-        with split_file.open(encoding="utf8") as f:
-            ids = [line.strip() for line in f if line.strip()]
-            ids = sorted(id_ for id_ in ids if id_ in dataset.documents)
+        split_urls = self.split_url()
+        if isinstance(split_urls, str):
+            split_urls = [split_urls]
+        split_ids_set = set()
+        for split_url in split_urls:
+            split_file = cached_path(f"{split_url}.{split}", split_dir)
+            with split_file.open(encoding="utf8") as f:
+                ids = [line.strip() for line in f if line.strip()]
+                ids = sorted(id_ for id_ in ids if id_ in dataset.documents)
+                split_ids_set.update(ids)
+        split_ids = sorted(split_ids_set)
 
         return InternalBioNerDataset(
             documents={k: dataset.documents[k] for k in ids},
-            entities_per_document={k: dataset.entities_per_document[k] for k in ids},
+            entities_per_document={k: dataset.entities_per_document[k] for k in split_ids},
+            entity_types=dataset.entity_types,
         )
 
 
@@ -588,6 +624,13 @@ class HUNER_GENE_BIO_INFER(HunerDataset):
     """HUNER version of the BioInfer corpus containing only gene/protein annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {
+            "Individual_protein": GENE_TAG,
+            "Gene/protein/RNA": GENE_TAG,
+            "Gene": GENE_TAG,
+            "DNA_family_or_group": GENE_TAG,
+            "Protein_family_or_group": GENE_TAG,
+        }
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -599,19 +642,14 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         train_data = BIO_INFER.parse_dataset(corpus_folder / "BioInfer-train.xml")
         test_data = BIO_INFER.parse_dataset(corpus_folder / "BioInfer-test.xml")
 
-        entity_type_mapping = {
-            "Individual_protein": GENE_TAG,
-            "Gene/protein/RNA": GENE_TAG,
-            "Gene": GENE_TAG,
-            "DNA_family_or_group": GENE_TAG,
-            "Protein_family_or_group": GENE_TAG,
-        }
-
-        train_data = filter_and_map_entities(train_data, entity_type_mapping)
-        test_data = filter_and_map_entities(test_data, entity_type_mapping)
+        train_data = filter_and_map_entities(train_data, self.entity_type_mapping)
+        test_data = filter_and_map_entities(test_data, self.entity_type_mapping)
 
         return merge_datasets([train_data, test_data])
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class JNLPBA(ColumnCorpus):
@@ -750,10 +788,11 @@ def read_file(cls, input_iob_file: Path, sentence_tag: str) -> InternalBioNerDat
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
-class HUNER_GENE_JNLPBA(HunerDataset):
-    """HUNER version of the JNLPBA corpus containing gene annotations."""
+class HUNER_JNLPBA(HunerDataset):
+    """HUNER version of the JNLPBA corpus."""
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -772,42 +811,42 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
             sentence_separator = self.sentence_splitter.tag
 
         train_data = HunerJNLPBA.download_and_prepare_train(orig_folder, sentence_separator)
-        train_data = filter_and_map_entities(train_data, {"protein": GENE_TAG})
+        train_data = filter_and_map_entities(train_data, self.entity_type_mapping)
 
         test_data = HunerJNLPBA.download_and_prepare_test(orig_folder, sentence_separator)
-        test_data = filter_and_map_entities(test_data, {"protein": GENE_TAG})
+        test_data = filter_and_map_entities(test_data, self.entity_type_mapping)
 
         return merge_datasets([train_data, test_data])
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
-class HUNER_CELL_LINE_JNLPBA(HunerDataset):
-    """HUNER version of the JNLPBA corpus containing cell line annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+class HUNER_GENE_JNLPBA(HUNER_JNLPBA):
+    """HUNER version of the JNLPBA corpus containing gene annotations."""
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia"
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"protein": GENE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    def get_corpus_sentence_splitter(self) -> SentenceSplitter:
-        return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        download_folder = data_dir / "original"
-        os.makedirs(str(download_folder), exist_ok=True)
+class HUNER_CELL_LINE_JNLPBA(HUNER_JNLPBA):
+    """HUNER version of the JNLPBA corpus containing cell line annotations."""
 
-        sentence_separator = " "
-        if isinstance(self.sentence_splitter, TagSentenceSplitter):
-            sentence_separator = self.sentence_splitter.tag
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"cell_line": CELL_LINE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-        train_data = HunerJNLPBA.download_and_prepare_train(download_folder, sentence_separator)
-        train_data = filter_and_map_entities(train_data, {"cell_line": CELL_LINE_TAG})
 
-        test_data = HunerJNLPBA.download_and_prepare_test(download_folder, sentence_separator)
-        test_data = filter_and_map_entities(test_data, {"cell_line": CELL_LINE_TAG})
+class HUNER_ALL_JNLPBA(HUNER_JNLPBA):
+    """HUNER version of the JNLPBA corpus containing gene and cell line annotations."""
 
-        return merge_datasets([train_data, test_data])
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "protein": GENE_TAG,
+            "cell_line": CELL_LINE_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
 class CELL_FINDER(ColumnCorpus):
@@ -943,7 +982,37 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return data
 
 
-@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
+class HUNER_ALL_CELL_FINDER(HunerDataset):
+    """HUNER version of the CellFinder corpus containing only gene annotations."""
+
+    def __init__(self, *args, **kwargs):
+        self.entity_type_mapping = {
+            "CellLine": CELL_LINE_TAG,
+            "Species": SPECIES_TAG,
+            "GeneProtein": GENE_TAG,
+        }
+        super().__init__(*args, **kwargs)
+
+    @staticmethod
+    def split_url() -> List[str]:
+        split_urls = [
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_cellline",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_species",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_protein",
+        ]
+        return split_urls
+
+    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
+        data = CELL_FINDER.download_and_prepare(data_dir)
+
+        data = filter_and_map_entities(data, self.entity_type_mapping)
+
+        return data
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
 class MIRNA(ColumnCorpus):
     """Original miRNA corpus.
 
@@ -1067,13 +1136,15 @@ def get_mirna_subset(dataset: InternalBioNerDataset, split_url: str, split_dir:
         return InternalBioNerDataset(
             documents={k: dataset.documents[k] for k in ids},
             entities_per_document={k: dataset.entities_per_document[k] for k in ids},
+            entity_types=dataset.entity_types,
         )
 
 
-class HUNER_GENE_MIRNA(HunerDataset):
-    """HUNER version of the miRNA corpus containing protein / gene annotations."""
+class HUNER_MIRNA(HunerDataset):
+    """HUNER version of the miRNA corpus."""
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1098,84 +1169,51 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
             sentence_separator = self.sentence_splitter.tag
 
         train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
-        train_data = filter_and_map_entities(train_data, {"Genes/Proteins": GENE_TAG})
+        train_data = filter_and_map_entities(train_data, self.entity_type_mapping)
 
         test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
-        test_data = filter_and_map_entities(test_data, {"Genes/Proteins": GENE_TAG})
+        test_data = filter_and_map_entities(test_data, self.entity_type_mapping)
 
         return merge_datasets([train_data, test_data])
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
-class HUNER_SPECIES_MIRNA(HunerDataset):
-    """HUNER version of the miRNA corpus containing species annotations."""
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA"
-
-    def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
-        # In the huner split files there is no information whether a given id originates
-        # from the train or test file of the original corpus - so we have to adapt corpus
-        # splitting here
-        return HunerMiRNAHelper.get_mirna_subset(dataset, f"{self.split_url()}.{split}", split_dir)
-
-    def get_corpus_sentence_splitter(self) -> SentenceSplitter:
-        return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        download_folder = data_dir / "original"
-        os.makedirs(str(download_folder), exist_ok=True)
+class HUNER_GENE_MIRNA(HUNER_MIRNA):
+    """HUNER version of the miRNA corpus containing protein / gene annotations."""
 
-        sentence_separator = " "
-        if isinstance(self.sentence_splitter, TagSentenceSplitter):
-            sentence_separator = self.sentence_splitter.tag
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Genes/Proteins": GENE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-        train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
-        train_data = filter_and_map_entities(train_data, {"Species": SPECIES_TAG})
 
-        test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
-        test_data = filter_and_map_entities(test_data, {"Species": SPECIES_TAG})
+class HUNER_SPECIES_MIRNA(HUNER_MIRNA):
+    """HUNER version of the miRNA corpus containing species annotations."""
 
-        return merge_datasets([train_data, test_data])
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Species": SPECIES_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
-class HUNER_DISEASE_MIRNA(HunerDataset):
+class HUNER_DISEASE_MIRNA(HUNER_MIRNA):
     """HUNER version of the miRNA corpus containing disease annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA"
-
-    def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
-        # In the huner split files there is no information whether a given id originates
-        # from the train or test file of the original corpus - so we have to adapt corpus
-        # splitting here
-        return HunerMiRNAHelper.get_mirna_subset(dataset, f"{self.split_url()}.{split}", split_dir)
-
-    def get_corpus_sentence_splitter(self) -> SentenceSplitter:
-        return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
-
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        download_folder = data_dir / "original"
-        os.makedirs(str(download_folder), exist_ok=True)
-
-        sentence_separator = " "
-        if isinstance(self.sentence_splitter, TagSentenceSplitter):
-            sentence_separator = self.sentence_splitter.tag
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Diseases": DISEASE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-        train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
-        train_data = filter_and_map_entities(train_data, {"Diseases": DISEASE_TAG})
 
-        test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
-        test_data = filter_and_map_entities(test_data, {"Diseases": DISEASE_TAG})
+class HUNER_ALL_MIRNA(HUNER_MIRNA):
+    """HUNER version of the miRNA corpus containing gene, species and disease annotations."""
 
-        return merge_datasets([train_data, test_data])
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "Genes/Proteins": GENE_TAG,
+            "Species": SPECIES_TAG,
+            "Diseases": DISEASE_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
 class KaewphanCorpusHelper:
@@ -1511,10 +1549,11 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
-class HUNER_SPECIES_LOCTEXT(HunerDataset):
-    """HUNER version of the Loctext corpus containing species annotations."""
+class HUNER_LOCTEXT(HunerDataset):
+    """HUNER version of the Loctext corpus."""
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1525,24 +1564,34 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         LOCTEXT.download_dataset(data_dir)
         dataset = LOCTEXT.parse_dataset(data_dir)
 
-        return filter_and_map_entities(dataset, {"species": SPECIES_TAG})
+        return filter_and_map_entities(dataset, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
+class HUNER_SPECIES_LOCTEXT(HUNER_LOCTEXT):
+    """HUNER version of the Loctext corpus containing species annotations."""
 
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"species": SPECIES_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-class HUNER_GENE_LOCTEXT(HunerDataset):
+
+class HUNER_GENE_LOCTEXT(HUNER_LOCTEXT):
     """HUNER version of the Loctext corpus containing protein annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"protein": GENE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext"
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        LOCTEXT.download_dataset(data_dir)
-        dataset = LOCTEXT.parse_dataset(data_dir)
+class HUNER_ALL_LOCTEXT(HUNER_LOCTEXT):
+    """HUNER version of the Loctext corpus containing species and protein annotations."""
 
-        return filter_and_map_entities(dataset, {"protein": GENE_TAG})
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"species": SPECIES_TAG, "protein": GENE_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
 @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
@@ -1834,13 +1883,19 @@ def download_and_parse_dataset(data_dir: Path):
                 if document_text[start:end] != text:
                     raise AssertionError
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        entity_types = [SPECIES_TAG]
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 class HUNER_SPECIES_LINNEAUS(HunerDataset):
     """HUNER version of the LINNEAUS corpus containing species annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Species": SPECIES_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1850,6 +1905,9 @@ def split_url() -> str:
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         return LINNEAUS.download_and_parse_dataset(data_dir)
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class CDR(ColumnCorpus):
@@ -1919,6 +1977,7 @@ class HUNER_DISEASE_CDR(HunerDataset):
     """HUNER version of the IEPA corpus containing disease annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Disease": DISEASE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1932,15 +1991,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml")
         test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml")
         all_data = merge_datasets([train_data, dev_data, test_data])
-        all_data = filter_and_map_entities(all_data, {"Disease": DISEASE_TAG})
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
 
         return all_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class HUNER_CHEMICAL_CDR(HunerDataset):
     """HUNER version of the IEPA corpus containing chemical annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Chemical": CHEMICAL_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -1954,12 +2017,45 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml")
         test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml")
         all_data = merge_datasets([train_data, dev_data, test_data])
-        all_data = filter_and_map_entities(all_data, {"Chemical": CHEMICAL_TAG})
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
 
         return all_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
+class HUNER_ALL_CDR(HunerDataset):
+    """HUNER version of the IEPA corpus containing disease and chemical annotations."""
+
+    def __init__(self, *args, **kwargs):
+        self.entity_type_mapping = {"Disease": DISEASE_TAG, "Chemical": CHEMICAL_TAG}
+        super().__init__(*args, **kwargs)
+
+    @staticmethod
+    def split_url() -> List[str]:
+        split_urls = [
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRDisease",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRChem",
+        ]
+        return split_urls
+
+    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
+        os.makedirs(str(data_dir), exist_ok=True)
+        CDR.download_dataset(data_dir)
+        train_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml")
+        dev_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml")
+        test_data = bioc_to_internal(data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml")
+        all_data = merge_datasets([train_data, dev_data, test_data])
+
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
+
+        return all_data
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
-@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class VARIOME(ColumnCorpus):
     """Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip.
 
@@ -2056,6 +2152,7 @@ class HUNER_GENE_VARIOME(HunerDataset):
     """HUNER version of the Variome corpus containing gene annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"gene": GENE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2066,15 +2163,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         os.makedirs(str(data_dir), exist_ok=True)
         VARIOME.download_dataset(data_dir)
         all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
-        all_data = filter_and_map_entities(all_data, {"gene": GENE_TAG})
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
 
         return all_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class HUNER_DISEASE_VARIOME(HunerDataset):
     """HUNER version of the Variome corpus containing disease annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2085,15 +2186,19 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         os.makedirs(str(data_dir), exist_ok=True)
         VARIOME.download_dataset(data_dir)
         all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
-        all_data = filter_and_map_entities(all_data, {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG})
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
 
         return all_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class HUNER_SPECIES_VARIOME(HunerDataset):
     """HUNER version of the Variome corpus containing species annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Living_Beings": SPECIES_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2104,12 +2209,48 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         os.makedirs(str(data_dir), exist_ok=True)
         VARIOME.download_dataset(data_dir)
         all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
-        all_data = filter_and_map_entities(all_data, {"Living_Beings": SPECIES_TAG})
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
 
         return all_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
+class HUNER_ALL_VARIOME(HunerDataset):
+    """HUNER version of the Variome corpus containing gene, disease and species annotations."""
+
+    def __init__(self, *args, **kwargs):
+        self.entity_type_mapping = {
+            "gene": GENE_TAG,
+            "Disorder": DISEASE_TAG,
+            "disease": DISEASE_TAG,
+            "Living_Beings": SPECIES_TAG,
+        }
+        super().__init__(*args, **kwargs)
+
+    @staticmethod
+    def split_url() -> List[str]:
+        split_urls = [
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_gene",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_disease",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_species",
+        ]
+        return split_urls
+
+    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
+        os.makedirs(str(data_dir), exist_ok=True)
+        VARIOME.download_dataset(data_dir)
+        all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
+
+        all_data = filter_and_map_entities(all_data, self.entity_type_mapping)
+
+        return all_data
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
-@deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
 class NCBI_DISEASE(ColumnCorpus):
     """Original NCBI disease corpus containing disease annotations.
 
@@ -2237,13 +2378,19 @@ def parse_input_file(input_file: Path):
                 documents[document_id] = document_text
                 entities_per_document[document_id] = entities
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        entity_types = [DISEASE_TAG]
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 class HUNER_DISEASE_NCBI(HunerDataset):
     """HUNER version of the NCBI corpus containing disease annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Disease": DISEASE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2259,6 +2406,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
 
         return merge_datasets([train_data, dev_data, test_data])
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class ScaiCorpus(ColumnCorpus):
     """Base class to support the SCAI chemicals and disease corpora."""
@@ -2412,6 +2562,16 @@ class HUNER_CHEMICAL_SCAI(HunerDataset):
     """HUNER version of the SCAI chemicals corpus containing chemical annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {
+            "FAMILY": CHEMICAL_TAG,
+            "TRIVIALVAR": CHEMICAL_TAG,
+            "PARTIUPAC": CHEMICAL_TAG,
+            "TRIVIAL": CHEMICAL_TAG,
+            "ABBREVIATION": CHEMICAL_TAG,
+            "IUPAC": CHEMICAL_TAG,
+            "MODIFIER": CHEMICAL_TAG,
+            "SUM": CHEMICAL_TAG,
+        }
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2422,8 +2582,40 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         original_file = SCAI_CHEMICALS.perform_corpus_download(data_dir)
         corpus = ScaiCorpus.parse_input_file(original_file)
 
-        # Map all entities to chemicals
-        entity_mapping = {
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
+class HUNER_DISEASE_SCAI(HunerDataset):
+    """HUNER version of the SCAI chemicals corpus containing disease annotations."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG}
+        super().__init__(*args, **kwargs)
+
+    @staticmethod
+    def split_url() -> str:
+        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease"
+
+    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
+        original_file = SCAI_DISEASE.perform_corpus_download(data_dir)
+        corpus = ScaiCorpus.parse_input_file(original_file)
+
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
+
+class HUNER_ALL_SCAI(HunerDataset):
+    """HUNER version of the SCAI chemicals corpus containing chemical and disease annotations."""
+
+    def __init__(self, *args, **kwargs):
+        self.entity_type_mapping = {
+            "DISEASE": DISEASE_TAG,
+            "ADVERSE": DISEASE_TAG,
             "FAMILY": CHEMICAL_TAG,
             "TRIVIALVAR": CHEMICAL_TAG,
             "PARTIUPAC": CHEMICAL_TAG,
@@ -2433,28 +2625,24 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
             "MODIFIER": CHEMICAL_TAG,
             "SUM": CHEMICAL_TAG,
         }
-
-        return filter_and_map_entities(corpus, entity_mapping)
-
-
-class HUNER_DISEASE_SCAI(HunerDataset):
-    """HUNER version of the SCAI chemicals corpus containing chemical annotations."""
-
-    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
     @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease"
+    def split_url() -> List[str]:
+        split_urls = [
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_chemicals",
+            "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease",
+        ]
+        return split_urls
 
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         original_file = SCAI_DISEASE.perform_corpus_download(data_dir)
         corpus = ScaiCorpus.parse_input_file(original_file)
 
-        # Map all entities to disease
-        entity_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG}
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
 
-        return filter_and_map_entities(corpus, entity_mapping)
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
 
 @deprecated(version="0.13.0", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
@@ -2562,6 +2750,7 @@ class HUNER_GENE_OSIRIS(HunerDataset):
     """HUNER version of the OSIRIS corpus containing (only) gene annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"ge": GENE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2572,8 +2761,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         original_file = OSIRIS.download_dataset(data_dir)
         corpus = OSIRIS.parse_dataset(original_file / "OSIRIScorpusv02")
 
-        entity_type_mapping = {"ge": GENE_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
 
 class S800(ColumnCorpus):
@@ -2658,6 +2849,7 @@ class HUNER_SPECIES_S800(HunerDataset):
     """HUNER version of the S800 corpus containing species annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Species": SPECIES_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2667,10 +2859,13 @@ def split_url() -> str:
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         S800.download_dataset(data_dir)
         data = S800.parse_dataset(data_dir)
-        data = filter_and_map_entities(data, {"Species": SPECIES_TAG})
+        data = filter_and_map_entities(data, self.entity_type_mapping)
 
         return data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class GPRO(ColumnCorpus):
     """Original GPRO corpus containing gene annotations.
@@ -2783,13 +2978,19 @@ def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset:
                 document_text = documents[document_id]
                 assert columns[4] == document_text[start:end]
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        entity_types = [GENE_TAG]
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 class HUNER_GENE_GPRO(HunerDataset):
     """HUNER version of the GPRO corpus containing gene annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Gene": GENE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2809,6 +3010,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
 
         return merge_datasets([train_data, dev_data])
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class DECA(ColumnCorpus):
     """Original DECA corpus containing gene annotations.
@@ -2892,13 +3096,19 @@ def parse_corpus(text_dir: Path, gold_file: Path) -> InternalBioNerDataset:
                 document_text = documents[document_id]
                 assert document_text[start:end] == columns[3]
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        entity_types = [GENE_TAG]
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 class HUNER_GENE_DECA(HunerDataset):
     """HUNER version of the DECA corpus containing gene annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Gene": GENE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -2912,6 +3122,9 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
 
         return DECA.parse_corpus(text_dir, gold_file)
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class FSU(ColumnCorpus):
     """Original FSU corpus containing protein and derived annotations.
@@ -2961,6 +3174,7 @@ def download_corpus(cls, data_dir: Path) -> Path:
     def parse_corpus(corpus_dir: Path, sentence_separator: str) -> InternalBioNerDataset:
         documents = {}
         entities_per_document = {}
+        entity_types_set = set()
 
         for subcorpus in corpus_dir.iterdir():
             if not subcorpus.is_dir():
@@ -3026,25 +3240,33 @@ def parse_corpus(corpus_dir: Path, sentence_separator: str) -> InternalBioNerDat
                 entities = []
                 sent_offset = 0
                 for sent, sent_entities in zip(sentence_texts, pre_entities):
-                    entities += [
-                        Entity(
-                            (start + sent_offset, end + sent_offset),
-                            ent_type,
-                        )
-                        for (start, end, ent_type) in sent_entities
-                    ]
+                    for start, end, ent_type in sent_entities:
+                        entities.append(Entity((start + sent_offset, end + sent_offset), ent_type))
+                        entity_types_set.add(ent_type)
                     sent_offset += len(sent) + len(sentence_separator)
 
                 documents[document_id] = document
                 entities_per_document[document_id] = entities
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        entity_types = list(entity_types_set)
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 class HUNER_GENE_FSU(HunerDataset):
     """HUNER version of the FSU corpus containing (only) gene annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {
+            "protein": GENE_TAG,
+            "protein_familiy_or_group": GENE_TAG,
+            "protein_complex": GENE_TAG,
+            "protein_variant": GENE_TAG,
+            "protein_enum": GENE_TAG,
+        }
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -3063,14 +3285,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
 
         corpus = FSU.parse_corpus(corpus_dir, sentence_separator)
 
-        entity_type_mapping = {
-            "protein": GENE_TAG,
-            "protein_familiy_or_group": GENE_TAG,
-            "protein_complex": GENE_TAG,
-            "protein_variant": GENE_TAG,
-            "protein_enum": GENE_TAG,
-        }
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
 
 class CRAFT(ColumnCorpus):
@@ -3556,6 +3774,18 @@ class HUNER_CHEMICAL_CEMP(HunerDataset):
     """HUNER version of the CEMP corpus containing chemical annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {
+            x: CHEMICAL_TAG
+            for x in [
+                "ABBREVIATION",
+                "FAMILY",
+                "FORMULA",
+                "IDENTIFIERS",
+                "MULTIPLE",
+                "SYSTEMATIC",
+                "TRIVIAL",
+            ]
+        }
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -3574,19 +3804,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         dev_data = CEMP.parse_input_file(dev_text_file, dev_ann_file)
 
         dataset = merge_datasets([train_data, dev_data])
-        entity_type_mapping = {
-            x: CHEMICAL_TAG
-            for x in [
-                "ABBREVIATION",
-                "FAMILY",
-                "FORMULA",
-                "IDENTIFIERS",
-                "MULTIPLE",
-                "SYSTEMATIC",
-                "TRIVIAL",
-            ]
-        }
-        return filter_and_map_entities(dataset, entity_type_mapping)
+        return filter_and_map_entities(dataset, self.entity_type_mapping)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
 
 @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
@@ -3708,10 +3929,11 @@ def get_entities(f):
         return entities
 
 
-class HUNER_CHEMICAL_CHEBI(HunerDataset):
-    """HUNER version of the CHEBI corpus containing chemical annotations."""
+class HUNER_CHEBI(HunerDataset):
+    """HUNER version of the CHEBI corpus."""
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -3721,42 +3943,46 @@ def split_url() -> str:
     def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
         corpus_dir = CHEBI.download_dataset(data_dir)
         dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
-        entity_type_mapping = {"Chemical": CHEMICAL_TAG}
-        return filter_and_map_entities(dataset, entity_type_mapping)
+        return filter_and_map_entities(dataset, self.entity_type_mapping)
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
-class HUNER_GENE_CHEBI(HunerDataset):
-    """HUNER version of the CHEBI corpus containing gene annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+class HUNER_CHEMICAL_CHEBI(HUNER_CHEBI):
+    """HUNER version of the CHEBI corpus containing chemical annotations."""
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new"
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Chemical": CHEMICAL_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
-        corpus_dir = CHEBI.download_dataset(data_dir)
-        dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
+
+class HUNER_GENE_CHEBI(HUNER_CHEBI):
+    """HUNER version of the CHEBI corpus containing gene annotations."""
+
+    def __init__(self, *args, **kwargs):
         entity_type_mapping = {"Protein": GENE_TAG}
-        return filter_and_map_entities(dataset, entity_type_mapping)
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
-class HUNER_SPECIES_CHEBI(HunerDataset):
+class HUNER_SPECIES_CHEBI(HUNER_CHEBI):
     """HUNER version of the CHEBI corpus containing species annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Species": SPECIES_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new"
 
-    def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
-        corpus_dir = CHEBI.download_dataset(data_dir)
-        dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
-        entity_type_mapping = {"Species": SPECIES_TAG}
-        return filter_and_map_entities(dataset, entity_type_mapping)
+class HUNER_ALL_CHEBI(HUNER_CHEBI):
+    """HUNER version of the CHEBI corpus containing chemical, gene and species annotations."""
+
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "Chemical": CHEMICAL_TAG,
+            "Protein": GENE_TAG,
+            "Species": SPECIES_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
 class BioNLPCorpus(ColumnCorpus):
@@ -3819,6 +4045,7 @@ def download_corpus(data_folder: Path) -> Tuple[Path, Path, Path]:
     def parse_input_files(input_folder: Path) -> InternalBioNerDataset:
         documents = {}
         entities_per_document = {}
+        entity_types_set = set()
 
         for txt_file in input_folder.glob("*.txt"):
             name = txt_file.with_suffix("").name
@@ -3835,9 +4062,15 @@ def parse_input_files(input_folder: Path) -> InternalBioNerDataset:
                     if fields[0].startswith("T"):
                         ann_type, start, end = fields[1].split()
                         entities.append(Entity(char_span=(int(start), int(end)), entity_type=ann_type))
+                        entity_types_set.add(ann_type)
                 entities_per_document[name] = entities
+        entity_types = list(entity_types_set)
 
-        return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
+        return InternalBioNerDataset(
+            documents=documents,
+            entities_per_document=entities_per_document,
+            entity_types=entity_types,
+        )
 
 
 @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)")
@@ -4484,14 +4717,12 @@ def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset:
         return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document)
 
 
-class HUNER_CHEMICAL_CRAFT_V4(HunerDataset):
+class HUNER_CRAFT_V4(HunerDataset):
     """HUNER version of the CRAFT corpus containing (only) chemical annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
+        super().__init__(*args, **kwargs)
 
     @staticmethod
     def split_url() -> str:
@@ -4501,58 +4732,52 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         corpus_dir = CRAFT_V4.download_corpus(data_dir)
         corpus = CRAFT_V4.parse_corpus(corpus_dir)
 
-        entity_type_mapping = {"chebi": CHEMICAL_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
-class HUNER_GENE_CRAFT_V4(HunerDataset):
-    """HUNER version of the CRAFT corpus containing (only) gene annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
+class HUNER_CHEMICAL_CRAFT_V4(HUNER_CRAFT_V4):
+    """HUNER version of the CRAFT corpus containing (only) chemical annotations."""
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4"
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"chebi": CHEMICAL_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        corpus_dir = CRAFT_V4.download_corpus(data_dir)
-        corpus = CRAFT_V4.parse_corpus(corpus_dir)
 
+class HUNER_GENE_CRAFT_V4(HUNER_CRAFT_V4):
+    """HUNER version of the CRAFT corpus containing (only) gene annotations."""
+
+    def __init__(self, *args, **kwargs):
         entity_type_mapping = {"pr": GENE_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
-class HUNER_SPECIES_CRAFT_V4(HunerDataset):
+class HUNER_SPECIES_CRAFT_V4(HUNER_CRAFT_V4):
     """HUNER version of the CRAFT corpus containing (only) species annotations."""
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"ncbitaxon": SPECIES_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4"
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        corpus_dir = CRAFT_V4.download_corpus(data_dir)
-        corpus = CRAFT_V4.parse_corpus(corpus_dir)
+class HUNER_ALL_CRAFT_V4(HUNER_CRAFT_V4):
+    """HUNER version of the CRAFT corpus containing chemical, gene and species annotations."""
 
-        entity_type_mapping = {"ncbitaxon": SPECIES_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "chebi": CHEMICAL_TAG,
+            "pr": GENE_TAG,
+            "ncbitaxon": SPECIES_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
-class HUNER_CHEMICAL_BIONLP2013_CG(HunerDataset):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
+class HUNER_BIONLP2013_CG(HunerDataset):
+    def __init__(self, entity_type_mapping, *args, **kwargs):
+        self.entity_type_mapping = entity_type_mapping
+        super().__init__(*args, **kwargs)
 
     @staticmethod
     def split_url() -> str:
@@ -4565,74 +4790,48 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         test_corpus = BioNLPCorpus.parse_input_files(test_dir)
         corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
 
-        entity_type_mapping = {"Simple_chemical": CHEMICAL_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        return filter_and_map_entities(corpus, self.entity_type_mapping)
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
 
-class HUNER_DISEASE_BIONLP2013_CG(HunerDataset):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
+class HUNER_CHEMICAL_BIONLP2013_CG(HUNER_BIONLP2013_CG):
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "Simple_chemical": CHEMICAL_TAG,
+            "Amino_acid": CHEMICAL_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
-        train_corpus = BioNLPCorpus.parse_input_files(train_dir)
-        dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
-        test_corpus = BioNLPCorpus.parse_input_files(test_dir)
-        corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
 
+class HUNER_DISEASE_BIONLP2013_CG(HUNER_BIONLP2013_CG):
+    def __init__(self, *args, **kwargs):
         entity_type_mapping = {"Cancer": DISEASE_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
-
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-class HUNER_GENE_BIONLP2013_CG(HunerDataset):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
-
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
-
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
-        train_corpus = BioNLPCorpus.parse_input_files(train_dir)
-        dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
-        test_corpus = BioNLPCorpus.parse_input_files(test_dir)
-        corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
 
+class HUNER_GENE_BIONLP2013_CG(HUNER_BIONLP2013_CG):
+    def __init__(self, *args, **kwargs):
         entity_type_mapping = {"Gene_or_gene_product": GENE_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
-class HUNER_SPECIES_BIONLP2013_CG(HunerDataset):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(
-            *args,
-            **kwargs,
-        )
+class HUNER_SPECIES_BIONLP2013_CG(HUNER_BIONLP2013_CG):
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {"Organism": SPECIES_TAG}
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
-    @staticmethod
-    def split_url() -> str:
-        return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
 
-    def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
-        train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
-        train_corpus = BioNLPCorpus.parse_input_files(train_dir)
-        dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
-        test_corpus = BioNLPCorpus.parse_input_files(test_dir)
-        corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
-
-        entity_type_mapping = {"Organism": SPECIES_TAG}
-        return filter_and_map_entities(corpus, entity_type_mapping)
+class HUNER_ALL_BIONLP2013_CG(HUNER_BIONLP2013_CG):
+    def __init__(self, *args, **kwargs):
+        entity_type_mapping = {
+            "Simple_chemical": CHEMICAL_TAG,
+            "Cancer": DISEASE_TAG,
+            "Gene_or_gene_product": GENE_TAG,
+            "Organism": SPECIES_TAG,
+        }
+        super().__init__(entity_type_mapping, *args, **kwargs)
 
 
 class AZDZ(ColumnCorpus):
@@ -4801,6 +5000,7 @@ class HUNER_DISEASE_PDR(HunerDataset):
     """PDR Dataset with only Disease annotations."""
 
     def __init__(self, *args, **kwargs) -> None:
+        self.entity_type_mapping = {"Disease": DISEASE_TAG}
         super().__init__(*args, **kwargs)
 
     @staticmethod
@@ -4810,10 +5010,13 @@ def split_url() -> str:
     def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
         corpus_folder = PDR.download_corpus(data_dir)
         corpus_data = brat_to_internal(corpus_folder, ann_file_suffixes=[".ann", ".ann2"])
-        corpus_data = filter_and_map_entities(corpus_data, {"Disease": DISEASE_TAG})
+        corpus_data = filter_and_map_entities(corpus_data, self.entity_type_mapping)
 
         return corpus_data
 
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return self.entity_type_mapping
+
 
 class HunerMultiCorpus(MultiCorpus):
     """Base class to build the union of all HUNER data sets considering a particular entity type."""
@@ -4834,32 +5037,48 @@ def entity_type_predicate(member):
                     corpus = constructor_func(sentence_splitter=sentence_splitter)
 
                 self.huner_corpora.append(corpus)
-            except (CompressionError, ExtractError, HeaderError, ReadError, StreamError, TarError):
+            except (
+                CompressionError,
+                ExtractError,
+                HeaderError,
+                ReadError,
+                StreamError,
+                TarError,
+            ):
                 logger.exception(
-                    f"Error while processing Tar file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while processing Tar file from corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except (BadZipFile, LargeZipFile):
                 logger.exception(
-                    f"Error while processing Zip file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while processing Zip file from corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except OSError:
                 logger.exception(
-                    f"Error while downloading data for corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while downloading data for corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except shutil.Error:
                 logger.exception(
-                    f"Error while copying data files for corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while copying data files for corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except etree.LxmlError:
                 logger.exception(
-                    f"Error while processing XML file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while processing XML file from corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except json.JSONDecodeError:
                 logger.exception(
-                    f"Error while processing JSON file from corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False
+                    f"Error while processing JSON file from corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
                 )
             except (FileNotFoundError, OSError, ValueError):
-                logger.exception(f"Error while preparing corpus {name}:\n{sys.exc_info()[1]}\n\n", exc_info=False)
+                logger.exception(
+                    f"Error while preparing corpus {name}:\n{sys.exc_info()[1]}\n\n",
+                    exc_info=False,
+                )
 
         super().__init__(corpora=self.huner_corpora, name=f"HUNER-{entity_type}")
 
@@ -4949,9 +5168,9 @@ def __init__(
         self.sentence_splitter = sentence_splitter if sentence_splitter else SciSpacySentenceSplitter()
 
         dataset_dir_name = self.build_corpus_directory_name(dataset_name)
-        data_folder = base_path / dataset_dir_name / self.sentence_splitter.name
+        data_folder = base_path / dataset_dir_name
 
-        train_file = data_folder / "train.conll"
+        train_file = data_folder / (self.sentence_splitter.name + "_train.conll")
 
         # Download data if necessary
         # Some datasets in BigBio only have train or test splits, not both
@@ -4986,18 +5205,73 @@ def __init__(
             type_mapping = self.get_entity_type_mapping()
             if type_mapping:
                 splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()}
+            else:
+                logger.warning(
+                    f"No entity type mapping found for {dataset_name}. Check CONLL files for task descriptions."
+                )
 
             conll_writer = CoNLLWriter(sentence_splitter=self.sentence_splitter)
             conll_writer.process_dataset(splits, data_folder)
 
-        super().__init__(data_folder, columns, in_memory=in_memory, comment_symbol="#", sample_missing_splits=True)
+        super().__init__(
+            data_folder,
+            columns,
+            in_memory=in_memory,
+            comment_symbol="#",
+            sample_missing_splits=True,
+        )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
         """Return the mapping of entity type given in the dataset to canonical types.
 
         Note, if a entity type is not present in the map it is discarded.
         """
-        return None
+        # return None
+        # TODO: Add entity type mapping for all remaining bigbio datasets not in HunFlair?
+        return {
+            "chemical": "chemical",
+            "['chemical']": "chemical",
+            "simple_chemical": "chemical",
+            "cancer": "disease",
+            "disease": "disease",
+            "['disease']": "disease",
+            "gene": "gene",
+            "['gene']": "gene",
+            "gene_or_gene_product": "gene",
+            "species": "species",
+            "['species']": "species",
+            "cellline": "cell_line",
+            "cell_line": "cell_line",
+            "protein": "gene",
+            # "simple_chemical": "chemical",  # BioNLP ST 2013 CG
+            "amino_acid": "chemical",  # BioNLP ST 2013 CG
+            # "cancer": "disease",  # BioNLP ST 2013 CG
+            # "gene_or_gene_product": "gene",  # BioNLP ST 2013 CG
+            "organism": "species",  # BioNLP ST 2013 CG
+            "pathological_formation": "disease",  # BioNLP ST 2013 CG
+            # "gene": "gene",  # NLM Gene
+            "generif": "gene",  # NLM Gene
+            "stargene": "gene",  # NLM Gene
+            "domain": "gene",  # NLM Gene
+            "other": "gene",  # NLM Gene
+            # "chemical": "chemical",  # NLM Chem
+            "diseaseclass": "disease",  # NCBI Disease
+            "specificdisease": "disease",  # NCBI Disease
+            "modifier": "disease",  # NCBI Disease
+            "geneprotein": "gene",  # Cell Finder
+            # "cellline": "cell_line",  # Cell Finder
+            # "species": "species",  # Cell Finder
+            "geneorgeneproduct": "gene",  # BioRED
+            "chemicalentity": "chemical",  # BioRED
+            "organismtaxon": "species",  # BioRED
+            "diseaseorphenotypicfeature": "disease",  # BioRED
+            "pr": "gene",  # CRAFT (local)
+            "chebi": "chemical",  # CRAFT (local)
+            "ncbitaxon": "species",  # CRAFT (local)
+            # "protein": "gene",  # BioID
+            "mondo": "disease",  # CRAFT (local)
+            "drug": "chemical",  # BERNv2
+        }
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         """Builds the directory name for the given data set."""
@@ -5005,8 +5279,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
 
     def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
         """Converts a dataset given in hugging datasets format to our internal corpus representation."""
-        id_to_text = {}
-        id_to_entities: Dict[str, List] = {}
+        id_to_text: Dict[str, str] = {}
+        id_to_entities: Dict[str, list] = {}
+        entity_type_set = set()
         for document in dataset[split]:
             document_id = document["document_id"]
             passage_offsets = []
@@ -5032,9 +5307,13 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
 
                 # Adapt entity offsets according to passage offsets
                 entity_offset = entity["offsets"][0]
-                entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0])
+                entity_offset = (
+                    entity_offset[0] - passage_offset[0],
+                    entity_offset[1] - passage_offset[0],
+                )
 
                 id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"]))
+                entity_type_set.add(entity["type"])
 
                 # FIXME: This is just for debugging purposes
                 # passage_text = id_to_text[passage_id]
@@ -5043,15 +5322,25 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset:
                 # if doc_text != mention_text:
                 #     print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}")
 
-        return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities)
+        entity_types = list(entity_type_set)
+        return InternalBioNerDataset(
+            documents=id_to_text,
+            entities_per_document=id_to_entities,
+            entity_types=entity_types,
+        )
 
-    def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict):
-        """Helper methods to find the passage to a given entity mention inclusive offset.
+    def bin_search_passage(
+        self,
+        passages: List[Tuple[str, List[Tuple[int, int]]]],
+        low: int,
+        high: int,
+        entity: Dict,
+    ):
+        """Helper methods to find the passage to a given entity mention (incl. offset).
 
         The implementation uses binary search to find the passage in the ordered sequence passages.
         """
-        # Check base case
-        if low > high:
+        if low > high:  # Check base case
             raise NotImplementedError("There was a mistake concerning the lower and upper bound.")
 
         # Get element in the middle
@@ -5093,7 +5382,13 @@ def __init__(
         )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"Gene": GENE_TAG, "GENERIF": GENE_TAG, "STARGENE": GENE_TAG, "Domain": GENE_TAG, "Other": GENE_TAG}
+        return {
+            "Gene": GENE_TAG,
+            "GENERIF": GENE_TAG,
+            "STARGENE": GENE_TAG,
+            "Domain": GENE_TAG,
+            "Other": GENE_TAG,
+        }
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
@@ -5153,6 +5448,17 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_ALL_DRUGPROT(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="drugprot", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"GENE-N": GENE_TAG, "GENE-Y": GENE_TAG, "CHEMICAL": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
 class HUNER_GENE_BIORED(BIGBIO_NER_CORPUS):
     def __init__(
         self,
@@ -5288,6 +5594,23 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_ALL_BIORED(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="biored", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {
+            "GeneOrGeneProduct": GENE_TAG,
+            "ChemicalEntity": CHEMICAL_TAG,
+            "DiseaseOrPhenotypicFeature": DISEASE_TAG,
+            "OrganismTaxon": SPECIES_TAG,
+            "CellLine": CELL_LINE_TAG,
+        }
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
 class HUNER_GENE_CPI(BIGBIO_NER_CORPUS):
     def __init__(
         self,
@@ -5342,6 +5665,17 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_ALL_CPI(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="cpi", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {"protein": GENE_TAG, "compound": CHEMICAL_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
 class HUNER_GENE_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS):
     def __init__(
         self,
@@ -5396,6 +5730,21 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_ALL_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {
+            "Gene_or_gene_product": GENE_TAG,
+            "Complex": GENE_TAG,
+            "Simple_chemical": CHEMICAL_TAG,
+        }
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
 class HUNER_GENE_BIONLP_ST_2013_GE(BIGBIO_NER_CORPUS):
     def __init__(
         self,
@@ -5531,6 +5880,21 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
+class HUNER_ALL_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        return {
+            "Protein": GENE_TAG,
+            "Chemical": CHEMICAL_TAG,
+            "Organism": SPECIES_TAG,
+        }
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
 class HUNER_GENE_BIONLP_ST_2011_REL(BIGBIO_NER_CORPUS):
     def __init__(
         self,
@@ -5714,7 +6078,26 @@ def __init__(
         )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"cell": CELL_LINE_TAG}
+        # TODO whether cell or cell line is the correct tag
+        return {"cellline": CELL_LINE_TAG}
+
+    def build_corpus_directory_name(self, dataset_name: str) -> str:
+        return self.__class__.__name__.lower()
+
+
+class HUNER_ALL_BIOID(BIGBIO_NER_CORPUS):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, dataset_name="bioid", **kwargs)
+
+    def get_entity_type_mapping(self) -> Optional[Dict]:
+        # TODO whether cell or cell line is the correct tag
+        return {
+            "gene": GENE_TAG,
+            "protein": GENE_TAG,
+            "chemical": CHEMICAL_TAG,
+            "species": SPECIES_TAG,
+            "cellline": CELL_LINE_TAG,
+        }
 
     def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
@@ -5834,23 +6217,11 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
 
 
 class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS):
-    def __init__(
-        self,
-        base_path: Optional[Union[str, Path]] = None,
-        in_memory: bool = True,
-        sentence_splitter: Optional[SentenceSplitter] = None,
-        train_split_name: Optional[str] = None,
-        dev_split_name: Optional[str] = None,
-        test_split_name: Optional[str] = None,
-    ) -> None:
+    def __init__(self, *args, **kwargs):
         super().__init__(
+            *args,
             dataset_name="tmvar_v3",
-            base_path=base_path,
-            in_memory=in_memory,
-            sentence_splitter=sentence_splitter,
-            train_split_name=train_split_name,
-            dev_split_name=dev_split_name,
-            test_split_name=test_split_name,
+            **kwargs,
         )
 
     def get_entity_type_mapping(self) -> Optional[Dict]:
@@ -5860,55 +6231,5 @@ def build_corpus_directory_name(self, dataset_name: str) -> str:
         return self.__class__.__name__.lower()
 
 
-class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS):
-    def __init__(
-        self,
-        base_path: Optional[Union[str, Path]] = None,
-        in_memory: bool = True,
-        sentence_splitter: Optional[SentenceSplitter] = None,
-        train_split_name: Optional[str] = None,
-        dev_split_name: Optional[str] = None,
-        test_split_name: Optional[str] = None,
-    ) -> None:
-        super().__init__(
-            dataset_name="tmvar_v3",
-            base_path=base_path,
-            in_memory=in_memory,
-            sentence_splitter=sentence_splitter,
-            train_split_name=train_split_name,
-            dev_split_name=dev_split_name,
-            test_split_name=test_split_name,
-        )
-
-    def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"['Species']": SPECIES_TAG}
-
-    def build_corpus_directory_name(self, dataset_name: str) -> str:
-        return self.__class__.__name__.lower()
-
-
-class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS):
-    def __init__(
-        self,
-        base_path: Optional[Union[str, Path]] = None,
-        in_memory: bool = True,
-        sentence_splitter: Optional[SentenceSplitter] = None,
-        train_split_name: Optional[str] = None,
-        dev_split_name: Optional[str] = None,
-        test_split_name: Optional[str] = None,
-    ) -> None:
-        super().__init__(
-            dataset_name="tmvar_v3",
-            base_path=base_path,
-            in_memory=in_memory,
-            sentence_splitter=sentence_splitter,
-            train_split_name=train_split_name,
-            dev_split_name=dev_split_name,
-            test_split_name=test_split_name,
-        )
-
-    def get_entity_type_mapping(self) -> Optional[Dict]:
-        return {"['CellLine']": CELL_LINE_TAG}
-
-    def build_corpus_directory_name(self, dataset_name: str) -> str:
-        return self.__class__.__name__.lower()
+if __name__ == "__main__":
+    HUNER_ALL_BIONLP_ST_2013_PC(base_path="/home/tmp/hunflair/tmp")
diff --git a/flair/models/prefixed_tagger.py b/flair/models/prefixed_tagger.py
new file mode 100644
index 000000000..b8c01c50a
--- /dev/null
+++ b/flair/models/prefixed_tagger.py
@@ -0,0 +1,319 @@
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+import torch
+from torch.utils.data import Dataset
+
+import flair.data
+from flair.data import Corpus, Sentence, Token
+from flair.datasets import DataLoader, FlairDatapointDataset
+from flair.models import SequenceTagger
+
+
+class PrefixedSentence(Sentence):
+    """An PrefixedSentence expresses that a sentence is augmented and compatible with the PrefixedSequenceTagger.
+
+    For inference, i.e. `predict` and `evaluate`, the PrefixedSequenceTagger internally encodes the sentences.
+    Therefore, these functions work with the regular flair sentence objects.
+    """
+
+
+class SentenceAugmentationStrategy(ABC):
+    """Strategy to augment a sentence with additional information or instructions."""
+
+    @abstractmethod
+    def augment_sentence(
+        self, sentence: Sentence, annotation_layers: Optional[Union[str, List[str]]] = None
+    ) -> PrefixedSentence:
+        """Augments the given sentence text with additional instructions for working / predicting the task on the given annotations.
+
+        Args:
+            sentence: The sentence to be augmented
+            annotation_layers: Annotations which should be predicted.
+        """
+        ...
+
+    @abstractmethod
+    def apply_predictions(
+        self,
+        augmented_sentence: Sentence,
+        original_sentence: Sentence,
+        source_annotation_layer: str,
+        target_annotation_layer: str,
+    ):
+        """Transfers the predictions made on the augmented sentence to the original one.
+
+        Args:
+              augmented_sentence: The augmented sentence instance
+              original_sentence: The original sentence before the augmentation was applied
+              source_annotation_layer: Annotation layer of the augmented sentence in which the predictions are stored.
+              target_annotation_layer: Annotation layer in which the predictions should be stored in the original sentence.
+        """
+        ...
+
+    @abstractmethod
+    def _get_state_dict(self):
+        """Returns the state dict for the given augmentation strategy."""
+        ...
+
+    @classmethod
+    def _init_strategy_with_state_dict(cls, state, **kwargs):
+        """Initializes the strategy from the given state."""
+
+    def augment_dataset(
+        self, dataset: Dataset[Sentence], annotation_layers: Optional[Union[str, List[str]]] = None
+    ) -> FlairDatapointDataset[PrefixedSentence]:
+        """Transforms a dataset into a dataset containing augmented sentences specific to the `PrefixedSequenceTagger`.
+
+        The returned dataset is stored in memory. For more information on the internal sentence transformation
+        procedure, see the :class:`PrefixedSequenceTagger` architecture.
+
+        Args:
+            dataset: A dataset of sentences to augment
+            annotation_layers: Annotations which should be predicted.
+
+        Returns: A dataset of augmented sentences specific to the `PrefixedSequenceTagger`
+        """
+        data_loader: DataLoader = DataLoader(dataset, batch_size=1)
+        original_sentences: List[Sentence] = [batch[0] for batch in iter(data_loader)]
+
+        augmented_sentences = [self.augment_sentence(sentence, annotation_layers) for sentence in original_sentences]
+
+        return FlairDatapointDataset(augmented_sentences)
+
+    def augment_corpus(
+        self, corpus: Corpus[Sentence], annotation_layers: Optional[Union[str, List[str]]] = None
+    ) -> Corpus[PrefixedSentence]:
+        """Transforms a corpus into a corpus containing augmented sentences specific to the `PrefixedSequenceTagger`.
+
+        The splits of the returned corpus are stored in memory. For more information on the internal
+        sentence augmentation procedure, see the :class:`PrefixedSequenceTagger`.
+
+        Args:
+            corpus: A corpus of sentences to augment
+            annotation_layers: Annotations which should be predicted.
+
+        Returns: A corpus of encoded sentences specific to the `PrefixedSequenceTagger`
+        """
+        return Corpus(
+            train=self.augment_dataset(corpus.train, annotation_layers) if corpus.train is not None else None,
+            dev=self.augment_dataset(corpus.dev, annotation_layers) if corpus.dev is not None else None,
+            test=self.augment_dataset(corpus.test, annotation_layers) if corpus.test is not None else None,
+            name=corpus.name,
+            # If we sample missing splits, the encoded sentences that correspond to the same original sentences
+            # may get distributed into different splits. For training purposes, this is always undesired.
+            sample_missing_splits=False,
+        )
+
+
+class EntityTypeTaskPromptAugmentationStrategy(SentenceAugmentationStrategy):
+    """Augmentation strategy that augments a sentence with a task description which specifies which entity types should be tagged.
+
+    This approach is inspired by the paper from Luo et al.:
+    AIONER: All-in-one scheme-based biomedical named entity recognition using deep learning
+    https://arxiv.org/abs/2211.16944
+
+    Example:
+        "[Tag gene and disease] Mutations in the TP53 tumour suppressor gene are found in ~50% of human cancers"
+    """
+
+    def __init__(self, entity_types: List[str]):
+        if len(entity_types) <= 0:
+            raise AssertionError
+
+        self.entity_types = entity_types
+        self.task_prompt = self._build_tag_prompt_prefix(entity_types)
+
+    def augment_sentence(
+        self, sentence: Sentence, annotation_layers: Optional[Union[str, List[str]]] = None
+    ) -> PrefixedSentence:
+        # Prepend the task description prompt to the sentence text
+        augmented_sentence = PrefixedSentence(
+            text=self.task_prompt + [t.text for t in sentence.tokens],
+            use_tokenizer=False,
+            language_code=sentence.language_code,
+            start_position=sentence.start_position,
+        )
+
+        # Make sure it's a list
+        if annotation_layers and isinstance(annotation_layers, str):
+            annotation_layers = [annotation_layers]
+
+        # Reconstruct all annotations from the original sentence (necessary for learning classifiers)
+        layers = annotation_layers if annotation_layers else sentence.annotation_layers.keys()
+        len_task_prompt = len(self.task_prompt)
+
+        for layer in layers:
+            for label in sentence.get_labels(layer):
+                if isinstance(label.data_point, Token):
+                    label_span = augmented_sentence[
+                        len_task_prompt + label.data_point.idx - 1 : len_task_prompt + label.data_point.idx
+                    ]
+                else:
+                    label_span = augmented_sentence[
+                        len_task_prompt
+                        + label.data_point.tokens[0].idx
+                        - 1 : len_task_prompt
+                        + label.data_point.tokens[-1].idx
+                    ]
+
+                label_span.add_label(layer, label.value, label.score)
+
+        return augmented_sentence
+
+    def apply_predictions(
+        self,
+        augmented_sentence: Sentence,
+        original_sentence: Sentence,
+        source_annotation_layer: str,
+        target_annotation_layer: str,
+    ):
+        new_labels = augmented_sentence.get_labels(source_annotation_layer)
+        len_task_prompt = len(self.task_prompt)
+
+        for label in new_labels:
+            if label.data_point.tokens[0].idx - len_task_prompt - 1 < 0:
+                continue
+            orig_span = original_sentence[
+                label.data_point.tokens[0].idx - len_task_prompt - 1 : label.data_point.tokens[-1].idx - len_task_prompt
+            ]
+            orig_span.add_label(target_annotation_layer, label.value, label.score)
+
+    def _build_tag_prompt_prefix(self, entity_types: List[str]) -> List[str]:
+        if len(self.entity_types) == 1:
+            prompt = f"[ Tag {entity_types[0]} ]"
+        else:
+            prompt = "[ Tag " + ", ".join(entity_types[:-1]) + " and " + entity_types[-1] + " ]"
+
+        return prompt.split()
+
+    def _get_state_dict(self):
+        return {"entity_types": self.entity_types}
+
+    @classmethod
+    def _init_strategy_with_state_dict(cls, state, **kwargs):
+        return cls(state["entity_types"])
+
+
+class PrefixedSequenceTagger(SequenceTagger):
+    def __init__(self, *args, augmentation_strategy: SentenceAugmentationStrategy, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if augmentation_strategy is None:
+            logging.warning("No augmentation strategy provided. Make sure that the strategy is set.")
+
+        self.augmentation_strategy = augmentation_strategy
+
+    def _get_state_dict(self):
+        state = super()._get_state_dict()
+        state["augmentation_strategy"] = self.augmentation_strategy
+
+        return state
+
+    @classmethod
+    def _init_model_with_state_dict(cls, state, **kwargs):
+        strategy = state["augmentation_strategy"]
+        return super()._init_model_with_state_dict(state, augmentation_strategy=strategy, **kwargs)
+
+    @classmethod
+    def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "PrefixedSequenceTagger":
+        from typing import cast
+
+        return cast("PrefixedSequenceTagger", super().load(model_path=model_path))
+
+    def forward_loss(self, sentences: Union[List[Sentence], List[PrefixedSentence]]) -> Tuple[torch.Tensor, int]:
+        # If all sentences are not augmented -> augment them
+        if all(isinstance(sentence, Sentence) for sentence in sentences):
+            # mypy does not infer the type of "sentences" restricted by the if statement
+            sentences = cast(List[Sentence], sentences)
+
+            sentences = self.augment_sentences(sentences=sentences, annotation_layers=self.tag_type)
+        elif not all(isinstance(sentence, PrefixedSentence) for sentence in sentences):
+            raise ValueError("All passed sentences must be either uniformly augmented or not.")
+
+        # mypy does not infer the type of "sentences" restricted by code above
+        sentences = cast(List[Sentence], sentences)
+
+        return super().forward_loss(sentences)
+
+    def predict(
+        self,
+        sentences: Union[List[Sentence], Sentence, List[PrefixedSentence], PrefixedSentence],
+        mini_batch_size: int = 32,
+        return_probabilities_for_all_classes: bool = False,
+        verbose: bool = False,
+        label_name: Optional[str] = None,
+        return_loss=False,
+        embedding_storage_mode="none",
+        force_token_predictions: bool = False,
+    ):
+        # Compute prediction label type
+        prediction_label_type: str = self.label_type if label_name is None else label_name
+
+        # make sure it's a list
+        if not isinstance(sentences, list) and not isinstance(sentences, flair.data.Dataset):
+            sentences = [sentences]
+
+        # If all sentences are already augmented (i.e. compatible with this class), just forward the sentences
+        if all(isinstance(sentence, PrefixedSentence) for sentence in sentences):
+            # mypy does not infer the type of "sentences" restricted by the if statement
+            sentences = cast(List[Sentence], sentences)
+
+            return super().predict(
+                sentences,
+                mini_batch_size=mini_batch_size,
+                return_probabilities_for_all_classes=return_probabilities_for_all_classes,
+                verbose=verbose,
+                label_name=prediction_label_type,
+                return_loss=return_loss,
+                embedding_storage_mode=embedding_storage_mode,
+            )
+
+        elif not all(isinstance(sentence, Sentence) for sentence in sentences):
+            raise ValueError("All passed sentences must be either uniformly augmented or not.")
+
+        # Remove existing labels
+        if label_name is not None:
+            for sentence in sentences:
+                sentence.remove_labels(prediction_label_type)
+
+        sentences = cast(List[Sentence], sentences)
+
+        # Augment sentences - copy all annotation of the given tag type
+        augmented_sentences = self.augment_sentences(sentences, self.tag_type)
+
+        mypy_safe_augmented_sentences = cast(List[Sentence], augmented_sentences)
+
+        # Predict on augmented sentence and store it in an internal annotation layer / label
+        loss_and_count = super().predict(
+            sentences=mypy_safe_augmented_sentences,
+            mini_batch_size=mini_batch_size,
+            return_probabilities_for_all_classes=return_probabilities_for_all_classes,
+            verbose=verbose,
+            label_name=prediction_label_type,
+            return_loss=return_loss,
+            embedding_storage_mode=embedding_storage_mode,
+        )
+
+        # Append predicted labels to the original sentences
+        for orig_sent, aug_sent in zip(sentences, augmented_sentences):
+            self.augmentation_strategy.apply_predictions(
+                aug_sent, orig_sent, prediction_label_type, prediction_label_type
+            )
+
+            if prediction_label_type == "predicted":
+                orig_sent.remove_labels("predicted_bio")
+                orig_sent.remove_labels("gold_bio")
+
+        if loss_and_count is not None:
+            return loss_and_count
+
+    def augment_sentences(
+        self, sentences: Union[Sentence, List[Sentence]], annotation_layers: Optional[Union[str, List[str]]] = None
+    ) -> List[PrefixedSentence]:
+        if not isinstance(sentences, list) and not isinstance(sentences, flair.data.Dataset):
+            sentences = [sentences]
+
+        return [self.augmentation_strategy.augment_sentence(sentence, annotation_layers) for sentence in sentences]
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 2bcc2c5cc..9e1ff7719 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -394,6 +394,9 @@ def _get_gold_labels(self, sentences: List[Sentence]) -> List[str]:
             for sentence in sentences:
                 sentence_labels = ["O"] * len(sentence)
                 for label in sentence.get_labels(self.label_type):
+                    if label.value == "O":
+                        continue
+
                     span: Span = label.data_point
                     if self.tag_format == "BIOES":
                         if len(span) == 1:
diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py
new file mode 100644
index 000000000..435aed0d7
--- /dev/null
+++ b/tests/test_augmentation.py
@@ -0,0 +1,99 @@
+from flair.data import Sentence
+from flair.models.prefixed_tagger import EntityTypeTaskPromptAugmentationStrategy, PrefixedSentence
+
+
+def test_entity_type_task_prompt_augmentation_single_type():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"])
+
+    sent = Sentence("This is a test sentence.")
+    aug_sent = strategy.augment_sentence(sent)
+
+    assert isinstance(aug_sent, PrefixedSentence)
+    assert aug_sent.text.startswith("[ Tag genes ] ")
+    assert len(aug_sent) == 10
+
+
+def test_entity_type_task_prompt_augmentation_two_types():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes", "diseases"])
+
+    sent = Sentence("This is a test sentence.")
+    aug_sent = strategy.augment_sentence(sent)
+
+    assert isinstance(aug_sent, PrefixedSentence)
+    assert aug_sent.text.startswith("[ Tag genes and diseases ] ")
+    assert len(aug_sent) == 12
+
+
+def test_entity_type_task_prompt_augmentation_multiple_types():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes", "diseases", "chemicals"])
+
+    sent = Sentence("This is a test sentence.")
+    aug_sent = strategy.augment_sentence(sent)
+
+    assert isinstance(aug_sent, PrefixedSentence)
+    assert aug_sent.text.startswith("[ Tag genes, diseases and chemicals ] ")
+    assert len(aug_sent) == 13
+
+
+def test_entity_type_task_prompt_augmentation_label_transfer():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"])
+
+    sent = Sentence("This is a test sentence.")
+    sent[0:2].add_label("ner", "test", 1.0)
+    sent[3:4].add_label("foo", "test", 1.0)
+
+    aug_sent = strategy.augment_sentence(sent, "ner")
+
+    assert isinstance(aug_sent, PrefixedSentence)
+    assert aug_sent.text.startswith("[ Tag genes ] ")
+    assert len(aug_sent.get_labels("foo")) == 0
+
+    ner_labels = aug_sent.get_labels("ner")
+    assert len(ner_labels) == 1
+    assert len(ner_labels[0].data_point.tokens) == 2
+    assert ner_labels[0].data_point.text == "This is"
+    assert ner_labels[0].data_point.tokens[0].idx == 5
+    assert ner_labels[0].data_point.tokens[-1].idx == 6
+
+
+def test_entity_type_task_prompt_augmentation_label_application():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"])
+
+    sent = Sentence("TP53 - also known as tumour protein 53 - is an onco-gene.")
+
+    aug_sent = strategy.augment_sentence(sent, "ner")
+    aug_sent[4:5].add_label("predict", "gene", 1.0)
+    aug_sent[9:12].add_label("predict", "gene", 1.0)
+    aug_sent[5:6].add_label("not-predict", "gene", 1.0)
+
+    strategy.apply_predictions(aug_sent, sent, "predict", "ner")
+
+    ner_labels = sent.get_labels("ner")
+    assert len(ner_labels) == 2
+
+    assert ner_labels[0].data_point.text == "TP53"
+    assert ner_labels[0].value == "gene"
+    assert ner_labels[0].score == 1.0
+    assert len(ner_labels[0].data_point.tokens) == 1
+    assert ner_labels[0].data_point.tokens[0].idx == 1
+
+    assert ner_labels[1].data_point.text == "tumour protein 53"
+    assert ner_labels[1].value == "gene"
+    assert ner_labels[1].score == 1.0
+    assert len(ner_labels[1].data_point.tokens) == 3
+    assert ner_labels[1].data_point.tokens[0].idx == 6
+    assert ner_labels[1].data_point.tokens[-1].idx == 8
+
+
+def test_entity_type_task_prompt_augmentation_label_application_label_in_tag():
+    strategy = EntityTypeTaskPromptAugmentationStrategy(["genes"])
+
+    sent = Sentence("TP53 - also known as tumour protein 53 - is an onco-gene.")
+
+    aug_sent = strategy.augment_sentence(sent, "ner")
+    aug_sent[2:4].add_label("predict", "gene", 1.0)  # Add label in tagging prompt
+
+    strategy.apply_predictions(aug_sent, sent, "predict", "ner")
+
+    ner_labels = sent.get_labels("ner")
+    assert len(ner_labels) == 0
diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py
index 2ba4cf3d6..4099bb928 100644
--- a/tests/test_datasets_biomedical.py
+++ b/tests/test_datasets_biomedical.py
@@ -117,6 +117,7 @@ def test_write_to_conll():
                 ),
             ]
         },
+        entity_types=["E"],
     )
     expected_labeling = [
         "This O +",
@@ -142,6 +143,7 @@ def test_conll_writer_one_token_multiple_entities1():
                 Entity((text.find("entity2"), text.find("entity2") + len("entity2")), "E"),
             ]
         },
+        entity_types=["E"],
     )
 
     assert_conll_writer_output(dataset, ["This O +", "is O +", "entity1 B-E +", "entity2 B-E -"])
@@ -157,6 +159,7 @@ def test_conll_writer_one_token_multiple_entities2():
                 Entity((text.find("tity1"), text.find("tity1") + 5), "E"),
             ]
         },
+        entity_types=["E"],
     )
 
     assert_conll_writer_output(dataset, ["This O +", "is O +", "entity1 B-E +", "entity2 O -"])