diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c801b96a70..5b3633d93e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,7 +29,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: ./cache
-          key: cache-v1.1
+          key: cache-v1.2
       - name: Run tests
         run: |
           python -c 'import flair'
diff --git a/.gitignore b/.gitignore
index cbd5be92d3..89a4bb39e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -110,4 +110,4 @@ resources/taggers/
 regression_train/
 /doc_build/
 
-scripts/
\ No newline at end of file
+scripts/
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 8d7ae05d70..0e8c4f6141 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,4 +5,9 @@ sphinx
 importlib-metadata
 sphinx-multiversion
 pydata-sphinx-theme<0.14
-sphinx_design
\ No newline at end of file
+sphinx_design
+
+# previous dependencies that are required to build docs for later versions too.
+semver
+gensim
+bpemb
\ No newline at end of file
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 30f010df40..613ee639d0 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -24,6 +24,7 @@
 )
 from flair.datasets.base import find_train_dev_test_files
 from flair.file_utils import cached_path, unpack_file
+from flair.tokenization import Tokenizer
 
 log = logging.getLogger("flair")
 
@@ -41,6 +42,7 @@ def __init__(
         label_column_name: str = "label",
         metadata_column_name: str = "metadata",
         label_type: str = "ner",
+        use_tokenizer: Union[bool, Tokenizer] = True,
         **corpusargs,
     ) -> None:
         """Instantiates a MuliFileJsonlCorpus as, e.g., created with doccanos JSONL export.
@@ -52,9 +54,12 @@ def __init__(
         :param train_files: the name of the train files
         :param test_files: the name of the test files
         :param dev_files: the name of the dev files, if empty, dev data is sampled from train
+        :param encoding: file encoding (default "utf-8")
         :param text_column_name: Name of the text column inside the jsonl files.
         :param label_column_name: Name of the label column inside the jsonl files.
         :param metadata_column_name: Name of the metadata column inside the jsonl files.
+        :param label_type: he type of label to predict (default "ner")
+        :param use_tokenizer: Specify a custom tokenizer to split the text into tokens.
 
         :raises RuntimeError: If no paths are given
         """
@@ -68,6 +73,7 @@ def __init__(
                         metadata_column_name=metadata_column_name,
                         label_type=label_type,
                         encoding=encoding,
+                        use_tokenizer=use_tokenizer,
                     )
                     for train_file in train_files
                 ]
@@ -86,6 +92,8 @@ def __init__(
                         label_column_name=label_column_name,
                         metadata_column_name=metadata_column_name,
                         label_type=label_type,
+                        encoding=encoding,
+                        use_tokenizer=use_tokenizer,
                     )
                     for test_file in test_files
                 ]
@@ -104,6 +112,8 @@ def __init__(
                         label_column_name=label_column_name,
                         metadata_column_name=metadata_column_name,
                         label_type=label_type,
+                        encoding=encoding,
+                        use_tokenizer=use_tokenizer,
                     )
                     for dev_file in dev_files
                 ]
@@ -128,6 +138,7 @@ def __init__(
         label_type: str = "ner",
         autofind_splits: bool = True,
         name: Optional[str] = None,
+        use_tokenizer: Union[bool, Tokenizer] = True,
         **corpusargs,
     ) -> None:
         """Instantiates a JsonlCorpus with one file per Dataset (train, dev, and test).
@@ -136,11 +147,14 @@ def __init__(
         :param train_file: the name of the train file
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
+        :param encoding: file encoding (default "utf-8")
         :param text_column_name: Name of the text column inside the JSONL file.
         :param label_column_name: Name of the label column inside the JSONL file.
         :param metadata_column_name: Name of the metadata column inside the JSONL file.
+        :param label_type: The type of label to predict (default "ner")
         :param autofind_splits: Whether train, test and dev file should be determined automatically
         :param name: name of the Corpus see flair.data.Corpus
+        :param use_tokenizer: Specify a custom tokenizer to split the text into tokens.
         """
         # find train, dev and test files if not specified
         dev_file, test_file, train_file = find_train_dev_test_files(
@@ -156,6 +170,7 @@ def __init__(
             label_type=label_type,
             name=name if data_folder is None else str(data_folder),
             encoding=encoding,
+            use_tokenizer=use_tokenizer,
             **corpusargs,
         )
 
@@ -169,6 +184,7 @@ def __init__(
         label_column_name: str = "label",
         metadata_column_name: str = "metadata",
         label_type: str = "ner",
+        use_tokenizer: Union[bool, Tokenizer] = True,
     ) -> None:
         """Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme.
 
@@ -184,9 +200,12 @@ def __init__(
 
         Args:
             path_to_jsonl_file: File to read
+            encoding: file encoding (default "utf-8")
             text_column_name: Name of the text column
             label_column_name: Name of the label column
             metadata_column_name: Name of the metadata column
+            label_type: The type of label to predict (default "ner")
+            use_tokenizer: Specify a custom tokenizer to split the text into tokens.
         """
         path_to_json_file = Path(path_to_jsonl_file)
 
@@ -203,7 +222,7 @@ def __init__(
                 raw_text = current_line[text_column_name]
                 current_labels = current_line[label_column_name]
                 current_metadatas = current_line.get(self.metadata_column_name, [])
-                current_sentence = Sentence(raw_text)
+                current_sentence = Sentence(raw_text, use_tokenizer=use_tokenizer)
 
                 self._add_labels_to_sentence(raw_text, current_sentence, current_labels)
                 self._add_metadatas_to_sentence(current_sentence, current_metadatas)
@@ -310,6 +329,7 @@ def __init__(
             dev_files: the name of the dev files, if empty, dev data is sampled from train
             column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
             comment_symbol: if set, lines that begin with this symbol are treated as comments
+            encoding: file encoding (default "utf-8")
             document_separator_token: If provided, sentences that function as document boundaries are so marked
             skip_first_line: set to True if your dataset has a header line
             in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index fd46321e91..f3492178f9 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -12,7 +12,7 @@
 
 import torch
 import transformers
-from semver import Version
+from packaging.version import Version
 from torch.jit import ScriptModule
 from transformers import (
     CONFIG_MAPPING,
@@ -65,7 +65,7 @@ def pad_sequence_embeddings(all_hidden_states: List[torch.Tensor]) -> torch.Tens
 
 @torch.jit.script_if_tracing
 def truncate_hidden_states(hidden_states: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
-    return hidden_states[:, :, : input_ids.size()[1]]
+    return hidden_states[:, :, : input_ids.size(1)]
 
 
 @torch.jit.script_if_tracing
@@ -95,14 +95,12 @@ def combine_strided_tensors(
         if selected_sentences.size(0) > 1:
             start_part = selected_sentences[0, : half_stride + 1]
             mid_part = selected_sentences[:, half_stride + 1 : max_length - 1 - half_stride]
-            mid_part = torch.reshape(mid_part, (mid_part.shape[0] * mid_part.shape[1],) + mid_part.shape[2:])
-            end_part = selected_sentences[selected_sentences.shape[0] - 1, max_length - half_stride - 1 :]
+            mid_part = torch.reshape(mid_part, (mid_part.size(0) * mid_part.size(1),) + mid_part.size()[2:])
+            end_part = selected_sentences[selected_sentences.size(0) - 1, max_length - half_stride - 1 :]
             sentence_hidden_state = torch.cat((start_part, mid_part, end_part), dim=0)
-            sentence_hidden_states[sentence_id, : sentence_hidden_state.shape[0]] = torch.cat(
-                (start_part, mid_part, end_part), dim=0
-            )
+            sentence_hidden_states[sentence_id, : sentence_hidden_state.size(0)] = sentence_hidden_state
         else:
-            sentence_hidden_states[sentence_id, : selected_sentences.shape[1]] = selected_sentences[0, :]
+            sentence_hidden_states[sentence_id, : selected_sentences.size(1)] = selected_sentences[0, :]
 
     return sentence_hidden_states
 
@@ -171,11 +169,30 @@ def fill_mean_token_embeddings(
     word_ids: torch.Tensor,
     token_lengths: torch.Tensor,
 ):
-    for i in torch.arange(all_token_embeddings.shape[0]):
-        for _id in torch.arange(token_lengths[i]):  # type: ignore[call-overload]
-            all_token_embeddings[i, _id, :] = torch.nan_to_num(
-                sentence_hidden_states[i][word_ids[i] == _id].mean(dim=0)
-            )
+    batch_size, max_tokens, embedding_dim = all_token_embeddings.shape
+    mask = word_ids >= 0
+
+    # sum embeddings for each token
+    all_token_embeddings.scatter_add_(
+        1,
+        word_ids.clamp(min=0).unsqueeze(-1).expand(-1, -1, embedding_dim),
+        sentence_hidden_states * mask.unsqueeze(-1).float(),
+    )
+
+    # calculate the mean of subtokens
+    subtoken_counts = torch.zeros_like(all_token_embeddings[:, :, 0])
+    subtoken_counts.scatter_add_(1, word_ids.clamp(min=0), mask.float())
+    all_token_embeddings = torch.where(
+        subtoken_counts.unsqueeze(-1) > 0,
+        all_token_embeddings / subtoken_counts.unsqueeze(-1),
+        torch.zeros_like(all_token_embeddings),
+    )
+
+    # Create a mask for valid tokens based on token_lengths
+    token_mask = torch.arange(max_tokens, device=token_lengths.device)[None, :] < token_lengths[:, None]
+    all_token_embeddings = all_token_embeddings * token_mask.unsqueeze(-1)
+    all_token_embeddings = torch.nan_to_num(all_token_embeddings)
+
     return all_token_embeddings
 
 
@@ -1056,7 +1073,7 @@ def __init__(
                 model, add_prefix_space=True, **transformers_tokenizer_kwargs, **kwargs
             )
             try:
-                self.feature_extractor = AutoFeatureExtractor.from_pretrained(model, apply_ocr=False)
+                self.feature_extractor = AutoFeatureExtractor.from_pretrained(model, apply_ocr=False, **kwargs)
             except OSError:
                 self.feature_extractor = None
         else:
@@ -1222,7 +1239,7 @@ def embedding_length(self) -> int:
     def _load_from_state_dict(
         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
     ):
-        if transformers.__version__ >= Version(4, 31, 0):
+        if Version(transformers.__version__) >= Version("4.31.0"):
             assert isinstance(state_dict, dict)
             state_dict.pop(f"{prefix}model.embeddings.position_ids", None)
         super()._load_from_state_dict(
@@ -1307,7 +1324,7 @@ def __setstate__(self, state):
             self.__dict__[key] = embedding.__dict__[key]
 
         if model_state_dict:
-            if transformers.__version__ >= Version(4, 31, 0):
+            if Version(transformers.__version__) >= Version("4.31.0"):
                 model_state_dict.pop("embeddings.position_ids", None)
             self.model.load_state_dict(model_state_dict)
 
diff --git a/flair/file_utils.py b/flair/file_utils.py
index f7f20a20f3..7a0118822b 100644
--- a/flair/file_utils.py
+++ b/flair/file_utils.py
@@ -239,7 +239,7 @@ def get_from_cache(url: str, cache_dir: Path) -> Path:
         return cache_path
 
     # make HEAD request to check ETag
-    response = requests.head(url, headers={"User-Agent": "Flair"}, allow_redirects=True)
+    response = requests.head(url, headers={"User-Agent": "Flair"}, allow_redirects=True, proxies=url_proxies)
     if response.status_code != 200:
         raise OSError(f"HEAD request failed for url {url} with status code {response.status_code}.")
 
diff --git a/flair/models/entity_mention_linking.py b/flair/models/entity_mention_linking.py
index 25a67cde8e..cecf2d9e57 100644
--- a/flair/models/entity_mention_linking.py
+++ b/flair/models/entity_mention_linking.py
@@ -1056,12 +1056,13 @@ def evaluate(
         embedding_storage_mode: str = "none",
         mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("accuracy", "f1-score"),
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         gold_label_dictionary: Optional[Dictionary] = None,
         return_loss: bool = True,
         k: int = 1,
         **kwargs,
     ) -> Result:
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         if gold_label_dictionary is not None:
             raise NotImplementedError("evaluating an EntityMentionLinker with a gold_label_dictionary is not supported")
 
diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py
index c657a4fe6d..c3f34e0f69 100644
--- a/flair/models/pairwise_regression_model.py
+++ b/flair/models/pairwise_regression_model.py
@@ -1,6 +1,5 @@
-import typing
 from pathlib import Path
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -12,7 +11,7 @@
 from flair.data import Corpus, Dictionary, Sentence, TextPair, _iter_dataset
 from flair.datasets import DataLoader, FlairDatapointDataset
 from flair.nn.model import ReduceTransformerVocabMixin
-from flair.training_utils import MetricRegression, Result, store_embeddings
+from flair.training_utils import EmbeddingStorageMode, MetricRegression, Result, store_embeddings
 
 
 class TextPairRegressor(flair.nn.Model[TextPair], ReduceTransformerVocabMixin):
@@ -91,7 +90,7 @@ def label_type(self):
 
     def get_used_tokens(
         self, corpus: Corpus, context_length: int = 0, respect_document_boundaries: bool = True
-    ) -> typing.Iterable[List[str]]:
+    ) -> Iterable[List[str]]:
         for sentence_pair in _iter_dataset(corpus.get_all_sentences()):
             yield [t.text for t in sentence_pair.first]
             yield [t.text for t in sentence_pair.first.left_context(context_length, respect_document_boundaries)]
@@ -204,10 +203,16 @@ def _get_state_dict(self):
         return model_state
 
     @classmethod
-    def _init_model_with_state_dict(cls, state, **kwargs):
-        # add DefaultClassifier arguments
+    def _init_model_with_state_dict(cls, state: Dict[str, Any], **kwargs):
+        """Initializes a TextPairRegressor model from a state dictionary (exported by _get_state_dict).
+
+        Requires keys 'state_dict', 'document_embeddings', and 'label_type' in the state dictionary.
+        """
+        if "document_embeddings" in state:
+            state["embeddings"] = state.pop("document_embeddings")  # need to rename this parameter
+        # add Model arguments
         for arg in [
-            "document_embeddings",
+            "embeddings",
             "label_type",
             "embed_separately",
             "dropout",
@@ -276,14 +281,15 @@ def evaluate(
         data_points: Union[List[TextPair], Dataset],
         gold_label_type: str,
         out_path: Union[str, Path, None] = None,
-        embedding_storage_mode: str = "none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
         mini_batch_size: int = 32,
-        main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
+        main_evaluation_metric: Tuple[str, str] = ("correlation", "pearson"),
+        exclude_labels: Optional[List[str]] = None,
         gold_label_dictionary: Optional[Dictionary] = None,
         return_loss: bool = True,
         **kwargs,
     ) -> Result:
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         # read Dataset into data loader, if list of sentences passed, make Dataset first
         if not isinstance(data_points, Dataset):
             data_points = FlairDatapointDataset(data_points)
diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py
index 468196baf0..53ccabac36 100644
--- a/flair/models/relation_classifier_model.py
+++ b/flair/models/relation_classifier_model.py
@@ -34,6 +34,7 @@
 from flair.datasets import DataLoader, FlairDatapointDataset
 from flair.embeddings import DocumentEmbeddings, TransformerDocumentEmbeddings
 from flair.tokenization import SpaceTokenizer
+from flair.training_utils import EmbeddingStorageMode
 
 logger: logging.Logger = logging.getLogger("flair")
 
@@ -602,7 +603,7 @@ def predict(
         verbose: bool = False,
         label_name: Optional[str] = None,
         return_loss: bool = False,
-        embedding_storage_mode: str = "none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
     ) -> Optional[Tuple[torch.Tensor, int]]:
         """Predicts the class labels for the given sentence(s).
 
diff --git a/flair/models/text_regression_model.py b/flair/models/text_regression_model.py
index 351dc9bed8..894ce3087e 100644
--- a/flair/models/text_regression_model.py
+++ b/flair/models/text_regression_model.py
@@ -14,7 +14,7 @@
 from flair.datasets import DataLoader, FlairDatapointDataset
 from flair.embeddings.base import load_embeddings
 from flair.nn.model import ReduceTransformerVocabMixin
-from flair.training_utils import MetricRegression, Result, store_embeddings
+from flair.training_utils import EmbeddingStorageMode, MetricRegression, Result, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -78,7 +78,7 @@ def predict(
         mini_batch_size: int = 32,
         verbose: bool = False,
         label_name: Optional[str] = None,
-        embedding_storage_mode="none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
     ) -> List[Sentence]:
         if label_name is None:
             label_name = self.label_name if self.label_name is not None else "label"
@@ -135,14 +135,15 @@ def evaluate(
         data_points: Union[List[Sentence], Dataset],
         gold_label_type: str,
         out_path: Optional[Union[str, Path]] = None,
-        embedding_storage_mode: str = "none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
         mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         gold_label_dictionary: Optional[Dictionary] = None,
         return_loss: bool = True,
         **kwargs,
     ) -> Result:
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         # read Dataset into data loader, if list of sentences passed, make Dataset first
         if not isinstance(data_points, Dataset):
             data_points = FlairDatapointDataset(data_points)
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 88f51f443b..eeb5b7c84a 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -8,6 +8,7 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import torch.nn
+from torch import Tensor
 from torch.nn.modules.loss import _Loss
 from torch.utils.data.dataset import Dataset
 from tqdm import tqdm
@@ -19,7 +20,7 @@
 from flair.embeddings import Embeddings
 from flair.embeddings.base import load_embeddings
 from flair.file_utils import Tqdm, load_torch_state
-from flair.training_utils import Result, store_embeddings
+from flair.training_utils import EmbeddingStorageMode, Result, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -34,7 +35,7 @@ class Model(torch.nn.Module, typing.Generic[DT], ABC):
 
     @property
     @abstractmethod
-    def label_type(self):
+    def label_type(self) -> str:
         """Each model predicts labels of a certain type."""
         raise NotImplementedError
 
@@ -52,10 +53,10 @@ def evaluate(
         data_points: Union[List[DT], Dataset],
         gold_label_type: str,
         out_path: Optional[Union[str, Path]] = None,
-        embedding_storage_mode: str = "none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
         mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         gold_label_dictionary: Optional[Dictionary] = None,
         return_loss: bool = True,
         **kwargs,
@@ -80,19 +81,18 @@ def evaluate(
         Returns:
             The evaluation results.
         """
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         raise NotImplementedError
 
-    def _get_state_dict(self):
+    def _get_state_dict(self) -> Dict:
         """Returns the state dictionary for this model."""
-        state_dict = {"state_dict": self.state_dict()}
-
         # Always include the name of the Model class for which the state dict holds
-        state_dict["__cls__"] = self.__class__.__name__
+        state_dict = {"state_dict": self.state_dict(), "__cls__": self.__class__.__name__}
 
         return state_dict
 
     @classmethod
-    def _init_model_with_state_dict(cls, state, **kwargs):
+    def _init_model_with_state_dict(cls, state: Dict[str, Any], **kwargs):
         """Initialize the model from a state dictionary."""
         if "embeddings" in kwargs:
             embeddings = kwargs.pop("embeddings")
@@ -107,10 +107,11 @@ def _init_model_with_state_dict(cls, state, **kwargs):
         return model
 
     @staticmethod
-    def _fetch_model(model_name) -> str:
+    def _fetch_model(model_name):
+        # this seems to just return model name, not a model with that name
         return model_name
 
-    def save(self, model_file: Union[str, Path], checkpoint: bool = False):
+    def save(self, model_file: Union[str, Path], checkpoint: bool = False) -> None:
         """Saves the current model to the provided file.
 
         Args:
@@ -146,8 +147,9 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model":
                     new_model_path = model_cls._fetch_model(model_path)
                     if new_model_path != model_path:
                         return model_cls.load(new_model_path)
-                except Exception:
-                    # skip any invalid loadings, e.g. not found on huggingface hub
+                except Exception as e:
+                    log.debug(e)
+                    # skip any invalid loadings, e.g. not found on HuggingFace hub
                     continue
 
             # if the model cannot be fetched, load as a file
@@ -172,13 +174,12 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model":
 
             # older (flair 11.3 and below) models do not contain cls information. In this case, try all subclasses
             for model_cls in subclasses:
-                # if str(model_cls) == "<class 'flair.models.pairwise_classification_model.TextPairClassifier'>": continue
                 try:
                     model = model_cls.load(state)
                     return model
                 except Exception as e:
                     print(e)
-                    # skip any invalid loadings, e.g. not found on huggingface hub
+                    # skip any invalid loadings, e.g. not found on HuggingFace hub
                     continue
 
             raise ValueError(f"Could not find any model with name '{model_path}'")
@@ -253,14 +254,16 @@ def evaluate(
         data_points: Union[List[DT], Dataset],
         gold_label_type: str,
         out_path: Optional[Union[str, Path]] = None,
-        embedding_storage_mode: str = "none",
+        embedding_storage_mode: EmbeddingStorageMode = "none",
         mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         gold_label_dictionary: Optional[Dictionary] = None,
         return_loss: bool = True,
         **kwargs,
     ) -> Result:
+        exclude_labels = exclude_labels if exclude_labels is not None else []
+
         import numpy as np
         import sklearn
 
@@ -351,7 +354,7 @@ def evaluate(
             predicted_values_span_aligned = []
             for span in all_spans:
                 list_of_gold_values_for_span = all_true_values.get(span, ["O"])
-                # delete exluded labels if exclude_labels is given
+                # delete excluded labels if exclude_labels is given
                 for excluded_label in exclude_labels:
                     if excluded_label in list_of_gold_values_for_span:
                         list_of_gold_values_for_span.remove(excluded_label)
@@ -445,7 +448,7 @@ def evaluate(
                 labels=labels,
             )
 
-            # compute accuracy separately as it is not always in classification_report (e.. when micro avg exists)
+            # compute accuracy separately as it is not always in classification_report (e.g. when micro avg exists)
             accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4)
 
             # if there is only one label, then "micro avg" = "macro avg"
@@ -456,10 +459,13 @@ def evaluate(
             # Otherwise, it is identical to the "macro avg". In this case, we add it to the report.
             if "micro avg" not in classification_report_dict:
                 classification_report_dict["micro avg"] = {}
-                for precision_recall_f1 in classification_report_dict["macro avg"]:
-                    classification_report_dict["micro avg"][precision_recall_f1] = classification_report_dict[
-                        "accuracy"
-                    ]
+                for metric_key in classification_report_dict["macro avg"]:
+                    if metric_key != "support":
+                        classification_report_dict["micro avg"][metric_key] = classification_report_dict["accuracy"]
+                    else:
+                        classification_report_dict["micro avg"][metric_key] = classification_report_dict["macro avg"][
+                            "support"
+                        ]
 
             detailed_result = (
                 "\nResults:"
@@ -513,8 +519,8 @@ def predict(
         return_probabilities_for_all_classes: bool = False,
         verbose: bool = False,
         label_name: Optional[str] = None,
-        return_loss=False,
-        embedding_storage_mode="none",
+        return_loss: bool = False,
+        embedding_storage_mode: EmbeddingStorageMode = "none",
     ):
         """Predicts the class labels for the given sentences.
 
@@ -531,7 +537,7 @@ def predict(
         """
         raise NotImplementedError
 
-    def _print_predictions(self, batch, gold_label_type):
+    def _print_predictions(self, batch: List[DT], gold_label_type: str) -> List[str]:
         lines = []
         for datapoint in batch:
             # check if there is a label mismatch
@@ -691,7 +697,7 @@ def multi_label_threshold(self, x):  # setter method
             if "default" in x:
                 self._multi_label_threshold = x
             else:
-                raise Exception('multi_label_threshold dict should have a "default" key')
+                raise ValueError('multi_label_threshold dict should have a "default" key')
         else:
             self._multi_label_threshold = {"default": x}
 
@@ -720,7 +726,7 @@ def _prepare_label_tensor(self, prediction_data_points: List[DT2]) -> torch.Tens
                 device=flair.device,
             )
 
-    def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]):
+    def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]) -> Tensor:
         # embed sentences
         if self.should_embed_sentence:
             self.embeddings.embed(sentences)
@@ -737,7 +743,8 @@ def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]):
 
         return data_point_tensor
 
-    def _mask_scores(self, scores, data_points):
+    def _mask_scores(self, scores: Tensor, data_points) -> Tensor:
+        """Classes that inherit from DefaultClassifier may optionally mask scores."""
         return scores
 
     def forward_loss(self, sentences: List[DT]) -> Tuple[torch.Tensor, int]:
@@ -791,8 +798,8 @@ def predict(
         return_probabilities_for_all_classes: bool = False,
         verbose: bool = False,
         label_name: Optional[str] = None,
-        return_loss=False,
-        embedding_storage_mode="none",
+        return_loss: bool = False,
+        embedding_storage_mode: EmbeddingStorageMode = "none",
     ):
         """Predicts the class labels for the given sentences. The labels are directly added to the sentences.
 
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index fb8590841b..03e6edc083 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -32,7 +32,7 @@
     TrainingInterrupt,
     WeightExtractorPlugin,
 )
-from flair.training_utils import identify_dynamic_embeddings, log_line, store_embeddings
+from flair.training_utils import EmbeddingStorageMode, identify_dynamic_embeddings, log_line, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -147,13 +147,13 @@ def train(
         monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = False,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         # sampling and shuffling
         sampler=None,
         shuffle: bool = True,
         shuffle_first_epoch: bool = True,
         # evaluation and monitoring
-        embeddings_storage_mode: str = "cpu",
+        embeddings_storage_mode: EmbeddingStorageMode = "cpu",
         epoch: int = 0,
         # when and what to save
         save_final_model: bool = True,
@@ -168,6 +168,7 @@ def train(
         attach_default_scheduler: bool = True,
         **kwargs,
     ):
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         if plugins is None:
             plugins = []
 
@@ -220,13 +221,13 @@ def fine_tune(
         monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = True,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         # sampling and shuffling
         sampler=None,
         shuffle: bool = True,
         shuffle_first_epoch: bool = True,
         # evaluation and monitoring
-        embeddings_storage_mode: str = "none",
+        embeddings_storage_mode: EmbeddingStorageMode = "none",
         epoch: int = 0,
         # when and what to save
         save_final_model: bool = True,
@@ -243,6 +244,7 @@ def fine_tune(
         attach_default_scheduler: bool = True,
         **kwargs,
     ):
+        exclude_labels = exclude_labels if exclude_labels is not None else []
         # annealing logic
         if plugins is None:
             plugins = []
@@ -313,13 +315,13 @@ def train_custom(
         monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = False,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
-        exclude_labels: List[str] = [],
+        exclude_labels: Optional[List[str]] = None,
         # sampling and shuffling
         sampler: Optional[FlairSampler] = None,
         shuffle: bool = True,
         shuffle_first_epoch: bool = True,
         # evaluation and monitoring
-        embeddings_storage_mode: str = "cpu",
+        embeddings_storage_mode: EmbeddingStorageMode = "cpu",
         epoch: int = 0,
         # when and what to save
         save_final_model: bool = True,
@@ -332,7 +334,7 @@ def train_custom(
         # amp
         use_amp: bool = False,
         # plugins
-        plugins: List[TrainerPlugin] = [],
+        plugins: Optional[List[TrainerPlugin]] = None,
         **kwargs,
     ) -> dict:
         """Trains any class that implements the flair.nn.Model interface.
@@ -381,6 +383,9 @@ def train_custom(
             A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins add
             additional information to this dictionary, such as the :class:`flair.trainers.plugins.MetricHistoryPlugin`
         """
+        exclude_labels = exclude_labels if exclude_labels is not None else []
+        plugins = plugins if plugins is not None else []
+
         # Create output folder
         base_path = Path(base_path)
         base_path.mkdir(exist_ok=True, parents=True)
diff --git a/flair/training_utils.py b/flair/training_utils.py
index 2c3ce9d5f4..0b4ef91cbf 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -5,7 +5,7 @@
 from functools import reduce
 from math import inf
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
 
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import mean_absolute_error, mean_squared_error
@@ -15,6 +15,7 @@
 import flair
 from flair.data import DT, Dictionary, Sentence, _iter_dataset
 
+EmbeddingStorageMode = Literal["none", "cpu", "gpu"]
 log = logging.getLogger("flair")
 
 
@@ -23,10 +24,11 @@ def __init__(
         self,
         main_score: float,
         detailed_results: str,
-        classification_report: dict = {},
-        scores: dict = {},
+        classification_report: Optional[dict] = None,
+        scores: Optional[Dict] = None,
     ) -> None:
-        assert "loss" in scores, "No loss provided."
+        classification_report = classification_report if classification_report is not None else {}
+        assert scores is not None and "loss" in scores, "No loss provided."
 
         self.main_score: float = main_score
         self.scores = scores
@@ -363,7 +365,9 @@ def add_file_handler(log, output_file):
 
 
 def store_embeddings(
-    data_points: Union[List[DT], Dataset], storage_mode: str, dynamic_embeddings: Optional[List[str]] = None
+    data_points: Union[List[DT], Dataset],
+    storage_mode: EmbeddingStorageMode,
+    dynamic_embeddings: Optional[List[str]] = None,
 ):
     if isinstance(data_points, Dataset):
         data_points = list(_iter_dataset(data_points))
@@ -387,7 +391,7 @@ def store_embeddings(
             data_point.to("cpu", pin_memory=pin_memory)
 
 
-def identify_dynamic_embeddings(data_points: List[DT]):
+def identify_dynamic_embeddings(data_points: List[DT]) -> Optional[List]:
     dynamic_embeddings = []
     all_embeddings = []
     for data_point in data_points:
diff --git a/pyproject.toml b/pyproject.toml
index b3ed1eaedd..78d1692a09 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ filterwarnings = [
     'ignore:distutils Version classes are deprecated.',  # faiss uses deprecated distutils.
     'ignore:`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.',  # transformers calls deprecated hf_hub
     "ignore:`torch.cuda.amp.GradScaler",  # GradScaler changes in torch 2.3.0 but we want to be backwards compatible.
+    "ignore:`clean_up_tokenization_spaces` was not set",  # Default behavior changes in transformers v4.45, raising irrelevant FutureWarning for serialized models.
 ]
 markers = [
     "integration",
diff --git a/requirements.txt b/requirements.txt
index 7f159bcd14..bb5ecafd45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,5 +22,4 @@ tqdm>=4.63.0
 transformer-smaller-training-vocab>=0.2.3
 transformers[sentencepiece]>=4.18.0,<5.0.0
 wikipedia-api>=0.5.7
-semver<4.0.0,>=3.0.0
 bioc<3.0.0,>=2.0.0
diff --git a/tests/models/test_tars_classifier.py b/tests/models/test_tars_classifier.py
index 818ead4a39..20bff503bd 100644
--- a/tests/models/test_tars_classifier.py
+++ b/tests/models/test_tars_classifier.py
@@ -12,7 +12,7 @@ class TestTarsClassifier(BaseModelTest):
     train_label_type = "class"
     model_args = {"task_name": "2_CLASS"}
     training_args = {"mini_batch_size": 1, "max_epochs": 2}
-    pretrained_model = "tars-base"
+    # pretrained_model = "tars-base"  # disabled due to too much space requirements.
 
     @pytest.fixture()
     def corpus(self, tasks_base_path):
diff --git a/tests/models/test_tars_ner.py b/tests/models/test_tars_ner.py
index 57c4b8cc9c..dd510f8fa4 100644
--- a/tests/models/test_tars_ner.py
+++ b/tests/models/test_tars_ner.py
@@ -12,7 +12,7 @@ class TestTarsTagger(BaseModelTest):
     train_label_type = "ner"
     model_args = {"task_name": "2_NER"}
     training_args = {"mini_batch_size": 1, "max_epochs": 2}
-    pretrained_model = "tars-ner"
+    # pretrained_model = "tars-ner"  # disabled due to too much space requirements.
 
     @pytest.fixture()
     def corpus(self, tasks_base_path):
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 1f54a9ed91..2c76b40678 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -436,7 +436,7 @@ def test_write_to_and_load_from_directory(tasks_base_path):
     assert loaded_corpus.train[0].to_tagged_string() == corpus.train[0].to_tagged_string()
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_hipe_2022_corpus(tasks_base_path):
     # This test covers the complete HIPE 2022 dataset.
     # https://github.com/hipe-eval/HIPE-2022-data
@@ -700,7 +700,7 @@ def test_hipe_2022(dataset_version="v2.1", add_document_separator=True):
     test_hipe_2022(dataset_version="v2.1", add_document_separator=False)
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_icdar_europeana_corpus(tasks_base_path):
     # This test covers the complete ICDAR Europeana corpus:
     # https://github.com/stefan-it/historic-domain-adaptation-icdar
@@ -718,7 +718,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str):
         check_number_sentences(len(corpus.test), gold_stats[language]["test"], "test")
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_masakhane_corpus(tasks_base_path):
     # This test covers the complete MasakhaNER dataset, including support for v1 and v2.
     supported_versions = ["v1", "v2"]
@@ -802,7 +802,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
             check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_nermud_corpus(tasks_base_path):
     # This test covers the NERMuD dataset. Official stats can be found here:
     # https://github.com/dhfbk/KIND/tree/main/evalita-2023
@@ -821,6 +821,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str):
         check_number_sentences(len(corpus.dev), stats["dev"], "dev")
 
 
+@pytest.mark.skip()
 def test_german_ler_corpus(tasks_base_path):
     corpus = flair.datasets.NER_GERMAN_LEGAL()
 
@@ -830,7 +831,7 @@ def test_german_ler_corpus(tasks_base_path):
     assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_masakha_pos_corpus(tasks_base_path):
     # This test covers the complete MasakhaPOS dataset.
     supported_versions = ["v1"]
@@ -899,7 +900,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
             check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_german_mobie(tasks_base_path):
     corpus = flair.datasets.NER_GERMAN_MOBIE()
 
@@ -984,7 +985,7 @@ def test_jsonl_corpus_loads_metadata(tasks_base_path):
     assert dataset.sentences[2].get_metadata("from") == 125
 
 
-@pytest.mark.integration()
+@pytest.mark.skip()
 def test_ontonotes_download():
     from urllib.parse import urlparse
 
@@ -992,6 +993,7 @@ def test_ontonotes_download():
     assert all([res.scheme, res.netloc])
 
 
+@pytest.mark.skip()
 def test_ontonotes_extraction(tasks_base_path):
     import os
     import tempfile
diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py
index 4099bb9288..0264b08394 100644
--- a/tests/test_datasets_biomedical.py
+++ b/tests/test_datasets_biomedical.py
@@ -1,21 +1,12 @@
-import inspect
 import logging
 import os
 import tempfile
-from operator import itemgetter
 from pathlib import Path
-from typing import Callable, List, Optional, Type
+from typing import List, Optional
 
-import pytest
-from tqdm import tqdm
-
-import flair
-from flair.data import Sentence, Token, _iter_dataset
-from flair.datasets import ColumnCorpus, biomedical
 from flair.datasets.biomedical import (
     CoNLLWriter,
     Entity,
-    HunerDataset,
     InternalBioNerDataset,
     filter_nested_entities,
 )
@@ -26,80 +17,6 @@
 logger.propagate = True
 
 
-def has_balanced_parantheses(text: str) -> bool:
-    stack = []
-    opening = ["(", "[", "{"]
-    closing = [")", "]", "}"]
-    for c in text:
-        if c in opening:
-            stack.append(c)
-        elif c in closing:
-            if not stack:
-                return False
-            last_paren = stack.pop()
-            if opening.index(last_paren) != closing.index(c):
-                return False
-
-    return len(stack) == 0
-
-
-def gene_predicate(member):
-    return inspect.isclass(member) and "HUNER_GENE_" in str(member)
-
-
-def chemical_predicate(member):
-    return inspect.isclass(member) and "HUNER_CHEMICAL_" in str(member)
-
-
-def disease_predicate(member):
-    return inspect.isclass(member) and "HUNER_DISEASE_" in str(member)
-
-
-def species_predicate(member):
-    return inspect.isclass(member) and "HUNER_SPECIES_" in str(member)
-
-
-def cellline_predicate(member):
-    return inspect.isclass(member) and "HUNER_CELL_LINE_" in str(member)
-
-
-CELLLINE_DATASETS = [
-    i[1] for i in sorted(inspect.getmembers(biomedical, predicate=cellline_predicate), key=itemgetter(0))
-]
-CHEMICAL_DATASETS = [
-    i[1] for i in sorted(inspect.getmembers(biomedical, predicate=chemical_predicate), key=itemgetter(0))
-]
-DISEASE_DATASETS = [
-    i[1] for i in sorted(inspect.getmembers(biomedical, predicate=disease_predicate), key=itemgetter(0))
-]
-GENE_DATASETS = [i[1] for i in sorted(inspect.getmembers(biomedical, predicate=gene_predicate), key=itemgetter(0))]
-SPECIES_DATASETS = [
-    i[1] for i in sorted(inspect.getmembers(biomedical, predicate=species_predicate), key=itemgetter(0))
-]
-ALL_DATASETS = CELLLINE_DATASETS + CHEMICAL_DATASETS + DISEASE_DATASETS + GENE_DATASETS + SPECIES_DATASETS
-
-
-def simple_tokenizer(text: str) -> List[str]:
-    tokens: List[str] = []
-    word = ""
-    index = -1
-    for index, char in enumerate(text):
-        if char == " " or char == "-":
-            if len(word) > 0:
-                tokens.append(word)
-
-            word = ""
-        else:
-            word += char
-
-    # increment for last token in sentence if not followed by whitespace
-    index += 1
-    if len(word) > 0:
-        tokens.append(word)
-
-    return tokens
-
-
 def test_write_to_conll():
     text = "This is entity1 entity2 and a long entity3"
     dataset = InternalBioNerDataset(
@@ -220,163 +137,3 @@ def test_filter_nested_entities(caplog):
             sorted(entities, key=lambda x: str(x)),
         ):
             assert str(e1) == str(e2)
-
-
-def sanity_check_all_corpora(check: Callable[[ColumnCorpus], None]):
-    for _, CorpusType in tqdm(ALL_DATASETS):
-        corpus = CorpusType()
-        check(corpus)
-
-
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-def test_sanity_not_starting_with_minus(CorpusType: Type[ColumnCorpus]):
-    corpus = CorpusType()  # type: ignore[call-arg]
-    entities_starting_with_minus = []
-    for sentence in _iter_dataset(corpus.get_all_sentences()):
-        entities = sentence.get_spans("ner")
-        for entity in entities:
-            if str(entity.tokens[0].text).startswith("-"):
-                entities_starting_with_minus.append(" ".join([t.text for t in entity.tokens]))
-
-    assert len(entities_starting_with_minus) == 0, "|".join(entities_starting_with_minus)
-
-
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_sanity_no_repeating_Bs(CorpusType: Type[ColumnCorpus]):
-    corpus = CorpusType()  # type: ignore[call-arg]
-    longest_repeat_tokens: List[Token] = []
-    repeat_tokens: List[Token] = []
-    for sentence in _iter_dataset(corpus.get_all_sentences()):
-        for token in sentence.tokens:
-            if token.get_labels()[0].value.startswith("B") or token.get_labels()[0].value.startswith("S"):
-                repeat_tokens.append(token)
-            else:
-                if len(repeat_tokens) > len(longest_repeat_tokens):
-                    longest_repeat_tokens = repeat_tokens
-                repeat_tokens = []
-
-    assert len(longest_repeat_tokens) < 4
-
-
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_sanity_no_long_entities(CorpusType: Type[ColumnCorpus]):
-    corpus = CorpusType()  # type: ignore[call-arg]
-    longest_entity: List[str] = []
-    for sentence in _iter_dataset(corpus.get_all_sentences()):
-        entities = sentence.get_spans("ner")
-        for entity in entities:
-            if len(entity.tokens) > len(longest_entity):
-                longest_entity = [t.text for t in entity.tokens]
-
-    assert len(longest_entity) < 10, " ".join(longest_entity)
-
-
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_sanity_no_unmatched_parentheses(CorpusType: Type[ColumnCorpus]):
-    corpus = CorpusType()  # type: ignore[call-arg]
-    unbalanced_entities = []
-    for sentence in _iter_dataset(corpus.get_all_sentences()):
-        entities = sentence.get_spans("ner")
-        for entity in entities:
-            entity_text = "".join(t.text for t in entity.tokens)
-            if not has_balanced_parantheses(entity_text):
-                unbalanced_entities.append(entity_text)
-
-    assert unbalanced_entities == []
-
-
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_sanity_not_too_many_entities(CorpusType: Type[ColumnCorpus]):
-    corpus = CorpusType()  # type: ignore[call-arg]
-    n_entities_per_sentence = []
-    for sentence in _iter_dataset(corpus.get_all_sentences()):
-        entities = sentence.get_spans("ner")
-        n_entities_per_sentence.append(len(entities))
-    avg_entities_per_sentence = sum(n_entities_per_sentence) / len(n_entities_per_sentence)
-
-    assert avg_entities_per_sentence <= 5
-
-
-@pytest.mark.parametrize("CorpusType", ALL_DATASETS)
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]):
-    dataset_name = CorpusType.__class__.__name__.lower()
-    base_path = flair.cache_root / "datasets"
-    data_folder = base_path / dataset_name
-
-    corpus = CorpusType()
-    internal = corpus.to_internal(data_folder)
-    for doc_id, _doc_text in internal.documents.items():
-        misaligned_starts = []
-        misaligned_ends: List[int] = []
-
-        entities = internal.entities_per_document[doc_id]
-        entity_starts = [i.char_span.start for i in entities]
-        entity_ends = [i.char_span.stop for i in entities]
-
-        for start in entity_starts:
-            if start not in entity_starts:
-                misaligned_starts.append(start)
-
-        for end in entity_ends:
-            if end not in entity_ends:
-                misaligned_starts.append(end)
-
-        assert len(misaligned_starts) <= len(entities) // 10
-        assert len(misaligned_ends) <= len(entities) // 10
-
-
-@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes")
-def test_scispacy_tokenization():
-    from flair.tokenization import SciSpacyTokenizer
-
-    spacy_tokenizer = SciSpacyTokenizer()
-
-    sentence = Sentence("HBeAg(+) patients", use_tokenizer=spacy_tokenizer)
-    assert len(sentence) == 5
-    assert sentence[0].text == "HBeAg"
-    assert sentence[0].start_position == 0
-    assert sentence[1].text == "("
-    assert sentence[1].start_position == 5
-    assert sentence[2].text == "+"
-    assert sentence[2].start_position == 6
-    assert sentence[3].text == ")"
-    assert sentence[3].start_position == 7
-    assert sentence[4].text == "patients"
-    assert sentence[4].start_position == 9
-
-    sentence = Sentence("HBeAg(+)/HBsAg(+)", use_tokenizer=spacy_tokenizer)
-    assert len(sentence) == 9
-
-    assert sentence[0].text == "HBeAg"
-    assert sentence[0].start_position == 0
-    assert sentence[1].text == "("
-    assert sentence[1].start_position == 5
-    assert sentence[2].text == "+"
-    assert sentence[2].start_position == 6
-    assert sentence[3].text == ")"
-    assert sentence[3].start_position == 7
-    assert sentence[4].text == "/"
-    assert sentence[4].start_position == 8
-    assert sentence[5].text == "HBsAg"
-    assert sentence[5].start_position == 9
-    assert sentence[6].text == "("
-    assert sentence[6].start_position == 14
-    assert sentence[7].text == "+"
-    assert sentence[7].start_position == 15
-    assert sentence[8].text == ")"
-    assert sentence[8].start_position == 16
-
-    sentence = Sentence("doxorubicin (DOX)-induced", use_tokenizer=spacy_tokenizer)
-
-    assert len(sentence) == 5
-    assert sentence[0].text == "doxorubicin"
-    assert sentence[1].text == "("
-    assert sentence[2].text == "DOX"
-    assert sentence[3].text == ")"
-    assert sentence[4].text == "-induced"