diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c801b96a70..5b3633d93e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: uses: actions/cache@v3 with: path: ./cache - key: cache-v1.1 + key: cache-v1.2 - name: Run tests run: | python -c 'import flair' diff --git a/.gitignore b/.gitignore index cbd5be92d3..89a4bb39e7 100644 --- a/.gitignore +++ b/.gitignore @@ -110,4 +110,4 @@ resources/taggers/ regression_train/ /doc_build/ -scripts/ \ No newline at end of file +scripts/ diff --git a/docs/requirements.txt b/docs/requirements.txt index 8d7ae05d70..0e8c4f6141 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,4 +5,9 @@ sphinx importlib-metadata sphinx-multiversion pydata-sphinx-theme<0.14 -sphinx_design \ No newline at end of file +sphinx_design + +# previous dependencies that are required to build docs for later versions too. +semver +gensim +bpemb \ No newline at end of file diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 30f010df40..613ee639d0 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -24,6 +24,7 @@ ) from flair.datasets.base import find_train_dev_test_files from flair.file_utils import cached_path, unpack_file +from flair.tokenization import Tokenizer log = logging.getLogger("flair") @@ -41,6 +42,7 @@ def __init__( label_column_name: str = "label", metadata_column_name: str = "metadata", label_type: str = "ner", + use_tokenizer: Union[bool, Tokenizer] = True, **corpusargs, ) -> None: """Instantiates a MuliFileJsonlCorpus as, e.g., created with doccanos JSONL export. @@ -52,9 +54,12 @@ def __init__( :param train_files: the name of the train files :param test_files: the name of the test files :param dev_files: the name of the dev files, if empty, dev data is sampled from train + :param encoding: file encoding (default "utf-8") :param text_column_name: Name of the text column inside the jsonl files. :param label_column_name: Name of the label column inside the jsonl files. :param metadata_column_name: Name of the metadata column inside the jsonl files. + :param label_type: he type of label to predict (default "ner") + :param use_tokenizer: Specify a custom tokenizer to split the text into tokens. :raises RuntimeError: If no paths are given """ @@ -68,6 +73,7 @@ def __init__( metadata_column_name=metadata_column_name, label_type=label_type, encoding=encoding, + use_tokenizer=use_tokenizer, ) for train_file in train_files ] @@ -86,6 +92,8 @@ def __init__( label_column_name=label_column_name, metadata_column_name=metadata_column_name, label_type=label_type, + encoding=encoding, + use_tokenizer=use_tokenizer, ) for test_file in test_files ] @@ -104,6 +112,8 @@ def __init__( label_column_name=label_column_name, metadata_column_name=metadata_column_name, label_type=label_type, + encoding=encoding, + use_tokenizer=use_tokenizer, ) for dev_file in dev_files ] @@ -128,6 +138,7 @@ def __init__( label_type: str = "ner", autofind_splits: bool = True, name: Optional[str] = None, + use_tokenizer: Union[bool, Tokenizer] = True, **corpusargs, ) -> None: """Instantiates a JsonlCorpus with one file per Dataset (train, dev, and test). @@ -136,11 +147,14 @@ def __init__( :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train + :param encoding: file encoding (default "utf-8") :param text_column_name: Name of the text column inside the JSONL file. :param label_column_name: Name of the label column inside the JSONL file. :param metadata_column_name: Name of the metadata column inside the JSONL file. + :param label_type: The type of label to predict (default "ner") :param autofind_splits: Whether train, test and dev file should be determined automatically :param name: name of the Corpus see flair.data.Corpus + :param use_tokenizer: Specify a custom tokenizer to split the text into tokens. """ # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files( @@ -156,6 +170,7 @@ def __init__( label_type=label_type, name=name if data_folder is None else str(data_folder), encoding=encoding, + use_tokenizer=use_tokenizer, **corpusargs, ) @@ -169,6 +184,7 @@ def __init__( label_column_name: str = "label", metadata_column_name: str = "metadata", label_type: str = "ner", + use_tokenizer: Union[bool, Tokenizer] = True, ) -> None: """Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme. @@ -184,9 +200,12 @@ def __init__( Args: path_to_jsonl_file: File to read + encoding: file encoding (default "utf-8") text_column_name: Name of the text column label_column_name: Name of the label column metadata_column_name: Name of the metadata column + label_type: The type of label to predict (default "ner") + use_tokenizer: Specify a custom tokenizer to split the text into tokens. """ path_to_json_file = Path(path_to_jsonl_file) @@ -203,7 +222,7 @@ def __init__( raw_text = current_line[text_column_name] current_labels = current_line[label_column_name] current_metadatas = current_line.get(self.metadata_column_name, []) - current_sentence = Sentence(raw_text) + current_sentence = Sentence(raw_text, use_tokenizer=use_tokenizer) self._add_labels_to_sentence(raw_text, current_sentence, current_labels) self._add_metadatas_to_sentence(current_sentence, current_metadatas) @@ -310,6 +329,7 @@ def __init__( dev_files: the name of the dev files, if empty, dev data is sampled from train column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs comment_symbol: if set, lines that begin with this symbol are treated as comments + encoding: file encoding (default "utf-8") document_separator_token: If provided, sentences that function as document boundaries are so marked skip_first_line: set to True if your dataset has a header line in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py index fd46321e91..f3492178f9 100644 --- a/flair/embeddings/transformer.py +++ b/flair/embeddings/transformer.py @@ -12,7 +12,7 @@ import torch import transformers -from semver import Version +from packaging.version import Version from torch.jit import ScriptModule from transformers import ( CONFIG_MAPPING, @@ -65,7 +65,7 @@ def pad_sequence_embeddings(all_hidden_states: List[torch.Tensor]) -> torch.Tens @torch.jit.script_if_tracing def truncate_hidden_states(hidden_states: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor: - return hidden_states[:, :, : input_ids.size()[1]] + return hidden_states[:, :, : input_ids.size(1)] @torch.jit.script_if_tracing @@ -95,14 +95,12 @@ def combine_strided_tensors( if selected_sentences.size(0) > 1: start_part = selected_sentences[0, : half_stride + 1] mid_part = selected_sentences[:, half_stride + 1 : max_length - 1 - half_stride] - mid_part = torch.reshape(mid_part, (mid_part.shape[0] * mid_part.shape[1],) + mid_part.shape[2:]) - end_part = selected_sentences[selected_sentences.shape[0] - 1, max_length - half_stride - 1 :] + mid_part = torch.reshape(mid_part, (mid_part.size(0) * mid_part.size(1),) + mid_part.size()[2:]) + end_part = selected_sentences[selected_sentences.size(0) - 1, max_length - half_stride - 1 :] sentence_hidden_state = torch.cat((start_part, mid_part, end_part), dim=0) - sentence_hidden_states[sentence_id, : sentence_hidden_state.shape[0]] = torch.cat( - (start_part, mid_part, end_part), dim=0 - ) + sentence_hidden_states[sentence_id, : sentence_hidden_state.size(0)] = sentence_hidden_state else: - sentence_hidden_states[sentence_id, : selected_sentences.shape[1]] = selected_sentences[0, :] + sentence_hidden_states[sentence_id, : selected_sentences.size(1)] = selected_sentences[0, :] return sentence_hidden_states @@ -171,11 +169,30 @@ def fill_mean_token_embeddings( word_ids: torch.Tensor, token_lengths: torch.Tensor, ): - for i in torch.arange(all_token_embeddings.shape[0]): - for _id in torch.arange(token_lengths[i]): # type: ignore[call-overload] - all_token_embeddings[i, _id, :] = torch.nan_to_num( - sentence_hidden_states[i][word_ids[i] == _id].mean(dim=0) - ) + batch_size, max_tokens, embedding_dim = all_token_embeddings.shape + mask = word_ids >= 0 + + # sum embeddings for each token + all_token_embeddings.scatter_add_( + 1, + word_ids.clamp(min=0).unsqueeze(-1).expand(-1, -1, embedding_dim), + sentence_hidden_states * mask.unsqueeze(-1).float(), + ) + + # calculate the mean of subtokens + subtoken_counts = torch.zeros_like(all_token_embeddings[:, :, 0]) + subtoken_counts.scatter_add_(1, word_ids.clamp(min=0), mask.float()) + all_token_embeddings = torch.where( + subtoken_counts.unsqueeze(-1) > 0, + all_token_embeddings / subtoken_counts.unsqueeze(-1), + torch.zeros_like(all_token_embeddings), + ) + + # Create a mask for valid tokens based on token_lengths + token_mask = torch.arange(max_tokens, device=token_lengths.device)[None, :] < token_lengths[:, None] + all_token_embeddings = all_token_embeddings * token_mask.unsqueeze(-1) + all_token_embeddings = torch.nan_to_num(all_token_embeddings) + return all_token_embeddings @@ -1056,7 +1073,7 @@ def __init__( model, add_prefix_space=True, **transformers_tokenizer_kwargs, **kwargs ) try: - self.feature_extractor = AutoFeatureExtractor.from_pretrained(model, apply_ocr=False) + self.feature_extractor = AutoFeatureExtractor.from_pretrained(model, apply_ocr=False, **kwargs) except OSError: self.feature_extractor = None else: @@ -1222,7 +1239,7 @@ def embedding_length(self) -> int: def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): - if transformers.__version__ >= Version(4, 31, 0): + if Version(transformers.__version__) >= Version("4.31.0"): assert isinstance(state_dict, dict) state_dict.pop(f"{prefix}model.embeddings.position_ids", None) super()._load_from_state_dict( @@ -1307,7 +1324,7 @@ def __setstate__(self, state): self.__dict__[key] = embedding.__dict__[key] if model_state_dict: - if transformers.__version__ >= Version(4, 31, 0): + if Version(transformers.__version__) >= Version("4.31.0"): model_state_dict.pop("embeddings.position_ids", None) self.model.load_state_dict(model_state_dict) diff --git a/flair/file_utils.py b/flair/file_utils.py index f7f20a20f3..7a0118822b 100644 --- a/flair/file_utils.py +++ b/flair/file_utils.py @@ -239,7 +239,7 @@ def get_from_cache(url: str, cache_dir: Path) -> Path: return cache_path # make HEAD request to check ETag - response = requests.head(url, headers={"User-Agent": "Flair"}, allow_redirects=True) + response = requests.head(url, headers={"User-Agent": "Flair"}, allow_redirects=True, proxies=url_proxies) if response.status_code != 200: raise OSError(f"HEAD request failed for url {url} with status code {response.status_code}.") diff --git a/flair/models/entity_mention_linking.py b/flair/models/entity_mention_linking.py index 25a67cde8e..cecf2d9e57 100644 --- a/flair/models/entity_mention_linking.py +++ b/flair/models/entity_mention_linking.py @@ -1056,12 +1056,13 @@ def evaluate( embedding_storage_mode: str = "none", mini_batch_size: int = 32, main_evaluation_metric: Tuple[str, str] = ("accuracy", "f1-score"), - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, gold_label_dictionary: Optional[Dictionary] = None, return_loss: bool = True, k: int = 1, **kwargs, ) -> Result: + exclude_labels = exclude_labels if exclude_labels is not None else [] if gold_label_dictionary is not None: raise NotImplementedError("evaluating an EntityMentionLinker with a gold_label_dictionary is not supported") diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py index c657a4fe6d..c3f34e0f69 100644 --- a/flair/models/pairwise_regression_model.py +++ b/flair/models/pairwise_regression_model.py @@ -1,6 +1,5 @@ -import typing from pathlib import Path -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch from torch import nn @@ -12,7 +11,7 @@ from flair.data import Corpus, Dictionary, Sentence, TextPair, _iter_dataset from flair.datasets import DataLoader, FlairDatapointDataset from flair.nn.model import ReduceTransformerVocabMixin -from flair.training_utils import MetricRegression, Result, store_embeddings +from flair.training_utils import EmbeddingStorageMode, MetricRegression, Result, store_embeddings class TextPairRegressor(flair.nn.Model[TextPair], ReduceTransformerVocabMixin): @@ -91,7 +90,7 @@ def label_type(self): def get_used_tokens( self, corpus: Corpus, context_length: int = 0, respect_document_boundaries: bool = True - ) -> typing.Iterable[List[str]]: + ) -> Iterable[List[str]]: for sentence_pair in _iter_dataset(corpus.get_all_sentences()): yield [t.text for t in sentence_pair.first] yield [t.text for t in sentence_pair.first.left_context(context_length, respect_document_boundaries)] @@ -204,10 +203,16 @@ def _get_state_dict(self): return model_state @classmethod - def _init_model_with_state_dict(cls, state, **kwargs): - # add DefaultClassifier arguments + def _init_model_with_state_dict(cls, state: Dict[str, Any], **kwargs): + """Initializes a TextPairRegressor model from a state dictionary (exported by _get_state_dict). + + Requires keys 'state_dict', 'document_embeddings', and 'label_type' in the state dictionary. + """ + if "document_embeddings" in state: + state["embeddings"] = state.pop("document_embeddings") # need to rename this parameter + # add Model arguments for arg in [ - "document_embeddings", + "embeddings", "label_type", "embed_separately", "dropout", @@ -276,14 +281,15 @@ def evaluate( data_points: Union[List[TextPair], Dataset], gold_label_type: str, out_path: Union[str, Path, None] = None, - embedding_storage_mode: str = "none", + embedding_storage_mode: EmbeddingStorageMode = "none", mini_batch_size: int = 32, - main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), - exclude_labels: List[str] = [], + main_evaluation_metric: Tuple[str, str] = ("correlation", "pearson"), + exclude_labels: Optional[List[str]] = None, gold_label_dictionary: Optional[Dictionary] = None, return_loss: bool = True, **kwargs, ) -> Result: + exclude_labels = exclude_labels if exclude_labels is not None else [] # read Dataset into data loader, if list of sentences passed, make Dataset first if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py index 468196baf0..53ccabac36 100644 --- a/flair/models/relation_classifier_model.py +++ b/flair/models/relation_classifier_model.py @@ -34,6 +34,7 @@ from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import DocumentEmbeddings, TransformerDocumentEmbeddings from flair.tokenization import SpaceTokenizer +from flair.training_utils import EmbeddingStorageMode logger: logging.Logger = logging.getLogger("flair") @@ -602,7 +603,7 @@ def predict( verbose: bool = False, label_name: Optional[str] = None, return_loss: bool = False, - embedding_storage_mode: str = "none", + embedding_storage_mode: EmbeddingStorageMode = "none", ) -> Optional[Tuple[torch.Tensor, int]]: """Predicts the class labels for the given sentence(s). diff --git a/flair/models/text_regression_model.py b/flair/models/text_regression_model.py index 351dc9bed8..894ce3087e 100644 --- a/flair/models/text_regression_model.py +++ b/flair/models/text_regression_model.py @@ -14,7 +14,7 @@ from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings.base import load_embeddings from flair.nn.model import ReduceTransformerVocabMixin -from flair.training_utils import MetricRegression, Result, store_embeddings +from flair.training_utils import EmbeddingStorageMode, MetricRegression, Result, store_embeddings log = logging.getLogger("flair") @@ -78,7 +78,7 @@ def predict( mini_batch_size: int = 32, verbose: bool = False, label_name: Optional[str] = None, - embedding_storage_mode="none", + embedding_storage_mode: EmbeddingStorageMode = "none", ) -> List[Sentence]: if label_name is None: label_name = self.label_name if self.label_name is not None else "label" @@ -135,14 +135,15 @@ def evaluate( data_points: Union[List[Sentence], Dataset], gold_label_type: str, out_path: Optional[Union[str, Path]] = None, - embedding_storage_mode: str = "none", + embedding_storage_mode: EmbeddingStorageMode = "none", mini_batch_size: int = 32, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, gold_label_dictionary: Optional[Dictionary] = None, return_loss: bool = True, **kwargs, ) -> Result: + exclude_labels = exclude_labels if exclude_labels is not None else [] # read Dataset into data loader, if list of sentences passed, make Dataset first if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) diff --git a/flair/nn/model.py b/flair/nn/model.py index 88f51f443b..eeb5b7c84a 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union import torch.nn +from torch import Tensor from torch.nn.modules.loss import _Loss from torch.utils.data.dataset import Dataset from tqdm import tqdm @@ -19,7 +20,7 @@ from flair.embeddings import Embeddings from flair.embeddings.base import load_embeddings from flair.file_utils import Tqdm, load_torch_state -from flair.training_utils import Result, store_embeddings +from flair.training_utils import EmbeddingStorageMode, Result, store_embeddings log = logging.getLogger("flair") @@ -34,7 +35,7 @@ class Model(torch.nn.Module, typing.Generic[DT], ABC): @property @abstractmethod - def label_type(self): + def label_type(self) -> str: """Each model predicts labels of a certain type.""" raise NotImplementedError @@ -52,10 +53,10 @@ def evaluate( data_points: Union[List[DT], Dataset], gold_label_type: str, out_path: Optional[Union[str, Path]] = None, - embedding_storage_mode: str = "none", + embedding_storage_mode: EmbeddingStorageMode = "none", mini_batch_size: int = 32, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, gold_label_dictionary: Optional[Dictionary] = None, return_loss: bool = True, **kwargs, @@ -80,19 +81,18 @@ def evaluate( Returns: The evaluation results. """ + exclude_labels = exclude_labels if exclude_labels is not None else [] raise NotImplementedError - def _get_state_dict(self): + def _get_state_dict(self) -> Dict: """Returns the state dictionary for this model.""" - state_dict = {"state_dict": self.state_dict()} - # Always include the name of the Model class for which the state dict holds - state_dict["__cls__"] = self.__class__.__name__ + state_dict = {"state_dict": self.state_dict(), "__cls__": self.__class__.__name__} return state_dict @classmethod - def _init_model_with_state_dict(cls, state, **kwargs): + def _init_model_with_state_dict(cls, state: Dict[str, Any], **kwargs): """Initialize the model from a state dictionary.""" if "embeddings" in kwargs: embeddings = kwargs.pop("embeddings") @@ -107,10 +107,11 @@ def _init_model_with_state_dict(cls, state, **kwargs): return model @staticmethod - def _fetch_model(model_name) -> str: + def _fetch_model(model_name): + # this seems to just return model name, not a model with that name return model_name - def save(self, model_file: Union[str, Path], checkpoint: bool = False): + def save(self, model_file: Union[str, Path], checkpoint: bool = False) -> None: """Saves the current model to the provided file. Args: @@ -146,8 +147,9 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model": new_model_path = model_cls._fetch_model(model_path) if new_model_path != model_path: return model_cls.load(new_model_path) - except Exception: - # skip any invalid loadings, e.g. not found on huggingface hub + except Exception as e: + log.debug(e) + # skip any invalid loadings, e.g. not found on HuggingFace hub continue # if the model cannot be fetched, load as a file @@ -172,13 +174,12 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model": # older (flair 11.3 and below) models do not contain cls information. In this case, try all subclasses for model_cls in subclasses: - # if str(model_cls) == "": continue try: model = model_cls.load(state) return model except Exception as e: print(e) - # skip any invalid loadings, e.g. not found on huggingface hub + # skip any invalid loadings, e.g. not found on HuggingFace hub continue raise ValueError(f"Could not find any model with name '{model_path}'") @@ -253,14 +254,16 @@ def evaluate( data_points: Union[List[DT], Dataset], gold_label_type: str, out_path: Optional[Union[str, Path]] = None, - embedding_storage_mode: str = "none", + embedding_storage_mode: EmbeddingStorageMode = "none", mini_batch_size: int = 32, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, gold_label_dictionary: Optional[Dictionary] = None, return_loss: bool = True, **kwargs, ) -> Result: + exclude_labels = exclude_labels if exclude_labels is not None else [] + import numpy as np import sklearn @@ -351,7 +354,7 @@ def evaluate( predicted_values_span_aligned = [] for span in all_spans: list_of_gold_values_for_span = all_true_values.get(span, ["O"]) - # delete exluded labels if exclude_labels is given + # delete excluded labels if exclude_labels is given for excluded_label in exclude_labels: if excluded_label in list_of_gold_values_for_span: list_of_gold_values_for_span.remove(excluded_label) @@ -445,7 +448,7 @@ def evaluate( labels=labels, ) - # compute accuracy separately as it is not always in classification_report (e.. when micro avg exists) + # compute accuracy separately as it is not always in classification_report (e.g. when micro avg exists) accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4) # if there is only one label, then "micro avg" = "macro avg" @@ -456,10 +459,13 @@ def evaluate( # Otherwise, it is identical to the "macro avg". In this case, we add it to the report. if "micro avg" not in classification_report_dict: classification_report_dict["micro avg"] = {} - for precision_recall_f1 in classification_report_dict["macro avg"]: - classification_report_dict["micro avg"][precision_recall_f1] = classification_report_dict[ - "accuracy" - ] + for metric_key in classification_report_dict["macro avg"]: + if metric_key != "support": + classification_report_dict["micro avg"][metric_key] = classification_report_dict["accuracy"] + else: + classification_report_dict["micro avg"][metric_key] = classification_report_dict["macro avg"][ + "support" + ] detailed_result = ( "\nResults:" @@ -513,8 +519,8 @@ def predict( return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, - return_loss=False, - embedding_storage_mode="none", + return_loss: bool = False, + embedding_storage_mode: EmbeddingStorageMode = "none", ): """Predicts the class labels for the given sentences. @@ -531,7 +537,7 @@ def predict( """ raise NotImplementedError - def _print_predictions(self, batch, gold_label_type): + def _print_predictions(self, batch: List[DT], gold_label_type: str) -> List[str]: lines = [] for datapoint in batch: # check if there is a label mismatch @@ -691,7 +697,7 @@ def multi_label_threshold(self, x): # setter method if "default" in x: self._multi_label_threshold = x else: - raise Exception('multi_label_threshold dict should have a "default" key') + raise ValueError('multi_label_threshold dict should have a "default" key') else: self._multi_label_threshold = {"default": x} @@ -720,7 +726,7 @@ def _prepare_label_tensor(self, prediction_data_points: List[DT2]) -> torch.Tens device=flair.device, ) - def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]): + def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]) -> Tensor: # embed sentences if self.should_embed_sentence: self.embeddings.embed(sentences) @@ -737,7 +743,8 @@ def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]): return data_point_tensor - def _mask_scores(self, scores, data_points): + def _mask_scores(self, scores: Tensor, data_points) -> Tensor: + """Classes that inherit from DefaultClassifier may optionally mask scores.""" return scores def forward_loss(self, sentences: List[DT]) -> Tuple[torch.Tensor, int]: @@ -791,8 +798,8 @@ def predict( return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, - return_loss=False, - embedding_storage_mode="none", + return_loss: bool = False, + embedding_storage_mode: EmbeddingStorageMode = "none", ): """Predicts the class labels for the given sentences. The labels are directly added to the sentences. diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index fb8590841b..03e6edc083 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -32,7 +32,7 @@ TrainingInterrupt, WeightExtractorPlugin, ) -from flair.training_utils import identify_dynamic_embeddings, log_line, store_embeddings +from flair.training_utils import EmbeddingStorageMode, identify_dynamic_embeddings, log_line, store_embeddings log = logging.getLogger("flair") @@ -147,13 +147,13 @@ def train( monitor_train_sample: float = 0.0, use_final_model_for_eval: bool = False, gold_label_dictionary_for_eval: Optional[Dictionary] = None, - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, # sampling and shuffling sampler=None, shuffle: bool = True, shuffle_first_epoch: bool = True, # evaluation and monitoring - embeddings_storage_mode: str = "cpu", + embeddings_storage_mode: EmbeddingStorageMode = "cpu", epoch: int = 0, # when and what to save save_final_model: bool = True, @@ -168,6 +168,7 @@ def train( attach_default_scheduler: bool = True, **kwargs, ): + exclude_labels = exclude_labels if exclude_labels is not None else [] if plugins is None: plugins = [] @@ -220,13 +221,13 @@ def fine_tune( monitor_train_sample: float = 0.0, use_final_model_for_eval: bool = True, gold_label_dictionary_for_eval: Optional[Dictionary] = None, - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, # sampling and shuffling sampler=None, shuffle: bool = True, shuffle_first_epoch: bool = True, # evaluation and monitoring - embeddings_storage_mode: str = "none", + embeddings_storage_mode: EmbeddingStorageMode = "none", epoch: int = 0, # when and what to save save_final_model: bool = True, @@ -243,6 +244,7 @@ def fine_tune( attach_default_scheduler: bool = True, **kwargs, ): + exclude_labels = exclude_labels if exclude_labels is not None else [] # annealing logic if plugins is None: plugins = [] @@ -313,13 +315,13 @@ def train_custom( monitor_train_sample: float = 0.0, use_final_model_for_eval: bool = False, gold_label_dictionary_for_eval: Optional[Dictionary] = None, - exclude_labels: List[str] = [], + exclude_labels: Optional[List[str]] = None, # sampling and shuffling sampler: Optional[FlairSampler] = None, shuffle: bool = True, shuffle_first_epoch: bool = True, # evaluation and monitoring - embeddings_storage_mode: str = "cpu", + embeddings_storage_mode: EmbeddingStorageMode = "cpu", epoch: int = 0, # when and what to save save_final_model: bool = True, @@ -332,7 +334,7 @@ def train_custom( # amp use_amp: bool = False, # plugins - plugins: List[TrainerPlugin] = [], + plugins: Optional[List[TrainerPlugin]] = None, **kwargs, ) -> dict: """Trains any class that implements the flair.nn.Model interface. @@ -381,6 +383,9 @@ def train_custom( A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins add additional information to this dictionary, such as the :class:`flair.trainers.plugins.MetricHistoryPlugin` """ + exclude_labels = exclude_labels if exclude_labels is not None else [] + plugins = plugins if plugins is not None else [] + # Create output folder base_path = Path(base_path) base_path.mkdir(exist_ok=True, parents=True) diff --git a/flair/training_utils.py b/flair/training_utils.py index 2c3ce9d5f4..0b4ef91cbf 100644 --- a/flair/training_utils.py +++ b/flair/training_utils.py @@ -5,7 +5,7 @@ from functools import reduce from math import inf from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Dict, List, Literal, Optional, Union from scipy.stats import pearsonr, spearmanr from sklearn.metrics import mean_absolute_error, mean_squared_error @@ -15,6 +15,7 @@ import flair from flair.data import DT, Dictionary, Sentence, _iter_dataset +EmbeddingStorageMode = Literal["none", "cpu", "gpu"] log = logging.getLogger("flair") @@ -23,10 +24,11 @@ def __init__( self, main_score: float, detailed_results: str, - classification_report: dict = {}, - scores: dict = {}, + classification_report: Optional[dict] = None, + scores: Optional[Dict] = None, ) -> None: - assert "loss" in scores, "No loss provided." + classification_report = classification_report if classification_report is not None else {} + assert scores is not None and "loss" in scores, "No loss provided." self.main_score: float = main_score self.scores = scores @@ -363,7 +365,9 @@ def add_file_handler(log, output_file): def store_embeddings( - data_points: Union[List[DT], Dataset], storage_mode: str, dynamic_embeddings: Optional[List[str]] = None + data_points: Union[List[DT], Dataset], + storage_mode: EmbeddingStorageMode, + dynamic_embeddings: Optional[List[str]] = None, ): if isinstance(data_points, Dataset): data_points = list(_iter_dataset(data_points)) @@ -387,7 +391,7 @@ def store_embeddings( data_point.to("cpu", pin_memory=pin_memory) -def identify_dynamic_embeddings(data_points: List[DT]): +def identify_dynamic_embeddings(data_points: List[DT]) -> Optional[List]: dynamic_embeddings = [] all_embeddings = [] for data_point in data_points: diff --git a/pyproject.toml b/pyproject.toml index b3ed1eaedd..78d1692a09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ filterwarnings = [ 'ignore:distutils Version classes are deprecated.', # faiss uses deprecated distutils. 'ignore:`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.', # transformers calls deprecated hf_hub "ignore:`torch.cuda.amp.GradScaler", # GradScaler changes in torch 2.3.0 but we want to be backwards compatible. + "ignore:`clean_up_tokenization_spaces` was not set", # Default behavior changes in transformers v4.45, raising irrelevant FutureWarning for serialized models. ] markers = [ "integration", diff --git a/requirements.txt b/requirements.txt index 7f159bcd14..bb5ecafd45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,5 +22,4 @@ tqdm>=4.63.0 transformer-smaller-training-vocab>=0.2.3 transformers[sentencepiece]>=4.18.0,<5.0.0 wikipedia-api>=0.5.7 -semver<4.0.0,>=3.0.0 bioc<3.0.0,>=2.0.0 diff --git a/tests/models/test_tars_classifier.py b/tests/models/test_tars_classifier.py index 818ead4a39..20bff503bd 100644 --- a/tests/models/test_tars_classifier.py +++ b/tests/models/test_tars_classifier.py @@ -12,7 +12,7 @@ class TestTarsClassifier(BaseModelTest): train_label_type = "class" model_args = {"task_name": "2_CLASS"} training_args = {"mini_batch_size": 1, "max_epochs": 2} - pretrained_model = "tars-base" + # pretrained_model = "tars-base" # disabled due to too much space requirements. @pytest.fixture() def corpus(self, tasks_base_path): diff --git a/tests/models/test_tars_ner.py b/tests/models/test_tars_ner.py index 57c4b8cc9c..dd510f8fa4 100644 --- a/tests/models/test_tars_ner.py +++ b/tests/models/test_tars_ner.py @@ -12,7 +12,7 @@ class TestTarsTagger(BaseModelTest): train_label_type = "ner" model_args = {"task_name": "2_NER"} training_args = {"mini_batch_size": 1, "max_epochs": 2} - pretrained_model = "tars-ner" + # pretrained_model = "tars-ner" # disabled due to too much space requirements. @pytest.fixture() def corpus(self, tasks_base_path): diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 1f54a9ed91..2c76b40678 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -436,7 +436,7 @@ def test_write_to_and_load_from_directory(tasks_base_path): assert loaded_corpus.train[0].to_tagged_string() == corpus.train[0].to_tagged_string() -@pytest.mark.integration() +@pytest.mark.skip() def test_hipe_2022_corpus(tasks_base_path): # This test covers the complete HIPE 2022 dataset. # https://github.com/hipe-eval/HIPE-2022-data @@ -700,7 +700,7 @@ def test_hipe_2022(dataset_version="v2.1", add_document_separator=True): test_hipe_2022(dataset_version="v2.1", add_document_separator=False) -@pytest.mark.integration() +@pytest.mark.skip() def test_icdar_europeana_corpus(tasks_base_path): # This test covers the complete ICDAR Europeana corpus: # https://github.com/stefan-it/historic-domain-adaptation-icdar @@ -718,7 +718,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str): check_number_sentences(len(corpus.test), gold_stats[language]["test"], "test") -@pytest.mark.integration() +@pytest.mark.skip() def test_masakhane_corpus(tasks_base_path): # This test covers the complete MasakhaNER dataset, including support for v1 and v2. supported_versions = ["v1", "v2"] @@ -802,7 +802,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) -@pytest.mark.integration() +@pytest.mark.skip() def test_nermud_corpus(tasks_base_path): # This test covers the NERMuD dataset. Official stats can be found here: # https://github.com/dhfbk/KIND/tree/main/evalita-2023 @@ -821,6 +821,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str): check_number_sentences(len(corpus.dev), stats["dev"], "dev") +@pytest.mark.skip() def test_german_ler_corpus(tasks_base_path): corpus = flair.datasets.NER_GERMAN_LEGAL() @@ -830,7 +831,7 @@ def test_german_ler_corpus(tasks_base_path): assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split" -@pytest.mark.integration() +@pytest.mark.skip() def test_masakha_pos_corpus(tasks_base_path): # This test covers the complete MasakhaPOS dataset. supported_versions = ["v1"] @@ -899,7 +900,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) -@pytest.mark.integration() +@pytest.mark.skip() def test_german_mobie(tasks_base_path): corpus = flair.datasets.NER_GERMAN_MOBIE() @@ -984,7 +985,7 @@ def test_jsonl_corpus_loads_metadata(tasks_base_path): assert dataset.sentences[2].get_metadata("from") == 125 -@pytest.mark.integration() +@pytest.mark.skip() def test_ontonotes_download(): from urllib.parse import urlparse @@ -992,6 +993,7 @@ def test_ontonotes_download(): assert all([res.scheme, res.netloc]) +@pytest.mark.skip() def test_ontonotes_extraction(tasks_base_path): import os import tempfile diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py index 4099bb9288..0264b08394 100644 --- a/tests/test_datasets_biomedical.py +++ b/tests/test_datasets_biomedical.py @@ -1,21 +1,12 @@ -import inspect import logging import os import tempfile -from operator import itemgetter from pathlib import Path -from typing import Callable, List, Optional, Type +from typing import List, Optional -import pytest -from tqdm import tqdm - -import flair -from flair.data import Sentence, Token, _iter_dataset -from flair.datasets import ColumnCorpus, biomedical from flair.datasets.biomedical import ( CoNLLWriter, Entity, - HunerDataset, InternalBioNerDataset, filter_nested_entities, ) @@ -26,80 +17,6 @@ logger.propagate = True -def has_balanced_parantheses(text: str) -> bool: - stack = [] - opening = ["(", "[", "{"] - closing = [")", "]", "}"] - for c in text: - if c in opening: - stack.append(c) - elif c in closing: - if not stack: - return False - last_paren = stack.pop() - if opening.index(last_paren) != closing.index(c): - return False - - return len(stack) == 0 - - -def gene_predicate(member): - return inspect.isclass(member) and "HUNER_GENE_" in str(member) - - -def chemical_predicate(member): - return inspect.isclass(member) and "HUNER_CHEMICAL_" in str(member) - - -def disease_predicate(member): - return inspect.isclass(member) and "HUNER_DISEASE_" in str(member) - - -def species_predicate(member): - return inspect.isclass(member) and "HUNER_SPECIES_" in str(member) - - -def cellline_predicate(member): - return inspect.isclass(member) and "HUNER_CELL_LINE_" in str(member) - - -CELLLINE_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=cellline_predicate), key=itemgetter(0)) -] -CHEMICAL_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=chemical_predicate), key=itemgetter(0)) -] -DISEASE_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=disease_predicate), key=itemgetter(0)) -] -GENE_DATASETS = [i[1] for i in sorted(inspect.getmembers(biomedical, predicate=gene_predicate), key=itemgetter(0))] -SPECIES_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=species_predicate), key=itemgetter(0)) -] -ALL_DATASETS = CELLLINE_DATASETS + CHEMICAL_DATASETS + DISEASE_DATASETS + GENE_DATASETS + SPECIES_DATASETS - - -def simple_tokenizer(text: str) -> List[str]: - tokens: List[str] = [] - word = "" - index = -1 - for index, char in enumerate(text): - if char == " " or char == "-": - if len(word) > 0: - tokens.append(word) - - word = "" - else: - word += char - - # increment for last token in sentence if not followed by whitespace - index += 1 - if len(word) > 0: - tokens.append(word) - - return tokens - - def test_write_to_conll(): text = "This is entity1 entity2 and a long entity3" dataset = InternalBioNerDataset( @@ -220,163 +137,3 @@ def test_filter_nested_entities(caplog): sorted(entities, key=lambda x: str(x)), ): assert str(e1) == str(e2) - - -def sanity_check_all_corpora(check: Callable[[ColumnCorpus], None]): - for _, CorpusType in tqdm(ALL_DATASETS): - corpus = CorpusType() - check(corpus) - - -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -def test_sanity_not_starting_with_minus(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - entities_starting_with_minus = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - if str(entity.tokens[0].text).startswith("-"): - entities_starting_with_minus.append(" ".join([t.text for t in entity.tokens])) - - assert len(entities_starting_with_minus) == 0, "|".join(entities_starting_with_minus) - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_repeating_Bs(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - longest_repeat_tokens: List[Token] = [] - repeat_tokens: List[Token] = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - for token in sentence.tokens: - if token.get_labels()[0].value.startswith("B") or token.get_labels()[0].value.startswith("S"): - repeat_tokens.append(token) - else: - if len(repeat_tokens) > len(longest_repeat_tokens): - longest_repeat_tokens = repeat_tokens - repeat_tokens = [] - - assert len(longest_repeat_tokens) < 4 - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_long_entities(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - longest_entity: List[str] = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - if len(entity.tokens) > len(longest_entity): - longest_entity = [t.text for t in entity.tokens] - - assert len(longest_entity) < 10, " ".join(longest_entity) - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_unmatched_parentheses(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - unbalanced_entities = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - entity_text = "".join(t.text for t in entity.tokens) - if not has_balanced_parantheses(entity_text): - unbalanced_entities.append(entity_text) - - assert unbalanced_entities == [] - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_not_too_many_entities(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - n_entities_per_sentence = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - n_entities_per_sentence.append(len(entities)) - avg_entities_per_sentence = sum(n_entities_per_sentence) / len(n_entities_per_sentence) - - assert avg_entities_per_sentence <= 5 - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]): - dataset_name = CorpusType.__class__.__name__.lower() - base_path = flair.cache_root / "datasets" - data_folder = base_path / dataset_name - - corpus = CorpusType() - internal = corpus.to_internal(data_folder) - for doc_id, _doc_text in internal.documents.items(): - misaligned_starts = [] - misaligned_ends: List[int] = [] - - entities = internal.entities_per_document[doc_id] - entity_starts = [i.char_span.start for i in entities] - entity_ends = [i.char_span.stop for i in entities] - - for start in entity_starts: - if start not in entity_starts: - misaligned_starts.append(start) - - for end in entity_ends: - if end not in entity_ends: - misaligned_starts.append(end) - - assert len(misaligned_starts) <= len(entities) // 10 - assert len(misaligned_ends) <= len(entities) // 10 - - -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_scispacy_tokenization(): - from flair.tokenization import SciSpacyTokenizer - - spacy_tokenizer = SciSpacyTokenizer() - - sentence = Sentence("HBeAg(+) patients", use_tokenizer=spacy_tokenizer) - assert len(sentence) == 5 - assert sentence[0].text == "HBeAg" - assert sentence[0].start_position == 0 - assert sentence[1].text == "(" - assert sentence[1].start_position == 5 - assert sentence[2].text == "+" - assert sentence[2].start_position == 6 - assert sentence[3].text == ")" - assert sentence[3].start_position == 7 - assert sentence[4].text == "patients" - assert sentence[4].start_position == 9 - - sentence = Sentence("HBeAg(+)/HBsAg(+)", use_tokenizer=spacy_tokenizer) - assert len(sentence) == 9 - - assert sentence[0].text == "HBeAg" - assert sentence[0].start_position == 0 - assert sentence[1].text == "(" - assert sentence[1].start_position == 5 - assert sentence[2].text == "+" - assert sentence[2].start_position == 6 - assert sentence[3].text == ")" - assert sentence[3].start_position == 7 - assert sentence[4].text == "/" - assert sentence[4].start_position == 8 - assert sentence[5].text == "HBsAg" - assert sentence[5].start_position == 9 - assert sentence[6].text == "(" - assert sentence[6].start_position == 14 - assert sentence[7].text == "+" - assert sentence[7].start_position == 15 - assert sentence[8].text == ")" - assert sentence[8].start_position == 16 - - sentence = Sentence("doxorubicin (DOX)-induced", use_tokenizer=spacy_tokenizer) - - assert len(sentence) == 5 - assert sentence[0].text == "doxorubicin" - assert sentence[1].text == "(" - assert sentence[2].text == "DOX" - assert sentence[3].text == ")" - assert sentence[4].text == "-induced"