Merge branch 'master' into flairNLPgh-3488/save-column-corpus-to-files

chelseagzr · Jul 30, 2024 · 757f0ca · 757f0ca
2 parents 205e46d + e17ab12
commit 757f0ca
Show file tree

Hide file tree

Showing 16 changed files with 37 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ document embeddings, including our proposed [Flair embeddings](https://www.aclwe
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.13.1](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.14.0](https://github.com/flairNLP/flair/releases)!
 
 
 ## State-of-the-Art Models
@@ -127,6 +127,7 @@ In particular:
 - [Tutorial 1: Basic tagging](https://flairnlp.github.io/docs/category/tutorial-1-basic-tagging) → how to tag your text 
 - [Tutorial 2: Training models](https://flairnlp.github.io/docs/category/tutorial-2-training-models) → how to train your own state-of-the-art NLP models 
 - [Tutorial 3: Embeddings](https://flairnlp.github.io/docs/category/tutorial-3-embeddings) → how to produce embeddings for words and documents
+- [Tutorial 4: Biomedical text](https://flairnlp.github.io/docs/category/tutorial-4-biomedical-text) → how to analyse biomedical text data
 
 There is also a dedicated landing page for our [biomedical NER and datasets](/resources/docs/HUNFLAIR.md) with
 installation instructions and tutorials.

diff --git a/docs/conf.py b/docs/conf.py
@@ -5,8 +5,8 @@
 # -- Project information -----------------------------------------------------
 from sphinx_github_style import get_linkcode_resolve
 
-version = "0.13.1"
-release = "0.13.1"
+version = "0.14.0"
+release = "0.14.0"
 project = "flair"
 author = importlib_metadata.metadata(project)["Author"]
 copyright = f"2023 {author}"

diff --git a/docs/tutorial/tutorial-basics/entity-mention-linking.md b/docs/tutorial/tutorial-basics/entity-mention-linking.md
@@ -23,7 +23,7 @@ sentence = Sentence(
 ner_tagger = Classifier.load("hunflair2")
 ner_tagger.predict(sentence)
 
-nen_tagger = EntityMentionLinker.load("disease-linker-no-ab3p")
+nen_tagger = EntityMentionLinker.load("disease-linker")
 nen_tagger.predict(sentence)
 
 for tag in sentence.get_labels():

diff --git a/docs/tutorial/tutorial-basics/other-models.md b/docs/tutorial/tutorial-basics/other-models.md
@@ -145,7 +145,6 @@ We end this section with a list of all other models we currently ship with Flair
 | '[frame](https://huggingface.co/flair/frame-english)'  |   Frame Detection |  English | Propbank 3.0     |  **97.54** (F1) |
 | '[frame-fast](https://huggingface.co/flair/frame-english-fast)'  |  Frame Detection |  English | Propbank 3.0     |  **97.31** (F1) | (fast model)
 | 'negation-speculation'  | Negation / speculation |English |  Bioscope | **80.2** (F1) |
-| 'communicative-functions' |  detecting function of sentence in research paper (BETA) |  English| scholarly papers |  |
 | 'de-historic-indirect' | historical indirect speech | German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
 | 'de-historic-direct' | historical direct speech |  German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
 | 'de-historic-reported' | historical reported speech | German |  @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |

diff --git a/docs/tutorial/tutorial-basics/part-of-speech-tagging.md b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
@@ -105,7 +105,7 @@ tagger.predict(sentence)
 print(sentence)
 ```
 
-## Tagging universal parts-of-speech (uPoS)
+## Tagging parts-of-speech in any language
 
 Universal parts-of-speech are a set of minimal syntactic units that exist across languages. For instance, most languages
 will have VERBs or NOUNs. 

diff --git a/docs/tutorial/tutorial-basics/tagging-entities.md b/docs/tutorial/tutorial-basics/tagging-entities.md
@@ -2,7 +2,7 @@
 
 This tutorials shows you how to do named entity recognition, showcases various NER models, and provides a full list of all NER models in Flair.
 
-## Tagging entities with our standard model
+## Tagging entities with our standard model
 
 Our standard model uses Flair embeddings and was trained over the English CoNLL-03 task and can recognize 4 different entity types. It offers a good tradeoff between accuracy and speed.
 
@@ -32,7 +32,7 @@ Sentence: "George Washington went to Washington ." → ["George Washington"/PER,
 
 The printout tells us that two entities are labeled in this sentence: "George Washington" as PER (person) and "Washington" as LOC (location).
 
-## Tagging entities with our best model
+## Tagging entities with our best model
 
 Our best 4-class model is trained using a very large transformer. Use it if accuracy is the most important to you, and speed/memory not so much. 
 

diff --git a/docs/tutorial/tutorial-basics/tagging-sentiment.md b/docs/tutorial/tutorial-basics/tagging-sentiment.md
@@ -2,7 +2,7 @@
 
 This tutorials shows you how to do sentiment analysis in Flair.
 
-## Tagging sentiment with our standard model
+## Tagging sentiment with our standard model
 
 Our standard sentiment analysis model uses distilBERT embeddings and was trained over a mix of corpora, notably
 the Amazon review corpus, and can thus handle a variety of domains and language.

diff --git a/flair/__init__.py b/flair/__init__.py
@@ -34,7 +34,7 @@
     device = torch.device("cpu")
 
 # global variable: version
-__version__ = "0.13.1"
+__version__ = "0.14.0"
 """The current version of the flair library installed."""
 
 # global variable: arrow symbol

diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py
@@ -122,11 +122,12 @@ def __getitem__(self, index: int = 0) -> Sentence:
         else:
             with open(str(self.path_to_conll_file), encoding="utf-8") as file:
                 file.seek(self.indices[index])
-                sentence = self._read_next_sentence(file)
+                sentence_or_none = self._read_next_sentence(file)
+                sentence = sentence_or_none if isinstance(sentence_or_none, Sentence) else Sentence("")
 
         return sentence
 
-    def _read_next_sentence(self, file) -> Sentence:
+    def _read_next_sentence(self, file) -> Optional[Sentence]:
         line = file.readline()
         tokens: List[Token] = []
 
@@ -139,13 +140,15 @@ def _read_next_sentence(self, file) -> Sentence:
         current_multiword_first_token = 0
         current_multiword_last_token = 0
 
+        newline_reached = False
         while line:
             line = line.strip()
             fields: List[str] = re.split("\t+", line)
 
             # end of sentence
             if line == "":
                 if len(tokens) > 0:
+                    newline_reached = True
                     break
 
             # comments or ellipsis
@@ -205,20 +208,18 @@ def _read_next_sentence(self, file) -> Sentence:
                 if token_idx <= current_multiword_last_token:
                     current_multiword_sequence += token.text
 
-                # print(token)
-                # print(current_multiword_last_token)
-                # print(current_multiword_first_token)
                 # if multi-word equals component tokens, there should be no whitespace
                 if token_idx == current_multiword_last_token and current_multiword_sequence == current_multiword_text:
                     # go through all tokens in subword and set whitespace_after information
                     for i in range(current_multiword_last_token - current_multiword_first_token):
-                        # print(i)
                         tokens[-(i + 1)].whitespace_after = 0
                 tokens.append(token)
 
             line = file.readline()
 
-        return Sentence(tokens)
+        if newline_reached or len(tokens) > 0:
+            return Sentence(tokens)
+        return None
 
 
 class UD_ENGLISH(UniversalDependenciesCorpus):

diff --git a/flair/models/language_model.py b/flair/models/language_model.py
@@ -189,7 +189,7 @@ def initialize(matrix):
 
     @classmethod
     def load_language_model(cls, model_file: Union[Path, str], has_decoder=True):
-        state = torch.load(str(model_file), map_location=flair.device)
+        state = torch.load(str(model_file), map_location=flair.device, weights_only=False)
 
         document_delimiter = state.get("document_delimiter", "\n")
         has_decoder = state.get("has_decoder", True) and has_decoder
@@ -213,7 +213,7 @@ def load_language_model(cls, model_file: Union[Path, str], has_decoder=True):
 
     @classmethod
     def load_checkpoint(cls, model_file: Union[Path, str]):
-        state = torch.load(str(model_file), map_location=flair.device)
+        state = torch.load(str(model_file), map_location=flair.device, weights_only=False)
 
         epoch = state.get("epoch")
         split = state.get("split")

diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
@@ -250,7 +250,6 @@ def _fetch_model(model_name) -> str:
         hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"
 
         # biomedical models
-        model_map["bioner"] = "/".join([hu_path, "bioner", "hunflair.pt"])
         model_map["hunflair"] = "/".join([hu_path, "bioner", "hunflair.pt"])
         model_map["hunflair-paper"] = "/".join([hu_path, "bioner", "hunflair-paper.pt"])
 

diff --git a/flair/models/prefixed_tagger.py b/flair/models/prefixed_tagger.py
@@ -321,7 +321,7 @@ def augment_sentences(
 
     @staticmethod
     def _fetch_model(model_name) -> str:
-        huggingface_model_map = {"hunflair2": "hunflair/hunflair2-ner"}
+        huggingface_model_map = {"hunflair2": "hunflair/hunflair2-ner", "bioner": "hunflair/hunflair2-ner"}
 
         # check if model name is a valid local file
         if Path(model_name).exists():

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -13,7 +13,7 @@
 from flair.data import Dictionary, Label, Sentence, Span, get_spans_from_bio
 from flair.datasets import DataLoader, FlairDatapointDataset
 from flair.embeddings import TokenEmbeddings
-from flair.file_utils import cached_path, hf_download, unzip_file
+from flair.file_utils import cached_path, hf_download
 from flair.models.sequence_tagger_utils.crf import CRF
 from flair.models.sequence_tagger_utils.viterbi import ViterbiDecoder, ViterbiLoss
 from flair.training_utils import store_embeddings
@@ -573,9 +573,9 @@ def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], pro
 
         return predictions, all_tags
 
-    def _all_scores_for_token(self, sentences: List[Sentence], scores: torch.Tensor, lengths: List[int]):
+    def _all_scores_for_token(self, sentences: List[Sentence], score_tensor: torch.Tensor, lengths: List[int]):
         """Returns all scores for each tag in tag dictionary."""
-        scores = scores.numpy()
+        scores = score_tensor.numpy()
         tokens = [token for sentence in sentences for token in sentence]
         prob_all_tags = [
             [
@@ -686,6 +686,11 @@ def _fetch_model(model_name) -> str:
             "ner-ukrainian": "dchaplinsky/flair-uk-ner",
             # Language-specific POS models
             "pos-ukrainian": "dchaplinsky/flair-uk-pos",
+            # Historic German
+            "de-historic-direct": "aehrm/redewiedergabe-direct",
+            "de-historic-indirect": "aehrm/redewiedergabe-indirect",
+            "de-historic-reported": "aehrm/redewiedergabe-reported",
+            "de-historic-free-indirect": "aehrm/redewiedergabe-freeindirect",
         }
 
         hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"
@@ -757,59 +762,6 @@ def _fetch_model(model_name) -> str:
                     "information."
                 )
 
-        # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
-        elif model_name == "de-historic-indirect":
-            model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"
-            if not model_file.exists():
-                cached_path(
-                    "http://www.redewiedergabe.de/models/indirect.zip",
-                    cache_dir=cache_dir,
-                )
-                unzip_file(
-                    flair.cache_root / cache_dir / "indirect.zip",
-                    flair.cache_root / cache_dir,
-                )
-            model_path = str(flair.cache_root / cache_dir / "indirect" / "final-model.pt")
-
-        elif model_name == "de-historic-direct":
-            model_file = flair.cache_root / cache_dir / "direct" / "final-model.pt"
-            if not model_file.exists():
-                cached_path(
-                    "http://www.redewiedergabe.de/models/direct.zip",
-                    cache_dir=cache_dir,
-                )
-                unzip_file(
-                    flair.cache_root / cache_dir / "direct.zip",
-                    flair.cache_root / cache_dir,
-                )
-            model_path = str(flair.cache_root / cache_dir / "direct" / "final-model.pt")
-
-        elif model_name == "de-historic-reported":
-            model_file = flair.cache_root / cache_dir / "reported" / "final-model.pt"
-            if not model_file.exists():
-                cached_path(
-                    "http://www.redewiedergabe.de/models/reported.zip",
-                    cache_dir=cache_dir,
-                )
-                unzip_file(
-                    flair.cache_root / cache_dir / "reported.zip",
-                    flair.cache_root / cache_dir,
-                )
-            model_path = str(flair.cache_root / cache_dir / "reported" / "final-model.pt")
-
-        elif model_name == "de-historic-free-indirect":
-            model_file = flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt"
-            if not model_file.exists():
-                cached_path(
-                    "http://www.redewiedergabe.de/models/freeIndirect.zip",
-                    cache_dir=cache_dir,
-                )
-                unzip_file(
-                    flair.cache_root / cache_dir / "freeIndirect.zip",
-                    flair.cache_root / cache_dir,
-                )
-            model_path = str(flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt")
-
         # for all other cases (not local file or special download location), use HF model hub
         else:
             model_path = hf_download(model_name)

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
@@ -226,10 +226,14 @@ def decode(
         return tags, all_tags
 
     def _all_scores_for_token(
-        self, scores: torch.Tensor, tag_sequences: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]
+        self,
+        score_tensor: torch.Tensor,
+        tag_sequences: torch.Tensor,
+        lengths: torch.IntTensor,
+        sentences: List[Sentence],
     ):
         """Returns all scores for each tag in tag dictionary."""
-        scores = scores.numpy()
+        scores = score_tensor.numpy()
         for i_batch, (batch, tag_seq) in enumerate(zip(scores, tag_sequences)):
             for i, (tag_id, tag_scores) in enumerate(zip(tag_seq, batch)):
                 tag_id_int = tag_id if isinstance(tag_id, int) else int(tag_id.item())

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ filterwarnings = [
     'ignore:Deprecated call to `pkg_resources',  # huggingface has deprecated calls.
     'ignore:distutils Version classes are deprecated.',  # faiss uses deprecated distutils.
     'ignore:`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.',  # transformers calls deprecated hf_hub
+    "ignore:`torch.cuda.amp.GradScaler",  # GradScaler changes in torch 2.3.0 but we want to be backwards compatible.
 ]
 markers = [
     "integration",

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="flair",
-    version="0.13.1",
+    version="0.14.0",
     description="A very simple framework for state-of-the-art NLP",
     long_description=Path("README.md").read_text(encoding="utf-8"),
     long_description_content_type="text/markdown",