From 321d8bfd3f025d9db0a67477206088d7e2deb8d0 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 17:51:27 +0200 Subject: [PATCH 01/35] unify get_spans method of SequenceTagger and MultiTagger --- flair/data.py | 22 +++++++++++++++++++--- flair/models/sequence_tagger_model.py | 9 +-------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/flair/data.py b/flair/data.py index bf1f80fd1..be5478b87 100644 --- a/flair/data.py +++ b/flair/data.py @@ -614,9 +614,7 @@ def get_label_names(self): label_names.append(label.value) return label_names - def get_spans(self, label_type: str, min_score=-1) -> List[Span]: - - spans: List[Span] = [] + def _add_spans_internal(self, spans: List[Span], label_type: str, min_score): current_span = [] @@ -688,6 +686,24 @@ def get_spans(self, label_type: str, min_score=-1) -> List[Span]: return spans + def get_spans(self, label_type: Optional[str] = None, min_score=-1) -> List[Span]: + + spans: List[Span] = [] + + # if label type is explicitly specified, get spans for this label type + if label_type: + return self._add_spans_internal(spans, label_type, min_score) + + # else determine all label types in sentence and get all spans + label_types = [] + for token in self: + for annotation in token.annotation_layers.keys(): + if annotation not in label_types: label_types.append(annotation) + + for label_type in label_types: + self._add_spans_internal(spans, label_type, min_score) + return spans + @property def embedding(self): return self.get_embedding() diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 812ede20f..f024610c8 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -1361,11 +1361,4 @@ def load(cls, model_names: Union[List[str], str]): taggers[model_name] = model models.append(model) - return cls(taggers) - - def get_all_spans(self, sentence: Sentence): - spans = [] - for name in self.name_to_tagger: - spans.extend(sentence.get_spans(name)) - - return spans + return cls(taggers) \ No newline at end of file From 3b4ce84a7b2ee8dd3624f9a8c2cb52f0e59dfe7e Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 17:54:02 +0200 Subject: [PATCH 02/35] update documentation --- resources/docs/HUNFLAIR.md | 2 +- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 5937232b8..aae0e007a 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -47,7 +47,7 @@ tagger.predict(sentence) ``` Done! The Sentence now has entity annotations. Let's print the entities found by the tagger: ```python -for entity in tagger.get_all_spans(sentence): +for entity in sentence.get_spans(): print(entity) ``` This should print: diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index 1bf8e4d23..d6ddd42fa 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -16,7 +16,7 @@ This will add predicted tags to the tokens in the sentence. Lets use a sentence ```python from flair.data import Sentence -sentence = Sentence("Behavioral Abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome") +sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome") # predict NER tags tagger.predict(sentence) @@ -83,7 +83,7 @@ To use the tokenizer we just have to pass it as parameter to when instancing a s ```python from flair.tokenization import SciSpacyTokenizer -sentence = Sentence("Behavioral Abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", +sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", use_tokenizer=SciSpacyTokenizer()) ``` From 76a165d1358289844039f07afcc3783e187910cc Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 18:32:50 +0200 Subject: [PATCH 03/35] bump version number and update readme --- README.md | 12 +++++++++--- flair/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 80f3f12dd..c7c4d6884 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,13 @@ Flair is: * **A powerful NLP library.** Flair allows you to apply our state-of-the-art natural language processing (NLP) models to your text, such as named entity recognition (NER), part-of-speech tagging (PoS), - sense disambiguation and classification. - + sense disambiguation and classification, with support for many languages. + * **Multilingual.** Thanks to the Flair community, we support a rapidly growing number of languages. We also now include '*one model, many languages*' taggers, i.e. single models that predict PoS or NER tags for input text in various languages. + +* **A biomedical NER library.** Flair has special support for [biomedical data](/resources/docs/HUNFLAIR.md) with +state-of-the-art models for biomedical NER and support for over 32 biomedical datasets. * **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word and document embeddings, including our proposed **[Flair embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)**, BERT embeddings and ELMo embeddings. @@ -25,7 +28,7 @@ document embeddings, including our proposed **[Flair embeddings](https://drive.g * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. -Now at [version 0.5.1](https://github.com/flairNLP/flair/releases)! +Now at [version 0.6](https://github.com/flairNLP/flair/releases)! ## Comparison with State-of-the-Art @@ -126,6 +129,9 @@ The tutorials explain how the base NLP classes work, how you can load pre-traine text, how you can embed your text with different word or document embeddings, and how you can train your own language models, sequence labeling models, and text classification models. Let us know if anything is unclear. +There is also a dedicated landing page for our **[biomedical NER and datasets](/resources/docs/HUNFLAIR.md)** with +installation instructions and tutorials. + There are also good third-party articles and posts that illustrate how to use Flair: * [How to build a text classifier with Flair](https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f) * [How to build a microservice with Flair and Flask](https://shekhargulati.com/2019/01/04/building-a-sentiment-analysis-python-microservice-with-flair-and-flask/) diff --git a/flair/__init__.py b/flair/__init__.py index eba084572..ff7f52c4c 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -24,7 +24,7 @@ import logging.config -__version__ = "0.5.1" +__version__ = "0.6" logging.config.dictConfig( { diff --git a/setup.py b/setup.py index aeecd37a3..62243b6e5 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.5.1", + version="0.6", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From fba08459b9feed19689a2de310af713bf93f3e34 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 18:34:40 +0200 Subject: [PATCH 04/35] Update readme --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index c7c4d6884..09f84ac87 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,7 @@ Flair is: * **A powerful NLP library.** Flair allows you to apply our state-of-the-art natural language processing (NLP) models to your text, such as named entity recognition (NER), part-of-speech tagging (PoS), - sense disambiguation and classification, with support for many languages. - -* **Multilingual.** Thanks to the Flair community, we support a rapidly growing number of languages. We also now include -'*one model, many languages*' taggers, i.e. single models that predict PoS or NER tags for input text in various languages. + sense disambiguation and classification, with support for a rapidly growing number of languages. * **A biomedical NER library.** Flair has special support for [biomedical data](/resources/docs/HUNFLAIR.md) with state-of-the-art models for biomedical NER and support for over 32 biomedical datasets. From 2a11dfe5c848e490bded19fb747cc4cf9a5ea329 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 21:46:04 +0200 Subject: [PATCH 05/35] update documentation --- resources/docs/HUNFLAIR.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index aae0e007a..217d8b6bd 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -6,8 +6,6 @@ builds on pretrained domain-specific language models and outperforms other biome NER tools on unseen corpora. Furthermore, it contains harmonized versions of [31 biomedical NER data sets](HUNFLAIR_CORPORA.md). - - Content: [Quick Start](#quick-start) | [BioNER-Tool Comparison](#comparison-to-other-biomedical-ner-tools) | @@ -34,15 +32,16 @@ pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/e Let's run named entity recognition (NER) over an example sentence. All you need to do is make a Sentence, load a pre-trained model and use it to predict tags for the sentence: ```python -import flair +from flair.data import Sentence +from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer -sentence = flair.data.Sentence( - "Behavioral Abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", +sentence = Sentence( + "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", use_tokenizer=SciSpacyTokenizer() ) -tagger = flair.models.MultiTagger.load("hunflair") +tagger = MultiTagger.load("hunflair") tagger.predict(sentence) ``` Done! The Sentence now has entity annotations. Let's print the entities found by the tagger: @@ -86,7 +85,6 @@ of the gold standard data. We allow a shift by max one character. [GNormPus](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/gnormplus/) for Gene and Species, and [DNorm](https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/DNorm.html) for Disease - Here's how to [reproduce these numbers](XXX) using Flair. You can also find detailed evaluations and discussions in our paper. ## Tutorials From 19c5c05b38bea0fac7b4fa6c895c1b8ab7ec9173 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 21:47:55 +0200 Subject: [PATCH 06/35] update documentation --- resources/docs/HUNFLAIR.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 217d8b6bd..7a4cfdc16 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -36,12 +36,16 @@ from flair.data import Sentence from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer +# make a sentence (tokenize with SciSpaCy) sentence = Sentence( "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", use_tokenizer=SciSpacyTokenizer() ) +# load biomedical tagger tagger = MultiTagger.load("hunflair") + +# tag sentence tagger.predict(sentence) ``` Done! The Sentence now has entity annotations. Let's print the entities found by the tagger: From 89faf9698c39a66806fba70a3954d8d2d23e037e Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 21:49:35 +0200 Subject: [PATCH 07/35] update documentation --- resources/docs/HUNFLAIR.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 7a4cfdc16..adbb2a5ae 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -38,9 +38,9 @@ from flair.tokenization import SciSpacyTokenizer # make a sentence (tokenize with SciSpaCy) sentence = Sentence( - "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", - use_tokenizer=SciSpacyTokenizer() -) + "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", + use_tokenizer=SciSpacyTokenizer() + ) # load biomedical tagger tagger = MultiTagger.load("hunflair") From f90ec8e05eb99374897a8915daf787c5efb29951 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 21:50:13 +0200 Subject: [PATCH 08/35] update documentation --- resources/docs/HUNFLAIR.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index adbb2a5ae..ea75e707c 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -37,10 +37,8 @@ from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer # make a sentence (tokenize with SciSpaCy) -sentence = Sentence( - "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", - use_tokenizer=SciSpacyTokenizer() - ) +sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", + use_tokenizer=SciSpacyTokenizer()) # load biomedical tagger tagger = MultiTagger.load("hunflair") From 51b41a98f17f052dd14430350b450f116836e047 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 21:51:07 +0200 Subject: [PATCH 09/35] update documentation --- resources/docs/HUNFLAIR.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index ea75e707c..874b55cb2 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -36,7 +36,7 @@ from flair.data import Sentence from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer -# make a sentence (tokenize with SciSpaCy) +# make a sentence and tokenize with SciSpaCy sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", use_tokenizer=SciSpacyTokenizer()) From 550b97052827f6884b2fd0f26650405ab135ebce Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:06:22 +0200 Subject: [PATCH 10/35] update documentation for release --- README.md | 2 +- resources/docs/TUTORIAL_1_BASICS.md | 34 ++++++++++++++++++----------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 09f84ac87..f9e3d872f 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ models to your text, such as named entity recognition (NER), part-of-speech tagg state-of-the-art models for biomedical NER and support for over 32 biomedical datasets. * **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word and -document embeddings, including our proposed **[Flair embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)**, BERT embeddings and ELMo embeddings. +document embeddings, including our proposed **[Flair embeddings](https://www.aclweb.org/anthology/C18-1139/)**, BERT embeddings and ELMo embeddings. * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 7d5e12bbf..93d226eaa 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -13,8 +13,8 @@ Let's start by making a `Sentence` object for an example sentence. # The sentence objects holds a sentence that we may want to embed or tag from flair.data import Sentence -# Make a sentence object by passing a whitespace tokenized string -sentence = Sentence('The grass is green .') +# Make a sentence object by passing a string +sentence = Sentence('The grass is green.') # Print the object to see what's in there print(sentence) @@ -62,44 +62,52 @@ Token: 5 . ## Tokenization -In some use cases, you might not have your text already tokenized. For this case, we added a simple tokenizer using the +When you create a `Sentence` as above, the text is automatically tokenized using the lightweight [segtok library](https://pypi.org/project/segtok/). -If you want to use this tokenizer, simply set the `use_tokenizer` flag when instantiating your `Sentence` with an untokenized string: +If you *do not* want to use this tokenizer, simply set the `use_tokenizer` flag to `False` +when instantiating your `Sentence` with an untokenized string: ```python from flair.data import Sentence # Make a sentence object by passing an untokenized string and the 'use_tokenizer' flag -sentence = Sentence('The grass is green.', use_tokenizer=True) +sentence = Sentence('The grass is green.', use_tokenizer=False) # Print the object to see what's in there print(sentence) ``` +In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. -### Adding Custom Tokenizers +### Using a Different Tokenizer -You can also pass custom tokenizers to the initialization method. Instead of passing a boolean `True` value to the `use_tokenizer` parameter, you can pass an implementation of `Tokenizer` class, like this: +You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese +sentence you can use the 'janome' tokenizer instead, like this: ```python from flair.data import Sentence -from flair.tokenization import SegtokTokenizer +from flair.tokenization import JapaneseTokenizer -# Make a sentence object by passing an untokenized string and a tokenizer -sentence = Sentence('The grass is green.', use_tokenizer=SegtokTokenizer()) +# init japanese tokenizer +tokenizer = JapaneseTokenizer("janome") -# Print the object to see what's in there +# make sentence (and tokenize) +sentence = Sentence("私はベルリンが好き", use_tokenizer=tokenizer) + +# output tokenized sentence print(sentence) ``` This should print: ```console -Sentence: "The grass is green ." - 5 Tokens +Sentence: "私 は ベルリン が 好き" [− Tokens: 5] ``` -The second way allows you to write your own tokenization routine. Check the code of `flair.data.Tokenizer` and it's implementations (e.g. `flair.tokenization.SegtokTokenizer` or `flair.tokenization.SpacyTokenizer`) to have an idea of how to add your own tokenization. +You can write your own tokenization routine. Check the code of `flair.data.Tokenizer` and its implementations + (e.g. `flair.tokenization.SegtokTokenizer` or `flair.tokenization.SpacyTokenizer`) to get an idea of how to add + your own tokenization method. ## Adding Labels From e078206580764f670ed482641ded4d4f0be221b4 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:20:12 +0200 Subject: [PATCH 11/35] update documentation --- resources/docs/TUTORIAL_2_TAGGING.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md index b45079011..a0e0ed6e7 100644 --- a/resources/docs/TUTORIAL_2_TAGGING.md +++ b/resources/docs/TUTORIAL_2_TAGGING.md @@ -69,6 +69,27 @@ This should print: ]} ``` + +### Multi-Tagging + +Sometimes you want to predict several types of annotation at once, for instance NER and part-of-speech (POS) tagging. +For this, you can use our new `MultiTagger` object, like this: + +```python +# load tagger for POS and NER +tagger = MultiTagger.load(['pos', 'ner']) + +# make example sentence +sentence = Sentence("George Washington went to Washington.") + +# predict with both models +tagger.predict(sentence) + +print(sentence) +``` + +The sentence now has two types of annotation: POS and NER. + ### List of Pre-Trained Sequence Tagger Models You choose which pre-trained model you load by passing the appropriate From 0764edfcca515f92820be359801634af0abc45d9 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:21:06 +0200 Subject: [PATCH 12/35] update documentation --- resources/docs/TUTORIAL_2_TAGGING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md index a0e0ed6e7..9ff4b9f4f 100644 --- a/resources/docs/TUTORIAL_2_TAGGING.md +++ b/resources/docs/TUTORIAL_2_TAGGING.md @@ -72,7 +72,7 @@ This should print: ### Multi-Tagging -Sometimes you want to predict several types of annotation at once, for instance NER and part-of-speech (POS) tagging. +Sometimes you want to predict several types of annotation at once, for instance NER and part-of-speech (POS) tags. For this, you can use our new `MultiTagger` object, like this: ```python From 3fd53f301cf10abb2a4de512a1fbc316f0f8a859 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:51:35 +0200 Subject: [PATCH 13/35] update documentation --- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index d6ddd42fa..b291d7c50 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -103,11 +103,13 @@ Again we can apply the integration of the [SciSpaCy](https://allenai.github.io/s ```python from flair.tokenization import SciSpacySentenceSplitter -# Split the text into sentences +# initialize the sentence splitter splitter = SciSpacySentenceSplitter() + +# split text into a list of Sentence objects sentences = splitter.split(abstract) -# Apply HunFlair to each sentence +# you can apply the HunFlair tagger directly to this list tagger.predict(sentences) ``` We can access the annotations of the single sentences by just iterating over the list: From 366ab2c415a95165e6ac0749b3fc44084130358e Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:55:23 +0200 Subject: [PATCH 14/35] update documentation --- resources/docs/TUTORIAL_1_BASICS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 93d226eaa..aaef612bd 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -93,10 +93,10 @@ from flair.tokenization import JapaneseTokenizer tokenizer = JapaneseTokenizer("janome") # make sentence (and tokenize) -sentence = Sentence("私はベルリンが好き", use_tokenizer=tokenizer) +japanese_sentence = Sentence("私はベルリンが好き", use_tokenizer=tokenizer) # output tokenized sentence -print(sentence) +print(japanese_sentence) ``` This should print: From feb426f0514a7bdb7c61196c0ab890741ca97e72 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:56:24 +0200 Subject: [PATCH 15/35] update documentation --- resources/docs/TUTORIAL_1_BASICS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index aaef612bd..02429a8cf 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -72,10 +72,10 @@ when instantiating your `Sentence` with an untokenized string: from flair.data import Sentence # Make a sentence object by passing an untokenized string and the 'use_tokenizer' flag -sentence = Sentence('The grass is green.', use_tokenizer=False) +untokenized_sentence = Sentence('The grass is green.', use_tokenizer=False) # Print the object to see what's in there -print(sentence) +print(untokenized_sentence) ``` In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. From 673360eb81f4c2d4ac51ec080731eb63e5ec0981 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 22:59:01 +0200 Subject: [PATCH 16/35] update documentation --- resources/docs/TUTORIAL_1_BASICS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 02429a8cf..540faec2d 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -241,6 +241,7 @@ This should print: France is the current world cup winner. - classified as "sports" with score 1.0 - classified as "soccer" with score 1.0 + - classified as "English" with score 1.0 ``` If you are interested only in the labels of one layer of annotation, you can access them like this: From 52e456c15c7d8ab6bc0e382629134f2bcf2d2f12 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:21:25 +0200 Subject: [PATCH 17/35] fix multi tagger error --- flair/models/sequence_tagger_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index f024610c8..5a1258ff6 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -1282,6 +1282,7 @@ def predict( return_loss=return_loss, embedding_storage_mode="cpu", ) + print(sentences) # clear embeddings after predicting for sentence in sentences: @@ -1321,8 +1322,13 @@ def load(cls, model_names: Union[List[str], str]): # if the model uses StackedEmbedding, make a new stack with previous objects if type(model.embeddings) == StackedEmbeddings: + # sort embeddings by key alphabetically new_stack = [] - for embedding in model.embeddings.embeddings: + d = model.embeddings.get_named_embeddings_dict() + import collections + od = collections.OrderedDict(sorted(d.items())) + + for k, embedding in od.items(): # check previous embeddings and add if found embedding_found = False From c7d9304864393320364a7e4c11143202813b65e2 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:25:52 +0200 Subject: [PATCH 18/35] fix multi tagger error --- flair/models/sequence_tagger_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 5a1258ff6..4aa4991e1 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -1282,7 +1282,6 @@ def predict( return_loss=return_loss, embedding_storage_mode="cpu", ) - print(sentences) # clear embeddings after predicting for sentence in sentences: From 4b982106e7687999e23784f95ffc95e097d7c008 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:34:28 +0200 Subject: [PATCH 19/35] fix output for frame tagger --- flair/data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flair/data.py b/flair/data.py index be5478b87..db30c85ff 100644 --- a/flair/data.py +++ b/flair/data.py @@ -771,6 +771,8 @@ def to_tagged_string(self, main_tag=None) -> str: if token.get_labels(label_type)[0].value == "O": continue + if token.get_labels(label_type)[0].value == "_": + continue tags.append(token.get_labels(label_type)[0].value) all_tags = "<" + "/".join(tags) + ">" From 25c5c01191ee7e11c322a10b95b69cc3632870e7 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:38:32 +0200 Subject: [PATCH 20/35] update documentation --- resources/docs/TUTORIAL_2_TAGGING.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md index 9ff4b9f4f..b994f23a3 100644 --- a/resources/docs/TUTORIAL_2_TAGGING.md +++ b/resources/docs/TUTORIAL_2_TAGGING.md @@ -76,6 +76,8 @@ Sometimes you want to predict several types of annotation at once, for instance For this, you can use our new `MultiTagger` object, like this: ```python +from flair.models import MultiTagger + # load tagger for POS and NER tagger = MultiTagger.load(['pos', 'ner']) @@ -273,17 +275,20 @@ list of `Sentence` objects to the `.predict()` method. For instance, you can use the sentence splitter of segtok to split your text: ```python +from flair.models import SequenceTagger +from flair.tokenization import SegtokSentenceSplitter -# your text of many sentences +# example text with many sentences text = "This is a sentence. This is another sentence. I love Berlin." -# use a library to split into sentences -from segtok.segmenter import split_single +# initialize sentence splitter +splitter = SegtokSentenceSplitter() -sentences = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)] +# use splitter to split text into list of sentences +sentences = splitter.split(text) -# predict tags for list of sentences -tagger: SequenceTagger = SequenceTagger.load('ner') +# predict tags for sentences +tagger = SequenceTagger.load('ner') tagger.predict(sentences) # iterate through sentences and print predicted labels From cbe6f92ddda21a595a7ca1afb86c5900c38e861e Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:49:52 +0200 Subject: [PATCH 21/35] update documentation --- resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md b/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md index f6ac1f62c..78110ad81 100644 --- a/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md +++ b/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md @@ -144,8 +144,7 @@ embedding.embed(sentence) You can find a full list of their pretained models [here](https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0). **Note**: To use this embedding, you need to install `sentence-transformers` -with `pip install sentence-transformers`. This library currently requires an older version of `transformers`, -so installing it will uninstall the latest `transformers`, causing other transformer embeddings to break. +with `pip install sentence-transformers`. ## Next From 9cadd8500e9857a28dbd7d045d92e9119e0bddc8 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:51:35 +0200 Subject: [PATCH 22/35] update documentation --- resources/docs/TUTORIAL_6_CORPUS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index ab7253949..7f0462889 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -68,7 +68,7 @@ corpus like this: ```python import flair.datasets -downsampled_corpus = flair.datasets.UD_ENGLISH() +corpus = flair.datasets.UD_ENGLISH() ``` then you can downsample the corpus, simply like this: From 73017d7f51e41614e96ae33e14ddda3789e788ab Mon Sep 17 00:00:00 2001 From: alanakbik Date: Thu, 13 Aug 2020 23:57:17 +0200 Subject: [PATCH 23/35] update documentation --- resources/docs/TUTORIAL_6_CORPUS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index 7f0462889..f0ebfaa57 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -180,6 +180,10 @@ data the first time you call the corresponding constructor ID. The following dat | 'WIKINER_POLISH' | Polish | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_RUSSIAN' | Russian | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | +#### Biomedical Named Entity Recognition + +We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md). + #### Universal Dependency Treebanks | ID(s) | Languages | Description | From a0f43c7c138046a94c2214e6724f01fd144a0851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Fri, 14 Aug 2020 12:30:29 +0200 Subject: [PATCH 24/35] Update documentation --- resources/docs/HUNFLAIR.md | 14 +++--- resources/docs/HUNFLAIR_CORPORA.md | 6 +-- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 47 ++++++++++++------- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 874b55cb2..48dfbf5b2 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -53,10 +53,10 @@ for entity in sentence.get_spans(): ``` This should print: ~~~ -Span [5]: "Fmr1" [− Labels: Gene (0.6896)] -Span [1,2]: "Behavioral Abnormalities" [− Labels: Disease (0.706)] -Span [10,11,12]: "Fragile X Syndrome" [− Labels: Disease (0.9863)] -Span [7]: "Mouse" [− Labels: Species (0.9517)] +Span [1,2]: "Behavioral abnormalities" [− Labels: Disease (0.6736)] +Span [10,11,12]: "Fragile X Syndrome" [− Labels: Disease (0.99)] +Span [5]: "Fmr1" [− Labels: Gene (0.838)] +Span [7]: "Mouse" [− Labels: Species (0.9979)] ~~~ ## Comparison to other biomedical NER tools @@ -82,12 +82,14 @@ or any of the competitor tools. All results are F1 scores using partial matching of predicted text offsets with the original char offsets of the gold standard data. We allow a shift by max one character. -1: Misc displays the results of multiple taggers: +1: Misc displays the results of multiple taggers: [tmChem](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/tmchem/) for Chemical, [GNormPus](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/gnormplus/) for Gene and Species, and [DNorm](https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/DNorm.html) for Disease + -Here's how to [reproduce these numbers](XXX) using Flair. You can also find detailed evaluations and discussions in our paper. +Here's how to [reproduce these numbers](https://github.com/hu-ner/hunflair-experiments) using Flair. +You can also find detailed evaluations and discussions in our paper. ## Tutorials We provide a set of quick tutorials to get you started with HunFlair: diff --git a/resources/docs/HUNFLAIR_CORPORA.md b/resources/docs/HUNFLAIR_CORPORA.md index 596ba0546..b8a3ae545 100644 --- a/resources/docs/HUNFLAIR_CORPORA.md +++ b/resources/docs/HUNFLAIR_CORPORA.md @@ -28,9 +28,9 @@ data set implementations can be found in `flair.datasets.biomedical`. | GPRO | `GPRO` | Gene/Protein | [Website](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/) | | CRAFT (v2.0) | `CRAFT` | Chemical, Gene, Species | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161) | | CRAFT (v4.0.1) | `CRAFT_V4` | Chemical, Gene, Species | [Website](https://github.com/UCDenver-ccp/CRAFT) | -| GELLUS | `GELLUS` | Cell line | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/) | -| IEPA | `IEPA` | Gene/Protein | [Paper](hhttps://www.ncbi.nlm.nih.gov/pubmed/11928487) | -| JNLPBA | `JNLPBA` | Disease | [Paper](https://www.aclweb.org/anthology/W04-1213.pdf) | +| GELLUS | `GELLUS` | Cell line | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/) | +| IEPA | `IEPA` | Gene/Protein | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/11928487) | +| JNLPBA | `JNLPBA` | Disease | [Paper](https://www.aclweb.org/anthology/W04-1213.pdf) | | LINNEAUS | `LINNEAUS` | Species | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/20149233) | | LocText | `LOCTEXT` | Disease | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2021-9) | | miRNA | `MIRNA` | Disease | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/) | diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index b291d7c50..4e22b4acf 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -1,6 +1,6 @@ # HunFlair Tutorial 1: Tagging -This is part 1 of the tutorial, in which we show how to use our pre-trained models to tag your text. +This is part 1 of the tutorial, in which we show how to use our pre-trained HunFlair models to tag your text. ### Tagging with Pre-trained HunFlair-Models Let's use the pre-trained HunFlair model for biomedical named entity recognition (NER). @@ -12,7 +12,8 @@ from flair.models import MultiTagger tagger = MultiTagger.load("hunflair") ``` All you need to do is use the predict() method of the tagger on a sentence. -This will add predicted tags to the tokens in the sentence. Lets use a sentence with two named entities: +This will add predicted tags to the tokens in the sentence. +Lets use a sentence with four named entities: ```python from flair.data import Sentence @@ -24,11 +25,15 @@ tagger.predict(sentence) # print sentence with predicted tags print(sentence.to_tagged_string()) ``` - This should print: ~~~ Behavioral Abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome ~~~ +The output contains the words of the original text extended by tags indicating whether +the word is the beginning (B), inside (I) or end (E) of an entity. +For example, "Fragil" is the first word of the disease "Fragil X Syndrom". +Entities consisting of just one word are marked with a special single tag (S). +For example, "Mouse" refers to a species entity. ### Getting Annotated Spans Often named entities consist of multiple words spanning a certain text span in the input text, such as @@ -40,8 +45,8 @@ for disease in sentence.get_spans("hunflair-disease"): ``` This should print: ~~~ -Span [1,2]: "Behavioral Abnormalities" [− Labels: Disease (0.706)] -Span [10,11,12]: "Fragile X Syndrome" [− Labels: Disease (0.9863)] +Span [1,2]: "Behavioral abnormalities" [− Labels: Disease (0.6736)] +Span [10,11,12]: "Fragile X Syndrome" [− Labels: Disease (0.99)] ~~~ Which indicates that "_Behavioral Abnormalities_" or "_Fragile X Syndrome_" are both disease. @@ -54,18 +59,29 @@ print(sentence.to_dict("hunflair-disease")) This should print: ~~~ { - 'text': 'Behavioral Abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome', - 'labels': [], - 'entities': [ - {'text': 'Behavioral Abnormalities', 'start_pos': 0, 'end_pos': 24, 'labels': [Disease (0.706)]}, - {'text': 'Fragile X Syndrome', 'start_pos': 56, 'end_pos': 74, 'labels': [Disease (0.9863)]} - ] + 'text': 'Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome', + 'labels': [], + 'entities': [ + { 'text': 'Behavioral abnormalities', 'start_pos': 0, 'end_pos': 24, 'labels': [Disease (0.6736)]}, + {'text': 'Fragile X Syndrome', 'start_pos': 56, 'end_pos': 74, 'labels': [Disease (0.99)]} + ] } ~~~ You can retrieve all annotated entities of the other entity types in analogous way using `hunflair-cellline` for cell lines, `hunflair-chemical` for chemicals, `hunflair-gene` for genes and proteins, and `hunflair-species` -for species. +for species. To get all entities in one you can run: +```python +for entity in sentence.get_spans(): + print(entity) +``` +This should print: +~~~ +Span [1,2]: "Behavioral abnormalities" [− Labels: Disease (0.6736)] +Span [10,11,12]: "Fragile X Syndrome" [− Labels: Disease (0.99)] +Span [5]: "Fmr1" [− Labels: Gene (0.838)] +Span [7]: "Mouse" [− Labels: Species (0.9979)] +~~~ ### Using a Biomedical Tokenizer Tokenization, i.e. separating a text into tokens / words, is an important issue in natural language processing @@ -117,12 +133,11 @@ We can access the annotations of the single sentences by just iterating over the for sentence in sentences: print(sentence.to_tagged_string()) ``` -This sould print: +This should print: ~~~ Fragile X syndrome ( FXS ) is a developmental disorder caused by a mutation in the X - linked FMR1 gene , coding for the FMRP protein which is largely involved in synaptic function . -FXS patients present several behavioral abnormalities , including hyperactivity , anxiety , sensory hyper - responsiveness , and cognitive deficits . -Autistic symptoms , e.g. , altered social interaction and communication , are also often observed : FXS is indeed the most common monogenic cause of autism . -... +FXS patients present several behavioral abnormalities , including hyperactivity , anxiety , sensory hyper - responsiveness , and cognitive deficits . +Autistic symptoms , e.g. , altered social interaction and communication , are also often observed : FXS is indeed the most common monogenic cause of autism . ~~~ From 1f8262f4346258d2f84dc894ce355f622f1f631d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Fri, 14 Aug 2020 12:40:50 +0200 Subject: [PATCH 25/35] Minor fixes to the HunFlair documentation --- resources/docs/HUNFLAIR.md | 16 ++++++++-------- resources/docs/HUNFLAIR_CORPORA.md | 2 +- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 48dfbf5b2..acb24949f 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -1,7 +1,7 @@ # HunFlair -HunFlair is a state-of-the-art NER tagger for biomedical texts. It comes with -models for genes/proteins, chemicals, diseases, species and cell lines. HunFlair +*HunFlair* is a state-of-the-art NER tagger for biomedical texts. It comes with +models for genes/proteins, chemicals, diseases, species and cell lines. *HunFlair* builds on pretrained domain-specific language models and outperforms other biomedical NER tools on unseen corpora. Furthermore, it contains harmonized versions of [31 biomedical NER data sets](HUNFLAIR_CORPORA.md). @@ -15,7 +15,7 @@ NER data sets](HUNFLAIR_CORPORA.md). ## Quick Start #### Requirements and Installation -HunFlair is based on Flair 0.6+ and Python 3.6+. +*HunFlair* is based on Flair 0.6+ and Python 3.6+. If you do not have Python 3.6, install it first. [Here is how for Ubuntu 16.04](https://vsupalov.com/developing-with-python3-6-on-ubuntu-16-04/). Then, in your favorite virtual environment, simply do: ``` @@ -61,11 +61,11 @@ Span [7]: "Mouse" [− Labels: Species (0.9979)] ## Comparison to other biomedical NER tools Tools for biomedical NER are typically trained and evaluated on rather small gold standard data sets. -However, they are applied "in the wild", i.e., to a much larger collection of texts, often varying in +However, they are applied "in the wild" to a much larger collection of texts, often varying in topic, entity distribution, genre (e.g. patents vs. scientific articles) and text type (e.g. abstract vs. full text), which can lead to severe drops in performance. -HunFlair outperforms other biomedical NER tools on corpora not used for training of neither HunFlair +*HunFlair* outperforms other biomedical NER tools on corpora not used for training of neither *HunFlair* or any of the competitor tools. | Corpus | Entity Type | Misc[1](#f1) | SciSpaCy | HUNER | HunFlair | @@ -89,14 +89,14 @@ of the gold standard data. We allow a shift by max one character. Here's how to [reproduce these numbers](https://github.com/hu-ner/hunflair-experiments) using Flair. -You can also find detailed evaluations and discussions in our paper. +You can find detailed evaluations and discussions in [our paper](http://arxiv.org/abs/XXX). ## Tutorials -We provide a set of quick tutorials to get you started with HunFlair: +We provide a set of quick tutorials to get you started with *HunFlair*: * [Tutorial 1: Tagging](HUNFLAIR_TUTORIAL_1_TAGGING.md) ## Citing HunFlair -Please cite the following paper when using HunFlair: +Please cite the following paper when using *HunFlair*: ~~~ @article{weber2020hunflair, author = {Weber, Leon and S{\"a}nger, Mario and M{\"u}nchmeyer, Jannes and Habibi, Maryam and Leser, Ulf and Akbik, Alan}, diff --git a/resources/docs/HUNFLAIR_CORPORA.md b/resources/docs/HUNFLAIR_CORPORA.md index b8a3ae545..e1fe104f4 100644 --- a/resources/docs/HUNFLAIR_CORPORA.md +++ b/resources/docs/HUNFLAIR_CORPORA.md @@ -4,7 +4,7 @@ Here you can find an overview about biomedical NER data sets integrated in *HunF __Content:__ [Overview](#overview) | [HUNER Data Sets](#huner-data-sets) | [BioBERT Evaluation Splits](#biobert-evaluation-splits) ## Overview -HunFlair integrates 31 biomedical named entity recognition (NER) data sets and provides +*HunFlair* integrates 31 biomedical named entity recognition (NER) data sets and provides them in an unified format to foster the development and evaluation of new NER models. All data set implementations can be found in `flair.datasets.biomedical`. diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index 4e22b4acf..874398470 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -1,9 +1,9 @@ # HunFlair Tutorial 1: Tagging -This is part 1 of the tutorial, in which we show how to use our pre-trained HunFlair models to tag your text. +This is part 1 of the tutorial, in which we show how to use our pre-trained *HunFlair* models to tag your text. ### Tagging with Pre-trained HunFlair-Models -Let's use the pre-trained HunFlair model for biomedical named entity recognition (NER). +Let's use the pre-trained *HunFlair* model for biomedical named entity recognition (NER). This model was trained over 24 biomedical NER data sets and can recognize 5 different entity types, i.e. cell lines, chemicals, disease, gene / proteins and species. ```python @@ -88,7 +88,7 @@ Tokenization, i.e. separating a text into tokens / words, is an important issue in general and biomedical text mining in particular. So far, we used a tokenizer for general domain text. This can be unfavourable if applied to biomedical texts. -HunFlair integrates [SciSpaCy](https://allenai.github.io/scispacy/), a library specially designed to work with scientific text. +*HunFlair* integrates [SciSpaCy](https://allenai.github.io/scispacy/), a library specially designed to work with scientific text. To use the library we first have to install it and download one of it's models: ~~~ pip install scispacy==0.2.5 From 7f07f329a6a19ccd06c38b57a1a1811f79db7481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Fri, 14 Aug 2020 15:20:30 +0200 Subject: [PATCH 26/35] Add model training tutorial --- resources/docs/HUNFLAIR.md | 1 + .../docs/HUNFLAIR_TUTORIAL_2_TRAINING.md | 162 ++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index acb24949f..53f652ab8 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -94,6 +94,7 @@ You can find detailed evaluations and discussions in [our paper](http://arxiv.or ## Tutorials We provide a set of quick tutorials to get you started with *HunFlair*: * [Tutorial 1: Tagging](HUNFLAIR_TUTORIAL_1_TAGGING.md) +* [Tutorial 2: Training biomedical NER models](HUNFLAIR_TUTORIAL_2_TRAINING.md) ## Citing HunFlair Please cite the following paper when using *HunFlair*: diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md new file mode 100644 index 000000000..9c8508afc --- /dev/null +++ b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md @@ -0,0 +1,162 @@ +# HunFlair Tutorial 2: Training NER models + +This part of the tutorial shows how you can train your own biomedical named entity recognition models +using state-of-the-art word embeddings. + +For this tutorial, we assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of Flair +and how [word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md) and +[flair embeddings](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md) work. +You should also know how to [load a corpus](/resources/docs/TUTORIAL_6_CORPUS.md). + +## Train a biomedical NER model from scratch +Here is example code for a biomedical NER model trained over `NCBI_DISEASE` corpus, using word embeddings +and flair embeddings based on biomedical abstracts from PubMed and full-texts from PMC. +```python +from flair.datasets import NCBI_DISEASE + +# 1. get the corpus +corpus = NCBI_DISEASE() +print(corpus) + +# 2. make the tag dictionary from the corpus +tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") + +# 3. initialize embeddings +from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings + +embedding_types = [ + + # word embeddings trained on PubMed and PMC + WordEmbeddings("glove"), + + # flair embeddings trained on PubMed and PMC + FlairEmbeddings('pm_pmc-forward/best-lm.pt'), + FlairEmbeddings('pm_pmc-backward/best-lm.pt'), +] + + +embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) + +# 4. initialize sequence tagger +from flair.models import SequenceTagger + +tagger: SequenceTagger = SequenceTagger( + hidden_size=256, + embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type="ner", + use_crf=True, + locked_dropout=0.5 +) + +# 5. initialize trainer +from flair.trainers import ModelTrainer + +trainer: ModelTrainer = ModelTrainer(tagger, corpus) + +trainer.train( + base_path="taggers/ncbi-disease", + train_with_dev=False, + max_epochs=200, + learning_rate=0.1, + mini_batch_size=32 +) +``` +Once the model is trained you can use it to predict tags for new sentences. +Just call the predict method of the model. +```python +# load the model you trained +model = SequenceTagger.load("taggers/ncbi-disease/best-model.pt") + +# create example sentence +from flair.data import Sentence +sentence = Sentence("Women who smoke 20 cigarettes a day are four times more likely to develop breast cancer.") + +# predict tags and print +model.predict(sentence) + +print(sentence.to_tagged_string()) +``` +If the model works well, it will correctly tag "breast cancer" as disease in this example: +~~~ +Women who smoke 20 cigarettes a day are four times more likely to develop breast cancer . +~~~ + +## Fine-tunning HunFlair models +Next to training a model completely from scratch, there is also the opportunity to just fine-tune +the *HunFlair* models (or any other pre-trained model) to your target domain / corpus. +This can be advantageous because the pre-trained models are based on a much broader data base, +which may allows a better and faster adaptation to the target domain. In the following example +we fine-tune the `hunflar-disease` model to the `NCBI_DISEASE`: +```python +# 1. load your target corpus +from flair.datasets import NCBI_DISEASE +corpus = NCBI_DISEASE() + +# 2. load the pre-trained sequence tagger +from flair.models import SequenceTagger +tagger: SequenceTagger = SequenceTagger.load("hunflair-disease") + +# 3. initialize trainer +from flair.trainers import ModelTrainer +trainer: ModelTrainer = ModelTrainer(tagger, corpus) + +# 4. fine-tune on the target corpus +trainer.train( + base_path="taggers/hunflair-disease-finetuned-ncbi", + train_with_dev=False, + max_epochs=200, + learning_rate=0.1, + mini_batch_size=32 +) +``` +## Training HunFlair from scratch +*HunFlair* consists of distinct models for the entity types cell line, chemical, disease, gene/protein +and species. For each entity multiple corpora are used to train the model for the specific entity. +The following code examples illustrates the training process of *HunFlair* for *cell line*: +```python +from flair.datasets import HUNER_CELL_LINE + +# 1. get all corpora for a specific entity type +from flair.models import SequenceTagger +corpus = HUNER_CELL_LINE() + +# 2. initialize embeddings +from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings +embedding_types = [ + WordEmbeddings("pubmed_pmc_wiki_sg_1M.gensim"), + FlairEmbeddings('pm_pmc-forward/best-lm.pt'), + FlairEmbeddings('pm_pmc-backward/best-lm.pt'), + +] + +embeddings = StackedEmbeddings(embeddings=embedding_types) + +# 3. initialize sequence tagger +tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") + +tagger = SequenceTagger( + hidden_size=256, + embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type="ner", + use_crf=True, + locked_dropout=0.5 +) + +# 4. train the model +from flair.trainers import ModelTrainer +trainer = ModelTrainer(tagger, corpus) + +trainer.train( + base_path="taggers/hunflair-cell-line", + train_with_dev=False, + max_epochs=200, + learning_rate=0.1, + mini_batch_size=32 +) +``` +Analogously, distinct models can be trained for chemicals, diseases, genes/proteins and species using +`HUNER_CHEMICALS`, `HUNER_DISEASE`, `HUNER_GENE`, `HUNER_SPECIES` respectively. + + From 24d3aa5225d7eda6d5327eaf688c27c0543b425f Mon Sep 17 00:00:00 2001 From: Leon Weber Date: Fri, 14 Aug 2020 15:40:50 +0200 Subject: [PATCH 27/35] Update HUNFLAIR_CORPORA.md Fix erroneous entity types --- resources/docs/HUNFLAIR_CORPORA.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/resources/docs/HUNFLAIR_CORPORA.md b/resources/docs/HUNFLAIR_CORPORA.md index e1fe104f4..bdecd67bb 100644 --- a/resources/docs/HUNFLAIR_CORPORA.md +++ b/resources/docs/HUNFLAIR_CORPORA.md @@ -12,31 +12,31 @@ data set implementations can be found in `flair.datasets.biomedical`. | --- | --- | --- | --- | | AnatEM | `ANAT_EM` | Anatomical entities | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957068/), [Website](http://nactem.ac.uk/anatomytagger/#AnatEM) | | Arizona Disease | `AZDZ` | Disease | [Website](http://diego.asu.edu/index.php) | -| BioCreative II GM | `BC2GM` | Disease | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/) | +| BioCreative II GM | `BC2GM` | Gene | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/) | | BioCreative V CDR task | `CDR` | Chemical, Disease | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/), [Website](https://github.com/JHnlp/BioCreative-V-CDR-Corpus) | | BioInfer | `BIO_INFER` | Gene/Protein | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-50) | | BioNLP'2013 Cancer Genetics (ST) | `BIONLP2013_CG` | Chemical, Disease, Gene/Protein, Species | [Paper](https://www.aclweb.org/anthology/W13-2008/) | | BioNLP'2013 Pathway Curation (ST)| `BIONLP2013_PC` | Chemical, Gene/Proteins | [Paper](http://diego.asu.edu/index.php) | | BioSemantics* | `BIOSEMANTICS` | Chemical, Disease | [Paper](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0107477), [Website](https://biosemantics.erasmusmc.nl/index.php/resources/chemical-patent-corpus)| -| CellFinder | `CELL_FINDER` | Cell line, Gene Species | [Paper](https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf) | +| CellFinder | `CELL_FINDER` | Cell line, Gene, Species | [Paper](https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf) | | CEMP | `CEMP` | Chemical | [Website](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/cemp-detailed-task-description/) | | CHEBI | `CHEBI` | Chemical, Gene, Species | [Paper](http://www.lrec-conf.org/proceedings/lrec2018/pdf/229.pdf) | | CHEMDNER | `CHEMDNER` | Chemical | [Paper](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2) | | CLL | `CLL` | Cell line | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/) | -| DECA | `DECA` | Gene/Protein | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/) | -| FSU | `FSU` | Gene/Protein | [Paper](https://www.aclweb.org/anthology/W10-1838/) | -| GPRO | `GPRO` | Gene/Protein | [Website](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/) | +| DECA | `DECA` | Gene | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/) | +| FSU | `FSU` | Gene | [Paper](https://www.aclweb.org/anthology/W10-1838/) | +| GPRO | `GPRO` | Gene | [Website](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/) | | CRAFT (v2.0) | `CRAFT` | Chemical, Gene, Species | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161) | | CRAFT (v4.0.1) | `CRAFT_V4` | Chemical, Gene, Species | [Website](https://github.com/UCDenver-ccp/CRAFT) | | GELLUS | `GELLUS` | Cell line | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/) | -| IEPA | `IEPA` | Gene/Protein | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/11928487) | -| JNLPBA | `JNLPBA` | Disease | [Paper](https://www.aclweb.org/anthology/W04-1213.pdf) | +| IEPA | `IEPA` | Gene | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/11928487) | +| JNLPBA | `JNLPBA` | Cell line, Gene | [Paper](https://www.aclweb.org/anthology/W04-1213.pdf) | | LINNEAUS | `LINNEAUS` | Species | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/20149233) | -| LocText | `LOCTEXT` | Disease | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2021-9) | -| miRNA | `MIRNA` | Disease | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/) | +| LocText | `LOCTEXT` | Gene, Species | [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2021-9) | +| miRNA | `MIRNA` | Disease, Gene, Species | [Paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/) | | NCBI Disease | `NCBI_DISEASE` | Disease | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/24393765) | -| Osiris v1.2 | `OSIRIS` | Gene/Protein, Mutation | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/18251998) | -| Plant-Disease-Relations | `PDR` | Disease, Plant | [Paper](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582), [Website](http://gcancer.org/pdr/) | +| Osiris v1.2 | `OSIRIS` | Gene | [Paper](https://www.ncbi.nlm.nih.gov/pubmed/18251998) | +| Plant-Disease-Relations | `PDR` | Disease | [Paper](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582), [Website](http://gcancer.org/pdr/) | | S800 | `S800` | Species | [Paper](http://www.plosone.org/article/info:doi%2F10.1371%2Fjournal.pone.0065390) | | SCAI Chemicals | `SCAI_CHEMICALS` | Chemical | [Paper](https://pub.uni-bielefeld.de/record/2603498) | | SCAI Disease | `SCAI_DISEASE` | Disease | [Paper](https://pub.uni-bielefeld.de/record/2603398) | From 1fa845fcd1189d901afd3cff208437f350b70a8b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Fri, 14 Aug 2020 16:58:18 +0200 Subject: [PATCH 28/35] Add biomedical embeddings --- flair/embeddings/token.py | 68 ++++++++++++++------------------------- 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index ff1c2bb2e..ec6d8347a 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -117,52 +117,38 @@ def __init__(self, embeddings: str, field: str = None): """ self.embeddings = embeddings - old_base_path = ( - "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/" - ) - base_path = ( - "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/" - ) - embeddings_path_v4 = ( - "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/" - ) + old_base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/" + base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/" + embeddings_path_v4 = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/" embeddings_path_v4_1 = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/" + hu_path: str = "https://flair.informatik.hu-berlin.de/resources/embeddings/" cache_dir = Path("embeddings") # GLOVE embeddings if embeddings.lower() == "glove" or embeddings.lower() == "en-glove": cached_path(f"{old_base_path}glove.gensim.vectors.npy", cache_dir=cache_dir) - embeddings = cached_path( - f"{old_base_path}glove.gensim", cache_dir=cache_dir - ) + embeddings = cached_path(f"{old_base_path}glove.gensim", cache_dir=cache_dir) # TURIAN embeddings elif embeddings.lower() == "turian" or embeddings.lower() == "en-turian": - cached_path( - f"{embeddings_path_v4_1}turian.vectors.npy", cache_dir=cache_dir - ) - embeddings = cached_path( - f"{embeddings_path_v4_1}turian", cache_dir=cache_dir - ) + cached_path(f"{embeddings_path_v4_1}turian.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{embeddings_path_v4_1}turian", cache_dir=cache_dir) # KOMNINOS embeddings elif embeddings.lower() == "extvec" or embeddings.lower() == "en-extvec": - cached_path( - f"{old_base_path}extvec.gensim.vectors.npy", cache_dir=cache_dir - ) - embeddings = cached_path( - f"{old_base_path}extvec.gensim", cache_dir=cache_dir - ) + cached_path(f"{old_base_path}extvec.gensim.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{old_base_path}extvec.gensim", cache_dir=cache_dir) + + # pubmed embeddings + elif embeddings.lower() == "pubmed" or embeddings.lower() == "en-pubmed": + cached_path(f"{hu_path}pubmed_pmc_wiki_sg_1M.gensim.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{hu_path}pubmed_pmc_wiki_sg_1M.gensim", cache_dir=cache_dir) # FT-CRAWL embeddings elif embeddings.lower() == "crawl" or embeddings.lower() == "en-crawl": - cached_path( - f"{base_path}en-fasttext-crawl-300d-1M.vectors.npy", cache_dir=cache_dir - ) - embeddings = cached_path( - f"{base_path}en-fasttext-crawl-300d-1M", cache_dir=cache_dir - ) + cached_path(f"{base_path}en-fasttext-crawl-300d-1M.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{base_path}en-fasttext-crawl-300d-1M", cache_dir=cache_dir) # FT-CRAWL embeddings elif ( @@ -170,21 +156,13 @@ def __init__(self, embeddings: str, field: str = None): or embeddings.lower() == "en-news" or embeddings.lower() == "en" ): - cached_path( - f"{base_path}en-fasttext-news-300d-1M.vectors.npy", cache_dir=cache_dir - ) - embeddings = cached_path( - f"{base_path}en-fasttext-news-300d-1M", cache_dir=cache_dir - ) + cached_path(f"{base_path}en-fasttext-news-300d-1M.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{base_path}en-fasttext-news-300d-1M", cache_dir=cache_dir) # twitter embeddings elif embeddings.lower() == "twitter" or embeddings.lower() == "en-twitter": - cached_path( - f"{old_base_path}twitter.gensim.vectors.npy", cache_dir=cache_dir - ) - embeddings = cached_path( - f"{old_base_path}twitter.gensim", cache_dir=cache_dir - ) + cached_path(f"{old_base_path}twitter.gensim.vectors.npy", cache_dir=cache_dir) + embeddings = cached_path(f"{old_base_path}twitter.gensim", cache_dir=cache_dir) # two-letter language code wiki embeddings elif len(embeddings.lower()) == 2: @@ -540,8 +518,10 @@ def __init__(self, "pt-forward": f"{aws_path}/embeddings-v0.4/lm-pt-forward.pt", "pt-backward": f"{aws_path}/embeddings-v0.4/lm-pt-backward.pt", # Pubmed - "pubmed-forward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt", - "pubmed-backward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt", + "pubmed-forward": f"{hu_path}/embeddings/pm_pmc-forward/pubmed-forward.pt", + "pubmed-backward": f"{hu_path}/embeddings/pm_pmc-backward/pubmed-backward.pt", + "pubmed-2015-forward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt", + "pubmed-2015-backward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt", # Slovenian "sl-forward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-forward-v0.1.pt", "sl-backward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-backward-v0.1.pt", From aaa90234e85f272490bf71701bc53e24a92db9d6 Mon Sep 17 00:00:00 2001 From: Leon Weber Date: Fri, 14 Aug 2020 17:13:27 +0200 Subject: [PATCH 29/35] Mention the new LM and word embeddings in the docs --- resources/docs/HUNFLAIR.md | 4 +++- resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 53f652ab8..62ae91dca 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -4,7 +4,9 @@ models for genes/proteins, chemicals, diseases, species and cell lines. *HunFlair* builds on pretrained domain-specific language models and outperforms other biomedical NER tools on unseen corpora. Furthermore, it contains harmonized versions of [31 biomedical -NER data sets](HUNFLAIR_CORPORA.md). +NER data sets](HUNFLAIR_CORPORA.md) and comes with a Flair language model ("pubmed-X") and +FastText embeddings ("pubmed") that were trained on roughly 3 million full texts and about +25 million abstracts from the biomedical domain. Content: [Quick Start](#quick-start) | diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md index 9c8508afc..7ecc7987d 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md @@ -27,11 +27,11 @@ from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings embedding_types = [ # word embeddings trained on PubMed and PMC - WordEmbeddings("glove"), + WordEmbeddings("pubmed"), # flair embeddings trained on PubMed and PMC - FlairEmbeddings('pm_pmc-forward/best-lm.pt'), - FlairEmbeddings('pm_pmc-backward/best-lm.pt'), + FlairEmbeddings('pubmed-forward'), + FlairEmbeddings('pubmed-backward'), ] @@ -124,9 +124,9 @@ corpus = HUNER_CELL_LINE() # 2. initialize embeddings from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings embedding_types = [ - WordEmbeddings("pubmed_pmc_wiki_sg_1M.gensim"), - FlairEmbeddings('pm_pmc-forward/best-lm.pt'), - FlairEmbeddings('pm_pmc-backward/best-lm.pt'), + WordEmbeddings("pubmed"), + FlairEmbeddings('pubmed-forward'), + FlairEmbeddings('pubmed-backward'), ] From 89b35a1e11d6ca55fe52ba370a4e91039af95efe Mon Sep 17 00:00:00 2001 From: alanakbik Date: Sun, 16 Aug 2020 22:17:14 +0200 Subject: [PATCH 30/35] Add NER model with pooled embeddings --- flair/models/sequence_tagger_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 4aa4991e1..2fd2598a5 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -978,6 +978,10 @@ def _fetch_model(model_name) -> str: [aws_resource_path_v04, "NER-conll03-english", "en-ner-conll03-v0.4.pt"] ) + model_map["ner-pooled"] = "/".join( + [hu_path, "NER-conll03-english-pooled", "en-ner-conll03-pooled-v0.5.pt"] + ) + model_map["ner-fast"] = "/".join( [ aws_resource_path_v04, From b5fb7578877949c8f2a4a43c675c9b4ec3716392 Mon Sep 17 00:00:00 2001 From: Leon Weber Date: Mon, 17 Aug 2020 08:13:07 +0200 Subject: [PATCH 31/35] GH-1513: Add page on how to reproduce paper results --- resources/docs/HUNFLAIR.md | 2 +- resources/docs/HUNFLAIR_EXPERIMENTS.md | 103 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 resources/docs/HUNFLAIR_EXPERIMENTS.md diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md index 62ae91dca..b23ac25b6 100644 --- a/resources/docs/HUNFLAIR.md +++ b/resources/docs/HUNFLAIR.md @@ -90,7 +90,7 @@ of the gold standard data. We allow a shift by max one character. [DNorm](https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/DNorm.html) for Disease -Here's how to [reproduce these numbers](https://github.com/hu-ner/hunflair-experiments) using Flair. +Here's how to [reproduce these numbers](HUNFLAIR_EXPERIMENTS.md) using Flair. You can find detailed evaluations and discussions in [our paper](http://arxiv.org/abs/XXX). ## Tutorials diff --git a/resources/docs/HUNFLAIR_EXPERIMENTS.md b/resources/docs/HUNFLAIR_EXPERIMENTS.md new file mode 100644 index 000000000..e3d09dae5 --- /dev/null +++ b/resources/docs/HUNFLAIR_EXPERIMENTS.md @@ -0,0 +1,103 @@ +# Training the models from the paper +Here's how we trained the models that we evaluated in the paper ("hunflair-paper") + +```python +# 1. define corpora +from flair.datasets import biomedical + +CELLLINE_CORPORA = [ + biomedical.HUNER_CELL_LINE_CELL_FINDER(), + biomedical.HUNER_CELL_LINE_CLL(), + biomedical.HUNER_CELL_LINE_GELLUS(), + biomedical.HUNER_CELL_LINE_JNLPBA() +] + +CHEMICAL_CORPORA = [ + biomedical.HUNER_CHEMICAL_CDR(), + biomedical.HUNER_CHEMICAL_CEMP(), + biomedical.HUNER_CHEMICAL_CHEBI(), + biomedical.HUNER_CHEMICAL_CHEMDNER(), + biomedical.HUNER_CHEMICAL_SCAI() +] + +DISEASE_CORPORA = [ + biomedical.HUNER_DISEASE_CDR(), + biomedical.HUNER_DISEASE_MIRNA(), + biomedical.HUNER_DISEASE_NCBI(), + biomedical.HUNER_DISEASE_SCAI(), + biomedical.HUNER_DISEASE_VARIOME() +] + +GENE_CORPORA = [ + biomedical.HUNER_GENE_BC2GM(), + biomedical.HUNER_GENE_BIO_INFER(), + biomedical.HUNER_GENE_CELL_FINDER(), + biomedical.HUNER_GENE_CHEBI(), + biomedical.HUNER_GENE_DECA(), + biomedical.HUNER_GENE_FSU(), + biomedical.HUNER_GENE_GPRO(), + biomedical.HUNER_GENE_IEPA(), + biomedical.HUNER_GENE_JNLPBA(), + biomedical.HUNER_GENE_LOCTEXT(), + biomedical.HUNER_GENE_MIRNA(), + biomedical.HUNER_GENE_OSIRIS(), + biomedical.HUNER_GENE_VARIOME() +] + + +SPECIES_CORPORA = [ + biomedical.HUNER_SPECIES_CELL_FINDER(), + biomedical.HUNER_SPECIES_CHEBI(), + biomedical.HUNER_SPECIES_LINNEAUS(), + biomedical.HUNER_SPECIES_LOCTEXT(), + biomedical.HUNER_SPECIES_MIRNA(), + biomedical.HUNER_SPECIES_S800(), + biomedical.HUNER_SPECIES_VARIOME() +] + +# 2. initialize embeddings +from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings +embedding_types = [ + WordEmbeddings("pubmed_pmc_wiki_sg_1M.gensim"), + FlairEmbeddings('pm_pmc-forward/best-lm.pt'), + FlairEmbeddings('pm_pmc-backward/best-lm.pt'), + +] + +embeddings = StackedEmbeddings(embeddings=embedding_types) + +# 3. Initialize corpus +# We also train on the test portions of the corpora, because we evaluate on held-out corpora +from flair.data import MultiCorpus +from torch.utils.data import ConcatDataset +corpus = MultiCorpus(GENE_CORPORA) +corpus._train = ConcatDataset([corpus._train, corpus._test]) + +# 4. Initialize sequence tagger +from flair.models import SequenceTagger +tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") + +tagger = SequenceTagger( + hidden_size=256, + embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type="ner", + use_crf=True, + locked_dropout=0.5 +) + +# 5. train the model +from flair.trainers import ModelTrainer +trainer = ModelTrainer(tagger, corpus) + +trainer.train( + base_path="taggers/hunflair-gene", + train_with_dev=False, + max_epochs=200, + learning_rate=0.1, + mini_batch_size=32 +) +``` + +The taggers for the other entity types are trained analogously. +Details on the evaluation can be found in a [dedicated github repository](https://github.com/hu-ner/hunflair-experiments). \ No newline at end of file From 11b845384a71aa9c21c9f5cec2a6fa9d1773cf09 Mon Sep 17 00:00:00 2001 From: Leon Weber Date: Mon, 17 Aug 2020 08:17:21 +0200 Subject: [PATCH 32/35] GH-1513: Use uploaded versions of embeddings --- resources/docs/HUNFLAIR_EXPERIMENTS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/docs/HUNFLAIR_EXPERIMENTS.md b/resources/docs/HUNFLAIR_EXPERIMENTS.md index e3d09dae5..736bdd8e0 100644 --- a/resources/docs/HUNFLAIR_EXPERIMENTS.md +++ b/resources/docs/HUNFLAIR_EXPERIMENTS.md @@ -58,9 +58,9 @@ SPECIES_CORPORA = [ # 2. initialize embeddings from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings embedding_types = [ - WordEmbeddings("pubmed_pmc_wiki_sg_1M.gensim"), - FlairEmbeddings('pm_pmc-forward/best-lm.pt'), - FlairEmbeddings('pm_pmc-backward/best-lm.pt'), + WordEmbeddings("pubmed"), + FlairEmbeddings("pubmed-forward"), + FlairEmbeddings("pubmed-backward"), ] From 75625ffe3e4d842fd64ce1a2a02c6f9de0d8d0a1 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Mon, 17 Aug 2020 10:13:35 +0200 Subject: [PATCH 33/35] deactivate weight writing by default --- flair/trainers/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index c729bf98b..b04c85d09 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -84,6 +84,7 @@ def train( batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, + write_weights: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, @@ -405,7 +406,7 @@ def train( ) batch_time = 0 iteration = self.epoch * total_number_of_batches + batch_no - if not param_selection_mode: + if not param_selection_mode and write_weights: weight_extractor.extract_weights( self.model.state_dict(), iteration ) From b90eea09512b2b5d4b887f2f0a3580fcf8a5ae36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Mon, 17 Aug 2020 12:20:49 +0200 Subject: [PATCH 34/35] Minor changes to documentation --- resources/docs/HUNFLAIR_EXPERIMENTS.md | 4 ++-- resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md | 2 ++ resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/resources/docs/HUNFLAIR_EXPERIMENTS.md b/resources/docs/HUNFLAIR_EXPERIMENTS.md index 736bdd8e0..e44ce4b5e 100644 --- a/resources/docs/HUNFLAIR_EXPERIMENTS.md +++ b/resources/docs/HUNFLAIR_EXPERIMENTS.md @@ -1,5 +1,5 @@ # Training the models from the paper -Here's how we trained the models that we evaluated in the paper ("hunflair-paper") +Here's how we trained the models that we evaluated in [our paper](http://arxiv.org/abs/XXX). ```python # 1. define corpora @@ -100,4 +100,4 @@ trainer.train( ``` The taggers for the other entity types are trained analogously. -Details on the evaluation can be found in a [dedicated github repository](https://github.com/hu-ner/hunflair-experiments). \ No newline at end of file +Details on the evaluation can be found in a [dedicated github repository](https://github.com/hu-ner/hunflair-experiments). diff --git a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md index 874398470..91cd4c9e9 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md @@ -140,6 +140,8 @@ FXS patients present several behavioral abnormalities symptoms , e.g. , altered social interaction and communication , are also often observed : FXS is indeed the most common monogenic cause of autism . ~~~ +### Next +Now, let us look at how to [train your own biomedical models](HUNFLAIR_TUTORIAL_2_TRAINING.md) to tag your text. diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md index 7ecc7987d..2e79e9a09 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md @@ -30,8 +30,8 @@ embedding_types = [ WordEmbeddings("pubmed"), # flair embeddings trained on PubMed and PMC - FlairEmbeddings('pubmed-forward'), - FlairEmbeddings('pubmed-backward'), + FlairEmbeddings("pubmed-forward"), + FlairEmbeddings("pubmed-backward"), ] @@ -125,8 +125,8 @@ corpus = HUNER_CELL_LINE() from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings embedding_types = [ WordEmbeddings("pubmed"), - FlairEmbeddings('pubmed-forward'), - FlairEmbeddings('pubmed-backward'), + FlairEmbeddings("pubmed-forward"), + FlairEmbeddings("pubmed-backward"), ] From 0675683dd04b3be7f840ce0e1f79ac22cf0bdfe6 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Mon, 17 Aug 2020 13:14:06 +0200 Subject: [PATCH 35/35] Update documentation --- resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md | 2 +- resources/docs/TUTORIAL_2_TAGGING.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md index 7ecc7987d..18839f00a 100644 --- a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md +++ b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md @@ -82,7 +82,7 @@ If the model works well, it will correctly tag "breast cancer" as disease in thi Women who smoke 20 cigarettes a day are four times more likely to develop breast cancer . ~~~ -## Fine-tunning HunFlair models +## Fine-tuning HunFlair models Next to training a model completely from scratch, there is also the opportunity to just fine-tune the *HunFlair* models (or any other pre-trained model) to your target domain / corpus. This can be advantageous because the pre-trained models are based on a much broader data base, diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md index b994f23a3..419b74f49 100644 --- a/resources/docs/TUTORIAL_2_TAGGING.md +++ b/resources/docs/TUTORIAL_2_TAGGING.md @@ -103,6 +103,7 @@ are provided: | ID | Task | Training Dataset | Accuracy | | ------------- | ------------- |------------- |------------- | | 'ner' | 4-class Named Entity Recognition | Conll-03 | **93.03** (F1) | +| 'ner-pooled' | 4-class Named Entity Recognition (memory inefficient) | Conll-03 | **93.24** (F1) | | 'ner-ontonotes' | [18-class](https://spacy.io/api/annotation#named-entities) Named Entity Recognition | Ontonotes | **89.06** (F1) | | 'chunk' | Syntactic Chunking | Conll-2000 | **96.47** (F1) | | 'pos' | Part-of-Speech Tagging (fine-grained) | Ontonotes | **98.19** (Accuracy) |