Merge pull request #6 from zalandoresearch/v0.1

V0.1
flairNLP · Jul 13, 2018 · bc4dfe3 · bc4dfe3
2 parents f396139 + 092388c
commit bc4dfe3
Show file tree

Hide file tree

Showing 16 changed files with 682 additions and 1,332 deletions.
diff --git a/README.md b/README.md
@@ -2,28 +2,23 @@
 
 ![alt text](resources/docs/flair_logo.svg)
 
-- a very simple framework for **state-of-the-art NLP**. Developed by [Zalando Research](https://research.zalando.com/).
+A very simple framework for **state-of-the-art NLP**. Developed by [Zalando Research](https://research.zalando.com/).
 
 ---
 
-Flair uses **hyper-powerful word embeddings** to achieve state-of-the-art accuracies
- on a range of natural language processing (NLP) tasks. 
-
 Flair is:
 
 * **A powerful syntactic / semantic tagger.** Flair allows you to apply our state-of-the-art models for named entity recognition (NER), 
 part-of-speech tagging (PoS) and chunking to your text.
 
-* **A word embedding library.** There are many different types of word embeddings out there, with wildly different properties. 
-Flair packages many of them behind a simple interface, so you can mix and match embeddings for your experiments. 
+* **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word embeddings. 
 In particular, you can try out our proposed 
-*[contextual string embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)*, 
+**[contextual string embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)** 
 to build your own state-of-the-art NLP methods.
 
 * **A Pytorch NLP framework.** Our framework builds directly on [Pytorch](https://pytorch.org/), making it easy to train your own models and 
 experiment with new approaches using Flair embeddings and classes.
 
-Embedding your text for state-of-the-art NLP has never been easier. 
 
 ## Comparison with State-of-the-Art
 
@@ -73,13 +68,13 @@ a pre-trained model and use it to predict tags for the sentence:
 
 ```python
 from flair.data import Sentence
-from flair.tagging_model import SequenceTaggerLSTM
+from flair.tagging_model import SequenceTagger
 
 # make a sentence
 sentence = Sentence('I love Berlin .')
 
 # load the NER tagger
-tagger = SequenceTaggerLSTM.load('ner')
+tagger = SequenceTagger.load('ner')
 
 # run NER over sentence
 tagger.predict(sentence)
@@ -88,15 +83,15 @@ tagger.predict(sentence)
 Done! The `Sentence` now has entity annotations. Print the sentence to see what the tagger found.
 
 ```python
-print('Analysing %s' % sentence)
+print(sentence)
 print('The following NER tags are found:')
-print(sentence.to_tag_string())
+print(sentence.to_tagged_string())
 ```
 
 This should print: 
 
 ```console
-Analysing Sentence: "I love Berlin ." - 4 Tokens
+Sentence: "I love Berlin ." - 4 Tokens
 
 The following NER tags are found: 
 

diff --git a/flair/data.py b/flair/data.py
@@ -126,7 +126,7 @@ def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[s
 
         self.labels: List[str] = labels
 
-        self.embeddings: Dict = {}
+        self._embeddings: Dict = {}
 
         # optionally, directly instantiate with sentence tokens
         if text is not None:
@@ -164,15 +164,24 @@ def add_token(self, token: Token):
             token.idx = len(self.tokens)
 
     def set_embedding(self, name: str, vector):
-        self.embeddings[name] = vector
+        self._embeddings[name] = vector
 
-    def clear_embeddings(self):
-        self.embeddings: Dict = {}
+    def clear_embeddings(self, also_clear_word_embeddings: bool = True):
+
+        self._embeddings: Dict = {}
+
+        if also_clear_word_embeddings:
+            for token in self:
+                token.clear_embeddings()
+
+    def cpu_embeddings(self):
+        for name, vector in self._embeddings.items():
+            self._embeddings[name] = vector.cpu()
 
     def get_embedding(self) -> torch.autograd.Variable:
         embeddings = []
-        for embed in sorted(self.embeddings.keys()):
-            embedding = self.embeddings[embed]
+        for embed in sorted(self._embeddings.keys()):
+            embedding = self._embeddings[embed]
             embeddings.append(embedding)
 
         return torch.cat(embeddings, dim=0)
@@ -181,24 +190,41 @@ def get_embedding(self) -> torch.autograd.Variable:
     def embedding(self):
         return self.get_embedding()
 
-    def to_tag_string(self, tag_type: str = 'tag') -> str:
+    def to_tagged_string(self) -> str:
+
         list = []
         for token in self.tokens:
             list.append(token.text)
-            if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
-            list.append('<' + token.get_tag(tag_type) + '>')
-        return ' '.join(list)
 
-    def to_ner_string(self) -> str:
-        list = []
-        for token in self.tokens:
-            if token.get_tag('ner') == 'O' or token.get_tag('ner') == '':
-                list.append(token.text)
-            else:
-                list.append(token.text)
-                list.append('<' + token.get_tag('ner') + '>')
+            tags = []
+            for tag_type in token.tags.keys():
+
+                if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
+                tags.append(token.get_tag(tag_type))
+            all_tags = '<' + '/'.join(tags) + '>'
+            if all_tags != '<>':
+                list.append(all_tags)
         return ' '.join(list)
 
+    # def to_tag_string(self, tag_type: str = 'tag') -> str:
+    #
+    #     list = []
+    #     for token in self.tokens:
+    #         list.append(token.text)
+    #         if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
+    #         list.append('<' + token.get_tag(tag_type) + '>')
+    #     return ' '.join(list)
+    #
+    # def to_ner_string(self) -> str:
+    #     list = []
+    #     for token in self.tokens:
+    #         if token.get_tag('ner') == 'O' or token.get_tag('ner') == '':
+    #             list.append(token.text)
+    #         else:
+    #             list.append(token.text)
+    #             list.append('<' + token.get_tag('ner') + '>')
+    #     return ' '.join(list)
+
     def convert_tag_scheme(self, tag_type: str = 'ner', target_scheme: str = 'iob'):
 
         tags: List[str] = []
@@ -217,7 +243,7 @@ def convert_tag_scheme(self, tag_type: str = 'ner', target_scheme: str = 'iob'):
             self.tokens[index].add_tag(tag_type, tag)
 
     def __repr__(self):
-        return ' '.join([x.text for x in self.tokens])
+        return 'Sentence: "' + ' '.join([t.text for t in self.tokens]) + '" - %d Tokens' % len(self)
 
     def __copy__(self):
         s = Sentence()