Skip to content

Commit

Permalink
Merge pull request #6 from zalandoresearch/v0.1
Browse files Browse the repository at this point in the history
V0.1
  • Loading branch information
Alan Akbik authored Jul 13, 2018
2 parents f396139 + 092388c commit bc4dfe3
Show file tree
Hide file tree
Showing 16 changed files with 682 additions and 1,332 deletions.
21 changes: 8 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,23 @@

![alt text](resources/docs/flair_logo.svg)

- a very simple framework for **state-of-the-art NLP**. Developed by [Zalando Research](https://research.zalando.com/).
A very simple framework for **state-of-the-art NLP**. Developed by [Zalando Research](https://research.zalando.com/).

---

Flair uses **hyper-powerful word embeddings** to achieve state-of-the-art accuracies
on a range of natural language processing (NLP) tasks.

Flair is:

* **A powerful syntactic / semantic tagger.** Flair allows you to apply our state-of-the-art models for named entity recognition (NER),
part-of-speech tagging (PoS) and chunking to your text.

* **A word embedding library.** There are many different types of word embeddings out there, with wildly different properties.
Flair packages many of them behind a simple interface, so you can mix and match embeddings for your experiments.
* **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word embeddings.
In particular, you can try out our proposed
*[contextual string embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)*,
**[contextual string embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)**
to build your own state-of-the-art NLP methods.

* **A Pytorch NLP framework.** Our framework builds directly on [Pytorch](https://pytorch.org/), making it easy to train your own models and
experiment with new approaches using Flair embeddings and classes.

Embedding your text for state-of-the-art NLP has never been easier.

## Comparison with State-of-the-Art

Expand Down Expand Up @@ -73,13 +68,13 @@ a pre-trained model and use it to predict tags for the sentence:

```python
from flair.data import Sentence
from flair.tagging_model import SequenceTaggerLSTM
from flair.tagging_model import SequenceTagger

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTaggerLSTM.load('ner')
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)
Expand All @@ -88,15 +83,15 @@ tagger.predict(sentence)
Done! The `Sentence` now has entity annotations. Print the sentence to see what the tagger found.

```python
print('Analysing %s' % sentence)
print(sentence)
print('The following NER tags are found:')
print(sentence.to_tag_string())
print(sentence.to_tagged_string())
```

This should print:

```console
Analysing Sentence: "I love Berlin ." - 4 Tokens
Sentence: "I love Berlin ." - 4 Tokens

The following NER tags are found:

Expand Down
64 changes: 45 additions & 19 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[s

self.labels: List[str] = labels

self.embeddings: Dict = {}
self._embeddings: Dict = {}

# optionally, directly instantiate with sentence tokens
if text is not None:
Expand Down Expand Up @@ -164,15 +164,24 @@ def add_token(self, token: Token):
token.idx = len(self.tokens)

def set_embedding(self, name: str, vector):
self.embeddings[name] = vector
self._embeddings[name] = vector

def clear_embeddings(self):
self.embeddings: Dict = {}
def clear_embeddings(self, also_clear_word_embeddings: bool = True):

self._embeddings: Dict = {}

if also_clear_word_embeddings:
for token in self:
token.clear_embeddings()

def cpu_embeddings(self):
for name, vector in self._embeddings.items():
self._embeddings[name] = vector.cpu()

def get_embedding(self) -> torch.autograd.Variable:
embeddings = []
for embed in sorted(self.embeddings.keys()):
embedding = self.embeddings[embed]
for embed in sorted(self._embeddings.keys()):
embedding = self._embeddings[embed]
embeddings.append(embedding)

return torch.cat(embeddings, dim=0)
Expand All @@ -181,24 +190,41 @@ def get_embedding(self) -> torch.autograd.Variable:
def embedding(self):
return self.get_embedding()

def to_tag_string(self, tag_type: str = 'tag') -> str:
def to_tagged_string(self) -> str:

list = []
for token in self.tokens:
list.append(token.text)
if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
list.append('<' + token.get_tag(tag_type) + '>')
return ' '.join(list)

def to_ner_string(self) -> str:
list = []
for token in self.tokens:
if token.get_tag('ner') == 'O' or token.get_tag('ner') == '':
list.append(token.text)
else:
list.append(token.text)
list.append('<' + token.get_tag('ner') + '>')
tags = []
for tag_type in token.tags.keys():

if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
tags.append(token.get_tag(tag_type))
all_tags = '<' + '/'.join(tags) + '>'
if all_tags != '<>':
list.append(all_tags)
return ' '.join(list)

# def to_tag_string(self, tag_type: str = 'tag') -> str:
#
# list = []
# for token in self.tokens:
# list.append(token.text)
# if token.get_tag(tag_type) == '' or token.get_tag(tag_type) == 'O': continue
# list.append('<' + token.get_tag(tag_type) + '>')
# return ' '.join(list)
#
# def to_ner_string(self) -> str:
# list = []
# for token in self.tokens:
# if token.get_tag('ner') == 'O' or token.get_tag('ner') == '':
# list.append(token.text)
# else:
# list.append(token.text)
# list.append('<' + token.get_tag('ner') + '>')
# return ' '.join(list)

def convert_tag_scheme(self, tag_type: str = 'ner', target_scheme: str = 'iob'):

tags: List[str] = []
Expand All @@ -217,7 +243,7 @@ def convert_tag_scheme(self, tag_type: str = 'ner', target_scheme: str = 'iob'):
self.tokens[index].add_tag(tag_type, tag)

def __repr__(self):
return ' '.join([x.text for x in self.tokens])
return 'Sentence: "' + ' '.join([t.text for t in self.tokens]) + '" - %d Tokens' % len(self)

def __copy__(self):
s = Sentence()
Expand Down
Loading

0 comments on commit bc4dfe3

Please sign in to comment.