Merge pull request #3339 from flairNLP/more_docs

add some more documentation and (rather empty) glossary page
flairNLP · Oct 23, 2023 · cbe750f · cbe750f
2 parents c5c1bf8 + 1a86265
commit cbe750f
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 8 deletions.
diff --git a/docs/_static/glossary.svg b/docs/_static/glossary.svg
diff --git a/docs/glossary/index.rst b/docs/glossary/index.rst
@@ -0,0 +1,7 @@
+Glossary
+========
+
+.. glossary::
+
+    Sentence
+      a sentence is a text-unit consisting of tokens, labels and possibly metadata. Notice that a sentence is not limited in size, hence a Sentence itself could hold either a full document, a paragraph, a simple phrase or a linguistic
diff --git a/docs/index.rst b/docs/index.rst
@@ -7,7 +7,7 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started <gtutorial/index/intro.html>`_ |
+`Getting started <tutorial/intro.html>`_ |
 `Source Repository <https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://github.com/flairNLP/flair/issues>`_ |
 
@@ -55,9 +55,9 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
         Contributor's Guide
         ^^^^^^^^^^^^^^^^^^^
 
-        Want to add to the codebase? Can help add translation or a flowchart to the
+        Want to add to the codebase? Can help add to the
         documentation? The contributing guidelines will guide you through the
-        process of improving NumPy.
+        process of improving Flair.
 
         +++
 
@@ -68,10 +68,28 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
 
             To the contributor's guide
 
+    .. grid-item-card::
+        :img-top: ./_static/glossary.svg
+
+        Glossary
+        ^^^^^^^^
+
+        Not sure what the exact meaning of certain terms is? Find their definition in the Glossary.
+
+        +++
+
+        .. button-ref:: glossary/index
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the glossary
+
 .. toctree::
    :maxdepth: 3
    :hidden:
 
    Tutorials <tutorial/index>
    API reference <api/index>
-   Contributing <contributing/index>
+   Contributing <contributing/index>
+   Glossary <glossary/index>
diff --git a/flair/data.py b/flair/data.py
@@ -720,7 +720,7 @@ def __init__(
                 :class:`flair.tokenization.SegTokTokenizer`. If `use_tokenizer` is set to False,
                 :class:`flair.tokenization.SpaceTokenizer` will be used instead. The tokenizer will be ignored,
                 if `text` refers to pretokenized tokens.
-            language_code: Language of the sentence. If not provided, [langdetect](https://pypi.org/project/langdetect/)
+            language_code: Language of the sentence. If not provided, `langdetect <https://pypi.org/project/langdetect/>`_
                 will be called when the language_code is accessed for the first time.
             start_position: Start char offset of the sentence in the superordinate document.
         """

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
@@ -8,7 +8,7 @@
 from abc import abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union, cast
 
 import torch
 import transformers
@@ -982,8 +982,8 @@ def __init__(
         fine_tune: bool = True,
         layers: str = "-1",
         layer_mean: bool = True,
-        subtoken_pooling: str = "first",
-        cls_pooling: str = "cls",
+        subtoken_pooling: Literal["first", "last", "first_last", "mean"] = "first",
+        cls_pooling: Literal["cls", "max", "mean"] = "cls",
         is_token_embedding: bool = True,
         is_document_embedding: bool = True,
         allow_long_sentences: bool = False,
@@ -999,6 +999,32 @@ def __init__(
         use_context_separator: bool = True,
         **kwargs,
     ) -> None:
+        """Instantiate transformers embeddings.
+
+        Allows using transformers as TokenEmbeddings and DocumentEmbeddings or both.
+
+        Args:
+            model: name of transformer model (see `huggingface hub <https://huggingface.co/models>`_ for options)
+            fine_tune: If True, the weights of the transformers embedding will be updated during training.
+            layers: Specify which layers should be extracted for the embeddings. Expects either "all" to extract all layers or a comma separated list of indices (e.g. "-1,-2,-3,-4" for the last 4 layers)
+            layer_mean: If True, the extracted layers will be averaged. Otherwise, they will be concatenated.
+            subtoken_pooling: Specify how multiple sub-tokens will be aggregated for a token-embedding.
+            cls_pooling: Specify how the document-embeddings will be extracted.
+            is_token_embedding: If True, this embeddings can be handled as token-embeddings.
+            is_document_embedding: If True, this embeddings can be handled document-embeddings.
+            allow_long_sentences: If True, too long sentences will be patched and strided and afterwards combined.
+            use_context: If True, predicting multiple sentences at once, will use the previous and next sentences for context.
+            respect_document_boundaries: If True, the context calculation will stop if a sentence represents a context boundary.
+            context_dropout: Integer percentage (0-100) to specify how often the context won't be used during training.
+            saved_config: Pretrained config used when loading embeddings. Always use None.
+            tokenizer_data: Tokenizer data used when loading embeddings. Always use None.
+            feature_extractor_data: Feature extractor data used when loading embeddings. Always use None.
+            name: The name for the embeddings. Per default the name will be used from the used transformers model.
+            force_max_length: If True, the tokenizer will always pad the sequences to maximum length.
+            needs_manual_ocr: If True, bounding boxes will be calculated manually. This is used for models like `layoutlm <https://huggingface.co/docs/transformers/model_doc/layoutlm>`_ where the tokenizer doesn't compute the bounding boxes itself.
+            use_context_separator: If True, the embedding will hold an additional token to allow the model to distingulish between context and prediction.
+            **kwargs: Further values forwarded to the transformers config
+        """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
         del self.instance_parameters["saved_config"]
         del self.instance_parameters["tokenizer_data"]