fix: added annotations for training data (#1742)

* fix: Added annotations for arctic embed models * added google and bge * added cohere * Added e5 * added bge based model2vec * annotated oAI * format and update annotations
embeddings-benchmark · Jan 11, 2025 · 3f093c8 · 3f093c8
1 parent cc27c78
commit 3f093c8
Show file tree

Hide file tree

Showing 9 changed files with 609 additions and 74 deletions.
diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -103,6 +103,32 @@
     use_instructions=True,
     adapted_from="sentence-transformers/all-MiniLM-L6-v2",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 
@@ -128,6 +154,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-small-unsupervised",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 
@@ -153,6 +205,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_m_long = ModelMeta(
@@ -178,6 +256,33 @@
     use_instructions=True,
     adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # trained on stack exchange, unsure if sources match
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_l = ModelMeta(
@@ -202,6 +307,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_m_v1_5 = ModelMeta(
@@ -254,6 +385,32 @@
     use_instructions=True,
     adapted_from="Alibaba-NLP/gte-multilingual-base",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_l_v2_0 = ModelMeta(
@@ -278,4 +435,30 @@
     use_instructions=True,
     adapted_from="BAAI/bge-m3-retromae",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -27,6 +27,37 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
 
 bge_base_en_v1_5 = ModelMeta(
@@ -50,6 +81,37 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
 
 bge_large_en_v1_5 = ModelMeta(
@@ -73,4 +135,35 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py
@@ -235,6 +235,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_eng_3 = ModelMeta(
@@ -257,6 +260,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_mult_light_3 = ModelMeta(
@@ -279,6 +285,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_eng_light_3 = ModelMeta(
@@ -301,4 +310,7 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )