Skip to content

Commit

Permalink
fix: added annotations for training data (#1742)
Browse files Browse the repository at this point in the history
* fix: Added annotations for arctic embed models

* added google and bge

* added cohere

* Added e5

* added bge based model2vec

* annotated oAI

* format and update annotations
  • Loading branch information
KennethEnevoldsen authored Jan 11, 2025
1 parent cc27c78 commit 3f093c8
Show file tree
Hide file tree
Showing 9 changed files with 609 additions and 74 deletions.
183 changes: 183 additions & 0 deletions mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,32 @@
use_instructions=True,
adapted_from="sentence-transformers/all-MiniLM-L6-v2",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)


Expand All @@ -128,6 +154,32 @@
use_instructions=True,
adapted_from="intfloat/e5-small-unsupervised",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)


Expand All @@ -153,6 +205,32 @@
use_instructions=True,
adapted_from="intfloat/e5-base-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)

arctic_embed_m_long = ModelMeta(
Expand All @@ -178,6 +256,33 @@
use_instructions=True,
adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# trained on stack exchange, unsure if sources match
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)

arctic_embed_l = ModelMeta(
Expand All @@ -202,6 +307,32 @@
use_instructions=True,
adapted_from="intfloat/e5-base-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)

arctic_embed_m_v1_5 = ModelMeta(
Expand Down Expand Up @@ -254,6 +385,32 @@
use_instructions=True,
adapted_from="Alibaba-NLP/gte-multilingual-base",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)

arctic_embed_l_v2_0 = ModelMeta(
Expand All @@ -278,4 +435,30 @@
use_instructions=True,
adapted_from="BAAI/bge-m3-retromae",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# not in MTEB
# trained on stack exchange (title-body)
# "stackexchange": [],
# potentially means that:
# "StackExchangeClusteringP2P": ["test"],
# "StackExchangeClusteringP2P.v2": ["test"],
# "StackExchangeClustering": ["test"],
# "StackExchangeClustering.v2": ["test"],
# not in MTEB
# "paq": [],
# "s2orc": [],
# "other": [], # undisclosed including webdata
}, # also use synthetic
)
93 changes: 93 additions & 0 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,37 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
)

bge_base_en_v1_5 = ModelMeta(
Expand All @@ -50,6 +81,37 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
)

bge_large_en_v1_5 = ModelMeta(
Expand All @@ -73,4 +135,35 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
)
12 changes: 12 additions & 0 deletions mteb/models/cohere_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def encode(
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
public_training_data=False, # assumed
public_training_code=False, # assumed
training_datasets=None,
)

cohere_eng_3 = ModelMeta(
Expand All @@ -257,6 +260,9 @@ def encode(
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
public_training_data=False, # assumed
public_training_code=False, # assumed
training_datasets=None,
)

cohere_mult_light_3 = ModelMeta(
Expand All @@ -279,6 +285,9 @@ def encode(
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
public_training_data=False, # assumed
public_training_code=False, # assumed
training_datasets=None,
)

cohere_eng_light_3 = ModelMeta(
Expand All @@ -301,4 +310,7 @@ def encode(
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
public_training_data=False, # assumed
public_training_code=False, # assumed
training_datasets=None,
)
Loading

0 comments on commit 3f093c8

Please sign in to comment.