Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v2] Refactor evaluators and Abstasks #1707

Merged
merged 19 commits into from
Jan 12, 2025
Merged
21 changes: 15 additions & 6 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import torch
import tqdm
import transformers
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import MultiLabelBinarizer

Expand All @@ -25,6 +26,13 @@
# ^ e.g {'main_score': 0.5, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}


def set_seed(seed: int) -> tuple[random.Random, np.random.Generator]:
torch.manual_seed(seed)
np.random.seed(seed)
transformers.set_seed(seed)
return random.Random(seed), np.random.default_rng(seed)


def _multilabel_subsampling(
dataset_dict: DatasetDict,
seed: int,
Expand Down Expand Up @@ -63,14 +71,14 @@ class AbsTask(ABC):
and Dataset is a datasets.Dataset objedct. "hf subset" is the data subset on Huggingface typically used to denote the language e.g.
datasets.load_dataset("data", "en"). If the dataset does not have a subset this is simply "default".
abstask_prompt: The potential prompt of the abstask
superseeded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
superseded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
reproducibility of existing benchmarks.
"""

metadata: TaskMetadata
abstask_prompt: str | None = None
_eval_splits: list[str] | None = None
superseded_by: None | str = None
superseded_by: str | None = None
dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore
data_loaded: bool = False
is_multilingual: bool = False
Expand All @@ -85,10 +93,7 @@ def __init__(self, seed: int = 42, **kwargs: Any):
self.save_suffix = kwargs.get("save_suffix", "")

self.seed = seed
random.seed(self.seed)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
torch.cuda.manual_seed_all(self.seed)
self.rng_state, self.np_rng = set_seed(seed)

def check_if_dataset_is_superseded(self):
"""Check if the dataset is superseded by a newer version"""
Expand Down Expand Up @@ -146,6 +151,7 @@ def evaluate(
scores[hf_subset] = self._evaluate_subset(
model, data_split, encode_kwargs=encode_kwargs, **kwargs
)
self._add_main_score(scores[hf_subset])
return scores

@abstractmethod
Expand Down Expand Up @@ -329,6 +335,9 @@ def filter_languages(
self.hf_subsets = subsets_to_keep
return self

def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _upload_dataset_to_hub(self, repo_name: str, fields: list[str]) -> None:
if self.is_multilingual:
for config in self.metadata.eval_langs:
Expand Down
11 changes: 2 additions & 9 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ class AbsTaskBitextMining(AbsTask):
parallel_subsets = False
abstask_prompt = "Retrieve parallel sentences."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def evaluate(
self,
model: Encoder,
Expand Down Expand Up @@ -94,7 +91,7 @@ def evaluate(
else:
for hf_subet in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
f"Task: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
)

if hf_subet not in self.dataset and hf_subet == "default":
Expand All @@ -103,8 +100,7 @@ def evaluate(
data_split = self.dataset[hf_subet][split]
scores[hf_subet] = self._evaluate_subset(
model,
data_split, # type: ignore
subsets=["sentence1", "sentence2"],
data_split,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand Down Expand Up @@ -142,9 +138,6 @@ def _evaluate_subset(
self._add_main_score(metrics)
return metrics

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> BitextDescriptiveStatistics:
Expand Down
148 changes: 63 additions & 85 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
from typing import Any

import numpy as np
from datasets import Dataset, DatasetDict

from mteb.encoder_interface import Encoder

from ..evaluation.evaluators import (
kNNClassificationEvaluator,
kNNClassificationEvaluatorPytorch,
logRegClassificationEvaluator,
)
from ..load_results.task_results import HFSubset, ScoresDict
Expand All @@ -24,17 +23,20 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics):
"""Descriptive statistics for Classification
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
number_texts_intersect_with_train: Number of texts in the train split
min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_text: Number of unique texts
unique_labels: Number of unique labels
labels: dict of label frequencies
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
number_texts_intersect_with_train: Number of texts in the train split
min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts
min_labels_per_text: Minimum number of labels per text
average_label_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
Expand All @@ -44,14 +46,17 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics):
min_text_length: int
average_text_length: float
max_text_length: int
unique_text: int
unique_texts: int

min_labels_per_text: int
average_label_per_text: float
max_labels_per_text: int
unique_labels: int
labels: dict[str, dict[str, int]]


class AbsTaskClassification(AbsTask):
"""Abstract class for kNN classification tasks
"""Abstract class for classification tasks
The similarity is computed between pairs and the results are ranked.
self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It
Expand All @@ -64,40 +69,21 @@ class AbsTaskClassification(AbsTask):
"""

evaluator = logRegClassificationEvaluator
abstask_prompt = "Classify user passages."
samples_per_label: int = 8
n_experiments: int = 10

def __init__(
self,
method: str = "logReg",
n_experiments: int | None = None,
k: int = 3,
**kwargs,
):
super().__init__(**kwargs)
self.method = method

# Bootstrap parameters
self.n_experiments: int = ( # type: ignore
n_experiments if n_experiments is not None else self.n_experiments
)

# kNN parameters
self.k = k

def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
scores["main_score"] = scores[self.metadata.main_score]
k: int = 3
train_split = "train"

def evaluate(
self,
model,
eval_split: str = "test",
train_split: str = "train",
model: Encoder,
split: str = "test",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
**kwargs: Any,
) -> dict[HFSubset, ScoresDict]:
if not self.data_loaded:
self.load_data()
Expand All @@ -109,7 +95,7 @@ def evaluate(

for hf_subset in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
f"Task: {self.metadata.name}, split: {split}, subset: {hf_subset}. Running..."
)

if hf_subset not in self.dataset and hf_subset == "default":
Expand All @@ -119,8 +105,7 @@ def evaluate(
scores[hf_subset] = self._evaluate_subset(
model,
ds,
eval_split,
train_split,
eval_split_name=split,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand All @@ -131,14 +116,13 @@ def evaluate(
def _evaluate_subset(
self,
model: Encoder,
dataset,
eval_split: str = "test",
train_split: str = "train",
dataset: DatasetDict | Dataset,
eval_split_name: str,
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> ScoresDict:
train_split = dataset[train_split]
eval_split = dataset[eval_split]
train_split = dataset[self.train_split]
eval_split = dataset[eval_split_name]
params = {"k": self.k}
params.update(kwargs)

Expand All @@ -159,40 +143,17 @@ def _evaluate_subset(
idxs,
)

if self.method == "kNN":
evaluator = kNNClassificationEvaluator(
X_sampled,
y_sampled,
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
elif self.method == "kNN-pytorch":
evaluator = kNNClassificationEvaluatorPytorch(
X_sampled,
y_sampled,
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
elif self.method == "logReg":
evaluator = logRegClassificationEvaluator(
X_sampled,
y_sampled,
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
else:
raise ValueError(f"Method {self.method} not supported")

scores_exp, test_cache = evaluator(model, test_cache=test_cache)
evaluator = self.evaluator(
X_sampled,
y_sampled,
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
**params,
)
scores_exp, test_cache = evaluator(
model, encode_kwargs=encode_kwargs, test_cache=test_cache
)
scores.append(scores_exp)

avg_scores: dict[str, Any] = {
Expand Down Expand Up @@ -242,7 +203,18 @@ def _calculate_metrics_from_split(

text_len = [len(t) for t in text]
total_text_len = sum(text_len)
label_count = Counter(label)
if isinstance(label[0], int):
label_len = [1] * len(label)
total_label_len = len(label)
total_labels = label
else:
# multilabel classification
label_len = [len(l) for l in label]
total_label_len = sum(label_len)
total_labels = []
for l in label:
total_labels.extend(l if len(l) > 0 else [None])
label_count = Counter(total_labels)
num_texts_in_train = (
len(set(text) & set(train_text)) if split != "train" else None
)
Expand All @@ -253,10 +225,16 @@ def _calculate_metrics_from_split(
min_text_length=min(text_len),
average_text_length=total_text_len / len(text),
max_text_length=max(text_len),
unique_text=len(set(text)),
unique_texts=len(set(text)),
min_labels_per_text=min(label_len),
average_label_per_text=total_label_len / len(label),
max_labels_per_text=max(label_len),
unique_labels=len(label_count),
labels={
str(label): {"count": count} for label, count in label_count.items()
str(label): {
"count": value,
}
for label, value in label_count.items()
},
)

Expand Down
6 changes: 0 additions & 6 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,6 @@ class AbsTaskClustering(AbsTask):

abstask_prompt = "Identify categories in user passages."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _evaluate_subset(
self,
model: Encoder,
Expand Down
Loading
Loading