Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix meta_eval after refactor and add new meta_mmlu_instruct task for 3.2 #862

Merged
merged 2 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ Given those differences, the numbers from this recipe can not be compared to the

## Environment setups

Please install lm-evaluation-harness and our llama-recipe repo by following:
Please install lm-evaluation-harness and our llama-cookbook repo by following:

```
git clone [email protected]:meta-llama/llama-recipes.git
cd llama-recipes
git clone [email protected]:meta-llama/llama-cookbook.git
cd llama-cookbook
pip install -U pip setuptools
pip install -e .
pip install -U antlr4_python3_runtime==4.11
pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
cd tools/benchmarks/llm_eval_harness/meta_eval
cd end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval
```

To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
model_name: "meta-llama/Llama-3.2-3B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."

evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
evals_dataset: "meta-llama/Llama-3.2-3B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]

tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
task: meta_mmlu_instruct
dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
dataset_name: Llama-3.1-8B-Instruct-evals__mmlu__details
test_split: latest
output_type: generate_until
process_docs: !function utils.process_docs_instruct
doc_to_text: !function utils.doc_to_text_instruct
doc_to_target: gold
filter_list:
- name: "strict-match"
filter:
- function: "regex"
group_select: -1
regex_pattern: ' ([A-D])'
- function: "take_first"
generation_kwargs:
until: []
do_sample: false
temperature: 0
max_gen_toks: 1024
num_fewshot: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ dataset_path: meta-llama/Llama-3.1-8B-evals
dataset_name: Llama-3.1-8B-evals__mmlu__details
test_split: latest
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: !function utils.doc_to_text
process_docs: !function utils.process_docs_pretrain
doc_to_text: !function utils.doc_to_text_pretrain
doc_to_target: !function utils.doc_to_target
doc_to_choice: ["A", "B", "C", "D"]
# 5-shot prompts are already included in the dataset
# So no need to generate
num_fewshot: 0
metadata:
version: 1.0
version: 1.0
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
import string

import datasets

def doc_to_text(doc: dict) -> str:

def doc_to_text_pretrain(doc: dict) -> str:
# Strip out the last two characters, which is a space and the answer
# E.g., "Answer: B" -> "Answer:"
return doc["input_final_prompts"][0][:-2]
return text


def doc_to_text_instruct(doc: dict) -> str:
# Strip out the last two characters, which is a space and the answer
# E.g., "Answer: B" -> "Answer:"
return doc["input_final_prompts"][0]


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def process_docs_pretrain(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
# input_correct_responses is in format of: "Answer: B"
answer = doc["input_correct_responses"][0]
Expand All @@ -21,11 +31,43 @@ def _process_doc(doc: dict) -> dict:
return out_doc

dataset = dataset.select_columns(
["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
"input_choice_list"])
[
"input_question",
"input_correct_responses",
"input_final_prompts",
"is_correct",
"input_question_hash",
"input_choice_list",
]
)
dataset = dataset.rename_column("is_correct", "previously_is_correct")
dataset = dataset.map(_process_doc)
return dataset.map(_process_doc)


def process_docs_instruct(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["input_question"],
"gold": doc["input_correct_responses"][0],
}
return out_doc

dataset = dataset.select_columns(
[
"input_question",
"input_correct_responses",
"input_final_prompts",
"is_correct",
"input_question_hash",
"input_choice_list",
"output_prediction_text",
]
)
dataset = dataset.rename_column("is_correct", "previously_is_correct")
dataset = dataset.map(_process_doc)
return dataset.map(_process_doc)


def doc_to_target(doc: dict) -> str:
return doc["gold"]
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
import string


import datasets



def doc_to_text(doc: dict) -> str:
return doc["input_final_prompts"][0]


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["input_question"],
"gold": doc["input_correct_responses"][0],
}
return out_doc
dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
dataset = dataset.rename_column("is_correct","previously_is_correct")

dataset = dataset.select_columns(
[
"input_question",
"input_correct_responses",
"input_final_prompts",
"is_correct",
"input_question_hash",
"input_choice_list",
"output_prediction_text",
]
)
dataset = dataset.rename_column("is_correct", "previously_is_correct")
dataset = dataset.map(_process_doc)
return dataset.map(_process_doc)
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,46 @@

import argparse
import errno
import shutil
import glob
import os
import shutil
from pathlib import Path

import nltk
import yaml
from datasets import Dataset, load_dataset

LLAMA_3_1_INSTRUCT_EVALS=[
LLAMA_3_1_INSTRUCT_EVALS = [
"meta-llama/Llama-3.1-8B-Instruct-evals",
"meta-llama/Llama-3.1-70B-Instruct-evals",
"meta-llama/Llama-3.1-405B-Instruct-evals",
]
LLAMA_3_1_PRETRAIN_EVALS=[
LLAMA_3_1_PRETRAIN_EVALS = [
"meta-llama/Llama-3.1-8B-evals",
"meta-llama/Llama-3.1-70B-evals",
"meta-llama/Llama-3.1-405B-evals",
]
LLAMA_3_2_INSTRUCT_EVALS=[
LLAMA_3_2_INSTRUCT_EVALS = [
"meta-llama/Llama-3.2-1B-Instruct-evals",
"meta-llama/Llama-3.2-3B-Instruct-evals",
]
LLAMA_3_2_PRETRAIN_EVALS=[
LLAMA_3_2_PRETRAIN_EVALS = [
"meta-llama/Llama-3.2-1B-evals",
"meta-llama/Llama-3.2-3B-evals",
]


# get the ifeval from the evals dataset and join it with the original ifeval datasets
def get_ifeval_data(model_name, output_dir):
print(f"preparing the ifeval data using {model_name}'s evals dataset")
if model_name not in [
"Llama-3.1-8B-Instruct",
"Llama-3.1-70B-Instruct",
"Llama-3.1-405B-Instruct",
"Llama-3.3-70B-Instruct",
]:
raise ValueError(
"Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
"Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
)
original_dataset_name = "wis-k/instruction-following-eval"
meta_dataset_name = f"meta-llama/{model_name}-evals"
Expand Down Expand Up @@ -80,11 +83,12 @@ def get_math_hard_data(model_name, output_dir):
"Llama-3.1-8B-Instruct",
"Llama-3.1-70B-Instruct",
"Llama-3.1-405B-Instruct",
"Llama-3.3-70B-Instruct",
]:
raise ValueError(
"Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
"Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
)
original_dataset_name = "lighteval/MATH-Hard"
original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
meta_dataset_name = f"meta-llama/{model_name}-evals"
meta_data = load_dataset(
meta_dataset_name,
Expand All @@ -95,6 +99,7 @@ def get_math_hard_data(model_name, output_dir):
joined = join_meta_and_original_math_data(meta_data, math_data)
joined.to_parquet(output_dir + "/joined_math_hard.parquet")


def get_math_data(model_name, output_dir):
print(f"preparing the math data using {model_name}'s evals dataset")
if model_name not in [
Expand All @@ -104,7 +109,7 @@ def get_math_data(model_name, output_dir):
raise ValueError(
"Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
)
original_dataset_name = "lighteval/MATH"
original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
meta_dataset_name = f"meta-llama/{model_name}-evals"
meta_data = load_dataset(
meta_dataset_name,
Expand All @@ -115,6 +120,7 @@ def get_math_data(model_name, output_dir):
joined = join_meta_and_original_math_data(meta_data, math_data)
joined.to_parquet(output_dir + "/joined_math.parquet")


def join_meta_and_original_math_data(meta_data, math_data):
meta_df = meta_data.to_pandas()
math_df = math_data.to_pandas()
Expand All @@ -138,6 +144,7 @@ def join_meta_and_original_math_data(meta_data, math_data):
)
return joined


# get the question from the ifeval dataset
def get_question(example):
try:
Expand Down Expand Up @@ -181,25 +188,30 @@ def change_yaml(args, base_name):
if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
meta_pretrain["task"] = ["meta_mmlu"]
meta_pretrain["task"] = ["meta_mmlu_pretrain"]
with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
yaml.dump(meta_pretrain, yaml_file)

# Update tasks in meta_instruct.yaml
with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
meta_instruct = yaml.safe_load(yaml_file)
if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
meta_instruct["task"] = ["meta_ifeval", "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct"]
meta_instruct["task"] = [
"meta_ifeval",
"meta_math_hard",
"meta_gpqa_cot",
"meta_mmlu_pro_instruct",
]
elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
meta_instruct["task"] = ["meta_mmlu_instruct", "meta_math", "meta_gpqa"]
with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
yaml.dump(meta_instruct, yaml_file)


# copy the files and change the yaml file to use the correct model name
def copy_and_prepare(args):
# nltk punkt_tab package is needed
nltk.download('punkt_tab')
nltk.download("punkt_tab")
copy_dir(args.template_dir, args.work_dir)
# Use the template yaml to get the correct model name in work_dir yaml
base_name = (
Expand Down Expand Up @@ -227,7 +239,9 @@ def prepare_datasets(args):
if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
get_ifeval_data(model_name, args.work_dir)
get_math_hard_data(model_name, args.work_dir)
elif "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
elif (
"meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
):
get_math_data(model_name, args.work_dir)
else:
if "meta_ifeval" in task_list:
Expand Down Expand Up @@ -264,10 +278,10 @@ def load_config(config_path: str = "./config.yaml"):
if not os.path.exists(args.template_dir):
raise ValueError("The template_dir does not exist, please check the path")
if args.evals_dataset not in (
LLAMA_3_1_INSTRUCT_EVALS +
LLAMA_3_1_PRETRAIN_EVALS +
LLAMA_3_2_INSTRUCT_EVALS +
LLAMA_3_2_PRETRAIN_EVALS
LLAMA_3_1_INSTRUCT_EVALS
+ LLAMA_3_1_PRETRAIN_EVALS
+ LLAMA_3_2_INSTRUCT_EVALS
+ LLAMA_3_2_PRETRAIN_EVALS
):
raise ValueError(
"The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
Expand Down