diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml deleted file mode 100644 index 527909bf68786..0000000000000 --- a/.github/actions/nm-lm-eval-smoke/action.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: run lm-eval accuracy smoke test -description: 'run lm-eval accuracy smoke test' -inputs: - python: - description: 'python version, e.g. 3.10.12' - required: true - venv: - description: 'name for python virtual environment' - required: true -runs: - using: composite - steps: - - id: lm-eval - run: | - # move source directories - mv vllm vllm-ignore || echo "no 'vllm' folder to move" - mv csrc csrc-ignore || echo "no 'csrc' folder to move" - - if [ -n "${{ inputs.venv }}" ]; then - COMMIT=${{ github.sha }} - VENV="${{ inputs.venv }}-${COMMIT:0:7}" - source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate - fi - - pip3 install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 - pip3 install optimum auto-gptq - - SUCCESS=0 - python .github/scripts/lm_eval_compare_hf_vs_vllm.py --hf_pretrained nm-testing/zephyr-beta-7b-gptq-g128 --vllm_pretrained nm-testing/zephyr-beta-7b-marlin-g128 || SUCCESS=$? - echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT" - exit ${SUCCESS} - shell: bash diff --git a/.github/actions/nm-lm-eval-accuracy/action.yml b/.github/actions/nm-lm-eval/action.yml similarity index 64% rename from .github/actions/nm-lm-eval-accuracy/action.yml rename to .github/actions/nm-lm-eval/action.yml index ee8c78c8855a3..f7f0b07fcf080 100644 --- a/.github/actions/nm-lm-eval-accuracy/action.yml +++ b/.github/actions/nm-lm-eval/action.yml @@ -1,5 +1,5 @@ -name: run lm-eval full accuracy test -description: 'run lm-eval full accuracy test' +name: run lm-eval accuracy test +description: 'run lm-eval accuracy test' inputs: python: description: 'python version, e.g. 3.10.12' @@ -7,15 +7,14 @@ inputs: venv: description: 'name for python virtual environment' required: true + lm_eval_configuration: + description: 'file containing test configuration' + required: true runs: using: composite steps: - id: lm-eval run: | - # move source directories - mv vllm vllm-ignore || echo "no 'vllm' folder to move" - mv csrc csrc-ignore || echo "no 'csrc' folder to move" - if [ -n "${{ inputs.venv }}" ]; then COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" @@ -26,7 +25,7 @@ runs: pip3 install pytest openai==1.3.9 SUCCESS=0 - pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$? - echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT" + ./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$? + echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} shell: bash diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py deleted file mode 100644 index d8e256631e9a7..0000000000000 --- a/.github/scripts/lm_eval_compare_hf_vs_vllm.py +++ /dev/null @@ -1,125 +0,0 @@ -import argparse -import os -from typing import Dict, List, Tuple - -import lm_eval -import lm_eval.models.utils -import numpy as np -import scipy.stats - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]: - acc1, acc2 = res1["acc,none"], res2["acc,none"] - st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"] - Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2)) - # Determining the p-value - p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test - return Z, p_value - - -def print_results(data_to_print: List = None, - results_dict: Dict = None, - alpha: float = None): - model1_data, model2_data = data_to_print - for task in model1_data: - print(f"Task: {task}") - print(f"HF Accuracy: {model1_data[task]['acc,none']}") - print(f"vLLM Accuracy: {model2_data[task]['acc,none']}") - print(f"HF StdErr: {model1_data[task]['acc_stderr,none']}") - print(f"vLLM StdErr: {model2_data[task]['acc_stderr,none']}") - z = results_dict[task]["z"] - p_value = results_dict[task]["p_value"] - result = "PASS" if p_value > alpha else "FAIL" - print(f"Z-Score: {z}, P-Value: {p_value}, p > {alpha}: {result}\n") - - -def check_passing_score(results_dict: Dict = None, - alpha: float = None) -> bool: - for task in results_dict: - p_value = results_dict[task]["p_value"] - if p_value <= alpha: - return False - return True - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--hf_pretrained", - default="EleutherAI/pythia-70m", - help="name of model to compare as baseline") - parser.add_argument("--vllm_pretrained", - default="EleutherAI/pythia-70m", - help="name of model to compare as difference") - parser.add_argument("--hf_args", - help="huggingface model args =", - default="") - parser.add_argument("--vllm_args", - help="vllm model args =", - default="") - parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag") - parser.add_argument( - "--limit", - type=float, - default=100, - ) - parser.add_argument( - "--alpha", - type=float, - default=0.05, - help="Significance level for two-tailed z-test", - ) - parser.add_argument( - "--device", - type=str, - default="cuda", - ) - parser.add_argument( - "--batch", - type=str, - default=4, - ) - parser.add_argument( - "--verbosity", - type=str, - default="INFO", - help="Logging verbosity", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - tasks = args.tasks.split(",") - print("Tasks:", tasks) - hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args - results_hf = lm_eval.simple_evaluate( - model="hf", - model_args=f"pretrained={args.hf_pretrained}" + hf_args, - tasks=tasks, - limit=args.limit, - device=args.device, - batch_size=args.batch, - ) - lm_eval.models.utils.clear_torch_cache() - print("Memory stats cleared") - results_vllm = lm_eval.simple_evaluate( - model="vllm", - model_args=f"pretrained={args.vllm_pretrained}" + vllm_args, - tasks=tasks, - limit=args.limit, - device=args.device, - batch_size=args.batch, - ) - all_res = {} - for task1, task2 in zip(results_hf["results"].items(), - results_vllm["results"].items()): - assert task1[0] == task2[0] - z, p_value = calculate_z_value(task1[1], task2[1]) - all_res[task1[0]] = {"z": z, "p_value": p_value} - print_results([results_hf["results"], results_vllm["results"]], all_res, - args.alpha) - if not check_passing_score(all_res, args.alpha): - print("Accuracy test failed!") - exit(1) diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh new file mode 100755 index 0000000000000..76f7100af5949 --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -d - device to use (e.g. cuda, cuda:0, auto, cpu)" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:d:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + d ) + DEVICE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args pretrained=$MODEL \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE --device $DEVICE diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh new file mode 100755 index 0000000000000..c68d6d1d7697f --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-vllm.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm server and compares to " + echo "precomputed baseline (measured by HF transformers." + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)" + echo +} + +while getopts "c:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index 7847043da5ed1..4d3f6fd5f34cb 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -66,6 +66,19 @@ on: description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" type: string default: "false" + # lm-eval related parameters + lm_eval_label: + description: "requested runner label (specifies instance)" + type: string + default: "" + lm_eval_timeout: + description: "time limit for lm_eval in minutes" + type: string + default: "60" + lm_eval_configuration: + description: "configuration for lm-eval test (see neuralmagic/lm-eval)" + type: string + default: "" jobs: @@ -134,16 +147,14 @@ jobs: push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" secrets: inherit - TEST-ACCURACY-FULL: + LM-EVAL-SOLO: needs: [BUILD] - if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }} - uses: ./.github/workflows/nm-test-accuracy-full.yml + uses: ./.github/workflows/nm-lm-eval.yml with: - label: ${{ inputs.test_label_multi }} - timeout: ${{ inputs.benchmark_timeout }} + label: ${{ inputs.lm_eval_label }} + timeout: ${{ inputs.lm_eval_timeout }} gitref: ${{ inputs.gitref }} - Gi_per_thread: ${{ inputs.Gi_per_thread }} - nvcc_threads: ${{ inputs.nvcc_threads }} python: ${{ inputs.python }} whl: ${{ needs.BUILD.outputs.whl }} + lm_eval_configuration: ${{ inputs.lm_eval_configuration }} secrets: inherit diff --git a/.github/workflows/nm-test-accuracy-full.yml b/.github/workflows/nm-lm-eval.yml similarity index 83% rename from .github/workflows/nm-test-accuracy-full.yml rename to .github/workflows/nm-lm-eval.yml index ae3ebee62203e..90b7ec61a0a3a 100644 --- a/.github/workflows/nm-test-accuracy-full.yml +++ b/.github/workflows/nm-lm-eval.yml @@ -15,14 +15,6 @@ on: description: "git commit hash or branch name" type: string required: true - Gi_per_thread: - description: 'requested GiB to reserve per thread' - type: string - required: true - nvcc_threads: - description: "number of threads nvcc build threads" - type: string - required: true python: description: "python version, e.g. 3.10.12" type: string @@ -31,6 +23,10 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true + lm_eval_configuration: + description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)' + type: string + required: true # makes workflow manually callable workflow_dispatch: @@ -47,14 +43,6 @@ on: description: "git commit hash or branch name" type: string required: true - Gi_per_thread: - description: 'requested GiB to reserve per thread' - type: string - required: true - nvcc_threads: - description: "number of threads nvcc build threads" - type: string - required: true python: description: "python version, e.g. 3.10.12" type: string @@ -63,9 +51,13 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true + lm_eval_configuration: + description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)' + type: string + required: true jobs: - TEST-ACCURACY-FULL: + LM-EVAL: runs-on: ${{ inputs.label }} timeout-minutes: ${{ fromJSON(inputs.timeout) }} @@ -77,6 +69,12 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} + + - name: install automation components + run: | + sudo apt-get update --fix-missing + sudo apt-get install -y git-all + sudo apt-get install -y curl - name: checkout repository code uses: actions/checkout@v4 @@ -114,7 +112,8 @@ jobs: venv: - name: run lm-eval-accuracy - uses: ./.github/actions/nm-lm-eval-accuracy/ + uses: ./.github/actions/nm-lm-eval/ with: python: ${{ inputs.python }} venv: + lm_eval_configuration: ${{ inputs.lm_eval_configuration }} diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index e90f99b34ee9a..17b9624cfe4e0 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -33,6 +33,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-9: @@ -51,6 +55,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-10: @@ -69,6 +77,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-11: @@ -88,4 +100,8 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml index f5c9056cbc5d7..ac1e4488ad1f9 100644 --- a/.github/workflows/nm-release.yml +++ b/.github/workflows/nm-release.yml @@ -29,6 +29,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-9: @@ -47,6 +51,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-10: @@ -65,6 +73,10 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit PYTHON-3-11: @@ -83,4 +95,8 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml + lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 3c1fe246756a4..37a350c461e38 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -26,6 +26,10 @@ jobs: benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 480 + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml + lm_eval_timeout: 60 secrets: inherit BUILD-TEST-3-9: @@ -42,6 +46,10 @@ jobs: benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 480 + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml + lm_eval_timeout: 60 secrets: inherit BUILD-TEST-3-10: @@ -58,6 +66,10 @@ jobs: benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 480 + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml + lm_eval_timeout: 60 secrets: inherit BUILD-TEST-3-11: @@ -74,4 +86,8 @@ jobs: benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt benchmark_timeout: 480 + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml + lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml index d92a2619ef359..c764fa9e884b4 100644 --- a/.github/workflows/nm-weekly.yml +++ b/.github/workflows/nm-weekly.yml @@ -33,4 +33,8 @@ jobs: benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt benchmark_timeout: 720 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./neuralmagic/lm-eval/full-samll-models.yaml + lm_eval_timeout: 60 secrets: inherit diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml new file mode 100644 index 0000000000000..129ea4c5bf99c --- /dev/null +++ b/neuralmagic/lm-eval/full-small-models.yaml @@ -0,0 +1,23 @@ +# ./nm-run-lm-eval-gsm-hf-baseline -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5 +- model_name: "meta-llama/Meta-Llama-3-8B-Instruct" + tasks: + - name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.74 + - name: "exact_match,flexible-extract" + value: 0.74 + limit: 250 + num_fewshot: 5 + +# ./nm-run-lm-eval-gsm-hf-baseline -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5 +- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" + tasks: + - name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.684 + - name: "exact_match,flexible-extract" + value: 0.688 + limit: 250 + num_fewshot: 5 diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml new file mode 100644 index 0000000000000..c9ecd35bf793d --- /dev/null +++ b/neuralmagic/lm-eval/smoke-small-models.yaml @@ -0,0 +1,23 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5 +- model_name: "meta-llama/Meta-Llama-3-8B-Instruct" + tasks: + - name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.74 + - name: "exact_match,flexible-extract" + value: 0.74 + limit: 250 + num_fewshot: 5 + +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5 +- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" + tasks: + - name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.684 + - name: "exact_match,flexible-extract" + value: 0.688 + limit: 250 + num_fewshot: 5 diff --git a/tests/accuracy/lm-eval-tasks.yaml b/tests/accuracy/lm-eval-tasks.yaml deleted file mode 100644 index 70c3e8881b676..0000000000000 --- a/tests/accuracy/lm-eval-tasks.yaml +++ /dev/null @@ -1,105 +0,0 @@ -# Llama 2 7B: FP16, FP16 sparse, marlin -# NOTE: This model is superseded by Llama 3 -# - model_name: "NousResearch/Llama-2-7b-chat-hf" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.2266868840030326 -# - name: "exact_match,flexible-extract" -# value: 0.22820318423047764 -- model_name: "neuralmagic/Llama-2-7b-pruned50-retrained-ultrachat" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.09855951478392722 - - name: "exact_match,flexible-extract" - value: 0.10083396512509477 -# - model_name: "neuralmagic/llama-2-7b-chat-marlin" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.14101592115238817 -# - name: "exact_match,flexible-extract" -# value: 0.1652767247915087 - -# Mistral 7B: FP16, FP16 sparse, marlin -- model_name: "teknium/OpenHermes-2.5-Mistral-7B" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.6004548900682335 - - name: "exact_match,flexible-extract" - value: 0.6482183472327521 -- model_name: "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.4935557240333586 - - name: "exact_match,flexible-extract" - value: 0.5269143290371494 - extra_args: - --sparsity: "sparse_w16a16" -# - model_name: "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.4935557240333586 -# - name: "exact_match,flexible-extract" -# value: 0.5868081880212282 - -# Llama 3: FP16, FP8 -- model_name: "NousResearch/Meta-Llama-3-8B-Instruct" - tasks: - - name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.7566 - - name: "exact_match,flexible-extract" - value: 0.7551 -# NOTE: Needs to run on a system with CUDA compute capability >= 8.9 -# - model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.7445 -# - name: "exact_match,flexible-extract" -# value: 0.7445 - -# Phi 2: marlin -# - model_name: "neuralmagic/phi-2-super-marlin" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.49962092494313876 -# - name: "exact_match,flexible-extract" -# value: 0.5041698256254739 - -# Llama 2 7B: 2:4 marlin -# - model_name: "nm-testing/Llama-2-7b-pruned2.4-Marlin" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.2214 -# - name: "exact_match,flexible-extract" -# value: 0.0425 - -# Mixtral: FP16 -# g5.12xlarge runner (4x 24GB A10 GPUs) has insufficient VRAM -# - model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" -# tasks: -# - name: "gsm8k" -# metrics: -# - name: "exact_match,strict-match" -# value: 0.6550416982562547 -# - name: "exact_match,flexible-extract" -# value: 0.6603487490523123 -# enable_tensor_parallel: true diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 4c1ac9638a10a..93da626a2de03 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ import logging +import os from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, TypedDict @@ -47,7 +48,11 @@ class EvalTaskDefinition(EvalTaskDefinitionOpts): tasks: List[Task] -TEST_DATA_FILE = Path(__file__).parent / "lm-eval-tasks.yaml" +TEST_DATA_FILE = os.environ.get("LM_EVAL_TEST_DATA_FILE", None) +if TEST_DATA_FILE is None: + raise ValueError("LM_EVAL_TEST_DATA_FILE env variable is not set.") +TEST_DATA_FILE = Path(TEST_DATA_FILE) + TEST_DATA: List[EvalTaskDefinition] = [ pytest.param(eval_def, id=eval_def["model_name"]) for eval_def in yaml.safe_load(TEST_DATA_FILE.read_text(encoding="utf-8")) @@ -69,7 +74,7 @@ def test_lm_eval_correctness( vllm_args = { "--model": model_name, "--disable-log-requests": None, - "--max-model-len": 2048, + "--max-model-len": 4096, } if eval_data.get("enable_tensor_parallel") is True: @@ -88,13 +93,17 @@ def test_lm_eval_correctness( logger.info("launching server") with ServerContext(vllm_args, logger=logger) as _: - task_names = [t["name"] for t in eval_data["tasks"]] + task_names = [task["name"] for task in eval_data["tasks"]] + limit = eval_data["limit"] + new_fewshot = eval_data["num_fewshot"] logger.info("getting results for task_names=%s", task_names) results = lm_eval.simple_evaluate( model="local-completions", model_args=openai_args, tasks=task_names, - batch_size=64, + batch_size=32, + num_fewshot=new_fewshot, + limit=limit, ) logger.info("clearing torch cache")