[Rel Eng] Dial In LM Eval Tests Phase 1 (#289)

WAIT UNTIL UPSTREAM SYNC LANDS TO MERGE SUMMARY: * refactored lm-eval workflows to use a single script for generating a baseline * refactored lm-eval workflows to accept a config file so we can parameterize for the different length runs * added configuration for `remote-push` -> running `llama-3-8b` on 250 GSM prompts * removed lm-eval-smoke such that we have one single pathway for running lm-eval tests
neuralmagic · Jun 21, 2024 · 7c46a95 · 7c46a95
1 parent 39e484e
commit 7c46a95
Show file tree

Hide file tree

Showing 15 changed files with 233 additions and 299 deletions.
diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml
diff --git a/...ub/actions/nm-lm-eval-accuracy/action.yml → .github/actions/nm-lm-eval/action.yml b/...ub/actions/nm-lm-eval-accuracy/action.yml → .github/actions/nm-lm-eval/action.yml
@@ -1,21 +1,20 @@
-name: run lm-eval full accuracy test
-description: 'run lm-eval full accuracy test'
+name: run lm-eval accuracy test
+description: 'run lm-eval accuracy test'
 inputs:
   python:
     description: 'python version, e.g. 3.10.12'
     required: true
   venv:
     description: 'name for python virtual environment'
     required: true
+  lm_eval_configuration:
+    description: 'file containing test configuration'
+    required: true
 runs:
   using: composite
   steps:
   - id: lm-eval
     run: |
-      # move source directories
-      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
-      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
-
       if [ -n "${{ inputs.venv }}" ]; then
         COMMIT=${{ github.sha }}
         VENV="${{ inputs.venv }}-${COMMIT:0:7}"
@@ -26,7 +25,7 @@ runs:
       pip3 install pytest openai==1.3.9
 
       SUCCESS=0
-      pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
-      echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      ./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$?
+      echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -d    - device to use (e.g. cuda, cuda:0, auto, cpu)"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:d:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    d ) 
+        DEVICE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE --device $DEVICE
diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm server and compares to "
+    echo "precomputed baseline (measured by HF transformers."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
+    echo
+}
+
+while getopts "c:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -66,6 +66,19 @@ on:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
         type: string
         default: "false"
+      # lm-eval related parameters
+      lm_eval_label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        default: ""
+      lm_eval_timeout:
+        description: "time limit for lm_eval in minutes"
+        type: string
+        default: "60"
+      lm_eval_configuration:
+        description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
+        type: string
+        default: "" 
 
 jobs:
 
@@ -134,16 +147,14 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
 
-    TEST-ACCURACY-FULL:
+    LM-EVAL-SOLO:
       needs: [BUILD]
-      if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-test-accuracy-full.yml
+      uses: ./.github/workflows/nm-lm-eval.yml
       with:
-        label: ${{ inputs.test_label_multi }}
-        timeout: ${{ inputs.benchmark_timeout }}
+        label: ${{ inputs.lm_eval_label }}
+        timeout: ${{ inputs.lm_eval_timeout }}
         gitref: ${{ inputs.gitref }}
-        Gi_per_thread: ${{ inputs.Gi_per_thread }}
-        nvcc_threads: ${{ inputs.nvcc_threads }}
         python: ${{ inputs.python }}
         whl: ${{ needs.BUILD.outputs.whl }}
+        lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
       secrets: inherit
diff --git a/.github/workflows/nm-test-accuracy-full.yml → .github/workflows/nm-lm-eval.yml b/.github/workflows/nm-test-accuracy-full.yml → .github/workflows/nm-lm-eval.yml
@@ -15,14 +15,6 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -31,6 +23,10 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
+      lm_eval_configuration:
+        description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
+        type: string
+        required: true
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -47,14 +43,6 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -63,9 +51,13 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
+      lm_eval_configuration:
+        description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
+        type: string
+        required: true
 
 jobs:
-  TEST-ACCURACY-FULL:
+  LM-EVAL:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
@@ -77,6 +69,12 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ inputs.python }}
+
+      - name: install automation components
+        run: |
+          sudo apt-get update --fix-missing
+          sudo apt-get install -y git-all
+          sudo apt-get install -y curl
 
       - name: checkout repository code
         uses: actions/checkout@v4
@@ -114,7 +112,8 @@ jobs:
             venv:
 
       - name: run lm-eval-accuracy
-        uses: ./.github/actions/nm-lm-eval-accuracy/
+        uses: ./.github/actions/nm-lm-eval/
         with:
           python: ${{ inputs.python }}
           venv:
+          lm_eval_configuration: ${{ inputs.lm_eval_configuration }}