Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Rel Eng] Dial In LM Eval Tests Phase 1 (#289)
Browse files Browse the repository at this point in the history
WAIT UNTIL UPSTREAM SYNC LANDS TO MERGE

SUMMARY:
* refactored lm-eval workflows to use a single script for generating a
baseline
* refactored lm-eval workflows to accept a config file so we can
parameterize for the different length runs
* added configuration for `remote-push` -> running `llama-3-8b` on 250
GSM prompts
* removed lm-eval-smoke such that we have one single pathway for running
lm-eval tests
  • Loading branch information
robertgshaw2-neuralmagic authored Jun 21, 2024
1 parent 39e484e commit 7c46a95
Show file tree
Hide file tree
Showing 15 changed files with 233 additions and 299 deletions.
32 changes: 0 additions & 32 deletions .github/actions/nm-lm-eval-smoke/action.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
name: run lm-eval full accuracy test
description: 'run lm-eval full accuracy test'
name: run lm-eval accuracy test
description: 'run lm-eval accuracy test'
inputs:
python:
description: 'python version, e.g. 3.10.12'
required: true
venv:
description: 'name for python virtual environment'
required: true
lm_eval_configuration:
description: 'file containing test configuration'
required: true
runs:
using: composite
steps:
- id: lm-eval
run: |
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
if [ -n "${{ inputs.venv }}" ]; then
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
Expand All @@ -26,7 +25,7 @@ runs:
pip3 install pytest openai==1.3.9
SUCCESS=0
pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$?
echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT"
exit ${SUCCESS}
shell: bash
125 changes: 0 additions & 125 deletions .github/scripts/lm_eval_compare_hf_vs_vllm.py

This file was deleted.

50 changes: 50 additions & 0 deletions .github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10

usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -d - device to use (e.g. cuda, cuda:0, auto, cpu)"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo
}

while getopts "m:b:d:l:f:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
d )
DEVICE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done

lm_eval --model hf \
--model_args pretrained=$MODEL \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE --device $DEVICE
30 changes: 30 additions & 0 deletions .github/scripts/nm-run-lm-eval-vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10

usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm server and compares to "
echo "precomputed baseline (measured by HF transformers."
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
echo
}

while getopts "c:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done

LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
25 changes: 18 additions & 7 deletions .github/workflows/nm-build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ on:
description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
type: string
default: "false"
# lm-eval related parameters
lm_eval_label:
description: "requested runner label (specifies instance)"
type: string
default: ""
lm_eval_timeout:
description: "time limit for lm_eval in minutes"
type: string
default: "60"
lm_eval_configuration:
description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
type: string
default: ""

jobs:

Expand Down Expand Up @@ -134,16 +147,14 @@ jobs:
push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
secrets: inherit

TEST-ACCURACY-FULL:
LM-EVAL-SOLO:
needs: [BUILD]
if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
uses: ./.github/workflows/nm-test-accuracy-full.yml
uses: ./.github/workflows/nm-lm-eval.yml
with:
label: ${{ inputs.test_label_multi }}
timeout: ${{ inputs.benchmark_timeout }}
label: ${{ inputs.lm_eval_label }}
timeout: ${{ inputs.lm_eval_timeout }}
gitref: ${{ inputs.gitref }}
Gi_per_thread: ${{ inputs.Gi_per_thread }}
nvcc_threads: ${{ inputs.nvcc_threads }}
python: ${{ inputs.python }}
whl: ${{ needs.BUILD.outputs.whl }}
lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
secrets: inherit
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ on:
description: "git commit hash or branch name"
type: string
required: true
Gi_per_thread:
description: 'requested GiB to reserve per thread'
type: string
required: true
nvcc_threads:
description: "number of threads nvcc build threads"
type: string
required: true
python:
description: "python version, e.g. 3.10.12"
type: string
Expand All @@ -31,6 +23,10 @@ on:
description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
type: string
required: true
lm_eval_configuration:
description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
type: string
required: true

# makes workflow manually callable
workflow_dispatch:
Expand All @@ -47,14 +43,6 @@ on:
description: "git commit hash or branch name"
type: string
required: true
Gi_per_thread:
description: 'requested GiB to reserve per thread'
type: string
required: true
nvcc_threads:
description: "number of threads nvcc build threads"
type: string
required: true
python:
description: "python version, e.g. 3.10.12"
type: string
Expand All @@ -63,9 +51,13 @@ on:
description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
type: string
required: true
lm_eval_configuration:
description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
type: string
required: true

jobs:
TEST-ACCURACY-FULL:
LM-EVAL:

runs-on: ${{ inputs.label }}
timeout-minutes: ${{ fromJSON(inputs.timeout) }}
Expand All @@ -77,6 +69,12 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python }}

- name: install automation components
run: |
sudo apt-get update --fix-missing
sudo apt-get install -y git-all
sudo apt-get install -y curl
- name: checkout repository code
uses: actions/checkout@v4
Expand Down Expand Up @@ -114,7 +112,8 @@ jobs:
venv:

- name: run lm-eval-accuracy
uses: ./.github/actions/nm-lm-eval-accuracy/
uses: ./.github/actions/nm-lm-eval/
with:
python: ${{ inputs.python }}
venv:
lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
Loading

0 comments on commit 7c46a95

Please sign in to comment.