diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml
deleted file mode 100644
index 527909bf68786..0000000000000
--- a/.github/actions/nm-lm-eval-smoke/action.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: run lm-eval accuracy smoke test
-description: 'run lm-eval accuracy smoke test'
-inputs:
-  python:
-    description: 'python version, e.g. 3.10.12'
-    required: true
-  venv:
-    description: 'name for python virtual environment'
-    required: true
-runs:
-  using: composite
-  steps:
-  - id: lm-eval
-    run: |
-      # move source directories
-      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
-      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
-
-      if [ -n "${{ inputs.venv }}" ]; then
-        COMMIT=${{ github.sha }}
-        VENV="${{ inputs.venv }}-${COMMIT:0:7}"
-        source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
-      fi
-
-      pip3 install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
-      pip3 install optimum auto-gptq
-
-      SUCCESS=0
-      python .github/scripts/lm_eval_compare_hf_vs_vllm.py --hf_pretrained nm-testing/zephyr-beta-7b-gptq-g128 --vllm_pretrained nm-testing/zephyr-beta-7b-marlin-g128 || SUCCESS=$?
-      echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
-      exit ${SUCCESS}
-    shell: bash
diff --git a/.github/actions/nm-lm-eval-accuracy/action.yml b/.github/actions/nm-lm-eval/action.yml
similarity index 64%
rename from .github/actions/nm-lm-eval-accuracy/action.yml
rename to .github/actions/nm-lm-eval/action.yml
index ee8c78c8855a3..f7f0b07fcf080 100644
--- a/.github/actions/nm-lm-eval-accuracy/action.yml
+++ b/.github/actions/nm-lm-eval/action.yml
@@ -1,5 +1,5 @@
-name: run lm-eval full accuracy test
-description: 'run lm-eval full accuracy test'
+name: run lm-eval accuracy test
+description: 'run lm-eval accuracy test'
 inputs:
   python:
     description: 'python version, e.g. 3.10.12'
@@ -7,15 +7,14 @@ inputs:
   venv:
     description: 'name for python virtual environment'
     required: true
+  lm_eval_configuration:
+    description: 'file containing test configuration'
+    required: true
 runs:
   using: composite
   steps:
   - id: lm-eval
     run: |
-      # move source directories
-      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
-      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
-
       if [ -n "${{ inputs.venv }}" ]; then
         COMMIT=${{ github.sha }}
         VENV="${{ inputs.venv }}-${COMMIT:0:7}"
@@ -26,7 +25,7 @@ runs:
       pip3 install pytest openai==1.3.9
 
       SUCCESS=0
-      pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
-      echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      ./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$?
+      echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
deleted file mode 100644
index d8e256631e9a7..0000000000000
--- a/.github/scripts/lm_eval_compare_hf_vs_vllm.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import argparse
-import os
-from typing import Dict, List, Tuple
-
-import lm_eval
-import lm_eval.models.utils
-import numpy as np
-import scipy.stats
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-
-def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
-    acc1, acc2 = res1["acc,none"], res2["acc,none"]
-    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
-    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
-    # Determining the p-value
-    p_value = 2 * scipy.stats.norm.sf(abs(Z))  # two-tailed test
-    return Z, p_value
-
-
-def print_results(data_to_print: List = None,
-                  results_dict: Dict = None,
-                  alpha: float = None):
-    model1_data, model2_data = data_to_print
-    for task in model1_data:
-        print(f"Task: {task}")
-        print(f"HF Accuracy: {model1_data[task]['acc,none']}")
-        print(f"vLLM Accuracy: {model2_data[task]['acc,none']}")
-        print(f"HF StdErr: {model1_data[task]['acc_stderr,none']}")
-        print(f"vLLM StdErr: {model2_data[task]['acc_stderr,none']}")
-        z = results_dict[task]["z"]
-        p_value = results_dict[task]["p_value"]
-        result = "PASS" if p_value > alpha else "FAIL"
-        print(f"Z-Score: {z}, P-Value: {p_value}, p > {alpha}: {result}\n")
-
-
-def check_passing_score(results_dict: Dict = None,
-                        alpha: float = None) -> bool:
-    for task in results_dict:
-        p_value = results_dict[task]["p_value"]
-        if p_value <= alpha:
-            return False
-    return True
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hf_pretrained",
-                        default="EleutherAI/pythia-70m",
-                        help="name of model to compare as baseline")
-    parser.add_argument("--vllm_pretrained",
-                        default="EleutherAI/pythia-70m",
-                        help="name of model to compare as difference")
-    parser.add_argument("--hf_args",
-                        help="huggingface model args <arg>=<value>",
-                        default="")
-    parser.add_argument("--vllm_args",
-                        help="vllm model args <arg>=<value>",
-                        default="")
-    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
-    parser.add_argument(
-        "--limit",
-        type=float,
-        default=100,
-    )
-    parser.add_argument(
-        "--alpha",
-        type=float,
-        default=0.05,
-        help="Significance level for two-tailed z-test",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-    )
-    parser.add_argument(
-        "--batch",
-        type=str,
-        default=4,
-    )
-    parser.add_argument(
-        "--verbosity",
-        type=str,
-        default="INFO",
-        help="Logging verbosity",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    tasks = args.tasks.split(",")
-    print("Tasks:", tasks)
-    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
-    results_hf = lm_eval.simple_evaluate(
-        model="hf",
-        model_args=f"pretrained={args.hf_pretrained}" + hf_args,
-        tasks=tasks,
-        limit=args.limit,
-        device=args.device,
-        batch_size=args.batch,
-    )
-    lm_eval.models.utils.clear_torch_cache()
-    print("Memory stats cleared")
-    results_vllm = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=f"pretrained={args.vllm_pretrained}" + vllm_args,
-        tasks=tasks,
-        limit=args.limit,
-        device=args.device,
-        batch_size=args.batch,
-    )
-    all_res = {}
-    for task1, task2 in zip(results_hf["results"].items(),
-                            results_vllm["results"].items()):
-        assert task1[0] == task2[0]
-        z, p_value = calculate_z_value(task1[1], task2[1])
-        all_res[task1[0]] = {"z": z, "p_value": p_value}
-    print_results([results_hf["results"], results_vllm["results"]], all_res,
-                  args.alpha)
-    if not check_passing_score(all_res, args.alpha):
-        print("Accuracy test failed!")
-        exit(1)
diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
new file mode 100755
index 0000000000000..76f7100af5949
--- /dev/null
+++ b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -d    - device to use (e.g. cuda, cuda:0, auto, cpu)"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:d:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    d ) 
+        DEVICE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE --device $DEVICE
diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh
new file mode 100755
index 0000000000000..c68d6d1d7697f
--- /dev/null
+++ b/.github/scripts/nm-run-lm-eval-vllm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm server and compares to "
+    echo "precomputed baseline (measured by HF transformers."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
+    echo
+}
+
+while getopts "c:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
index 7847043da5ed1..4d3f6fd5f34cb 100644
--- a/.github/workflows/nm-build-test.yml
+++ b/.github/workflows/nm-build-test.yml
@@ -66,6 +66,19 @@ on:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
         type: string
         default: "false"
+      # lm-eval related parameters
+      lm_eval_label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        default: ""
+      lm_eval_timeout:
+        description: "time limit for lm_eval in minutes"
+        type: string
+        default: "60"
+      lm_eval_configuration:
+        description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
+        type: string
+        default: "" 
 
 jobs:
 
@@ -134,16 +147,14 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
 
-    TEST-ACCURACY-FULL:
+    LM-EVAL-SOLO:
       needs: [BUILD]
-      if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-test-accuracy-full.yml
+      uses: ./.github/workflows/nm-lm-eval.yml
       with:
-        label: ${{ inputs.test_label_multi }}
-        timeout: ${{ inputs.benchmark_timeout }}
+        label: ${{ inputs.lm_eval_label }}
+        timeout: ${{ inputs.lm_eval_timeout }}
         gitref: ${{ inputs.gitref }}
-        Gi_per_thread: ${{ inputs.Gi_per_thread }}
-        nvcc_threads: ${{ inputs.nvcc_threads }}
         python: ${{ inputs.python }}
         whl: ${{ needs.BUILD.outputs.whl }}
+        lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
       secrets: inherit
diff --git a/.github/workflows/nm-test-accuracy-full.yml b/.github/workflows/nm-lm-eval.yml
similarity index 83%
rename from .github/workflows/nm-test-accuracy-full.yml
rename to .github/workflows/nm-lm-eval.yml
index ae3ebee62203e..90b7ec61a0a3a 100644
--- a/.github/workflows/nm-test-accuracy-full.yml
+++ b/.github/workflows/nm-lm-eval.yml
@@ -15,14 +15,6 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -31,6 +23,10 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
+      lm_eval_configuration:
+        description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
+        type: string
+        required: true
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -47,14 +43,6 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -63,9 +51,13 @@ on:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
         required: true
+      lm_eval_configuration:
+        description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
+        type: string
+        required: true
 
 jobs:
-  TEST-ACCURACY-FULL:
+  LM-EVAL:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
@@ -77,6 +69,12 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ inputs.python }}
+      
+      - name: install automation components
+        run: |
+          sudo apt-get update --fix-missing
+          sudo apt-get install -y git-all
+          sudo apt-get install -y curl
 
       - name: checkout repository code
         uses: actions/checkout@v4
@@ -114,7 +112,8 @@ jobs:
             venv:
 
       - name: run lm-eval-accuracy
-        uses: ./.github/actions/nm-lm-eval-accuracy/
+        uses: ./.github/actions/nm-lm-eval/
         with:
           python: ${{ inputs.python }}
           venv:
+          lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
index e90f99b34ee9a..17b9624cfe4e0 100644
--- a/.github/workflows/nm-nightly.yml
+++ b/.github/workflows/nm-nightly.yml
@@ -33,6 +33,10 @@ jobs:
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 720
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     PYTHON-3-9:
@@ -51,6 +55,10 @@ jobs:
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 720
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     PYTHON-3-10:
@@ -69,6 +77,10 @@ jobs:
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 720
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     PYTHON-3-11:
@@ -88,4 +100,8 @@ jobs:
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 720
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml
index f5c9056cbc5d7..ac1e4488ad1f9 100644
--- a/.github/workflows/nm-release.yml
+++ b/.github/workflows/nm-release.yml
@@ -29,6 +29,10 @@ jobs:
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
       benchmark_timeout: 720
       push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
+
+      lm_eval_label: gcp-k8s-l4-solo
+      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+      lm_eval_timeout: 60
     secrets: inherit
 
   PYTHON-3-9:
@@ -47,6 +51,10 @@ jobs:
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
       benchmark_timeout: 720
       push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
+
+      lm_eval_label: gcp-k8s-l4-solo
+      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+      lm_eval_timeout: 60
     secrets: inherit
 
   PYTHON-3-10:
@@ -65,6 +73,10 @@ jobs:
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
       benchmark_timeout: 720
       push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
+
+      lm_eval_label: gcp-k8s-l4-solo
+      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+      lm_eval_timeout: 60
     secrets: inherit
 
   PYTHON-3-11:
@@ -83,4 +95,8 @@ jobs:
       benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
       benchmark_timeout: 720
       push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
+
+      lm_eval_label: gcp-k8s-l4-solo
+      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+      lm_eval_timeout: 60
     secrets: inherit
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
index 3c1fe246756a4..37a350c461e38 100644
--- a/.github/workflows/nm-remote-push.yml
+++ b/.github/workflows/nm-remote-push.yml
@@ -26,6 +26,10 @@ jobs:
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 480
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     BUILD-TEST-3-9:
@@ -42,6 +46,10 @@ jobs:
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 480
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     BUILD-TEST-3-10:
@@ -58,6 +66,10 @@ jobs:
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 480
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
 
     BUILD-TEST-3-11:
@@ -74,4 +86,8 @@ jobs:
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
             benchmark_timeout: 480
+
+            lm_eval_label: gcp-k8s-l4-solo
+            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_timeout: 60
         secrets: inherit
diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml
index d92a2619ef359..c764fa9e884b4 100644
--- a/.github/workflows/nm-weekly.yml
+++ b/.github/workflows/nm-weekly.yml
@@ -33,4 +33,8 @@ jobs:
       benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt
       benchmark_timeout: 720
       push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+
+      lm_eval_label: gcp-k8s-l4-solo
+      lm_eval_configuration: ./neuralmagic/lm-eval/full-samll-models.yaml
+      lm_eval_timeout: 60
     secrets: inherit
diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml
new file mode 100644
index 0000000000000..129ea4c5bf99c
--- /dev/null
+++ b/neuralmagic/lm-eval/full-small-models.yaml
@@ -0,0 +1,23 @@
+# ./nm-run-lm-eval-gsm-hf-baseline -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5
+- model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+  tasks:
+  - name: "gsm8k"
+    metrics:
+    - name: "exact_match,strict-match"
+      value: 0.74
+    - name: "exact_match,flexible-extract"
+      value: 0.74
+  limit: 250
+  num_fewshot: 5
+
+# ./nm-run-lm-eval-gsm-hf-baseline -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5
+- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ"
+  tasks:
+  - name: "gsm8k"
+    metrics:
+    - name: "exact_match,strict-match"
+      value: 0.684
+    - name: "exact_match,flexible-extract"
+      value: 0.688
+  limit: 250
+  num_fewshot: 5
diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml
new file mode 100644
index 0000000000000..c9ecd35bf793d
--- /dev/null
+++ b/neuralmagic/lm-eval/smoke-small-models.yaml
@@ -0,0 +1,23 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -d cuda -l 250 -f 5
+- model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+  tasks:
+  - name: "gsm8k"
+    metrics:
+    - name: "exact_match,strict-match"
+      value: 0.74
+    - name: "exact_match,flexible-extract"
+      value: 0.74
+  limit: 250
+  num_fewshot: 5
+
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -d cuda -l 250 -f 5
+- model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ"
+  tasks:
+  - name: "gsm8k"
+    metrics:
+    - name: "exact_match,strict-match"
+      value: 0.684
+    - name: "exact_match,flexible-extract"
+      value: 0.688
+  limit: 250
+  num_fewshot: 5
diff --git a/tests/accuracy/lm-eval-tasks.yaml b/tests/accuracy/lm-eval-tasks.yaml
deleted file mode 100644
index 70c3e8881b676..0000000000000
--- a/tests/accuracy/lm-eval-tasks.yaml
+++ /dev/null
@@ -1,105 +0,0 @@
-# Llama 2 7B: FP16, FP16 sparse, marlin
-# NOTE: This model is superseded by Llama 3
-# - model_name: "NousResearch/Llama-2-7b-chat-hf"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.2266868840030326
-#     - name: "exact_match,flexible-extract"
-#       value: 0.22820318423047764
-- model_name: "neuralmagic/Llama-2-7b-pruned50-retrained-ultrachat"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.09855951478392722
-    - name: "exact_match,flexible-extract"
-      value: 0.10083396512509477
-# - model_name: "neuralmagic/llama-2-7b-chat-marlin"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.14101592115238817
-#     - name: "exact_match,flexible-extract"
-#       value: 0.1652767247915087
-
-# Mistral 7B: FP16, FP16 sparse, marlin
-- model_name: "teknium/OpenHermes-2.5-Mistral-7B"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.6004548900682335
-    - name: "exact_match,flexible-extract"
-      value: 0.6482183472327521
-- model_name: "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.4935557240333586
-    - name: "exact_match,flexible-extract"
-      value: 0.5269143290371494
-  extra_args:
-    --sparsity: "sparse_w16a16"
-# - model_name: "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.4935557240333586
-#     - name: "exact_match,flexible-extract"
-#       value: 0.5868081880212282
-
-# Llama 3: FP16, FP8
-- model_name: "NousResearch/Meta-Llama-3-8B-Instruct"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.7566
-    - name: "exact_match,flexible-extract"
-      value: 0.7551
-# NOTE: Needs to run on a system with CUDA compute capability >= 8.9
-# - model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.7445
-#     - name: "exact_match,flexible-extract"
-#       value: 0.7445
-
-# Phi 2: marlin
-# - model_name: "neuralmagic/phi-2-super-marlin"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.49962092494313876
-#     - name: "exact_match,flexible-extract"
-#       value: 0.5041698256254739
-
-# Llama 2 7B: 2:4 marlin
-# - model_name: "nm-testing/Llama-2-7b-pruned2.4-Marlin"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.2214
-#     - name: "exact_match,flexible-extract"
-#       value: 0.0425
-
-# Mixtral: FP16
-# g5.12xlarge runner (4x 24GB A10 GPUs) has insufficient VRAM
-# - model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-#   tasks:
-#   - name: "gsm8k"
-#     metrics:
-#     - name: "exact_match,strict-match"
-#       value: 0.6550416982562547
-#     - name: "exact_match,flexible-extract"
-#       value: 0.6603487490523123
-#   enable_tensor_parallel: true
diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py
index 4c1ac9638a10a..93da626a2de03 100644
--- a/tests/accuracy/test_lm_eval_correctness.py
+++ b/tests/accuracy/test_lm_eval_correctness.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, TypedDict
 
@@ -47,7 +48,11 @@ class EvalTaskDefinition(EvalTaskDefinitionOpts):
     tasks: List[Task]
 
 
-TEST_DATA_FILE = Path(__file__).parent / "lm-eval-tasks.yaml"
+TEST_DATA_FILE = os.environ.get("LM_EVAL_TEST_DATA_FILE", None)
+if TEST_DATA_FILE is None:
+    raise ValueError("LM_EVAL_TEST_DATA_FILE env variable is not set.")
+TEST_DATA_FILE = Path(TEST_DATA_FILE)
+
 TEST_DATA: List[EvalTaskDefinition] = [
     pytest.param(eval_def, id=eval_def["model_name"])
     for eval_def in yaml.safe_load(TEST_DATA_FILE.read_text(encoding="utf-8"))
@@ -69,7 +74,7 @@ def test_lm_eval_correctness(
     vllm_args = {
         "--model": model_name,
         "--disable-log-requests": None,
-        "--max-model-len": 2048,
+        "--max-model-len": 4096,
     }
 
     if eval_data.get("enable_tensor_parallel") is True:
@@ -88,13 +93,17 @@ def test_lm_eval_correctness(
 
     logger.info("launching server")
     with ServerContext(vllm_args, logger=logger) as _:
-        task_names = [t["name"] for t in eval_data["tasks"]]
+        task_names = [task["name"] for task in eval_data["tasks"]]
+        limit = eval_data["limit"]
+        new_fewshot = eval_data["num_fewshot"]
         logger.info("getting results for task_names=%s", task_names)
         results = lm_eval.simple_evaluate(
             model="local-completions",
             model_args=openai_args,
             tasks=task_names,
-            batch_size=64,
+            batch_size=32,
+            num_fewshot=new_fewshot,
+            limit=limit,
         )
 
     logger.info("clearing torch cache")