diff --git a/.gitignore b/.gitignore
index 9545a7977..eba8bb341 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ loadgen/build/
 libmlperf_loadgen.a
 __pycache__/
 generated/
+*.swp
diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md
index dcc5344c4..26e2876a0 100644
--- a/language/llama3-405b/README.md
+++ b/language/llama3-405b/README.md
@@ -9,31 +9,61 @@
 
 Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
 
- 
+
 ## Prepare environment
 
-Copy the mlperf.conf file to this folder.
-```
-cp ../../mlperf.conf .
+### Local Environment Run
+
+The following steps were tested in Ubuntu 22.04 with python 3.10
+
+- **Prerrequisite for GPU runs:** Install Nvidia Driver and cuda 12.1.
+
+The following links contain the commands for installing the [NVIDIA Driver](https://developer.nvidia.com/datacenter-driver-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local) and [Cuda](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local)
+
+- **Prerrequisite:** Install conda.
+
+```bash
+mkdir -p ~/miniconda3
+wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+rm ~/miniconda3/miniconda.sh
+~/miniconda3/bin/conda init
 ```
 
-For a CPU-only run:
+- Set the following helper variables
+```bash
+export ROOT=$PWD/inference
+export LLAMA_FOLDER=$PWD/inference/language/llama3-405b
+export LOADGEN_FOLDER=$PWD/inference/loadgen
+export DATASET_FOLDER=$PWD/inference/language/llama3-405b/dataset
+```
 
+- Clone the inference repository:
+```bash
+git clone --recurse-submodules https://github.com/mlcommons/inference.git \
+ --depth 1
 ```
-conda create -n llama3-405b python=3.9
+
+- Create a conda environment:
+```bash
+conda create -y -n llama3-405b python=3.10
 conda activate llama3-405b
+conda install -y -c conda-forge libstdcxx-ng=12
+```
 
+- Install requirements and loadgen:
+```bash
+cd $LLAMA_FOLDER
 # Install packages
 pip install -r requirements.txt
+```
 
-export CUR_DIR=${PWD}
-cd <inference-repo-root>/loadgen
-
-
-python -m pip install .
+```bash
+cd $LOADGEN_FOLDER
+pip install -e .
 ```
 
-For a GPU-based run:
+### Docker Run
 
 A dockerfile is provided, along with scripts to help launch it. First, add any docker volume mounts you want in
 `launch.sh`. There is a section at the top of the file that looks like:
@@ -54,10 +84,13 @@ MOUNTS=(
     /raid/data:/raid/data
 )
 ```
-Once you have added all your mounts, launch the container with `bash launch.sh`.
+Once you have added all your mounts, build and launch the container with `bash launch.sh`.
 
-Inside the container, set up the environment with `bash build.sh`. This will install all the dependencies from the
-CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch.
+Now install all the dependencies:
+```
+pip install -r requirements.txt
+pip install -e ../../loadgen
+```
 
 
 ## Get Model
@@ -73,7 +106,7 @@ TODO: Host model and grant access to submitters
 export CHECKPOINT_PATH=Meta-Llama-3.1-405B-Instruct
 git lfs install
 git clone https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct ${CHECKPOINT_PATH}
-
+cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1
 ```
 
 ## Get Dataset
@@ -109,6 +142,7 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama
 ```
 python -u main.py --scenario Offline \
                 --model-path ${CHECKPOINT_PATH} \
+                --batch-size 16 \
                 --dtype float16 \
                 --user-conf user.conf \
                 --total-sample-count 8312 \
@@ -123,6 +157,7 @@ python -u main.py --scenario Offline \
 ```
 python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
+                --batch-size 16 \
                 --dtype float16 \
                 --user-conf user.conf \
                 --total-sample-count 8312 \
@@ -145,6 +180,7 @@ mkdir -p "run_outputs"  # The script will dump all the outputs to 'run_outputs'.
 
 python -u main.py --scenario Offline \
                 --model-path ${CHECKPOINT_PATH} \
+                --batch-size 16 \
                 --accuracy \
                 --dtype float16 \
                 --user-conf user.conf \
@@ -172,6 +208,7 @@ OUTPUT_LOG_DIR=server-accuracy-logs
 
 python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
+                --batch-size 16 \
                 --accuracy \
                 --dtype float16 \
                 --user-conf user.conf \
@@ -201,4 +238,4 @@ Running the GPU implementation in FP16 precision resulted in the following FP16
         'tokens_per_sample': 684.68,
 }
 ```
-
+The accuracy target is 99% for rougeL and exact_match, and 90% for tokens_per_sample
diff --git a/language/llama3-405b/run_accuracy.sh b/language/llama3-405b/run_accuracy.sh
index 075245913..f1a8be404 100644
--- a/language/llama3-405b/run_accuracy.sh
+++ b/language/llama3-405b/run_accuracy.sh
@@ -5,6 +5,7 @@ mkdir -p "run_outputs"
 
 python3 -u main.py --scenario Offline \
         --model-path ${CHECKPOINT_PATH} \
+        --batch-size 16 \
         --accuracy \
         --mlperf-conf mlperf.conf \
         --user-conf user.conf \
@@ -17,5 +18,3 @@ python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
         --mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \
         --dataset-file ${DATASET_PATH} \
         --dtype int32
-
-python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
diff --git a/language/llama3-405b/run_offline.sh b/language/llama3-405b/run_offline.sh
index 89fa9e45f..b5ad1ded5 100644
--- a/language/llama3-405b/run_offline.sh
+++ b/language/llama3-405b/run_offline.sh
@@ -1,10 +1,13 @@
 CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}"
-DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
+DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}"
 
 python -u main.py --scenario Offline \
-		--model-path ${CHECKPOINT_PATH} \
-		--mlperf-conf mlperf.conf \
-		--user-conf user.conf \
-		--total-sample-count 8312 \
-		--dataset-path ${DATASET_PATH} \
-		--device cpu 2>&1 | tee server_log.log
+	--model-path ${CHECKPOINT_PATH} \
+	--batch-size 16 \
+	--dtype float16 \
+	--user-conf user.conf \
+	--total-sample-count 8312 \
+	--dataset-path ${DATASET_PATH} \
+	--output-log-dir output \
+	--tensor-parallel-size ${GPU_COUNT} \
+	--vllm 2>&1 | tee offline.log
diff --git a/language/llama3-405b/run_server.sh b/language/llama3-405b/run_server.sh
index fe2a31c43..7735b417e 100644
--- a/language/llama3-405b/run_server.sh
+++ b/language/llama3-405b/run_server.sh
@@ -1,12 +1,15 @@
 
 
 CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}"
-DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
+DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}"
 
 python -u main.py --scenario Server \
-		--model-path ${CHECKPOINT_PATH} \
-		--mlperf-conf mlperf.conf \
-		--user-conf user.conf \
-		--total-sample-count 8312 \
-		--dataset-path ${DATASET_PATH} \
-		--device cpu 2>&1 | tee server_log.log
+	--model-path ${CHECKPOINT_PATH} \
+	--batch-size 16 \
+	--dtype float16 \
+	--user-conf user.conf \
+	--total-sample-count 8312 \
+	--dataset-path ${DATASET_PATH} \
+	--output-log-dir output \
+	--tensor-parallel-size ${GPU_COUNT} \
+	--vllm 2>&1 | tee server.log
diff --git a/language/llama3-405b/with_the_same_user b/language/llama3-405b/with_the_same_user
new file mode 100755
index 000000000..cfa57902f
--- /dev/null
+++ b/language/llama3-405b/with_the_same_user
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# wkong: manually set the user info in env first
+
+set -ex
+
+if [ -z "$@" ]; then
+  COMMAND=(bash)
+else
+  COMMAND=("$@")
+fi
+
+apt-get update && apt-get install -y sudo
+
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" --disabled-password --quiet "${CI_BUILD_USER}"
+
+usermod -a -G dip "${CI_BUILD_USER}"
+usermod -a -G sudo "${CI_BUILD_USER}"
+usermod -a -G root "${CI_BUILD_USER}"
+
+echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+sudo -H -u "#${CI_BUILD_UID}" --preserve-env \
+  PATH="${PATH}" \
+  LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
+  PYTHONPATH="${PYTHONPATH}" \
+  ${COMMAND[@]}