From ea450005d1b2248cdf164f09415b1280f0de6b2c Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Mon, 13 Jan 2025 17:31:20 -0500
Subject: [PATCH] Update README to have interactive server scenario + remove
 unnecessary changes

---
 language/llama2-70b/README.md | 6 +++---
 language/llama2-70b/main.py   | 1 -
 language/llama2-70b/user.conf | 7 -------
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 760a5c01bf..5268863ac6 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -250,16 +250,16 @@ This was run on a DGX-H100 node. Total runtime was ~4.5 days.
 
 For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
 
-In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the offline scenario in interactive mode:
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
 
 ```
-python -u main.py --scenario Offline \
+python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
                 --mlperf-conf mlperf.conf \
                 --user-conf user.conf \
                 --total-sample-count 24576 \
                 --device cpu \
                 --dataset-path ${DATASET_PATH} \
-                --output-log-dir offline-logs \
+                --output-log-dir server-logs \
                 --lg-model-name llama2-70b-interactive
 ```
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index c5c56c8aea..84ccf849ad 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -152,7 +152,6 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, args.lg_model_name, args.scenario)
     settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
 
     if args.accuracy:
diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
index 15fe568957..bb97c437a9 100644
--- a/language/llama2-70b/user.conf
+++ b/language/llama2-70b/user.conf
@@ -9,10 +9,3 @@
 *.Server.target_qps = 0.5
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
-
-llama2-70b.Server.sample_concatenate_permutation = 1
-
-# Target Latencies for low latency setting.
-# llama2-70b-interactive.Server.target_latency = 0
-# llama2-70b-interactive.Server.ttft_latency = 450
-# llama2-70b-interactive.Server.tpot_latency = 40
\ No newline at end of file