Update README to have interactive server scenario + remove unnecessar…

…y changes
mlcommons · Jan 16, 2025 · 9d322cb · 9d322cb
1 parent e0df19a
commit 9d322cb
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 11 deletions.
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
@@ -250,16 +250,16 @@ This was run on a DGX-H100 node. Total runtime was ~4.5 days.
 
 For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
 
-In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the offline scenario in interactive mode:
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
 
 ```
-python -u main.py --scenario Offline \
+python -u main.py --scenario Server \
                 --model-path ${CHECKPOINT_PATH} \
                 --mlperf-conf mlperf.conf \
                 --user-conf user.conf \
                 --total-sample-count 24576 \
                 --device cpu \
                 --dataset-path ${DATASET_PATH} \
-                --output-log-dir offline-logs \
+                --output-log-dir server-logs \
                 --lg-model-name llama2-70b-interactive
 ```
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
@@ -152,7 +152,6 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, args.lg_model_name, args.scenario)
     settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
 
     if args.accuracy:

diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
@@ -9,10 +9,3 @@
 *.Server.target_qps = 0.5
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
-
-llama2-70b.Server.sample_concatenate_permutation = 1
-
-# Target Latencies for low latency setting.
-# llama2-70b-interactive.Server.target_latency = 0
-# llama2-70b-interactive.Server.ttft_latency = 450
-# llama2-70b-interactive.Server.tpot_latency = 40