add docs for llama3 + inference version upgrade (#2020)

* add docs for llama3 + inference version upgrade * add output path and hf token
mlcommons · Jan 7, 2025 · 2c67b24 · 2c67b24
1 parent e3ea5ef
commit 2c67b24
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 2 deletions.
diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md
@@ -0,0 +1,41 @@
+---
+hide:
+  - toc
+---
+
+# Text Summarization using LLAMA3.1-405b
+
+## Dataset
+
+The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
+
+=== "Validation"
+
+    ### Get Validation Dataset
+    ```
+    cm run script --tags=get,dataset,mlperf,inference,llama3,_validation --outdirname=<path to download> -j
+    ```
+
+=== "Calibration"
+
+    ### Get Calibration Dataset
+    ```
+    cm run script --tags=get,dataset,mlperf,inference,llama3,_calibration --outdirname=<path to download> -j
+    ```
+
+## Model
+The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
+
+Get the Official MLPerf LLAMA3.1-405b Model
+
+=== "Pytorch"
+
+    ### Pytorch
+    ```
+    cm run script --tags=get,ml-model,llama3 --outdirname=<path to download> --hf_token=<huggingface access token> -j
+    ```
+
+!!! tip
+
+    Downloading llama3.1-405B model from Hugging Face will require an [**access token**](https://huggingface.co/settings/tokens) which could be generated for your account. Additionally, ensure that your account has access to the [llama3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) model. 
+
diff --git a/docs/benchmarks/language/llama3_1-405b.md b/docs/benchmarks/language/llama3_1-405b.md
@@ -0,0 +1,13 @@
+---
+hide:
+  - toc
+---
+
+# Text Summarization using LLAMA3_1-405b
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+
+{{ mlperf_inference_implementation_readme (4, "llama3_1-405b-99", "reference", devices=["CPU","CUDA"]) }}
+
+{{ mlperf_inference_implementation_readme (4, "llama3_1-405b-99.9", "reference", devices=["CPU","CUDA"]) }}
diff --git a/main.py b/main.py
@@ -28,7 +28,7 @@ def mlperf_inference_implementation_readme(
         content = ""
 
         execution_envs = ["Docker", "Native"]
-        code_version = "r4.1-dev"
+        code_version = "r5.0-dev"
         implementation_run_options = []
 
         if model == "rnnt":
@@ -50,6 +50,8 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Onnxruntime", "Pytorch"]
                 elif "bert" in model.lower():
                     frameworks = ["Pytorch", "Deepsparse"]
+                elif "llama3" in model.lower():
+                    frameworks = ["Pytorch"]
                 else:
                     frameworks = ["Pytorch"]
 
@@ -127,6 +129,7 @@ def mlperf_inference_implementation_readme(
                 "dlrm" in model.lower()
                 or "llama2" in model.lower()
                 or "mixtral" in model.lower()
+                or "llama3" in model.lower()
             ):
                 categories = ["Datacenter"]
             else:
@@ -499,6 +502,7 @@ def get_common_info(spaces, implementation, model):
         info += f"\n{pre_space}!!! tip\n\n"
         info += f"{pre_space}    - Number of threads could be adjusted using `--threads=#`, where `#` is the desired number of threads. This option works only if the implementation in use supports threading.\n\n"
         info += f"{pre_space}    - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n"
+        info += f"{pre_space}    - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n"
         if model == "rgat":
             info += f"{pre_space}    - Add `--env.CM_DATASET_IGBH_PATH=<Path to IGBH dataset>` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n"
             info += f"{pre_space}    - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=<Path to R-GAT model checkpoint>` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n"
@@ -522,7 +526,9 @@ def get_docker_info(spaces, model, implementation,
 
             if model == "sdxl":
                 info += f"{pre_space}    - `--env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
-
+            elif "llama3" in model.lower():
+                info += f"{pre_space}    - `--env.CM_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
+                info += f"{pre_space}    - `--env.CM_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
             if implementation.lower() == "nvidia":
                 info += f"{pre_space}    - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n"
                 info += f"{pre_space}    - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n"

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -39,6 +39,7 @@ nav:
           - IndySCC24: benchmarks/language/reproducibility/indyscc24-bert.md
       - GPT-J: benchmarks/language/gpt-j.md
       - LLAMA2-70B: benchmarks/language/llama2-70b.md
+      - LLAMA3-405B: benchmarks/language/llama3_1-405b.md
       - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md
     - Recommendation:
       - DLRM-v2: benchmarks/recommendation/dlrm-v2.md