mlcommons · davidjurado · Jun 14, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 25, 2024
@@ -0,0 +1,2 @@
+mlcube/workspace/*
+!mlcube/workspace/rclone.conf
@@ -0,0 +1,17 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.01-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update \
+    && apt-get install -y curl
+
+RUN curl https://rclone.org/install.sh | bash
+
+COPY requirements.txt /
+RUN pip install --no-cache-dir -r /requirements.txt
+RUN pip install flash-attn==2.4.1 --no-build-isolation
+
+COPY . /workspace/ft-llm
+
+RUN chmod -R +x /workspace/ft-llm
+
+WORKDIR /workspace/ft-llm
@@ -0,0 +1,15 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: NO
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,2 @@
+workspace/*
+!workspace/README.md
@@ -0,0 +1,90 @@
+# MLCube for Llama 2
+
+MLCube™ GitHub [repository](https://github.com/mlcommons/mlcube). MLCube™ [wiki](https://mlcommons.github.io/mlcube/).
+
+## Project setup
+
+An important requirement is that you must have Docker installed.
+
+```bash
+# Create Python environment and install MLCube Docker runner 
+virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
+# Fetch the implementation from GitHub
+git clone https://github.com/mlcommons/training && cd ./training
+git fetch origin pull/749/head:feature/mlcube_llama2 && git checkout feature/mlcube_llama2
+cd ./llama2_70b_lora/mlcube
+```
+
+Inside the mlcube directory run the following command to check implemented tasks.
+
+```shell
+mlcube describe
+```
+
+### Extra requirements
+
+Install Rclone in your system, by following [these instructions](https://rclone.org/install/).
+
+MLCommons hosts the model for download exclusively by MLCommons Members. You must first agree to the [confidentiality notice](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform).
+
+When finishing the previous form, you will be redirected to a Drive folder containing a file called `CLI Download Instructions`, follow the instructions inside that file up to step: `#3 Authenticate Rclone with Google Drive`.
+
+When finishing this step a configuration file for Rclone will contain the necessary data to download the dataset and models. To check where this file is located run the command:
+
+```bash
+ rclone config file
+ ```
+
+ **Default:** `~/.config/rclone/rclone.conf`
+
+Finally copy that file inside the `workspace` folder that is located in the same path as this readme, it must have the name `rclone.conf`.
+
+### MLCube tasks
+
+* Core tasks:
+
+Download dataset.
+
+```shell
+mlcube run --task=download_data -Pdocker.build_strategy=always
+```
+
+Train.
+
+```shell
+mlcube run --task=train -Pdocker.build_strategy=always
+```
+
+* Demo tasks:
+
+Here is a video explaining the demo steps:
+
+[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/1Y9q-nltI8U/0.jpg)](https://www.youtube.com/watch?v=1Y9q-nltI8U)
+
+Download demo dataset.
+
+```shell
+mlcube run --task=download_demo -Pdocker.build_strategy=always
+```
+
+Train demo.
+
+```shell
+mlcube run --task=demo -Pdocker.build_strategy=always
+```
+
+### Execute the complete pipeline
+
+You can execute the complete pipeline with one single command.
+
+* Core pipeline:
+
+```shell
+mlcube run --task=download_data,train -Pdocker.build_strategy=always
+```
+
+* Demo pipeline:
+
+```shell
+mlcube run --task=download_demo,demo -Pdocker.build_strategy=always
+```
@@ -0,0 +1,59 @@
+name: llama2
+description: llama2 70b lora
+authors:
+  - { name: "MLCommons Best Practices Working Group" }
+
+platform:
+  accelerator_count: 1
+
+docker:
+  # Image name.
+  image: mlcommons/llama2_70b_lora:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile_mlcube"
+  # GPU arguments
+  gpu_args: "--gpus=all --shm-size=1G"
+
+tasks:
+  download_data:
+    entrypoint: ./scripts/download_data.sh -a
+    parameters:
+      inputs:
+        rclone_config: rclone.conf
+      outputs:
+        data_dir: data/
+        model_dir: model/
+  train:
+    entrypoint: ./run_and_time.sh -a
+    parameters:
+      inputs:
+        data_dir: data/
+        model_dir: model/
+        config_path:
+          type: file
+          default: ../../configs/default_config.yaml
+      outputs:
+        log_dir: logs/
+        result_dir: result/
+  download_demo:
+    entrypoint: ./scripts/download_demo.sh -a
+    parameters:
+      inputs:
+        rclone_config: rclone.conf
+      outputs:
+        data_dir: demo_data/
+        model_dir: demo_model/
+  demo:
+    entrypoint: ./run_demo.sh -a
+    parameters:
+      inputs:
+        data_dir: demo_data/
+        model_dir: demo_model/
+        config_path:
+          type: file
+          default: ../../configs/demo_config.yaml
+      outputs:
+        log_dir: demo_logs/
+        result_dir: demo_result/
@@ -0,0 +1,13 @@
+# Rclone configuration file
+
+Follow the instructions from: [main MLCube README](../README.md#extra-requirements).
+
+Place your Rclone configuration file in the same path as this README (workspace folder). To check where this file is located run the command:
+
+```bash
+rclone config file
+```
+
+**Default:** `~/.config/rclone/rclone.conf`
+
+Name the file `rclone.conf`.
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+set +x
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+: "${DATA_DIR:=./dataset}"
+: "${MODEL_DIR:=./models/llama-v2-fused-qkv}"
+: "${RESULT_DIR:=./workspace/results}"
+: "${CONFIG_PATH:=./configs/default_config.yaml}"
+: "${LOG_DIR:=./workspace/logs}"
+
+# Handle MLCube parameters
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    --model_dir=*)
+        MODEL_DIR="${1#*=}"
+        ;;
+    --result_dir=*)
+        RESULT_DIR="${1#*=}"
+        ;;
+    --config_path=*)
+        CONFIG_PATH="${1#*=}"
+        ;;
+    --log_dir=*)
+        LOG_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+# run benchmark
+echo "running benchmark"
+
+accelerate launch --config_file $CONFIG_PATH scripts/train.py \
+    --dataset_path $DATA_DIR/scrolls_gov_report_8k \
+    --model_path $MODEL_DIR/Llama2-70b-fused-qkv-mlperf \
+    --max_seq_len 8192 \
+    --bf16 True \
+    --logging_steps 24 \
+    --eval_steps 48 \
+    --output_dir $RESULT_DIR/llama-70b_scrolls_gov_report_r16_$1 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type "cosine" \
+    --learning_rate 4e-4 \
+    --weight_decay 0.0001 \
+    --warmup_ratio 0 \
+    --max_grad_norm 0.3 \
+    --use_gradient_checkpointing True \
+    --target_eval_loss 0.925 \
+    --use_peft_lora True \
+    --lora_r 16 \
+    --lora_alpha 32 \
+    --lora_dropout 0.1 \
+    --max_steps 1024 \
+    --use_flash_attn \
+    --seed 1234 \
+    --lora_target_modules "qkv_proj,o_proj" |& tee "$LOG_DIR/train_console.log"
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+set +x
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+: "${DATA_DIR:=./dataset}"
+: "${MODEL_DIR:=./models/Llama-2-7b-chat-hf}"
+: "${RESULT_DIR:=./workspace/results}"
+: "${CONFIG_PATH:=./configs/default_config.yaml}"
+: "${LOG_DIR:=./workspace/logs}"
+
+# Handle MLCube parameters
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    --model_dir=*)
+        MODEL_DIR="${1#*=}"
+        ;;
+    --result_dir=*)
+        RESULT_DIR="${1#*=}"
+        ;;
+    --config_path=*)
+        CONFIG_PATH="${1#*=}"
+        ;;
+    --log_dir=*)
+        LOG_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+# run benchmark
+echo "running benchmark"
+
+accelerate launch --config_file $CONFIG_PATH scripts/train.py \
+    --dataset_path $DATA_DIR/scrolls_gov_report_8k \
+    --model_path $MODEL_DIR/Llama-2-7b-chat-hf \
+    --max_seq_len 8192 \
+    --bf16 True \
+    --logging_steps 1 \
+    --eval_steps 1 \
+    --output_dir $RESULT_DIR/llama-70b_scrolls_gov_report_r16_$1 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type "cosine" \
+    --learning_rate 4e-4 \
+    --weight_decay 0.0001 \
+    --warmup_ratio 0 \
+    --max_grad_norm 0.3 \
+    --use_gradient_checkpointing True \
+    --target_eval_loss 0.925 \
+    --use_peft_lora True \
+    --lora_r 16 \
+    --lora_alpha 32 \
+    --lora_dropout 0.1 \
+    --max_steps 2 \
+    --lora_target_modules "qkv_proj,o_proj" |& tee "$LOG_DIR/train_console.log"
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+DATA_DIR="./data"
+MODEL_DIR="./model"
+RCLONE_CONFIG="./rclone.conf"
+
+# Capture MLCube parameter
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    --model_dir=*)
+        MODEL_DIR="${1#*=}"
+        ;;
+    --rclone_config=*)
+        RCLONE_CONFIG="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+mkdir -p ~/.config/rclone/
+cp $RCLONE_CONFIG ~/.config/rclone/rclone.conf
+
+mkdir -p $DATA_DIR
+
+cd $DATA_DIR
+
+rclone copy mlc-llama2:training/scrolls_gov_report_8k ./scrolls_gov_report_8k -P
+
+cd -
+
+mkdir -p $MODEL_DIR
+
+cd $MODEL_DIR
+
+rclone copy mlc-llama2:Llama2-70b-fused-qkv-mlperf ./Llama2-70b-fused-qkv-mlperf -P