From 62b06bf4af8271199873e001570c46c88da619a1 Mon Sep 17 00:00:00 2001
From: Richard Gong <richard@modal.com>
Date: Wed, 14 Feb 2024 21:56:03 +0000
Subject: [PATCH 1/3] Working Mixtral support

---
 config/mixtral.yml | 105 +++++++++++++++++++++++++++++++++++++++++++++
 src/common.py      |   4 +-
 src/gui.py         |   6 +--
 src/train.py       |   3 +-
 4 files changed, 111 insertions(+), 7 deletions(-)
 create mode 100644 config/mixtral.yml

diff --git a/config/mixtral.yml b/config/mixtral.yml
new file mode 100644
index 0000000..a0978e5
--- /dev/null
+++ b/config/mixtral.yml
@@ -0,0 +1,105 @@
+base_model: mistralai/Mixtral-8x7B-v0.1
+model_type: AutoModelForCausalLM
+tokenizer_type: LlamaTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  # This will be the path used for the data when it is saved to the Volume in the cloud.
+  - path: data.jsonl
+    ds_type: json
+    type:
+      # JSONL file contains question, context, answer fields per line.
+      # This gets mapped to instruction, input, output axolotl tags.
+      field_instruction: question
+      field_input: context
+      field_output: answer
+      # Format is used by axolotl to generate the prompt.
+      format: |-
+        [INST] Using the schema context below, generate a SQL query that answers the question.
+        {input}
+        {instruction} [/INST] 
+
+dataset_prepared_path:
+val_set_size: 100
+output_dir: ./lora-out
+
+## You can optionally freeze the entire model and unfreeze a subset of parameters
+unfrozen_parameters:
+#  - lm_head.*
+#  - model.embed_tokens.*
+#  - model.layers.2[0-9]+.block_sparse_moe.gate.*
+#  - model.layers.2[0-9]+.block_sparse_moe.experts.*
+#  - model.layers.3[0-9]+.block_sparse_moe.gate.*
+#  - model.layers.3[0-9]+.block_sparse_moe.experts.*
+
+model_config:
+  output_router_logits: true
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: false
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+#lora_target_modules:
+#  - gate
+#  - q_proj
+#  - k_proj
+#  - v_proj
+#  - o_proj
+#  - w1
+#  - w2
+#  - w3
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed: /root/axolotl/deepspeed_configs/zero2.json
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
diff --git a/src/common.py b/src/common.py
index 3ac368b..c640c90 100644
--- a/src/common.py
+++ b/src/common.py
@@ -4,9 +4,9 @@
 
 APP_NAME = "example-axolotl"
 
-# Axolotl image hash corresponding to 0.4.0 release
+# Axolotl image hash corresponding to 0.4.0 release (2024-02-14)
 AXOLOTL_REGISTRY_SHA = (
-    "af4d878e9fbc90c7ba30fa78ce4d6d95b1ccba398ab944efbd322d7c0d6313c8"
+    "d5b941ba2293534c01c23202c8fc459fd2a169871fa5e6c45cb00f363d474b6a"
 )
 
 axolotl_image = (
diff --git a/src/gui.py b/src/gui.py
index b3a511b..317ca06 100644
--- a/src/gui.py
+++ b/src/gui.py
@@ -175,9 +175,9 @@ def get_model_choices():
 
 @stub.local_entrypoint()
 def main():
-    dir = os.path.dirname(__file__)
-    with open(f"{dir}/config.yml", "r") as cfg, open(
-        f"{dir}/my_data.jsonl", "r"
+    parent = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    with open(f"{parent}/config/mixtral.yml", "r") as cfg, open(
+        f"{parent}/data/sqlqa.jsonl", "r"
     ) as data:
         handle = gui.spawn(cfg.read(), data.read())
     url = stub.q.get()
diff --git a/src/train.py b/src/train.py
index db547bf..1f98306 100644
--- a/src/train.py
+++ b/src/train.py
@@ -10,8 +10,7 @@
 )
 
 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_MEM = int(os.environ.get("GPU_MEM", 40))
-GPU_CONFIG = modal.gpu.A100(count=N_GPUS, memory=GPU_MEM)
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
 
 
 def print_common_training_issues(config):

From 7aeb8b91d137fbc8a356b0e18308b7f1c7e9431e Mon Sep 17 00:00:00 2001
From: Richard Gong <richard@modal.com>
Date: Wed, 14 Feb 2024 21:57:02 +0000
Subject: [PATCH 2/3] Add to CI

---
 .github/workflows/ci-cd.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
index e617b06..129b996 100644
--- a/.github/workflows/ci-cd.yml
+++ b/.github/workflows/ci-cd.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        config: ["codellama", "llama-2", "mistral"]
+        config: ["codellama", "llama-2", "mistral", "mixtral"]
     env:
       MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
       MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}

From 84408b2e14c535fd3a19412c7499d1536c62ab7e Mon Sep 17 00:00:00 2001
From: Richard Gong <richard@modal.com>
Date: Wed, 14 Feb 2024 22:17:58 +0000
Subject: [PATCH 3/3] re-enable sample packing

---
 config/mixtral.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/mixtral.yml b/config/mixtral.yml
index a0978e5..359a857 100644
--- a/config/mixtral.yml
+++ b/config/mixtral.yml
@@ -43,9 +43,9 @@ adapter: qlora
 lora_model_dir:
 
 sequence_len: 4096
-sample_packing: false
+sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: false
+pad_to_sequence_len: true
 
 lora_r: 16
 lora_alpha: 32