modal-labs · mwaskom · Feb 6, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
@@ -1,16 +1,20 @@
 name: CI/CD
 
-on: workflow_dispatch
+on: pull_request
 
 jobs:
   test:
     environment: CI
-    name: Deploy
+    name: Test
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ["codellama", "llama-2", "mistral"]
     env:
       MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
       MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
-      MODAL_ENVIRONMENT: ci-cd
+      MODAL_ENVIRONMENT: CI-CD
 
     steps:
       - name: Checkout Repository
@@ -19,13 +23,20 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
 
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal
+          pip install modal pyyaml
+
+      - name: Prep config and data for CI
+        run: |
+          python ci/prep_for_ci.py --config=config/${{ matrix.config }}.yml --data=data/sqlqa.jsonl
+          echo `wc -l data/sqlqa.jsonl | awk '{print $1}'` lines in test data
+          echo Config:
+          cat config/${{ matrix.config }}.yml
 
       - name: Run training job on Modal
         run: |
-          modal run src.train --config=config/codellama.yml --data=data/sqlqa.jsonl
+          GPU_MEM=40 modal run src.train --config=config/${{ matrix.config }}.yml --data=data/sqlqa.jsonl
diff --git a/ci/prep_for_ci.py b/ci/prep_for_ci.py
@@ -0,0 +1,28 @@
+import click
+import yaml
+
+
+@click.command()
+@click.option("--config")
+@click.option("--data")
+def main(config: str, data: str):
+    """Set the config for lighter-weight training and truncate the dataset."""
+    with open(config) as f:
+        cfg = yaml.safe_load(f.read())
+    cfg["sequence_len"] = 1024
+    cfg["val_set_size"] = 100
+    cfg["eval_batch_size"] = 2
+    cfg["micro_batch_size"] = 2
+    cfg["num_epochs"] = 2
+    cfg.pop("eval_steps", None)
+    with open(config, "w") as f:
+        yaml.dump(cfg, f)
+
+    with open(data) as f:
+        data_truncated = f.readlines()[:1000]
+    with open(data, "w") as f:
+        f.writelines(data_truncated)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/config/codellama.yml b/config/codellama.yml
@@ -4,7 +4,7 @@ tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true
 
 load_in_8bit: false
-bf16: true
+load_in_4bit: false
 strict: false
 
 datasets:
@@ -24,7 +24,7 @@ datasets:
         {instruction} [/INST] 
 
 dataset_prepared_path:
-val_set_size: 32 # must be at least micro_batch_size * N_GPUS, and more if eval packing.
+val_set_size: 0.05
 output_dir: ./lora-out
 
 sequence_len: 4096
@@ -54,6 +54,7 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
+bf16: true
 fp16: false
 tf32: false
 

diff --git a/config/llama-2.yml b/config/llama-2.yml
@@ -8,14 +8,28 @@ load_in_4bit: false
 strict: false
 
 datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
+  # This will be the path used for the data when it is saved to the Volume in the cloud.
+  - path: my_data.jsonl
+    ds_type: json
+    type:
+      # JSONL file contains question, context, answer fields per line.
+      # This gets mapped to instruction, input, output axolotl tags.
+      field_instruction: question
+      field_input: context
+      field_output: answer
+      # Format is used by axolotl to generate the prompt.
+      format: |-
+        [INST] Using the schema context below, generate a SQL query that answers the question.
+        {input}
+        {instruction} [/INST]
+
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./lora-out
 
 sequence_len: 4096
 sample_packing: true
+eval_sample_packing: false
 pad_to_sequence_len: true
 
 adapter: lora

diff --git a/config/mistral.yml b/config/mistral.yml
@@ -4,9 +4,11 @@ tokenizer_type: LlamaTokenizer
 is_mistral_derived_model: true
 
 load_in_8bit: false
+load_in_4bit: false
 strict: false
 
 datasets:
+  # This will be the path used for the data when it is saved to the Volume in the cloud.
   - path: my_data.jsonl
     ds_type: json
     type:
@@ -22,7 +24,7 @@ datasets:
         {instruction} [/INST] 
 
 dataset_prepared_path:
-val_set_size: 0
+val_set_size: 32
 output_dir: ./lora-out
 
 sequence_len: 2048
@@ -38,13 +40,13 @@ lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 
-wandb_project: mistral-7b-axolotl-train
+wandb_project:
 wandb_entity:
-wandb_watch: gradients
+wandb_watch:
 wandb_run_id:
 
 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 16
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
@@ -62,7 +64,7 @@ resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention: true
+flash_attention: false
 
 warmup_steps: 10
 save_steps:

diff --git a/src/common.py b/src/common.py
@@ -24,17 +24,16 @@
     .env(dict(HUGGINGFACE_HUB_CACHE="/pretrained", HF_HUB_ENABLE_HF_TRANSFER="1"))
 )
 
-vllm_image = (
-    Image.from_registry("nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10")
-    .pip_install(
-        "vllm==0.2.5",
-        "torch==2.1.2",
-        "torchvision==0.16.2",
-        "torchaudio==2.1.2"
-        )
+vllm_image = Image.from_registry(
+    "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
+).pip_install(
+    "vllm==0.2.5",
+    "torch==2.1.2",
+    "torchvision==0.16.2",
+    "torchaudio==2.1.2",
 )
 
-stub = Stub(APP_NAME, secrets=[Secret.from_name("huggingface")])
+stub = Stub(APP_NAME)  # , secrets=[Secret.from_name("huggingface")])
 
 # Volumes for pre-trained models and training runs.
 pretrained_volume = Volume.persisted("example-pretrained-vol")

diff --git a/src/train.py b/src/train.py
@@ -54,7 +54,10 @@ def run_cmd(cmd: str, run_folder: str):
     _allow_background_volume_commits=True,
 )
 def train(run_folder: str):
-    print(f"Starting training run in {run_folder}")
+    import torch
+
+    print(f"Starting training run in {run_folder}.")
+    print(f"Using {torch.cuda.device_count()} {torch.cuda.get_device_name()} GPU(s).")
 
     TRAIN_CMD = "accelerate launch -m axolotl.cli.train ./config.yml"
     run_cmd(TRAIN_CMD, run_folder)