From 62b06bf4af8271199873e001570c46c88da619a1 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Wed, 14 Feb 2024 21:56:03 +0000 Subject: [PATCH 1/3] Working Mixtral support --- config/mixtral.yml | 105 +++++++++++++++++++++++++++++++++++++++++++++ src/common.py | 4 +- src/gui.py | 6 +-- src/train.py | 3 +- 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 config/mixtral.yml diff --git a/config/mixtral.yml b/config/mixtral.yml new file mode 100644 index 0000000..a0978e5 --- /dev/null +++ b/config/mixtral.yml @@ -0,0 +1,105 @@ +base_model: mistralai/Mixtral-8x7B-v0.1 +model_type: AutoModelForCausalLM +tokenizer_type: LlamaTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + # This will be the path used for the data when it is saved to the Volume in the cloud. + - path: data.jsonl + ds_type: json + type: + # JSONL file contains question, context, answer fields per line. + # This gets mapped to instruction, input, output axolotl tags. + field_instruction: question + field_input: context + field_output: answer + # Format is used by axolotl to generate the prompt. + format: |- + [INST] Using the schema context below, generate a SQL query that answers the question. + {input} + {instruction} [/INST] + +dataset_prepared_path: +val_set_size: 100 +output_dir: ./lora-out + +## You can optionally freeze the entire model and unfreeze a subset of parameters +unfrozen_parameters: +# - lm_head.* +# - model.embed_tokens.* +# - model.layers.2[0-9]+.block_sparse_moe.gate.* +# - model.layers.2[0-9]+.block_sparse_moe.experts.* +# - model.layers.3[0-9]+.block_sparse_moe.gate.* +# - model.layers.3[0-9]+.block_sparse_moe.experts.* + +model_config: + output_router_logits: true + +adapter: qlora +lora_model_dir: + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: false + +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: +#lora_target_modules: +# - gate +# - q_proj +# - k_proj +# - v_proj +# - o_proj +# - w1 +# - w2 +# - w3 + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 4 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_steps: 10 +evals_per_epoch: 4 +eval_table_size: +eval_max_new_tokens: 128 +saves_per_epoch: 1 +debug: +deepspeed: /root/axolotl/deepspeed_configs/zero2.json +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: diff --git a/src/common.py b/src/common.py index 3ac368b..c640c90 100644 --- a/src/common.py +++ b/src/common.py @@ -4,9 +4,9 @@ APP_NAME = "example-axolotl" -# Axolotl image hash corresponding to 0.4.0 release +# Axolotl image hash corresponding to 0.4.0 release (2024-02-14) AXOLOTL_REGISTRY_SHA = ( - "af4d878e9fbc90c7ba30fa78ce4d6d95b1ccba398ab944efbd322d7c0d6313c8" + "d5b941ba2293534c01c23202c8fc459fd2a169871fa5e6c45cb00f363d474b6a" ) axolotl_image = ( diff --git a/src/gui.py b/src/gui.py index b3a511b..317ca06 100644 --- a/src/gui.py +++ b/src/gui.py @@ -175,9 +175,9 @@ def get_model_choices(): @stub.local_entrypoint() def main(): - dir = os.path.dirname(__file__) - with open(f"{dir}/config.yml", "r") as cfg, open( - f"{dir}/my_data.jsonl", "r" + parent = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + with open(f"{parent}/config/mixtral.yml", "r") as cfg, open( + f"{parent}/data/sqlqa.jsonl", "r" ) as data: handle = gui.spawn(cfg.read(), data.read()) url = stub.q.get() diff --git a/src/train.py b/src/train.py index db547bf..1f98306 100644 --- a/src/train.py +++ b/src/train.py @@ -10,8 +10,7 @@ ) N_GPUS = int(os.environ.get("N_GPUS", 2)) -GPU_MEM = int(os.environ.get("GPU_MEM", 40)) -GPU_CONFIG = modal.gpu.A100(count=N_GPUS, memory=GPU_MEM) +GPU_CONFIG = modal.gpu.H100(count=N_GPUS) def print_common_training_issues(config): From 7aeb8b91d137fbc8a356b0e18308b7f1c7e9431e Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Wed, 14 Feb 2024 21:57:02 +0000 Subject: [PATCH 2/3] Add to CI --- .github/workflows/ci-cd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index e617b06..129b996 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - config: ["codellama", "llama-2", "mistral"] + config: ["codellama", "llama-2", "mistral", "mixtral"] env: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} From 84408b2e14c535fd3a19412c7499d1536c62ab7e Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Wed, 14 Feb 2024 22:17:58 +0000 Subject: [PATCH 3/3] re-enable sample packing --- config/mixtral.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/mixtral.yml b/config/mixtral.yml index a0978e5..359a857 100644 --- a/config/mixtral.yml +++ b/config/mixtral.yml @@ -43,9 +43,9 @@ adapter: qlora lora_model_dir: sequence_len: 4096 -sample_packing: false +sample_packing: true eval_sample_packing: false -pad_to_sequence_len: false +pad_to_sequence_len: true lora_r: 16 lora_alpha: 32