From e2ea8c4f51d4d3c309da246cdd29d88f5890a2cd Mon Sep 17 00:00:00 2001 From: Davidzhangyuanhan <704464079@qq.com> Date: Sat, 12 Oct 2024 18:56:02 +0800 Subject: [PATCH] chore: Update training script for LLaVA-NeXT video models --- docs/LLaVA_Video_1003.md | 2 +- scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh | 3 ++- scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/LLaVA_Video_1003.md b/docs/LLaVA_Video_1003.md index 754f9fd79..9bac6b3c5 100644 --- a/docs/LLaVA_Video_1003.md +++ b/docs/LLaVA_Video_1003.md @@ -84,7 +84,7 @@ print(text_outputs) ## Training -[[Scripts]](/Users/zhangyuanhan/Desktop/LLaVA-NeXT/scripts/video/train): Start training models on your single-image/multi-image/video data. +[[Scripts]](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/yhzhang/video_dev/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9_aug6.sh): Start training models on your single-image/multi-image/video data. ## Evaluation Guidance diff --git a/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh b/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh index 9dbfdae05..5e053ffb5 100755 --- a/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh +++ b/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh @@ -31,7 +31,8 @@ echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}" echo "MID_RUN_NAME: ${MID_RUN_NAME}" -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \ +# ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \ +deepspeed --master_port 30000 \ llava/train/train_mem.py \ --deepspeed scripts/zero3.json \ --model_name_or_path $PREV_STAGE_CHECKPOINT \ diff --git a/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh b/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh index 0a72bc134..14b915a0c 100755 --- a/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh +++ b/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh @@ -31,7 +31,8 @@ echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}" echo "MID_RUN_NAME: ${MID_RUN_NAME}" -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \ +# ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \ +deepspeed --master_port 30000 \ llava/train/train_mem.py \ --deepspeed scripts/zero3.json \ --model_name_or_path $PREV_STAGE_CHECKPOINT \