forked from pytorch/torchtitan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmitit_train_hparam_tuning.py
76 lines (68 loc) · 2.6 KB
/
submitit_train_hparam_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import submitit
if __name__ == "__main__":
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
n_gpus = 6
node = "h100"
executor.update_parameters(
name="titan",
timeout_min=6 * 60,
gpus_per_node=n_gpus,
nodes=1,
mem_gb=80,
cpus_per_task=n_gpus * 12,
slurm_additional_parameters={"partition": node},
)
hparams = {
# "optimizer.lr": ["1.2e-3", "9e-4", "6e-4", "3e-4"],
# "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
# "optimizer.lr": ["2.5e-4"],
# "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
# "training.gradient_accumulation_steps": ["21", "25", "29", "33"],
# "training.steps": ["31000", "26000", "22500", "20000"],
}
jobs = []
with executor.batch():
for _ in range(1):
length = len(list(hparams.values())[0])
for i in range(length):
hparam_dict = {}
for key, values in hparams.items():
hparam_dict[key] = values[i]
# train_config = './train_configs/chemlactica_125m.toml'
# train_config = './train_configs/chemlactica_1.3b.toml'
train_config = "./train_configs/llama3.2_1b.toml"
# train_config = './train_configs/debug_model.toml'
command_lst = [
"python3",
"-m",
"torch.distributed.run",
"--nproc_per_node",
f"{n_gpus}",
"--rdzv_backend",
"c10d",
"--rdzv_endpoint",
"localhost:0",
"--local-ranks-filter",
"0",
"--role",
"rank",
"--tee",
"3",
"train.py",
"--job.config_file",
train_config,
]
# add the hparam
for key, value in hparam_dict.items():
command_lst.append(f"--{key}")
command_lst.append(value)
function = submitit.helpers.CommandFunction(command_lst)
print(" ".join(function.command))
# subprocess.run(function.command)
job = executor.submit(function)
jobs.append(job)