-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_slurm_test_sweep2.sh
executable file
·156 lines (118 loc) · 4.65 KB
/
my_slurm_test_sweep2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
# ====================
# Options for sbatch
# ====================
# Location for stdout log - see https://slurm.schedmd.com/sbatch.html#lbAH
#SBATCH --output=/home/%u/slurm_logs/slurm-%A_%a.out
# Location for stderr log - see https://slurm.schedmd.com/sbatch.html#lbAH
#SBATCH --error=/home/%u/slurm_logs/slurm-%A_%a.out
# Maximum number of nodes to use for the job
#SBATCH --nodes=1
# Generic resources to use - typically you'll want gpu:n to get n gpus
#SBATCH --gres=gpu:1
# Megabytes of RAM required. Check `cluster-status` for node configurations
#SBATCH --mem=14000
# Number of CPUs to use. Check `cluster-status` for node configurations
#SBATCH --cpus-per-task=4
# Maximum time for the job to run, format: days-hours:minutes:seconds
# #SBATCH --time=1-08:00:00
# Partition of the cluster to pick nodes from (check `sinfo`)
#SBATCH --partition=PGR-Standard
# Any nodes to exclude from selection
# #SBATCH --exclude=charles[05,12-18]
# Request a node
# #SBATCH --nodelist=damnii07
# =====================
# Logging information
# =====================
MODEL_NAME=wholeinetrun
# slurm info - more at https://slurm.schedmd.com/sbatch.html#lbAJ
echo "Job running on ${SLURM_JOB_NODELIST}"
dt=$(date '+%d/%m/%Y %H:%M:%S')
echo "Job started: $dt"
# ===================
# Environment setup
# ===================
echo "Setting up bash enviroment"
# Make available all commands on $PATH as on headnode
source ~/.bashrc
# Make script bail out after first error
set -e
# Make your own folder on the node's scratch disk
echo "Making Scratch Disk"
SCRATCH_DISK=/disk/scratch
SCRATCH_HOME=${SCRATCH_DISK}/${USER}
mkdir -p ${SCRATCH_HOME}
echo "Scratch Disk Created"
# Activate your conda environment
CONDA_ENV_NAME=tideep
echo "Activating conda environment: ${CONDA_ENV_NAME}"
conda activate ${CONDA_ENV_NAME}
echo "Activated conda environment"
# =================================
# Move input data to scratch disk
# =================================
echo "Moving input data to the compute node's scratch space: $SCRATCH_DISK"
# input data directory path on the DFS
src_path=/home/${USER}/datasets/INetData/FromHomeVal
#caffe_src=/home/${USER}/git/dissertation/pretrained_models/checkpoints/siggraph_caffemodel
model_src=/home/${USER}/git/dissertation/checkpoints/${MODEL_NAME}
resource_src=/home/${USER}/git/dissertation/resources
mkdir -p ${model_src} # make it if required
# input data directory path on the scratch disk of the node
dest_path=${SCRATCH_HOME}/datasets/INetData/FromHomeVal
mkdir -p ${dest_path} # make it if required
rsync --archive --update --compress --progress ${src_path}/ ${dest_path}
echo "Rsync Data Completed"
cpoint_path=${SCRATCH_HOME}/checkpoints/${MODEL_NAME}
cpoint_dir=${SCRATCH_HOME}/checkpoints/
mkdir -p ${cpoint_path} # make it if required
#rsync --archive --update --compress --progress ${cpoint_path}/ ${model_src}
rsync --archive --update --compress --progress ${model_src}/ ${cpoint_path}
echo "Rsync Models Completed"
resource_path=${SCRATCH_HOME}/resources
mkdir -p ${resource_path}
rsync --archive --update --compress --progress ${resource_src}/ ${resource_path}
echo "Rsync Npz ValIndex Completed"
#code="${dest_path}/"
#for f in ${code}*.tar; do
# d=`basename "$f" .tar`
# dpath="${code}TrainFolders/$d"
# echo "${dpath}"
# if [[ ! -d "$dpath" ]]; then
# mkdir -p "${dpath}"
# tar --keep-newer-files -xf "$f" -C "${dpath}"
# fi
#done
echo "Forming Symlink Datafiles:"
sorted_path=${SCRATCH_HOME}/dataset
echo "OG Dataset Dir: ${dest_path}"
echo "Sorted Dataset Dir: ${sorted_path}"
python make_ilsvrc_dataset_with_val.py --in_path ${dest_path} --out_path ${sorted_path} --resource_path ${resource_path}
# ==============================
# Finally, run the experiment!
# ==============================
echo "Starting python call"
python test_sweep.py --gpu_ids 0 --name ${MODEL_NAME} --data_dir ${sorted_path} --checkpoints_dir ${cpoint_dir} --resources_dir ${resource_path} --weighted_mask
echo "Python ended"
# ======================================
# Move output data from scratch to DFS
# ======================================
# This presumes your command wrote data to some known directory. In this
# example, send it back to the DFS with rsync
echo "Moving output data back to DFS"
#src_path=${SCRATCH_HOME}/project_name/data/output
# dest_path=/home/${USER}/git/dissertation/checkpoints
rsync --archive --update --compress --progress ${cpoint_path}/ ${model_src}
echo "Rsync done"
echo "Removing Results From Scratch"
rm -rv ${cpoint_path}
echo "Remove done"
# =========================
# Post experiment logging
# =========================
echo ""
echo "============"
echo "job finished successfully"
dt=$(date '+%d/%m/%Y %H:%M:%S')
echo "Job finished: $dt"