diff --git a/train/tr10-13B-ml/tr10-13B.slurm b/train/tr10-13B-ml/tr10-13B.slurm
index 0e6ecdd0..5b02c437 100644
--- a/train/tr10-13B-ml/tr10-13B.slurm
+++ b/train/tr10-13B-ml/tr10-13B.slurm
@@ -46,10 +46,13 @@ GLOBAL_BATCH_SIZE=2048
 
 NLAYERS=40
 NHIDDEN=5120
-NHEADS=32
+NHEADS=40
 SEQ_LEN=2048
 VOCAB_SIZE=150000
 
+TRAIN_TOKENS=300_000_000_000
+TRAIN_SAMPLES=$(python -c "print($TRAIN_TOKENS // $SEQ_LEN)")
+
 SAVE_INTERVAL=300
 
 OPTIMIZER_ARGS=" \
@@ -57,13 +60,14 @@ OPTIMIZER_ARGS=" \
     --adam-beta1 0.9 \
     --adam-beta2 0.95 \
     --adam-eps 1e-8 \
-    --lr 6e-5 \
+    --lr 1e-4 \
     --min-lr 6e-6 \
     --lr-decay-style cosine \
-    --lr-decay-samples 126_953_125 \
     --lr-warmup-samples 216_320 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
+    --hidden-dropout 0.0 \
+    --attention-dropout 0.0 \
     "
 
 EXIT_OPTS=" \
@@ -80,7 +84,7 @@ GPT_ARGS=" \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --rampup-batch-size 16 16 6_000_000 \
     --global-batch-size $GLOBAL_BATCH_SIZE \
-    --train-samples 300_000_000 \
+    --train-samples $TRAIN_SAMPLES \
     --tokenizer-type PretrainedFromHF \
     --tokenizer-name-or-path $TOKENIZER_NAME \
     --loss-scale 12 \
@@ -165,7 +169,7 @@ export CMD=" \
     --load $CHECKPOINT_PATH \
     --data-path $DATA_PATH \
     --data-impl mmap \
-    --split 900,100,0 \
+    --split 950,50,0 \
     --distributed-backend nccl \
      $DEEPSPEED_ARGS \
     "