Skip to content

Commit

Permalink
Added EP==0
Browse files Browse the repository at this point in the history
  • Loading branch information
TJ-Solergibert committed Sep 6, 2024
1 parent 4d61489 commit ef835e8
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/nanotron/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,8 +865,10 @@ def save_checkpoint(self) -> Path:
), # We only save the weights on DP==0
should_save_optimizer=True,
should_save_lr_scheduler=bool(
dist.get_rank(self.parallel_context.dp_pg) == 0 and dist.get_rank(self.parallel_context.tp_pg)
), # We only save the lr_scheduler on DP==0 && TP==0
dist.get_rank(self.parallel_context.dp_pg) == 0
and dist.get_rank(self.parallel_context.tp_pg) == 0
and dist.get_rank(self.parallel_context.expert_pg) == 0
), # We only save the lr_scheduler on DP==0 && TP==0 && EP==0
should_save_config=bool(
dist.get_rank(self.parallel_context.world_pg) == 0
), # We only save the config on world_rank==0
Expand Down

0 comments on commit ef835e8

Please sign in to comment.