microsoft · ys950902 · Aug 8, 2024 · Aug 9, 2024 · Sep 3, 2024 · Sep 27, 2024
@@ -1997,7 +1997,7 @@ def _overflow_clean_up(self, prev_scale):
     def _overflow_check_and_loss_scale_update(self):
 
         # First compute norm for all group so we know if there is overflow
-        if self.dtype == torch.float16:
+        if self.dtype in [torch.float16, torch.bfloat16]:
             self.check_overflow()
 
         #loss scaling related computation

@@ -1828,7 +1828,7 @@ def step(self, closure=None):
         see_memory_usage(f"In step before checking overflow")
 
         # First compute norm for all group so we know if there is overflow
-        if self.dtype == torch.float16:
+        if self.dtype in [torch.float16, torch.bfloat16]:
             self.check_overflow()
 
         prev_scale = self.loss_scale