Skip to content

Commit

Permalink
[Op] Prevent inconsistent number of Ops and devices during distribute…
Browse files Browse the repository at this point in the history
…d training.

Signed-off-by: JunqiHu <[email protected]>
  • Loading branch information
Mesilenceki committed Aug 31, 2023
1 parent 4983e02 commit c0ade86
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions tensorflow/python/training/saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,8 +550,14 @@ def _GroupByDevices(self, saveables):
"""
per_device = collections.defaultdict(lambda: [])
for saveable in saveables:
canonical_device = set(
pydev.canonical_name(spec.tensor.device) for spec in saveable.specs)
canonical_device = set()
for spec in saveable.specs:
device_name = pydev.canonical_name(spec.tensor.device)
device_idx = device_name.find("/device")
if device_idx != -1:
canonical_device.add(device_name[:device_idx])
else:
canonical_device.add(device_name)
if len(canonical_device) != 1:
raise ValueError("All tensors of a saveable object must be "
"on the same device: %s" % saveable.name)
Expand Down

0 comments on commit c0ade86

Please sign in to comment.