horovod_0.18.2.patch

Index: horovod/tensorflow/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- horovod/tensorflow/__init__.py	(revision bb2134b427e0e0c5a83624d02fafa4f14de623d9)
+++ horovod/tensorflow/__init__.py	(date 1586756466203)
@@ -23,7 +23,6 @@
 
 check_extension('horovod.tensorflow', 'HOROVOD_WITH_TENSORFLOW', __file__, 'mpi_lib')
 
-from horovod.tensorflow.compression import Compression
 from horovod.tensorflow.mpi_ops import allgather, broadcast, _allreduce
 from horovod.tensorflow.mpi_ops import init, shutdown
 from horovod.tensorflow.mpi_ops import size, local_size, rank, local_rank
@@ -35,8 +34,7 @@
 import tensorflow as tf
 
 
-def allreduce(tensor, average=True, device_dense='', device_sparse='',
-              compression=Compression.none):
+def allreduce(tensor, grace, average=True, device_dense='', device_sparse=''):
     """Perform an allreduce on a tf.Tensor or tf.IndexedSlices.
 
     This function performs a bandwidth-optimal ring allreduce on the input
@@ -53,7 +51,7 @@
                       if Horovod was built with HOROVOD_GPU_ALLREDUCE.
         device_sparse: Device to be used for sparse tensors. Uses GPU by default
                        if Horovod was built with HOROVOD_GPU_ALLGATHER.
-        compression: Compression algorithm used to reduce the amount of data
+        grace: Compression algorithm used to reduce the amount of data
                      sent and received by each worker node.  Defaults to not
                      using compression.
 
@@ -75,11 +73,7 @@
                                 dense_shape=tensor.dense_shape)
     else:
         with tf.device(device_dense):
-            horovod_size = tf.cast(size(), dtype=tensor.dtype)
-            tensor_compressed, ctx = compression.compress(tensor)
-            summed_tensor_compressed = _allreduce(tensor_compressed)
-            summed_tensor = compression.decompress(summed_tensor_compressed, ctx)
-            new_tensor = (summed_tensor / horovod_size) if average else summed_tensor
+            new_tensor = grace.step(tensor)
         return new_tensor
 
 
@@ -193,8 +187,7 @@
 
 
 @_cache
-def _make_allreduce_grads_fn(name, device_dense, device_sparse,
-                             compression, sparse_as_dense):
+def _make_allreduce_grads_fn(name, grace, device_dense, device_sparse, sparse_as_dense):
     def allreduce_grads(grads):
         with tf.name_scope(name + "_Allreduce"):
             if sparse_as_dense:
@@ -203,9 +196,9 @@
                          else grad for grad in grads]
 
             return [allreduce(grad,
+                              grace=grace,
                               device_dense=device_dense,
-                              device_sparse=device_sparse,
-                              compression=compression)
+                              device_sparse=device_sparse)
                     if grad is not None else grad
                     for grad in grads]
 
@@ -231,16 +224,15 @@
         """An optimizer that wraps another tf.Optimizer, using an allreduce to
         average gradient values before applying gradients to model weights."""
 
-        def __init__(self, optimizer, name=None, use_locking=False, device_dense='',
-                    device_sparse='', compression=Compression.none,
-                    sparse_as_dense=False):
+        def __init__(self, optimizer, grace, name=None, use_locking=False, device_dense='',
+                     device_sparse='',
+                     sparse_as_dense=False):
             if name is None:
                 name = "Distributed{}".format(type(optimizer).__name__)
             super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking)
 
             self._optimizer = optimizer
-            self._allreduce_grads = _make_allreduce_grads_fn(
-                name, device_dense, device_sparse, compression, sparse_as_dense)
+            self._allreduce_grads = _make_allreduce_grads_fn(name, grace, device_dense, device_sparse, sparse_as_dense)
 
         def compute_gradients(self, *args, **kwargs):
             """Compute gradients of all trainable variables.
@@ -275,8 +267,7 @@
             return self._optimizer.variables(*args, **kwargs)
 
 
-def DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='',
-                         device_sparse='', compression=Compression.none,
+def DistributedOptimizer(optimizer, grace, name=None, use_locking=False, device_dense='', device_sparse='',
                          sparse_as_dense=False):
     """Construct a new DistributedOptimizer, which uses another optimizer
     under the hood for computing single-process gradient values and
@@ -299,7 +290,7 @@
       device_sparse:
         Device to be used for sparse tensors. Uses GPU by default
         if Horovod was built with HOROVOD_GPU_ALLGATHER.
-      compression:
+      grace:
         Compression algorithm used during allreduce to reduce the amount
         of data sent during each parameter update step.  Defaults to
         not using compression.
@@ -309,12 +300,11 @@
         has high density.  Defaults to false.
     """
     if isinstance(optimizer, _LegacyOptimizer):
-        return _DistributedOptimizer(optimizer, name, use_locking, device_dense,
-                                     device_sparse, compression, sparse_as_dense)
+        return _DistributedOptimizer(optimizer, grace, name, use_locking, device_dense,
+                                     device_sparse, sparse_as_dense)
     elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
         import horovod.tensorflow.keras as hvd_k
-        return hvd_k.DistributedOptimizer(optimizer, name, device_dense, device_sparse,
-                                          compression, sparse_as_dense)
+        return hvd_k.DistributedOptimizer(optimizer, grace, name, device_dense, device_sparse, sparse_as_dense)
     else:
         raise ValueError('Provided optimizer doesn\'t inherit from either legacy '
                          'TensorFlow or Keras optimizer: %s' % optimizer)
@@ -322,7 +312,7 @@
 
 if hasattr(tf, 'GradientTape'):
     class _DistributedGradientTape(tf.GradientTape):
-        def __init__(self, tape, device_dense, device_sparse, compression, sparse_as_dense,
+        def __init__(self, tape, grace, device_dense, device_sparse, sparse_as_dense,
                      persistent=False, watch_accessed_variables=True):
             if hasattr(tape, '_watch_accessed_variables'):
                 super(self.__class__, self).__init__(persistent, watch_accessed_variables)
@@ -331,7 +321,7 @@
 
             self._tape = tape
             self._allreduce_grads = _make_allreduce_grads_fn(
-                'DistributedGradientTape', device_dense, device_sparse, compression,
+                'DistributedGradientTape', grace, device_dense, device_sparse,
                 sparse_as_dense)
 
         def gradient(self, target, sources, output_gradients=None):
@@ -342,8 +332,7 @@
                 return gradients
 
 
-    def DistributedGradientTape(gradtape, device_dense='', device_sparse='',
-                                compression=Compression.none, sparse_as_dense=False):
+    def DistributedGradientTape(gradtape, grace, device_dense='', device_sparse='', sparse_as_dense=False):
         """A tape that wraps another tf.GradientTape, using an allreduce to
         average gradient values before applying gradients to model weights.
 
@@ -356,7 +345,7 @@
           device_sparse:
             Device to be used for sparse tensors. Uses GPU by default
             if Horovod was built with HOROVOD_GPU_ALLGATHER.
-          compression:
+          grace:
             Compression algorithm used during allreduce to reduce the amount
             of data sent during each parameter update step.  Defaults to
             not using compression.
@@ -368,9 +357,9 @@
         cls = type(gradtape.__class__.__name__, (gradtape.__class__,),
                    dict(_DistributedGradientTape.__dict__))
         if hasattr(gradtape, '_watch_accessed_variables'):
-            return cls(gradtape._tape, device_dense, device_sparse, compression,
+            return cls(gradtape._tape, grace, device_dense, device_sparse,
                        sparse_as_dense, gradtape._persistent,
                        gradtape._watch_accessed_variables)
         else:
-            return cls(gradtape._tape, device_dense, device_sparse, compression,
+            return cls(gradtape._tape, grace, device_dense, device_sparse,
                        sparse_as_dense, gradtape._persistent)
Index: horovod/_keras/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- horovod/_keras/__init__.py	(revision bb2134b427e0e0c5a83624d02fafa4f14de623d9)
+++ horovod/_keras/__init__.py	(date 1586756466160)
@@ -17,17 +17,16 @@
 import tensorflow as tf
 
 
-def create_distributed_optimizer(keras, optimizer, name, device_dense, device_sparse,
-                                 compression, sparse_as_dense):
+def create_distributed_optimizer(keras, optimizer, grace, name, device_dense, device_sparse, sparse_as_dense):
     class _DistributedOptimizer(keras.optimizers.Optimizer):
-        def __init__(self, name, device_dense, device_sparse, compression, sparse_as_dense,
+        def __init__(self, name, grace, device_dense, device_sparse, sparse_as_dense,
                      config):
             if name is None:
                 name = "Distributed%s" % self.__class__.__base__.__name__
             self._name = name
             self._device_dense = device_dense
             self._device_sparse = device_sparse
-            self._compression = compression
+            self._grace = grace
             self._sparse_as_dense = sparse_as_dense
             self._get_gradients_used = False
             super(self.__class__, self).__init__(**config)
@@ -51,10 +50,9 @@
                             if self._sparse_as_dense and \
                                     isinstance(grad, tf.IndexedSlices):
                                 grad = tf.convert_to_tensor(grad)
-                            avg_grad = hvd.allreduce(grad,
+                            avg_grad = hvd.allreduce(grad, self._grace,
                                                      device_dense=self._device_dense,
-                                                     device_sparse=self._device_sparse,
-                                                     compression=self._compression)
+                                                     device_sparse=self._device_sparse)
                             averaged_gradients.append(avg_grad)
                         else:
                             averaged_gradients.append(None)
@@ -72,7 +70,7 @@
 
         @classmethod
         def from_config(cls, cfg):
-            return cls(name, device_dense, device_sparse, compression, sparse_as_dense, cfg)
+            return cls(name, grace, device_dense, device_sparse, sparse_as_dense, cfg)
 
     # We dynamically create a new class that inherits from the optimizer that was passed in.
     # The goal is to override get_gradients() method with an allreduce implementation.
@@ -80,7 +78,7 @@
     # model could be easily restored without Horovod.
     cls = type(optimizer.__class__.__name__, (optimizer.__class__,),
                dict(_DistributedOptimizer.__dict__))
-    return cls(name, device_dense, device_sparse, compression, sparse_as_dense,
+    return cls(name, grace, device_dense, device_sparse, sparse_as_dense,
                optimizer.get_config())
 
 
Index: horovod/tensorflow/keras/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- horovod/tensorflow/keras/__init__.py	(revision bb2134b427e0e0c5a83624d02fafa4f14de623d9)
+++ horovod/tensorflow/keras/__init__.py	(date 1587724061140)
@@ -32,15 +32,13 @@
 from horovod.tensorflow import mpi_threads_supported, mpi_enabled, mpi_built
 from horovod.tensorflow import gloo_enabled, gloo_built
 from horovod.tensorflow import nccl_built, ddl_built, mlsl_built
-from horovod.tensorflow import Compression
 
 import horovod._keras as _impl
 from horovod.tensorflow.keras import callbacks
 
 
-def DistributedOptimizer(optimizer, name=None,
+def DistributedOptimizer(optimizer, grace, name=None,
                          device_dense='', device_sparse='',
-                         compression=Compression.none,
                          sparse_as_dense=False):
     """
     An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
@@ -55,15 +53,15 @@
                       if Horovod was build with HOROVOD_GPU_ALLREDUCE.
         device_sparse: Device to be used for sparse tensors. Uses GPU by default
                        if Horovod was build with HOROVOD_GPU_ALLGATHER.
-        compression: Compression algorithm used to reduce the amount of data
+        grace: Compression algorithm used to reduce the amount of data
                      sent and received by each worker node.  Defaults to not
                      using compression.
         sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                          help improve performance and memory utilization if
                          the original sparse gradient has high density.
                          Defaults to false.    """
-    return _impl.create_distributed_optimizer(keras, optimizer, name,
-                                              device_dense, device_sparse, compression,
+    return _impl.create_distributed_optimizer(keras, optimizer, grace, name,
+                                              device_dense, device_sparse,
                                               sparse_as_dense)
 
 
@@ -120,7 +118,7 @@
     return _impl.broadcast(K, value, root_rank, name)
 
 
-def load_model(filepath, custom_optimizers=None, custom_objects=None, compression=Compression.none):
+def load_model(filepath, grace, custom_optimizers=None, custom_objects=None):
     """
     Loads a saved Keras model with a Horovod DistributedOptimizer.
 
@@ -140,7 +138,7 @@
             during loading.
         custom_objects: Optional dictionary mapping names (strings) to custom
             classes or functions to be considered during deserialization.
-        compression: Compression algorithm used to reduce the amount of data
+        grace: Compression algorithm used to reduce the amount of data
                      sent and received by each worker node.  Defaults to not
                      using compression.
 
@@ -152,6 +150,6 @@
         ValueError: In case of an invalid savefile.
     """
     def wrap_optimizer(cls):
-        return lambda **kwargs: DistributedOptimizer(cls(**kwargs), compression=compression)
+        return lambda **kwargs: DistributedOptimizer(cls(**kwargs), grace=grace)
     return _impl.load_model(keras, wrap_optimizer, filepath, custom_optimizers, custom_objects)
 
Index: horovod/torch/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- horovod/torch/__init__.py	(revision bb2134b427e0e0c5a83624d02fafa4f14de623d9)
+++ horovod/torch/__init__.py	(date 1587952345552)
@@ -17,8 +17,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
 import warnings
+from contextlib import contextmanager
 
 from horovod.common.util import check_extension
 
@@ -29,7 +29,6 @@
     check_extension('horovod.torch', 'HOROVOD_WITH_PYTORCH',
                     __file__, 'mpi_lib', '_mpi_lib')
 
-from horovod.torch.compression import Compression
 from horovod.torch.mpi_ops import allreduce, allreduce_async, allreduce_, allreduce_async_
 from horovod.torch.mpi_ops import allgather, allgather_async
 from horovod.torch.mpi_ops import broadcast, broadcast_async, broadcast_, broadcast_async_
@@ -45,10 +44,10 @@
 
 
 class _DistributedOptimizer(torch.optim.Optimizer):
-    def __init__(self, params, named_parameters, compression,
+    def __init__(self, params, named_parameters, grace,
                  backward_passes_per_step=1):
         super(self.__class__, self).__init__(params)
-        self._compression = compression
+        self.grace = grace
 
         if named_parameters is not None:
             named_parameters = list(named_parameters)
@@ -116,12 +115,10 @@
                     grad_acc.register_hook(self._make_hook(p))
                     self._grad_accs.append(grad_acc)
 
-    def _allreduce_grad_async(self, p):
+    def _communicate_grad_async(self, p):
         name = self._parameter_names.get(p)
         tensor = p.grad
-        tensor_compressed, ctx = self._compression.compress(tensor)
-
-        handle = allreduce_async_(tensor_compressed, average=True, name=name)
+        handle, ctx = self.grace.send_step(tensor, name)
         return handle, ctx
 
     def _make_hook(self, p):
@@ -138,25 +135,27 @@
             handle, ctx = None, None
             self._allreduce_delay[p] -= 1
             if self._allreduce_delay[p] == 0:
-                handle, ctx = self._allreduce_grad_async(p)
+                handle, ctx = self._communicate_grad_async(p)
             self._handles[p] = (handle, ctx)
+
         return hook
 
     def synchronize(self):
         missing_p = self._requires_update - set(self._handles.keys())
         for p in missing_p:
-            handle, ctx = self._allreduce_grad_async(p)
-            self._handles[p] = (handle, ctx)
+            handles, ctx = self._communicate_grad_async(p)
+            self._handles[p] = (handles, ctx)
 
         for p, value in self._handles.items():
-            handle, ctx = value
-            if handle is None:
-                handle, ctx = self._allreduce_grad_async(p)
-                self._handles[p] = (handle, ctx)
-        for p, (handle, _) in self._handles.items():
-            output = synchronize(handle)
+            handles, ctx = value
+            if handles is None:
+                handles, ctx = self._communicate_grad_async(p)
+                self._handles[p] = (handles, ctx)
+        for p, value in self._handles.items():
+            handles, ctx = value
+            tensor = self.grace.receive_step(handles, ctx)
             self._allreduce_delay[p] = self.backward_passes_per_step
-            p.grad.set_(self._compression.decompress(output, ctx))
+            p.grad.set_(tensor)
         self._handles.clear()
 
         self._synchronized = True
@@ -202,8 +201,7 @@
         return super(self.__class__, self).zero_grad()
 
 
-def DistributedOptimizer(optimizer, named_parameters=None,
-                         compression=Compression.none,
+def DistributedOptimizer(optimizer, grace, named_parameters=None,
                          backward_passes_per_step=1):
     """
     An optimizer that wraps another torch.optim.Optimizer, using an allreduce to
@@ -235,7 +233,7 @@
         optimizer: Optimizer to use for computing gradients and applying updates.
         named_parameters: A mapping between parameter names and values. Used for naming of
                           allreduce operations. Typically just ``model.named_parameters()``.
-        compression: Compression algorithm used during allreduce to reduce the amount
+        grace: Compression algorithm used during allreduce to reduce the amount
                      of data sent during the each parameter update step.  Defaults to
                      not using compression.
         backward_passes_per_step: Number of expected backward passes to perform
@@ -249,7 +247,7 @@
     cls = type(optimizer.__class__.__name__, (optimizer.__class__,),
                dict(_DistributedOptimizer.__dict__))
     return cls(optimizer.param_groups, named_parameters,
-               compression, backward_passes_per_step)
+               grace, backward_passes_per_step)
 
 
 def broadcast_parameters(params, root_rank):
@@ -352,11 +350,13 @@
     def _create_callback(pid, name, t, p):
         def _from_tensor():
             state_dict['state'][pid][name] = t(p.cpu().numpy()[0])
+
         return _from_tensor
 
     def _create_option_callback(index, option_key, option_tensor, dtypes):
         def _from_tensor():
             optimizer.param_groups[index][option_key] = _recursive_cast(option_tensor.cpu().numpy()[0], dtypes)
+
         return _from_tensor
 
     # Param groups are an ordered list, normally there is only one per model,
Index: horovod/torch/mpi_ops.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- horovod/torch/mpi_ops.py	(revision bb2134b427e0e0c5a83624d02fafa4f14de623d9)
+++ horovod/torch/mpi_ops.py	(date 1584885364766)
@@ -27,17 +27,17 @@
 if _v2_api:
     from horovod.torch import mpi_lib_v2 as mpi_lib
     from horovod.common.basics import HorovodBasics as _HorovodBasics
+
     _NULL = ""
     _basics = _HorovodBasics(__file__, 'mpi_lib_v2')
 else:
     from horovod.torch import mpi_lib_impl
     from horovod.torch import mpi_lib
     from horovod.common.basics import HorovodBasics as _HorovodBasics
+
     _NULL = mpi_lib._ffi.NULL
     _basics = _HorovodBasics(__file__, 'mpi_lib_impl', '_mpi_lib_impl')
 
-from horovod.torch.compression import Compression
-
 # import basic methods
 init = _basics.init
 shutdown = _basics.shutdown
@@ -54,7 +54,6 @@
 ddl_built = _basics.ddl_built
 mlsl_built = _basics.mlsl_built
 
-
 # Schema: handle -> input, output
 # We keep input in order to make sure it does not get garbage collected
 # before the operation is finished.
@@ -81,7 +80,7 @@
     if tensor.dtype == torch.float16 and not _fp16_supported:
         raise NotImplementedError(
             'float16 allreduce is not supported for PyTorch version {} < 1.0.0'
-            .format(torch.__version__))
+                .format(torch.__version__))
 
     function = _check_function(_allreduce_function_factory, tensor)
     handle = getattr(mpi_lib, function)(tensor, output, average,
@@ -128,7 +127,7 @@
         return allreduce(grad_output, ctx.average), None, None
 
 
-def allreduce(tensor, average=True, name=None, compression=Compression.none):
+def allreduce(tensor, average=True, name=None):
     """
     A function that performs averaging or summation of the input tensor over all the
     Horovod processes. The input tensor is not modified.
@@ -147,17 +146,12 @@
         average: A flag indicating whether to compute average or summation,
                  defaults to average.
         name: A name of the reduction operation.
-        compression: Compression algorithm used during allreduce to reduce the amount
-                     of data sent during the each parameter update step.  Defaults to
-                     not using compression.
 
     Returns:
         A tensor of the same shape and type as `tensor`, averaged or summed across all
         processes.
     """
-    tensor_compressed, ctx = compression.compress(tensor)
-    summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name)
-    return compression.decompress(summed_tensor_compressed, ctx)
+    return HorovodAllreduce.apply(tensor, average, name)
 
 
 def allreduce_async_(tensor, average=True, name=None):