diff --git a/tf_shell_ml/dpsgd_sequential_model.py b/tf_shell_ml/dpsgd_sequential_model.py index 1458120..558ce59 100644 --- a/tf_shell_ml/dpsgd_sequential_model.py +++ b/tf_shell_ml/dpsgd_sequential_model.py @@ -76,6 +76,13 @@ def compute_grads(self, features, enc_labels): max_two_norms_list.append(self.jacobian_max_two_norm(jacobians)) with tf.device(self.features_party_dev): + # For some reason, when running the jacobian on an accelerator, the + # weights must be touched otherwise training loss goes to NaN. Maybe + # it is to ensure the weights are on assigned to features_party + # device for later, when the final gradient is added to weights (on + # CPU)? + tf.print(self.trainable_variables, output_stream="file:///dev/null") + predictions = tf.concat(predictions_list, axis=0) max_two_norm = tf.reduce_max(max_two_norms_list) diff --git a/tf_shell_ml/postscale_sequential_model.py b/tf_shell_ml/postscale_sequential_model.py index b36a823..1b5c0d0 100644 --- a/tf_shell_ml/postscale_sequential_model.py +++ b/tf_shell_ml/postscale_sequential_model.py @@ -67,6 +67,13 @@ def compute_grads(self, features, enc_labels): max_two_norms_list.append(self.jacobian_max_two_norm(jacobians)) with tf.device(self.features_party_dev): + # For some reason, when running the jacobian on an accelerator, the + # weights must be touched otherwise training loss goes to NaN. Maybe + # it is to ensure the weights are on assigned to features_party + # device for later, when the final gradient is added to weights (on + # CPU)? + tf.print(self.trainable_variables, output_stream="file:///dev/null") + predictions = tf.concat(predictions_list, axis=0) max_two_norm = tf.reduce_max(max_two_norms_list) jacobians = [tf.concat(j, axis=0) for j in zip(*jacobians_list)]