keras-team · mattdangerw · Jul 31, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/keras_nlp/samplers/beam_sampler.py b/keras_nlp/samplers/beam_sampler.py
@@ -18,10 +18,10 @@
 
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
+from keras_nlp.backend import ops
 from keras_nlp.samplers.sampler import Sampler
 from keras_nlp.samplers.sampler import call_args_docstring
 from keras_nlp.utils.python_utils import format_docstring
-from keras_nlp.utils.tensor_utils import assert_tf_backend
 
 
 @format_docstring(call_args=call_args_docstring)
@@ -52,7 +52,7 @@ class BeamSampler(Sampler):
     batch_size, length, vocab_size = 1, 12, len(int_lookup)
 
     def next(prompt, cache, index):
-        prompt_batch_size = tf.shape(prompt)[0]
+        prompt_batch_size = ops.shape(prompt)[0]
         hidden_states = np.ones((prompt_batch_size, 10))
         # A uniform distribution over our alphabet.
         logits = np.ones((prompt_batch_size, vocab_size))
@@ -75,7 +75,7 @@ def next(prompt, cache, index):
     batch_size, length, vocab_size = 1, 8, len(int_lookup)
 
     def next(prompt, cache, index):
-        prompt_batch_size = tf.shape(prompt)[0]
+        prompt_batch_size = ops.shape(prompt)[0]
         hidden_states = np.ones((prompt_batch_size, 10))
         # A uniform distribution over our alphabet.
         logits = np.ones((batch_size, vocab_size))
@@ -102,10 +102,6 @@ def __init__(
         return_all_beams=False,
         **kwargs,
     ):
-        # Temporarily turn off beam search in other backends.
-        # No technical blockers here, just need tf -> ops rewrite.
-        assert_tf_backend(self.__class__.__name__)
-
         super().__init__(**kwargs)
         self.num_beams = num_beams
         self.return_all_beams = return_all_beams
@@ -120,89 +116,91 @@ def __call__(
         end_token_id=None,
         hidden_states=None,
     ):
-        batch_size, max_length = tf.shape(prompt)[0], tf.shape(prompt)[1]
+        batch_size, max_length = ops.shape(prompt)[0], ops.shape(prompt)[1]
         # Make sure max length and start index are the same dtype.
-        index = tf.cast(index, max_length.dtype)
+        index = ops.cast(index, max_length.dtype)
 
         def create_beams(x):
             """Add initial beam state."""
-            return tf.repeat(x, self.num_beams, axis=0)
+            return ops.repeat(x, self.num_beams, axis=0)
 
         def flatten_beams(x):
             """Combine the beam dim and batch dim."""
             flat_shape = [batch_size * self.num_beams] + x.shape.as_list()[2:]
-            return tf.reshape(x, shape=flat_shape)
+            return ops.reshape(x, new_shape=flat_shape)
 
         def unflatten_beams(x):
             """Separate the beam dim and batch dim."""
             unflat_shape = [batch_size, self.num_beams] + x.shape.as_list()[1:]
-            return tf.reshape(x, shape=unflat_shape)
+            return ops.reshape(x, new_shape=unflat_shape)
 
         if mask is None:
-            mask = tf.zeros_like(prompt, dtype="bool")
+            mask = ops.zeros_like(prompt, dtype="bool")
         else:
-            mask = tf.cast(mask, dtype="bool")
-        # `tf.while_loop` will not accept `None` as a value for `loop_vars`.
+            mask = ops.cast(mask, dtype="bool")
+        # `ops.while_loop` will not accept `None` as a value for `loop_vars`.
         cache = () if cache is None else cache
         # Add extra sequences for each beam.
         prompt, mask = create_beams(prompt), create_beams(mask)
         cache = tf.nest.map_structure(create_beams, cache)
         # Setup the initial beam log-likelihoods.
         # On the first loop, make sure only the original beam is considered.
-        log_probs = tf.constant([[0.0] + [-1e9] * (self.num_beams - 1)])
-        log_probs = flatten_beams(tf.repeat(log_probs, batch_size, axis=0))
+        log_probs = [[0.0] + [-1e9] * (self.num_beams - 1)]
+        log_probs = flatten_beams(ops.repeat(log_probs, batch_size, axis=0))
 
         def cond(prompt, cache, index, log_probs):
             if end_token_id is None:
                 return True
             # Stop if all sequences have produced a *new* end_token_id.
             end_tokens = (prompt == end_token_id) & (~mask)
-            prompt_done = tf.reduce_any(end_tokens, axis=-1)
-            return not tf.reduce_all(prompt_done)
+            prompt_done = ops.any(end_tokens, axis=-1)
+            return not ops.all(prompt_done)
 
         def body(prompt, cache, index, log_probs):
             # Compute the softmax distribution for the next token.
             logits, _, cache = next(prompt, cache, index)
-            vocab_size = tf.shape(logits)[-1]
+            vocab_size = ops.shape(logits)[-1]
             probs = keras.activations.softmax(logits / self.temperature)
 
             # Compute the running log-likelihood of each new candidate.
-            next_log_probs = tf.math.log(probs) + log_probs[..., tf.newaxis]
+            next_log_probs = ops.log(probs) + log_probs[..., tf.newaxis]
             # Reshape `preds` to shape `(batch_size, num_beams * vocab_size)`.
-            next_log_probs = tf.reshape(next_log_probs, shape=[batch_size, -1])
+            next_log_probs = ops.reshape(
+                next_log_probs, new_shape=[batch_size, -1]
+            )
 
             # Compute the top beam indices and next tokens.
-            next_log_probs, indices = tf.math.top_k(
+            next_log_probs, indices = ops.top_k(
                 next_log_probs, k=self.num_beams, sorted=False
             )
             beam_indices = indices // vocab_size
             next_token = flatten_beams(indices % vocab_size)
             # Ensure shape is `[None]`, otherwise it causes issues after
             # converting to TFLite.
-            next_token = tf.ensure_shape(next_token, [None])
+            # next_token = tf.ensure_shape(next_token, [None])
             # We need `ensure_shape` as `top_k` will change the static shape.
             next_log_probs = flatten_beams(next_log_probs)
-            log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)
+            # log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)
 
             def gather_beams(x):
                 x = unflatten_beams(x)
-                x = tf.gather(x, beam_indices, axis=1, batch_dims=1)
+                x = ops.take_along_axis(x, beam_indices, axis=1)
                 return flatten_beams(x)
 
             prompt = gather_beams(prompt)
             cache = tf.nest.map_structure(gather_beams, cache)
 
             # Update each beam with the next token.
-            next_token = tf.cast(next_token, prompt.dtype)
+            next_token = ops.cast(next_token, prompt.dtype)
             # Don't overwrite anywhere mask is True.
-            next_token = tf.where(mask[:, index], prompt[:, index], next_token)
+            next_token = ops.where(mask[:, index], prompt[:, index], next_token)
             # Update the prompt with the next token.
             next_token = next_token[:, tf.newaxis]
             prompt = dynamic_update_slice(prompt, next_token, [0, index])
             # Return the iteration of the loop state.
             return (prompt, cache, index + 1, log_probs)
 
-        prompt, _, _, log_probs = tf.while_loop(
+        prompt, _, _, log_probs = ops.while_loop(
             cond=cond,
             body=body,
             loop_vars=(prompt, cache, index, log_probs),
@@ -213,21 +211,19 @@ def gather_beams(x):
         all_log_probs = unflatten_beams(log_probs)
 
         if self.return_all_beams:
-            sorted_indices = tf.argsort(
-                all_log_probs, axis=-1, direction="DESCENDING"
-            )
-            sorted_log_probs = tf.gather(
-                all_log_probs, sorted_indices, axis=-1, batch_dims=1
+            sorted_indices = ops.argsort(all_log_probs, axis=-1)
+            sorted_log_probs = ops.take_along_axis(
+                all_log_probs, sorted_indices, axis=-1
             )
-            sorted_prompts = tf.gather(
-                all_prompts, sorted_indices, axis=1, batch_dims=1
+            sorted_prompts = ops.take_along_axis(
+                all_prompts, sorted_indices, axis=1
             )
             return sorted_prompts, sorted_log_probs
         else:
             # Gather the top beam at each batch index.
-            top_beams = tf.math.argmax(all_log_probs, axis=-1)[:, tf.newaxis]
-            prompt = tf.gather(all_prompts, top_beams, axis=1, batch_dims=1)
-            return tf.squeeze(prompt, axis=1)
+            top_beams = ops.argmax(all_log_probs, axis=-1)[:, tf.newaxis]
+            prompt = ops.take_along_axis(all_prompts, top_beams, axis=1)
+            return ops.squeeze(prompt, axis=1)
 
     def get_config(self):
         config = super().get_config()

diff --git a/keras_nlp/samplers/beam_sampler_test.py b/keras_nlp/samplers/beam_sampler_test.py
@@ -22,7 +22,6 @@
 from keras_nlp.tests.test_case import TestCase
 
 
-@pytest.mark.tf_only
 class BeamSamplerTest(TestCase):
     def setUp(self):
         super().setUp()