Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port BeamSampler to core #1181

Merged
merged 7 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 35 additions & 39 deletions keras_nlp/samplers/beam_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.backend import keras
from keras_nlp.backend import ops
from keras_nlp.samplers.sampler import Sampler
from keras_nlp.samplers.sampler import call_args_docstring
from keras_nlp.utils.python_utils import format_docstring
from keras_nlp.utils.tensor_utils import assert_tf_backend


@format_docstring(call_args=call_args_docstring)
Expand Down Expand Up @@ -52,7 +52,7 @@ class BeamSampler(Sampler):
batch_size, length, vocab_size = 1, 12, len(int_lookup)

def next(prompt, cache, index):
prompt_batch_size = tf.shape(prompt)[0]
prompt_batch_size = ops.shape(prompt)[0]
hidden_states = np.ones((prompt_batch_size, 10))
# A uniform distribution over our alphabet.
logits = np.ones((prompt_batch_size, vocab_size))
Expand All @@ -75,7 +75,7 @@ def next(prompt, cache, index):
batch_size, length, vocab_size = 1, 8, len(int_lookup)

def next(prompt, cache, index):
prompt_batch_size = tf.shape(prompt)[0]
prompt_batch_size = ops.shape(prompt)[0]
hidden_states = np.ones((prompt_batch_size, 10))
# A uniform distribution over our alphabet.
logits = np.ones((batch_size, vocab_size))
Expand All @@ -102,10 +102,6 @@ def __init__(
return_all_beams=False,
**kwargs,
):
# Temporarily turn off beam search in other backends.
# No technical blockers here, just need tf -> ops rewrite.
assert_tf_backend(self.__class__.__name__)

super().__init__(**kwargs)
self.num_beams = num_beams
self.return_all_beams = return_all_beams
Expand All @@ -120,89 +116,91 @@ def __call__(
end_token_id=None,
hidden_states=None,
):
batch_size, max_length = tf.shape(prompt)[0], tf.shape(prompt)[1]
batch_size, max_length = ops.shape(prompt)[0], ops.shape(prompt)[1]
# Make sure max length and start index are the same dtype.
index = tf.cast(index, max_length.dtype)
index = ops.cast(index, max_length.dtype)

def create_beams(x):
"""Add initial beam state."""
return tf.repeat(x, self.num_beams, axis=0)
return ops.repeat(x, self.num_beams, axis=0)

def flatten_beams(x):
"""Combine the beam dim and batch dim."""
flat_shape = [batch_size * self.num_beams] + x.shape.as_list()[2:]
return tf.reshape(x, shape=flat_shape)
return ops.reshape(x, new_shape=flat_shape)

def unflatten_beams(x):
"""Separate the beam dim and batch dim."""
unflat_shape = [batch_size, self.num_beams] + x.shape.as_list()[1:]
return tf.reshape(x, shape=unflat_shape)
return ops.reshape(x, new_shape=unflat_shape)

if mask is None:
mask = tf.zeros_like(prompt, dtype="bool")
mask = ops.zeros_like(prompt, dtype="bool")
else:
mask = tf.cast(mask, dtype="bool")
# `tf.while_loop` will not accept `None` as a value for `loop_vars`.
mask = ops.cast(mask, dtype="bool")
# `ops.while_loop` will not accept `None` as a value for `loop_vars`.
cache = () if cache is None else cache
# Add extra sequences for each beam.
prompt, mask = create_beams(prompt), create_beams(mask)
cache = tf.nest.map_structure(create_beams, cache)
# Setup the initial beam log-likelihoods.
# On the first loop, make sure only the original beam is considered.
log_probs = tf.constant([[0.0] + [-1e9] * (self.num_beams - 1)])
log_probs = flatten_beams(tf.repeat(log_probs, batch_size, axis=0))
log_probs = [[0.0] + [-1e9] * (self.num_beams - 1)]
log_probs = flatten_beams(ops.repeat(log_probs, batch_size, axis=0))

def cond(prompt, cache, index, log_probs):
if end_token_id is None:
return True
# Stop if all sequences have produced a *new* end_token_id.
end_tokens = (prompt == end_token_id) & (~mask)
prompt_done = tf.reduce_any(end_tokens, axis=-1)
return not tf.reduce_all(prompt_done)
prompt_done = ops.any(end_tokens, axis=-1)
return not ops.all(prompt_done)

def body(prompt, cache, index, log_probs):
# Compute the softmax distribution for the next token.
logits, _, cache = next(prompt, cache, index)
vocab_size = tf.shape(logits)[-1]
vocab_size = ops.shape(logits)[-1]
probs = keras.activations.softmax(logits / self.temperature)

# Compute the running log-likelihood of each new candidate.
next_log_probs = tf.math.log(probs) + log_probs[..., tf.newaxis]
next_log_probs = ops.log(probs) + log_probs[..., tf.newaxis]
# Reshape `preds` to shape `(batch_size, num_beams * vocab_size)`.
next_log_probs = tf.reshape(next_log_probs, shape=[batch_size, -1])
next_log_probs = ops.reshape(
next_log_probs, new_shape=[batch_size, -1]
)

# Compute the top beam indices and next tokens.
next_log_probs, indices = tf.math.top_k(
next_log_probs, indices = ops.top_k(
next_log_probs, k=self.num_beams, sorted=False
)
beam_indices = indices // vocab_size
next_token = flatten_beams(indices % vocab_size)
# Ensure shape is `[None]`, otherwise it causes issues after
# converting to TFLite.
next_token = tf.ensure_shape(next_token, [None])
# next_token = tf.ensure_shape(next_token, [None])
# We need `ensure_shape` as `top_k` will change the static shape.
next_log_probs = flatten_beams(next_log_probs)
log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)
# log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)

def gather_beams(x):
x = unflatten_beams(x)
x = tf.gather(x, beam_indices, axis=1, batch_dims=1)
x = ops.take_along_axis(x, beam_indices, axis=1)
return flatten_beams(x)

prompt = gather_beams(prompt)
cache = tf.nest.map_structure(gather_beams, cache)

# Update each beam with the next token.
next_token = tf.cast(next_token, prompt.dtype)
next_token = ops.cast(next_token, prompt.dtype)
# Don't overwrite anywhere mask is True.
next_token = tf.where(mask[:, index], prompt[:, index], next_token)
next_token = ops.where(mask[:, index], prompt[:, index], next_token)
# Update the prompt with the next token.
next_token = next_token[:, tf.newaxis]
prompt = dynamic_update_slice(prompt, next_token, [0, index])
# Return the iteration of the loop state.
return (prompt, cache, index + 1, log_probs)

prompt, _, _, log_probs = tf.while_loop(
prompt, _, _, log_probs = ops.while_loop(
cond=cond,
body=body,
loop_vars=(prompt, cache, index, log_probs),
Expand All @@ -213,21 +211,19 @@ def gather_beams(x):
all_log_probs = unflatten_beams(log_probs)

if self.return_all_beams:
sorted_indices = tf.argsort(
all_log_probs, axis=-1, direction="DESCENDING"
)
sorted_log_probs = tf.gather(
all_log_probs, sorted_indices, axis=-1, batch_dims=1
sorted_indices = ops.argsort(all_log_probs, axis=-1)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have option to set direction in argsort. Because of this couple of tests fail in local with assertion error.

The possible solution could be to reverse the array sorted_indices along axis=-1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ops.argsort(-all_log_probs, axis=-1) should do the trick

Copy link
Collaborator Author

@shivance shivance Jul 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! This fixes the failing test for tf backend

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need the minus sign in front so that it is sorted in descending order

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Flip does the job here.

Great! This fixes the failing test for tf backend

I was referring to ops.flip here 😂. Probably you commented at the same time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:o. Missed the ops.flip line

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think slipping the sign of the log probs would be more efficient, probably worth doing that and avoiding flip.

sorted_log_probs = ops.take_along_axis(
all_log_probs, sorted_indices, axis=-1
)
sorted_prompts = tf.gather(
all_prompts, sorted_indices, axis=1, batch_dims=1
sorted_prompts = ops.take_along_axis(
all_prompts, sorted_indices, axis=1
)
return sorted_prompts, sorted_log_probs
else:
# Gather the top beam at each batch index.
top_beams = tf.math.argmax(all_log_probs, axis=-1)[:, tf.newaxis]
prompt = tf.gather(all_prompts, top_beams, axis=1, batch_dims=1)
return tf.squeeze(prompt, axis=1)
top_beams = ops.argmax(all_log_probs, axis=-1)[:, tf.newaxis]
prompt = ops.take_along_axis(all_prompts, top_beams, axis=1)
return ops.squeeze(prompt, axis=1)

def get_config(self):
config = super().get_config()
Expand Down
1 change: 0 additions & 1 deletion keras_nlp/samplers/beam_sampler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from keras_nlp.tests.test_case import TestCase


@pytest.mark.tf_only
class BeamSamplerTest(TestCase):
def setUp(self):
super().setUp()
Expand Down
Loading