Skip to content

Commit

Permalink
Add detokenize testing for model tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdangerw committed Oct 28, 2023
1 parent 6b66ad8 commit 08a96e2
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 18 deletions.
4 changes: 2 additions & 2 deletions keras_nlp/models/albert/albert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def setUp(self):
self.get_test_data_dir(), "albert_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]
self.input_data = ["the quick brown fox", "the earth is round"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=AlbertTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]],
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
)

def test_errors_missing_special_tokens(self):
Expand Down
5 changes: 5 additions & 0 deletions keras_nlp/models/bart/bart_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ def test_tokenizer_basics(self):
cls=BartTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
# TODO: </s> should not get tokenized as <s>
expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]],
expected_detokenize_output=[
"<s> airplane at airport<s><pad>",
" airplane airport",
],
)

def test_errors_missing_special_tokens(self):
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/bert/bert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ def setUp(self):
self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
self.vocab += ["the", "quick", "brown", "fox"]
self.init_kwargs = {"vocabulary": self.vocab}
self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."]
self.input_data = ["THE QUICK BROWN FOX", "THE FOX"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=BertTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]],
expected_output=[[5, 6, 7, 8], [5, 8]],
)

def test_lowercase(self):
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def setUp(self):
)
self.tokenizer = DebertaV3Tokenizer(proto=proto)
self.init_kwargs = {"proto": proto}
self.input_data = ["the quick brown fox.", "the earth is round."]
self.input_data = ["the quick brown fox", "the earth is round"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=DebertaV3Tokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
)

def test_errors_missing_special_tokens(self):
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def setUp(self):
self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
self.vocab += ["the", "quick", "brown", "fox"]
self.init_kwargs = {"vocabulary": self.vocab}
self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."]
self.input_data = ["THE QUICK BROWN FOX", "THE FOX"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=DistilBertTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]],
expected_output=[[5, 6, 7, 8], [5, 8]],
)

def test_lowercase(self):
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/f_net/f_net_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def setUp(self):
self.get_test_data_dir(), "f_net_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]
self.input_data = ["the quick brown fox", "the earth is round"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=FNetTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]],
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
)

def test_errors_missing_special_tokens(self):
Expand Down
5 changes: 5 additions & 0 deletions keras_nlp/models/roberta/roberta_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ def test_tokenizer_basics(self):
cls=RobertaTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
# TODO: </s> should not get tokenized as <s>
expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]],
expected_detokenize_output=[
"<s> airplane at airport<s><pad>",
" airplane airport",
],
)

def test_errors_missing_special_tokens(self):
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/t5/t5_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def setUp(self):
# Generated using create_t5_test_proto.py
"proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm")
}
self.input_data = ["the quick brown fox.", "the earth is round."]
self.input_data = ["the quick brown fox", "the earth is round"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=T5Tokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[4, 9, 5, 2], [4, 6, 8, 2]],
expected_output=[[4, 9, 5, 7], [4, 6, 8, 10]],
)

def test_errors_missing_special_tokens(self):
Expand Down
3 changes: 0 additions & 3 deletions keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,6 @@ def tokenize(self, inputs):
return tf.add(tokens, 1)

def detokenize(self, inputs):
if inputs.dtype == tf.string:
return super().detokenize(inputs)

tokens = tf.ragged.boolean_mask(
inputs, tf.not_equal(inputs, self.mask_token_id)
)
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ def setUp(self):
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]
self.input_data = ["the quick brown fox", "the earth is round"]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=XLMRobertaTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]],
expected_output=[[6, 11, 7, 9], [6, 8, 10, 12]],
)

@pytest.mark.large
Expand Down
10 changes: 9 additions & 1 deletion keras_nlp/tests/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from keras_nlp.backend import config
from keras_nlp.backend import keras
from keras_nlp.backend import ops
from keras_nlp.tokenizers.tokenizer import Tokenizer
from keras_nlp.utils.tensor_utils import is_float_dtype
from keras_nlp.utils.tensor_utils import standardize_dtype

Expand Down Expand Up @@ -203,7 +204,7 @@ def run_preprocessing_layer_test(
init_kwargs,
input_data,
expected_output=None,
batch_size=2,
expected_detokenize_output=None,
):
"""Run basic tests for a preprocessing layer."""
layer = cls(**init_kwargs)
Expand All @@ -219,6 +220,13 @@ def run_preprocessing_layer_test(
else:
output = layer(input_data)

# For tokenizers only, also check detokenize.
if isinstance(layer, Tokenizer):
if not expected_detokenize_output:
expected_detokenize_output = input_data
detokenize_output = layer.detokenize(output)
self.assertAllEqual(detokenize_output, expected_detokenize_output)

# Run with an unbatched dataset.
output_ds = ds.map(layer).ragged_batch(1_000)
self.assertAllClose(output, output_ds.get_single_element())
Expand Down

0 comments on commit 08a96e2

Please sign in to comment.