diff --git a/keras_nlp/conftest.py b/keras_nlp/conftest.py index 04daf5cd37..3a23ee475d 100644 --- a/keras_nlp/conftest.py +++ b/keras_nlp/conftest.py @@ -86,8 +86,3 @@ def pytest_collection_modifyitems(config, items): tf.debugging.disable_traceback_filtering() if backend_config.multi_backend(): keras.config.disable_traceback_filtering() - -# One off setup for dtensor tests. -if not backend_config.multi_backend(): - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py index caf73c0606..3f357d6408 100644 --- a/keras_nlp/models/gpt2/gpt2_backbone.py +++ b/keras_nlp/models/gpt2/gpt2_backbone.py @@ -14,10 +14,6 @@ import copy -from tensorflow.experimental import dtensor -from tensorflow.experimental.dtensor import Layout -from tensorflow.keras.dtensor.experimental import LayoutMap - from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.position_embedding import PositionEmbedding @@ -190,71 +186,3 @@ def get_config(self): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for a GPT2Backbone. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.GPT2Backbone` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.GPT2Backbone.create_layout_map(mesh) - - with layout_map.scope(): - model = keras_nlp.models.GPT2Backbone.from_preset("gpt2_base_en") - ``` - """ - # We assert the mesh is 2D, and assume the first mesh dim is for data - # parallel and the second dim is for model parallel. - mesh_shape = mesh.shape() - if len(mesh_shape) != 2: - raise ValueError( - f"Expect to create layout based on 2D mesh, received {mesh}" - ) - _, model_dim = mesh.dim_names - unshard_dim = dtensor.UNSHARDED - - layout_map = LayoutMap(mesh=mesh) - # Embedding sharding - layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh) - - # Transformer block sharding - layout_map[r".*_(query|key|value)_dense.kernel"] = Layout( - [unshard_dim, unshard_dim, model_dim], mesh - ) - layout_map[r".*_(query|key|value)_dense.bias"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout( - [unshard_dim, model_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout( - [model_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.kernel"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.bias"] = Layout( - [unshard_dim], mesh - ) - return layout_map diff --git a/keras_nlp/models/gpt2/gpt2_backbone_test.py b/keras_nlp/models/gpt2/gpt2_backbone_test.py index 8fc779c634..c82ec1b06d 100644 --- a/keras_nlp/models/gpt2/gpt2_backbone_test.py +++ b/keras_nlp/models/gpt2/gpt2_backbone_test.py @@ -84,20 +84,3 @@ def test_saved_model(self): # Check that output matches. restored_output = restored_model(self.input_batch) self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with GPT2Backbone.create_layout_map(mesh).scope(): - GPT2Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py index 23dcc41664..44eebd0a20 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm.py @@ -325,39 +325,3 @@ def next(prompt, cache, index): "token_ids": token_ids, "padding_mask": padding_mask, } - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for a GPT2CausalLM. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.GPT2CausalLM` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.GPT2CausalLM.create_layout_map(mesh) - - with layout_map.scope(): - gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en") - ``` - """ - # As this task has no new variables, we just re-use the backbone method. - return cls.backbone_cls.create_layout_map(mesh) diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py index c50b6c5cf4..412083b275 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py @@ -165,13 +165,3 @@ def test_saved_model(self): keras.utils.set_random_seed(42) restored_output = restored_model.predict(self.raw_batch) self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with GPT2CausalLM.create_layout_map(mesh).scope(): - GPT2CausalLM(backbone=self.backbone) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py index 8fe37472a0..ff1495ba9f 100644 --- a/keras_nlp/models/opt/opt_backbone.py +++ b/keras_nlp/models/opt/opt_backbone.py @@ -14,10 +14,6 @@ import copy -from tensorflow.experimental import dtensor -from tensorflow.experimental.dtensor import Layout -from tensorflow.keras.dtensor.experimental import LayoutMap - from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.token_and_position_embedding import ( @@ -168,71 +164,3 @@ def get_config(self): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for an OPTBackbone. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.OPTBackbone` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.OPTBackbone.create_layout_map(mesh) - - with layout_map.scope(): - model = keras_nlp.models.OPTBackbone.from_preset("opt_125m_en") - ``` - """ - # We assert the mesh is 2D, and assume the first mesh dim is for data - # parallel and the second dim is for model parallel. - mesh_shape = mesh.shape() - if len(mesh_shape) != 2: - raise ValueError( - f"Expect to create layout based on 2D mesh, received {mesh}" - ) - _, model_dim = mesh.dim_names - unshard_dim = dtensor.UNSHARDED - - layout_map = LayoutMap(mesh=mesh) - # Embedding sharding - layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh) - - # Transformer block sharding - layout_map[r".*_(query|key|value)_dense.kernel"] = Layout( - [unshard_dim, unshard_dim, model_dim], mesh - ) - layout_map[r".*_(query|key|value)_dense.bias"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout( - [unshard_dim, model_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout( - [model_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.kernel"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.bias"] = Layout( - [unshard_dim], mesh - ) - return layout_map diff --git a/keras_nlp/models/opt/opt_backbone_test.py b/keras_nlp/models/opt/opt_backbone_test.py index 1d7e54889c..c887001040 100644 --- a/keras_nlp/models/opt/opt_backbone_test.py +++ b/keras_nlp/models/opt/opt_backbone_test.py @@ -84,20 +84,3 @@ def test_saved_model(self): # Check that output matches. restored_output = restored_model(self.input_batch) self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with OPTBackbone.create_layout_map(mesh).scope(): - OPTBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py index f0b0682749..6197a87ffd 100644 --- a/keras_nlp/models/opt/opt_causal_lm.py +++ b/keras_nlp/models/opt/opt_causal_lm.py @@ -321,39 +321,3 @@ def next(prompt, cache, index): "token_ids": token_ids, "padding_mask": padding_mask, } - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for an OPTCausalLM. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.OPTCausalLM` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.OPTCausalLM.create_layout_map(mesh) - - with layout_map.scope(): - opt_lm = keras_nlp.models.OPTCausalLM.from_preset("opt_125m_en") - ``` - """ - # As this task has no new variables, we just re-use the backbone method. - return cls.backbone_cls.create_layout_map(mesh) diff --git a/keras_nlp/models/opt/opt_causal_lm_test.py b/keras_nlp/models/opt/opt_causal_lm_test.py index 19325c7b0a..1e8fcb8785 100644 --- a/keras_nlp/models/opt/opt_causal_lm_test.py +++ b/keras_nlp/models/opt/opt_causal_lm_test.py @@ -171,13 +171,3 @@ def test_saved_model(self): keras.utils.set_random_seed(42) restored_output = restored_model.predict(self.raw_batch) self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with OPTCausalLM.create_layout_map(mesh).scope(): - OPTCausalLM(backbone=self.backbone) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge()