Skip to content

Commit

Permalink
Use gelu_approximate directly in t5 presets (#1284)
Browse files Browse the repository at this point in the history
We should keep `activation="gelu"` meaning the same canonical thing it
means across Keras. Let's use our string identifier for approximate
gelu directly in the preset
  • Loading branch information
mattdangerw authored Oct 24, 2023
1 parent c5a531a commit 4c43428
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 11 deletions.
5 changes: 2 additions & 3 deletions keras_nlp/models/t5/t5_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ class T5Backbone(Backbone):
dropout: float. Dropout probability for the Transformer layers.
activation: activation function (or activation string name). The
activation to be used in the inner dense blocks of the
Transformer layers. The original T5 architecture used `"relu"`,
but more recent versions use `"gelu"`. Defaults to `"gelu"`.
Transformer layers. Defaults to `"relu"`.
use_gated_activation: boolean. Whether to use activation gating in
the inner dense blocks of the Transformer layers.
The original T5 architecture didn't use gating, but more
Expand All @@ -80,7 +79,7 @@ def __init__(
intermediate_dim,
key_value_dim=None,
dropout=0.1,
activation="gelu",
activation="relu",
use_gated_activation=True,
layer_norm_epsilon=1e-06,
tie_embedding_weights=False,
Expand Down
6 changes: 3 additions & 3 deletions keras_nlp/models/t5/t5_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
"intermediate_dim": 1024,
"key_value_dim": 64,
"dropout": 0.1,
"activation": "gelu",
"activation": "keras_nlp>gelu_approximate",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
Expand All @@ -130,7 +130,7 @@
"hidden_dim": 768,
"intermediate_dim": 2048,
"dropout": 0.1,
"activation": "gelu",
"activation": "keras_nlp>gelu_approximate",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
Expand All @@ -154,7 +154,7 @@
"hidden_dim": 1024,
"intermediate_dim": 2816,
"dropout": 0.1,
"activation": "gelu",
"activation": "keras_nlp>gelu_approximate",
"use_gated_activation": True,
"layer_norm_epsilon": 1e-06,
},
Expand Down
5 changes: 0 additions & 5 deletions keras_nlp/models/t5/t5_transformer_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,6 @@ def __init__(
self.cross_attention_layer_norm = T5LayerNorm(layer_norm_epsilon)
self.cross_attention_dropout = keras.layers.Dropout(dropout)

if activation == "gelu":
activation = keras.activations.get("keras_nlp>gelu_approximate")
else:
activation = keras.activations.get(activation)

self.input_projector = keras.layers.Dense(
intermediate_dim,
use_bias=False,
Expand Down

0 comments on commit 4c43428

Please sign in to comment.