Add CLIP model (#1955)

* Add `CLIPVisionEmbedding` * Add `CLIPBackbone` and `CLIPVisionEncoder` and `CLIPImageConverter` * Fix typo
keras-team · Oct 28, 2024 · 9238b06 · 9238b06
1 parent a45110e
commit 9238b06
Show file tree

Hide file tree

Showing 8 changed files with 511 additions and 1 deletion.
diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py
@@ -34,6 +34,7 @@
 from keras_hub.src.layers.preprocessing.random_deletion import RandomDeletion
 from keras_hub.src.layers.preprocessing.random_swap import RandomSwap
 from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
+from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
 from keras_hub.src.models.deeplab_v3.deeplab_v3_image_converter import (
     DeepLabV3ImageConverter,
 )

diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
@@ -53,8 +53,11 @@
 from keras_hub.src.models.bloom.bloom_tokenizer import BloomTokenizer
 from keras_hub.src.models.causal_lm import CausalLM
 from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
 from keras_hub.src.models.clip.clip_preprocessor import CLIPPreprocessor
+from keras_hub.src.models.clip.clip_text_encoder import CLIPTextEncoder
 from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
+from keras_hub.src.models.clip.clip_vision_encoder import CLIPVisionEncoder
 from keras_hub.src.models.csp_darknet.csp_darknet_backbone import (
     CSPDarkNetBackbone,
 )

diff --git a/keras_hub/src/models/clip/clip_backbone.py b/keras_hub/src/models/clip/clip_backbone.py
@@ -0,0 +1,242 @@
+import math
+
+from keras import layers
+from keras import ops
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.backbone import Backbone
+
+
+class CLIPVisionPooler(layers.Layer):
+    """The vision pooler layer of CLIP.
+
+    `CLIPVisionPooler` will extracts the first token (index `0`) from the
+    sequence of the vision embeddings as the pooled outputs.
+
+    Call arguments:
+        vision_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+    """
+
+    def call(self, vision_embeddings):
+        pooled_outputs = vision_embeddings[:, 0, :]
+        return pooled_outputs
+
+
+class CLIPTextPooler(layers.Layer):
+    """The text pooler layer of CLIP.
+
+    `CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
+    as the pooled outputs.
+
+    Call arguments:
+        text_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+        token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
+            identify the positions of EOS tokens.
+    """
+
+    def call(self, text_embeddings, token_ids):
+        eos_index = ops.argmax(token_ids, axis=-1, keepdims=True)
+        eos_index = ops.expand_dims(eos_index, axis=-1)
+        pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
+        pooled_outputs = ops.squeeze(pooled_outputs, axis=1)
+        return pooled_outputs
+
+
+class CLIPHead(layers.Layer):
+    """The head layer of CLIP.
+
+    `CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
+    compute the corresponding logits. Both embeddings are L2 normalized and used
+    to compute pairwise cosine similarity. The resulting logits are then scaled
+    by a learnable `logit_scale` parameter.
+
+    Call arguments:
+        vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+        text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+    """
+
+    def build(self, input_shape):
+        self.logit_scale = self.add_weight(
+            shape=(),
+            initializer=lambda *a, **kw: math.log(1 / 0.07),
+            trainable=True,
+            dtype=self.variable_dtype,
+            name="logit_scale",
+        )
+
+    def call(self, vision_embedding, text_embedding):
+        normalized_vision_embedding = ops.sqrt(
+            ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
+        )
+        normalized_text_embedding = ops.sqrt(
+            ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
+        )
+        vision_embedding = vision_embedding / normalized_vision_embedding
+        text_embedding = text_embedding / normalized_text_embedding
+        logit_scale = ops.exp(self.logit_scale)
+        text_logits = (
+            ops.matmul(
+                text_embedding,
+                ops.transpose(vision_embedding),
+            )
+            * logit_scale
+        )
+        vision_logits = ops.transpose(text_logits)
+        return vision_logits, text_logits
+
+
+@keras_hub_export("keras_hub.models.CLIPBackbone")
+class CLIPBackbone(Backbone):
+    """CLIP core network with hyperparameters.
+
+    This backbone implements the base architecture for Contrastive
+    Language-Image Pretraining (CLIP) model. It includes a vision and text
+    encoders and the corresponding projection layers. This backbone will output
+    the final logit scores corresponding to each image and token input. These
+    values are cosine similarities between the corresponding image and text
+    features.
+
+    The default constructor gives a fully customizable, randomly initialized
+    CLIP model with any number of layers, heads, and embedding dimensions. To
+    load preset architectures and weights, use the `from_preset` constructor.
+
+    Args:
+        vision_encoder: The CLIP vision encoder for encoding the input images.
+        text_encoder: The CLIP text encoder for encoding the input tokens.
+        projection_dim: int. The size of the projection layer.
+        dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
+            for the models computations and weights. Note that some
+            computations, such as softmax and layer normalization will always
+            be done a float32 precision regardless of dtype.
+
+    Example:
+    ```python
+    input_data = {
+        "images": np.ones(shape=(1, 224, 224, 3), dtype="float32"),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+    }
+
+    # Pretrained CLIP model.
+    model = keras_hub.models.CLIPBackbone.from_preset("clip-vit-base-patch32")
+    model(input_data)
+
+    # Randomly initialized CLIP model with custom config.
+    vision_encoder = keras_hub.models.CLIPVisionEncoder(
+        patch_size=32,
+        hidden_dim=768,
+        num_layers=8,
+        num_heads=8,
+        intermediate_dim=2048,
+        image_shape=(384, 384, 3),
+    )
+    text_encoder = keras_hub.models.CLIPTextEncoder(
+        vocabulary_size=49408,
+        embedding_dim=768,
+        hidden_dim=768,
+        num_layers=8,
+        num_heads=8,
+        intermediate_dim=2048,
+    )
+    model = keras_hub.models.CLIPBackbone(
+        vision_encoder=50257,
+        text_encoder=12,
+        projection_dim=256,
+    )
+    model(input_data)
+    ```
+    """
+
+    def __init__(
+        self,
+        vision_encoder,
+        text_encoder,
+        projection_dim,
+        dtype=None,
+        name=None,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.vision_encoder = vision_encoder
+        self.text_encoder = text_encoder
+        self.vision_pooler = CLIPVisionPooler(dtype=dtype, name="vision_pooler")
+        self.text_pooler = CLIPTextPooler(dtype=dtype, name="text_pooler")
+        self.vision_projection = layers.Dense(
+            projection_dim,
+            use_bias=False,
+            dtype=dtype,
+            name="vision_projection",
+        )
+        self.text_projection = layers.Dense(
+            projection_dim,
+            use_bias=False,
+            dtype=dtype,
+            name="text_projection",
+        )
+        self.clip_head = CLIPHead(dtype=dtype, name="clip_head")
+
+        # === Functional Model ===
+        image_input = layers.Input(
+            shape=self.vision_encoder.image_shape, name="images"
+        )
+        token_id_input = layers.Input(
+            shape=(None,), dtype="int32", name="token_ids"
+        )
+        vision_outputs = self.vision_encoder({"images": image_input})
+        text_outputs = self.text_encoder({"token_ids": token_id_input})
+        vision_outputs = self.vision_pooler(vision_outputs)
+        text_outputs = self.text_pooler(text_outputs, token_id_input)
+        vision_embeddings = self.vision_projection(vision_outputs)
+        text_embeddings = self.text_projection(text_outputs)
+        vision_logits, text_logits = self.clip_head(
+            vision_embeddings, text_embeddings
+        )
+
+        super().__init__(
+            inputs={
+                "images": image_input,
+                "token_ids": token_id_input,
+            },
+            outputs={
+                "vision_logits": vision_logits,
+                "text_logits": text_logits,
+            },
+            name=name,
+            **kwargs,
+        )
+
+        # === Config ===
+        self.projection_dim = projection_dim
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "vision_encoder": layers.serialize(self.vision_encoder),
+                "text_encoder": layers.serialize(self.text_encoder),
+                "projection_dim": self.projection_dim,
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+
+        # Propagate `dtype` to submodels if needed.
+        if "dtype" in config and config["dtype"] is not None:
+            dtype_config = config["dtype"]
+            if "dtype" not in config["vision_encoder"]["config"]:
+                config["vision_encoder"]["config"]["dtype"] = dtype_config
+            if "dtype" not in config["text_encoder"]["config"]:
+                config["text_encoder"]["config"]["dtype"] = dtype_config
+
+        # We expect submodels to be instantiated.
+        config["vision_encoder"] = layers.deserialize(
+            config["vision_encoder"], custom_objects=custom_objects
+        )
+        config["text_encoder"] = layers.deserialize(
+            config["text_encoder"], custom_objects=custom_objects
+        )
+        return cls(**config)
diff --git a/keras_hub/src/models/clip/clip_encoder_block.py b/keras_hub/src/models/clip/clip_encoder_block.py
@@ -14,6 +14,7 @@ def __init__(
         num_heads,
         intermediate_dim,
         intermediate_activation="quick_gelu",
+        use_causal_mask=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -26,6 +27,7 @@ def __init__(
         self.num_heads = num_heads
         self.intermediate_dim = intermediate_dim
         self.intermediate_activation = intermediate_activation
+        self.use_causal_mask = use_causal_mask
 
         if intermediate_activation == "quick_gelu":
             intermediate_activation = quick_gelu
@@ -73,7 +75,9 @@ def compute_output_shape(self, inputs_shape):
     def call(self, x, training=None):
         residual = x
         x = self.layer_norm_1(x)
-        x = self.attention(x, x, x, training=training, use_causal_mask=True)
+        x = self.attention(
+            x, x, x, training=training, use_causal_mask=self.use_causal_mask
+        )
         x = ops.add(residual, x)
 
         residual = x
@@ -91,6 +95,7 @@ def get_config(self):
                 "num_heads": self.num_heads,
                 "intermediate_dim": self.intermediate_dim,
                 "intermediate_activation": self.intermediate_activation,
+                "use_causal_mask": self.use_causal_mask,
             }
         )
         return config
diff --git a/keras_hub/src/models/clip/clip_image_converter.py b/keras_hub/src/models/clip/clip_image_converter.py
@@ -0,0 +1,8 @@
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
+
+
+@keras_hub_export("keras_hub.layers.CLIPImageConverter")
+class CLIPImageConverter(ImageConverter):
+    backbone_cls = CLIPBackbone
diff --git a/keras_hub/src/models/clip/clip_text_encoder.py b/keras_hub/src/models/clip/clip_text_encoder.py
@@ -1,12 +1,14 @@
 from keras import layers
 
+from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.modeling.token_and_position_embedding import (
     TokenAndPositionEmbedding,
 )
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.clip.clip_encoder_block import CLIPEncoderBlock
 
 
+@keras_hub_export("keras_hub.models.CLIPTextEncoder")
 class CLIPTextEncoder(Backbone):
     """CLIP text core network with hyperparameters.