diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 29b85e5bb4e..781511578af 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -35,9 +35,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::config_;
 
  public:
-  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {
-    
+  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                    embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {
     switch (config_.counter_type){
       case DT_UINT64:
         VLOG(2) << "The type of bloom counter is uint64";
@@ -64,10 +65,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -81,17 +82,17 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -109,13 +110,13 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::vector<K>> lookup_or_create_ids(num_worker_threads);
     std::vector<std::vector<int>>
         lookup_or_create_cursor(num_worker_threads);
-    std::vector<std::vector<ValuePtr<V>*>>
+    std::vector<std::vector<void*>>
         lookup_or_create_ptrs(num_worker_threads);
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
     std::vector<std::list<int64>>
@@ -147,7 +148,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
           1000, do_work);
 
     std::vector<K> total_ids(num_of_keys);
-    std::vector<ValuePtr<V>*> total_ptrs(num_of_keys);
+    std::vector<void*> total_ptrs(num_of_keys);
     std::vector<int> total_cursors(num_of_keys);
     int num_of_admit_id = 0;
     for (int i = 0; i < num_worker_threads; i++) {
@@ -157,7 +158,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                sizeof(K) * lookup_or_create_ids[i].size());
         memcpy(total_ptrs.data() + num_of_admit_id,
                lookup_or_create_ptrs[i].data(),
-               sizeof(ValuePtr<V>*) * lookup_or_create_ptrs[i].size());
+               sizeof(void*) * lookup_or_create_ptrs[i].size());
         memcpy(total_cursors.data() + num_of_admit_id,
                lookup_or_create_cursor[i].data(),
                sizeof(int) * lookup_or_create_cursor[i].size());
@@ -174,11 +175,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
     if (GetBloomFreq(key) >= config_.filter_freq) {
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+      bool is_filter = true;
+      TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       AddFreq(key, count);
@@ -186,19 +188,27 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    *val = nullptr;
-    if ((GetFreq(key, *val) + count) >= config_.filter_freq) {
+    *value_ptr = nullptr;
+    if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) {
+      Status s = ev_->LookupKey(key, value_ptr);
+      if (!s.ok()) {
+        *value_ptr = feat_desc_->Allocate();
+        feat_desc_->SetDefaultValue(*value_ptr, key);
+        ev_->storage()->Insert(key, value_ptr);
+        s = Status::OK();
+      }
       *is_filter = true;
-      return ev_->LookupOrCreateKey(key, val);
+      feat_desc_->AddFreq(*value_ptr, count);
+    } else {
+      *is_filter = false;
+      AddFreq(key, count);
     }
-    *is_filter = false;
-    AddFreq(key, count);
     return Status::OK();
   }
 
-  int64 GetFreq(K key, ValuePtr<V>*) override {
+  int64 GetFreq(K key, void* val) override {
     return GetBloomFreq(key);
   }
 
@@ -210,7 +220,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     return bloom_counter_;
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     if (value_ptr == nullptr) {
       return false;
     } else {
@@ -326,8 +336,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
+      void* value_ptr = nullptr;
       int64 new_freq = freq_buff[i];
+      int64 import_version = -1;
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
           SetBloomFreq(key_buff[i], freq_buff[i]);
@@ -339,17 +353,9 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         SetBloomFreq(key_buff[i], freq_buff[i]);
       }
       if (new_freq >= config_.filter_freq){
-        ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
-        if (config_.steps_to_live != 0 || config_.record_version) {
-          value_ptr->SetStep(version_buff[i]);
-        }
-        if (!is_filter){
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 value_buff + i * ev_->ValueLen());
-        } else {
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 ev_->GetDefaultValue(key_buff[i]));
-        }
+        ev_->storage()->Import(key_buff[i],
+            value_buff + i * ev_->ValueLen(),
+            new_freq, import_version, config_.emb_index);
       }
     }
     return Status::OK();
@@ -449,6 +455,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
  private:
   void* bloom_counter_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
   std::vector<int64> seeds_;
 };
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index a8535347020..424fc5e1a38 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -50,11 +50,7 @@ enum EmbeddingVariableType {
 enum ValuePtrStatus {
   OK = 0;
   IS_DELETED = 1;
-}
-
-enum ValuePosition {
-  IN_DRAM = 0;
-  NOT_IN_DRAM = 1;
+  NOT_IN_DRAM = 2;
 }
 
 enum IsSetInitialized {
diff --git a/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
new file mode 100644
index 00000000000..e51166a2895
--- /dev/null
+++ b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
@@ -0,0 +1,272 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  CounterFilterDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version,
+      int64 filter_freq,
+      StorageType storage_type) 
+      : filter_freq_(filter_freq),
+        is_record_freq_(need_record_freq),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    if (filter_freq >= (1L << version_offset_bits_)) {
+      LOG(FATAL)<<"Filter freqeuncy threshold shouldn't bigger than 2^12.";
+    }
+
+    if (storage_type == StorageType::HBM_DRAM || 
+        storage_type == StorageType::HBM_DRAM_SSDHASH) {
+#if GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+#endif //GOOGLE_CUDA
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  CounterFilterDescriptorImpl(CounterFilterDescriptorImpl<V>* feat_desc_impl)
+      : filter_freq_(feat_desc_impl->filter_freq_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {
+#if GOOGLE_CUDA
+    if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == 
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>*)){
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+    } else {
+#endif //GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+#if GOOGLE_CUDA
+    }
+#endif //GOOGLE_CUDA
+  }
+
+  ~CounterFilterDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    return feat_desc_impl_->InitSlotInfo(feat_desc_impl);
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  bool IsAdmit(void* val) override {
+    return (GetFlag(val) == 0);
+  }
+
+  void* Admit(void* val) override {
+    if (!IsAdmit(val)) {
+      return feat_desc_impl_->Allocate();
+    } else {
+      LOG(FATAL)<<"Only unadmited feature could be admited.";
+      return nullptr;
+    }
+  }
+
+  void* Allocate() override {
+    uint64* val = (uint64*)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    uint64 flag = 1L << flag_offset_bits_;
+    uint64 version = (0xffffffffffffffff << version_offset_bits_);
+    uint64 freq = 0;
+    *val = version + freq;
+    val = (uint64*)((uint64)val | flag);
+    return (void*)val;
+  }
+
+  void* Allocate(int64 freq) override {
+    if (freq < filter_freq_) {
+      return Allocate();
+    } else {
+      return feat_desc_impl_->Allocate();
+    }
+  }
+
+  void Deallocate(void* val) override {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->Deallocate(val);
+    } else {
+      void* tmp = GetPtr(val);
+      alloc_->DeallocateRaw(tmp);
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      if (IsAdmit(val)) {
+        feat_desc_impl_->Deallocate(val);
+      } else {
+        void* tmp = GetPtr(val);
+        alloc_->DeallocateRaw(tmp);
+      }
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {
+    uint64* tmp = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      __sync_fetch_and_add(tmp, count);
+    } else {
+      feat_desc_impl_->AddFreq(val, count);
+    }
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->SetValue(val, emb_index, value);
+    }
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {
+    feat_desc_impl_->SetDefaultValue(val, key);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    feat_desc_impl_->SetDefaultValues(
+        keys, init_cursor,
+        value_ptrs, compute_stream,
+        event_mgr, gpu_device);
+  }
+#endif
+
+  int64 GetFreq(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      return *((uint64*)tmp) &
+             ((1L << version_offset_bits_) - 1);
+    } else {
+      if (is_record_freq_) {
+        return feat_desc_impl_->GetFreq(val);
+      } else {
+        return filter_freq_;
+      } 
+    }
+  }
+
+  int64 GetVersion(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      int64 version = *(uint64*)tmp >> version_offset_bits_;
+      if (version == 0xffffffffffff) {
+        version = -1;
+      }
+      return version;
+    } else {
+      return feat_desc_impl_->GetVersion(val);
+    }
+  }
+
+  void UpdateVersion(void* val, int64 version) override {
+    if (!IsAdmit(val)) {
+      void* tmp_ptr = GetPtr(val);
+      uint64 tmp_val = 0;
+      uint64 result  = 0;
+      do {
+        tmp_val = *(uint64*)tmp_ptr;
+        version = version << version_offset_bits_;
+        uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1);
+        result = version + freq;
+      } while(!__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result));
+    } else {
+      feat_desc_impl_->UpdateVersion(val, version);
+    }
+  }
+
+  void SetFreq(void* val, int64 freq) override {
+    uint64* tmp_ptr = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      uint64 tmp = *tmp_ptr;
+      tmp = ~((1L << version_offset_bits_) - 1) & tmp;
+      tmp += freq;
+      __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp);
+    } else {
+      feat_desc_impl_->SetFreq(val, freq);
+    }
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  uint64 GetFlag(void* val) {
+    return (uint64)val >> flag_offset_bits_;
+  }
+
+  void* GetPtr(void* val) {
+    return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1));
+  }
+
+  int64 filter_freq_;
+  int alloc_bytes_ = 8;
+  Allocator* alloc_ = ev_allocator();
+  const int freq_offset_bits_ = 0;
+  const int version_offset_bits_ = 16;
+  const int flag_offset_bits_ = 48;
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+  bool is_record_freq_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index c9f19f34cd2..19cd90ad01c 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -25,18 +25,19 @@ template<typename K, typename V, typename EV>
 class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
-  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {}
+  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                      embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
-    if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+    if (s.ok() && feat_desc_->IsAdmit(value_ptr)) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -50,18 +51,18 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         int64 freq = GetFreq(keys[i], value_ptr);
-        if (value_ptr != nullptr && freq >= config_.filter_freq) {
+        if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -79,7 +80,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
@@ -90,36 +91,61 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    if (GetFreq(key, *value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    if (is_filter) {
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    Status s = ev_->LookupOrCreateKey(key, val);
-    *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq;
+    *is_filter = false;
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      if (count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        feat_desc_->Deallocate(*value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+      ev_->storage()->Insert(key, value_ptr);
+      s = Status::OK();
+    } else if (!feat_desc_->IsAdmit(*value_ptr)) {
+      int64 freq = feat_desc_->GetFreq(*value_ptr);
+      if (freq + count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetFreq(admit_value_ptr, freq);
+        feat_desc_->UpdateVersion(
+            admit_value_ptr, feat_desc_->GetVersion(*value_ptr));
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+    } else {
+      *is_filter = true;
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
     return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    return value_ptr->GetFreq();
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetFreq();
-  }
-
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
-    return (GetFreq(key, value_ptr) >= config_.filter_freq);
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -136,27 +162,33 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
-          value_ptr->SetFreq(freq_buff[i]);
+          import_freq = freq_buff[i];
         } else {
-          value_ptr->SetFreq(config_.filter_freq);
+          import_freq = config_.filter_freq;
         }
       } else {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
-      }
-      if (value_ptr->GetFreq() >= config_.filter_freq) {
-        LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                  value_ptr, value_buff, key_buff);
+        import_version = version_buff[i];
       }
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
+
+  bool is_admit(K key, void* value_ptr) override {
+    return feat_desc_->IsAdmit(value_ptr);
+  }
+
+ private:
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 600f6c20e44..8476c399c40 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -21,25 +21,25 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
 class LocklessHashMap : public KVInterface<K, V> {
  public:
-  LocklessHashMap() {
+  LocklessHashMap(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {
     hash_map_.max_load_factor(0.8);
     hash_map_.set_empty_key_and_value(
         LocklessHashMap<K, V>::EMPTY_KEY_, nullptr);
     hash_map_.set_counternum(16);
     hash_map_.set_deleted_key(LocklessHashMap<K, V>::DELETED_KEY_);
+    pthread_key_create(&key_, NULL);
   }
 
-  ~LocklessHashMap() override {}
+  ~LocklessHashMap() override {
+    pthread_key_delete(key_);
+  }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == LocklessHashMap<K, V>::EMPTY_KEY_) {
       return errors::NotFound(
@@ -60,10 +60,10 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
     // insert fail, exist key
     if ((*(iter.first)).second != value_ptr){
       return errors::AlreadyExists(
@@ -88,14 +88,40 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
+  Status Commit(K key, const void* value_ptr) override {
+    auto iter = hash_map_.insert_lockless(std::move(
+        std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
+    if ((*(iter.first)).second != value_ptr) {
+      AppendToValuePtrQueue((*(iter.first)).second);
+      __sync_bool_compare_and_swap(
+          &((*(iter.first)).second),
+          (*(iter.first)).second,
+          value_ptr);
+    }
+    return Status::OK();
+  }
+
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
+    for(int i = 0; i < keys.size(); ++i) {
+      auto iter = hash_map_.insert_lockless(std::move(
+          std::pair<K, void*>(keys[i],
+              const_cast<void*>(value_ptrs[i]))));
+      if ((*(iter.first)).second != value_ptrs[i]) {
+        AppendToValuePtrQueue((*(iter.first)).second);
+        __sync_bool_compare_and_swap(
+            &((*(iter.first)).second),
+            (*(iter.first)).second,
+            value_ptrs[i]);
+      }
+    }
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
+      std::vector<void*>* value_ptr_list) override {
+    std::pair<const K, void*> *hash_map_dump;
     int64 bucket_count;
     auto it = hash_map_.GetSnapshot();
     hash_map_dump = it.first;
@@ -120,11 +146,50 @@ class LocklessHashMap : public KVInterface<K, V> {
     return "";
   }
 
+  void UpdateValuePtr(
+      K key, void* new_value_ptr, 
+      void* old_value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, old_value_ptr)));
+    bool flag = __sync_bool_compare_and_swap(
+        &((*(iter.first)).second), old_value_ptr, new_value_ptr);
+    if (flag) {
+      AppendToValuePtrQueue(old_value_ptr);
+    } else {
+      feat_desc_->Deallocate(new_value_ptr);
+    }
+  }
+
+ private:
+  void AppendToValuePtrQueue(void* old_value_ptr) {
+    //A parameter that can be adjusted in the future
+    std::deque<void*>* value_ptr_queue = GetOutOfDateValuePtrQueue();
+    if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) {
+      void* value_ptr = value_ptr_queue->front();
+      feat_desc_->Deallocate(value_ptr);
+      value_ptr_queue->pop_front();
+    }
+    value_ptr_queue->emplace_back(old_value_ptr);
+  }
+
+  std::deque<void*>* GetOutOfDateValuePtrQueue() {
+    std::deque<void*>* value_ptr_queue = 
+        static_cast<std::deque<void*>*>(pthread_getspecific(key_));
+    if (value_ptr_queue == nullptr) {
+      value_ptr_queue = new std::deque<void*>();
+      pthread_setspecific(key_, value_ptr_queue);
+    }
+    return value_ptr_queue;
+  }
+
  private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>*> LockLessHashMap;
+  typedef google::dense_hash_map_lockless<K, void*> LockLessHashMap;
   static const int EMPTY_KEY_;
   static const int DELETED_KEY_;
   LockLessHashMap hash_map_;
+  const int CAP_INVALID_VALUEPTR = 20000;
+  FeatureDescriptor<V>* feat_desc_;
+  pthread_key_t key_;
 };
 template <class K, class V>
 const int LocklessHashMap<K, V>::EMPTY_KEY_ = -1;
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index 92baf037721..ffaf2e335dc 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
@@ -45,7 +42,7 @@ class DenseHashMap : public KVInterface<K, V> {
     delete []hash_map_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_rd_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -70,7 +67,7 @@ class DenseHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_wr_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -80,8 +77,8 @@ class DenseHashMap : public KVInterface<K, V> {
           "already exists Key: ", key, " in DenseHashMap.");
     } else {
       auto iter = hash_map_[l_id].hash_map.insert(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
       return Status::OK();
     }
   }
@@ -109,7 +106,7 @@ class DenseHashMap : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     dense_hash_map hash_map_dump[partition_num_];
     for (int i = 0; i< partition_num_; i++) {
       spin_rd_lock l(hash_map_[i].mu);
@@ -132,7 +129,7 @@ class DenseHashMap : public KVInterface<K, V> {
   const int partition_num_ = 1000;
   struct dense_hash_map {
     mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
-    google::dense_hash_map<K, ValuePtr<V>* > hash_map;
+    google::dense_hash_map<K, void* > hash_map;
   };
   dense_hash_map* hash_map_;
 };
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index fdb6697d541..2f9fbade6c5 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramLevelDBStore : public MultiTierStorage<K, V> {
  public:
-  DramLevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    leveldb_ = new LevelDBStore<K, V>(sc, alloc, lc);
+  DramLevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    leveldb_ = new LevelDBStore<K, V>(sc, feat_desc);
   }
 
   ~DramLevelDBStore() override {
@@ -46,7 +44,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -63,23 +61,22 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramLevelDBStore.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramLevelDBStore can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -93,7 +90,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       leveldb_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
  
@@ -146,15 +143,15 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_leveldb_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_leveldb_value_list;
+    std::vector<void*> value_ptr_list, tmp_leveldb_value_list;
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
 
     TF_CHECK_OK(leveldb_->GetSnapshot(
         &tmp_leveldb_key_list, &tmp_leveldb_value_list));
 
     for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) {
-      tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index);
+      tmp_leveldb_value_list[i] =
+          (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset));
     }
 
     std::vector<K> leveldb_key_list;
@@ -173,26 +170,34 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(leveldb_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM,
+          true, true,
+          {false, 0});
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = &hbm_feat_desc;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           value_iter)));
     }
 
     for (auto it: tmp_leveldb_value_list) {
-      delete it;
+      cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff));
     }
-
     delete value_iter;
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -206,8 +211,8 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(leveldb_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -218,14 +223,20 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    leveldb_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_;
   LevelDBStore<K, V>* leveldb_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index fd19f75ab4c..e58d9450d96 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -15,14 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 
+#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
-#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,36 +29,36 @@ namespace embedding {
 template<typename K, typename V>
 class DramPmemStorage : public MultiTierStorage<K, V> {
  public:
-  DramPmemStorage(const StorageConfig& sc, Allocator* dram_alloc,
-      Allocator* pmem_alloc, LayoutCreator<V>* lc,
+  DramPmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc,
       const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, dram_alloc, lc, new LocklessHashMap<K, V>());
-    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_alloc, lc);
-    value_ptr_size_ =
-        const_cast<EmbeddingConfig&>(sc.embedding_config).total_num(
-            Storage<K, V>::GetAllocLen());
+      : dram_feat_desc_(feat_desc), 
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    pmem_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    pmem_feat_desc_->SetAllocator(experimental_pmem_allocator(sc.path, sc.size[0]));
+
+    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_feat_desc_);
   }
 
   ~DramPmemStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete dram_;
     delete pmem_;
+    delete pmem_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_);
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_);
-      *value_ptr = new_value_ptr;
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
       s = dram_->TryInsert(key, *value_ptr);
       if (s.ok()) {
         return s;
@@ -71,19 +69,19 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramPmemStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-     LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramPmemStorage can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   bool IsUseHbm() override {
@@ -94,18 +92,16 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
 
-    ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(size);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * size);
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
     }
     *value_ptr = new_value_ptr;
     
@@ -159,7 +155,7 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_pmem_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_pmem_value_list;
+    std::vector<void*> value_ptr_list, tmp_pmem_value_list;
 
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
     dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
@@ -182,13 +178,14 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
         emb_config,
         value_len, default_value,
         key_list,
-        value_ptr_list)));
+        value_ptr_list,
+        pmem_feat_desc_)));
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -202,8 +199,8 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(pmem_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -214,13 +211,26 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    pmem_feat_desc_->InitSlotInfo(dram_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {}
+  int total_dim() override {
+    return pmem_feat_desc_->total_dim();
+  }
 
  private:
   DramStorage<K, V>* dram_;
   PmemLibpmemStorage<K, V>* pmem_;
-  int64 value_ptr_size_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* pmem_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 356a61d865f..ddd2d782e03 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramSsdHashStorage : public MultiTierStorage<K, V> {
  public:
-  DramSsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_= new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    ssd_hash_ = new SsdHashStorage<K, V>(sc, alloc, lc);
+  DramSsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_= new DramStorage<K, V>(sc, feat_desc);
+    ssd_hash_ = new SsdHashStorage<K, V>(sc, feat_desc);
   }
 
   ~DramSsdHashStorage() override {
@@ -46,7 +44,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -64,24 +62,22 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramSsdHashStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramSsdStorage can not be called.";
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -96,7 +92,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
       ssd_hash_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
 
@@ -164,7 +160,6 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
                     const std::string& ssd_emb_file_name, EmbeddingVar<K, V>* ev,
                     RestoreSSDBuffer<K>& restore_buff) override {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
     std::map<int64, int64> file_id_map;
     for (int64 i = 0; i < restore_buff.num_of_files; i++) {
       file_id_map[restore_buff.file_list_buf[i]] = i;
@@ -185,7 +180,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -199,8 +194,8 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(ssd_hash_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -211,14 +206,25 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    ssd_hash_->Init();
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    ssd_hash_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_hash_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
new file mode 100644
index 00000000000..c1fa878788b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#include <list>
+#include <bitset>
+#include <atomic>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+constexpr int COLUMN_BITSET_BYTES = 5;
+constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
+
+struct MetaHeader {
+  volatile unsigned char embed_num;
+  unsigned char value_type;
+  unsigned char header_size;
+  unsigned char column_bitset[COLUMN_BITSET_BYTES];
+
+  static const int kEmbeddingNumStartIndex = 0;
+  static const int kValueTypeStartIndex =
+      kEmbeddingNumStartIndex + sizeof(char);
+  static const int kHeaderSizeStartIndex =
+      kValueTypeStartIndex + sizeof(char);
+  static const int kColumnBitsetIndex =
+      kHeaderSizeStartIndex + sizeof(char);
+
+  inline unsigned int GetEmbeddingNum() {
+    return (unsigned int) embed_num;
+  }
+
+  inline void SetEmbeddingNum(size_t s) {
+    embed_num = (unsigned char)s;
+  }
+
+  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
+    unsigned long meta = ((unsigned long*)this)[0];
+    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
+    return bs;
+  }
+
+  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
+      unsigned int embnum) {
+    ((unsigned long*)(this))[0] =
+      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
+      (header_size << (8 * kHeaderSizeStartIndex)) |
+      (value_type << (8 * kValueTypeStartIndex)) |
+      (embnum << (8 * kEmbeddingNumStartIndex));
+  }
+
+  inline unsigned int GetHeaderSize() {
+    return (unsigned int) header_size;
+  }
+
+  inline void SetHeaderSize(size_t size) {
+    header_size = (unsigned char)size;
+  }
+};
+
+template <class V>
+class DynmaicDimDescriptorImpl: public FeatureDescriptorImpl<V> {
+using FeatureDescriptorImpl<V>::slot_infos_;
+ public:
+  DynmaicDimDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num) 
+      : alloc_bytes_(sizeof(std::atomic_flag) +
+                     sizeof(MetaHeader) +
+                     sizeof(V*) * slot_num),
+        header_offset_bytes_(sizeof(V*) * slot_num),
+        flag_offset_bytes_(sizeof(MetaHeader) +
+                           sizeof(V*) * slot_num),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 false,
+                                 false) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+  }
+  ~DynmaicDimDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+  } 
+
+  V* GetEmbedding(void* val, int emb_index) override {
+		MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->embed_num;
+    auto metadata = meta->GetColumnBitset();
+    
+    if (!metadata.test(emb_index)) {
+      std::atomic_flag* flag= (std::atomic_flag*)(val + flag_offset_bytes_);
+      while(flag->test_and_set(std::memory_order_acquire));
+      metadata = meta->GetColumnBitset();
+      if (metadata.test(emb_index)) {
+        flag->clear(std::memory_order_release);
+        return ((V**)val)[emb_index];
+      }
+      embnum++ ;
+      int64 alloc_value_len = slot_infos_[emb_index].embedding_dim;
+      V* tensor_val = (V*)alloc_->AllocateRaw(
+          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
+      V* default_v = (V*)slot_infos_[emb_index].default_value;
+      memcpy(tensor_val, default_v,
+             sizeof(V) * slot_infos_[emb_index].default_value_len);
+      ((V**)val)[emb_index] = tensor_val;
+
+      metadata.set(emb_index);
+      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
+      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
+      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
+      //LOG(INFO)<<"emb_num: "<<embnum;
+      meta->SetColumnBitset(metadata, embnum);
+      flag->clear(std::memory_order_release);
+      return tensor_val;
+    } else {
+      return ((V**)val)[emb_index];
+    }
+  }
+
+  bool IsAdmit(void* val) override {
+    return true;
+  }
+
+  void* Admit(void* val) override {}
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    memset(val, 0, alloc_bytes_);
+    new ((char*)val + header_offset_bytes_) MetaHeader();
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->GetEmbeddingNum();
+    //LOG(INFO)<<"emb_num in deallocate: "<<embnum;
+    auto metadata = meta->GetColumnBitset();
+    for (int i = 0; i< embnum; i++) {
+      if (metadata.test(i)) {
+        V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i];
+        if (val_ptr != nullptr) {
+          alloc_->DeallocateRaw(val_ptr);
+        }
+      }
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      Deallocate(val);
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {}
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {}
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+
+  int64 GetFreq(void* val) override {}
+
+  int64 GetVersion(void* val) override {}
+
+  void UpdateVersion(void* val, int64 version) override {}
+
+  void SetFreq(void* val, int64 freq) override {}
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  int alloc_bytes_ = 0;
+  int header_offset_bytes_ = 0;
+  int flag_offset_bytes_ = 0;
+  Allocator* alloc_ = ev_allocator();
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index d47d07d4205..a39d2dca303 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -23,7 +23,6 @@ struct EmbeddingConfig {
   DataType counter_type;
   int64 default_value_dim;
   float default_value_no_permission;
-  int normal_fix_flag;
   bool record_freq;
   bool record_version;
   bool is_inference;
@@ -37,7 +36,6 @@ struct EmbeddingConfig {
                   int64 filter_freq = 0,
                   int64 max_freq = 999999,
                   float l2_weight_threshold = -1.0,
-                  const std::string& layout = "normal",
                   int64 max_element_size = 0,
                   float false_positive_probability = -1.0,
                   DataType counter_type = DT_UINT64,
@@ -58,7 +56,6 @@ struct EmbeddingConfig {
       counter_type(counter_type),
       default_value_dim(default_value_dim),
       default_value_no_permission(default_value_no_permission),
-      normal_fix_flag(0),
       record_freq(record_freq),
       record_version(record_version),
       is_inference(is_inference) {
@@ -70,10 +67,6 @@ struct EmbeddingConfig {
       kHashFunc = 0;
       num_counter = 0;
     }
-    if (layout == "normal_contiguous" ||
-        layout == "normal_contiguous_gpu") {
-      normal_fix_flag = 1;
-    }
   }
 
   int64 calc_num_counter(int64 max_element_size,
@@ -105,21 +98,13 @@ struct EmbeddingConfig {
   }
 
   bool is_save_freq() const {
-    return filter_freq != 0 ||
-           record_freq ||
-           normal_fix_flag == 1;
+    return filter_freq != 0 || record_freq;
   }
 
   bool is_save_version() const {
     return steps_to_live != 0 || record_version;
   }
 
-  int64 total_num(int alloc_len) {
-    return block_num *
-           (1 + (1 - normal_fix_flag) * slot_num) *
-           (1 + normal_fix_flag * (alloc_len * (slot_num + 1) - 1));
-  }
-
   int64 get_filter_freq() {
     return filter_freq;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_memory_pool.h b/tensorflow/core/framework/embedding/embedding_memory_pool.h
index 27b31ce1ed7..ef175151b00 100644
--- a/tensorflow/core/framework/embedding/embedding_memory_pool.h
+++ b/tensorflow/core/framework/embedding/embedding_memory_pool.h
@@ -18,9 +18,6 @@ limitations under the License.
 #include <deque>
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 template<typename V>
 class EmbeddingMemoryPool {
@@ -50,7 +47,7 @@ class EmbeddingMemoryPool {
     return ptr;
   }
 
-  void Deallocate(std::vector<ValuePtr<V>*> value_ptrs) {
+  void Deallocate(std::vector<void*> value_ptrs) {
     int64 prev_size = value_ptrs_queue_.size();
     for (auto it : value_ptrs) {
       value_ptrs_queue_.emplace_back(it);
@@ -59,9 +56,8 @@ class EmbeddingMemoryPool {
       int64 n = value_ptrs_queue_.size() - embs_per_block_;
       n = std::min(prev_size, n);
       for (int64 i = 0; i < n; i++) {
-        ValuePtr<V>* val = value_ptrs_queue_.front();
-        free_ptr_queue_.emplace_back(val->GetValue(0, 0));
-        delete val;
+        void* val = value_ptrs_queue_.front();
+        free_ptr_queue_.emplace_back((V*)val);
         value_ptrs_queue_.pop_front();
       }
     }
@@ -88,7 +84,7 @@ class EmbeddingMemoryPool {
   int64 embs_per_block_;
   Allocator* alloc_;
   std::deque<V*> free_ptr_queue_;
-  std::deque<ValuePtr<V>*> value_ptrs_queue_;
+  std::deque<void*> value_ptrs_queue_;
   std::vector<V*> block_list_;
 };
 } //embedding
diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc
index 0c0be83ec1d..f7162fd2c22 100644
--- a/tensorflow/core/framework/embedding/embedding_var.cu.cc
+++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc
@@ -42,71 +42,6 @@ void SyncWithEventMgr(se::Stream* stream,
   while(!is_kernel_finish) {}
 }
 
-template <class K, class V>
-void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
-    const K* keys, int64 size, const std::list<int64>& init_cursor,
-    V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device) {
-  if (init_cursor.size() > 0) {
-    int64 total = init_cursor.size();
-    V** value_address = nullptr;
-    value_address = TypedAllocator::Allocate<V*>(cpu_allocator(), total * 2,
-                                                 AllocationAttributes());
-    V** default_value_address = value_address + total;
-    V** dev_value_address = nullptr;
-    dev_value_address =
-        TypedAllocator::Allocate<V*>(alloc_, total * 2, AllocationAttributes());
-    V** dev_default_value_address = dev_value_address + total;
-    int64 i = 0;
-    auto it = init_cursor.cbegin();
-    for (; it != init_cursor.cend(); ++it, ++i) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_address[i] =
-          *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) +
-          storage_->GetOffset(emb_config_.emb_index);
-      default_value_address[i] =
-          default_value_ +
-          (keys[i] % emb_config_.default_value_dim) % value_len_;
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
-                               total * 2 * sizeof(V*));
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::CopyEmbedding<V>,
-        (total * value_len_ + block_dim - 1) / block_dim,
-        block_dim, 0, gpu_device.stream(), dev_default_value_address,
-        dev_value_address, value_len_, total));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    // Set init meta of ValuePtrs
-    for (auto it = init_cursor.cbegin(); it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr->SetInitialized(emb_config_.emb_index);
-      memcpy_address[*it] = value_ptr->GetValue(
-          emb_config_.emb_index,
-          storage_->GetOffset(emb_config_.emb_index));
-    }
-    TypedAllocator::Deallocate(alloc_, dev_value_address, total * 2);
-    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
-  }
-}
-
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::SetDefaultValueOfNewFeatures(     \
-      const ktype*, int64, const std::list<int64>&, vtype**,                  \
-      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
 template <class K, class V>
 void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
     V* val_base, int64 size, V** memcpy_address,
@@ -136,85 +71,6 @@ void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
 
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
-template <class K, class V>
-void EmbeddingVar<K, V>::CopyEmbeddingsFromCPUToGPU(
-    const K* keys, const std::list<int64>& copyback_cursor, V** memcpy_address,
-    se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device,
-    const DeviceBase::CpuWorkerThreads* worker_threads,
-    int64* output_value_ptrs) {
-  if (copyback_cursor.size() > 0) {
-    int64 total = copyback_cursor.size();
-    size_t value_len = emb_config_.total_num(storage_->GetAllocLen());
-    V* memcpy_buffer_gpu = nullptr;
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[total];
-    memcpy_buffer_gpu = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                total * value_len * sizeof(V));
-    storage_->CopyEmbeddingsFromCPUToGPU(
-        total, keys, copyback_cursor, memcpy_address, value_len, gpu_value_ptrs,
-        memcpy_buffer_gpu, compute_stream, event_mgr, worker_threads);
-
-    V** value_address = (V**)cpu_allocator()->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * total);
-    V** dev_value_address = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                 sizeof(V*) * total);
-    std::vector<K> copyback_keys(total);
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    for (; it != copyback_cursor.cend(); ++it, ++i) {
-      bool init;
-      // Get the curosr
-      int64 cursor = *it & 0x0fffffffffffffff;
-      gpu_value_ptrs[i]->SetInitialized(emb_config_.emb_index);
-      memcpy_address[cursor] = LookupOrCreateEmb(gpu_value_ptrs[i], init);
-      value_address[i] = memcpy_address[cursor];
-      copyback_keys[i] = keys[cursor];
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, total * sizeof(V*));
-
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
-        block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu,
-        value_len, total));
-
-    auto do_insert = [this, copyback_keys, gpu_value_ptrs, value_len](
-                         int64 start, int64 limit) {
-      for (int64 i = start; i < limit; i++)
-        storage_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers,
-          copyback_keys.size(), 100000, do_insert);
-    if (output_value_ptrs != nullptr) {
-      auto it = copyback_cursor.cbegin();
-      for (int64 i = 0; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 cursor = *it & 0x0fffffffffffffff;
-        output_value_ptrs[cursor] = (int64)gpu_value_ptrs[i];
-      }
-    }
-    SyncWithEventMgr(compute_stream, event_mgr);
-
-    alloc_->DeallocateRaw(dev_value_address);
-    alloc_->DeallocateRaw(memcpy_buffer_gpu);
-    cpu_allocator()->DeallocateRaw(value_address);
-    delete[] gpu_value_ptrs;
-  }
-}
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::CopyEmbeddingsFromCPUToGPU(       \
-      const ktype*, const std::list<int64>&, vtype**, se::Stream*, EventMgr*, \
-      const Eigen::GpuDevice&, const DeviceBase::CpuWorkerThreads*, int64*);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 28ce5094d87..487f595bf31 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/embedding/embedding_var_restore.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/framework/embedding/filter_factory.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
@@ -57,7 +56,8 @@ class EmbeddingVar : public ResourceBase {
   EmbeddingVar(const string& name,
                embedding::Storage<K, V>* storage,
                EmbeddingConfig emb_cfg,
-               Allocator* alloc):
+               Allocator* alloc,
+               embedding::FeatureDescriptor<V>* feat_desc):
       name_(name),
       storage_(storage),
       default_value_(nullptr),
@@ -65,27 +65,8 @@ class EmbeddingVar : public ResourceBase {
       value_len_(0),
       alloc_(alloc),
       default_value_alloc_(alloc),
-      emb_config_(emb_cfg) {
-    if (IsMultiLevel() || emb_config_.record_freq) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        value_ptr->AddFreq(freq);
-      };
-    } else if (emb_config_.is_counter_filter()) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        if (value_ptr->GetFreq() < filter_freq)
-          value_ptr->AddFreq(freq);
-      };
-    } else {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {};
-    }
-    if (emb_config_.steps_to_live != 0 || emb_config_.record_version) {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {
-        value_ptr->SetStep(gs);
-      };
-    } else {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {};
-    }
-  }
+      emb_config_(emb_cfg),
+      feat_desc_(feat_desc) {}
 
   Status Init(const Tensor& default_tensor, int64 default_value_dim) {
     if (storage_ == nullptr) {
@@ -95,17 +76,11 @@ class EmbeddingVar : public ResourceBase {
 
     storage_type_ = storage_->GetStorageType();
     filter_ = FilterFactory::CreateFilter<K, V, EmbeddingVar<K, V>>(
-        emb_config_, this, storage_);
+        emb_config_, this, storage_, feat_desc_);
     emb_config_.default_value_dim = default_value_dim;
     value_len_ =
         default_tensor.NumElements() / emb_config_.default_value_dim;
 
-    if (LayoutType::NORMAL_CONTIGUOUS == storage_->GetLayoutType() ||
-        LayoutType::NORMAL_CONTIGUOUS_GPU == storage_->GetLayoutType() ||
-        LayoutType::COMPACT == storage_->GetLayoutType()) {
-      storage_->SetAllocLen(value_len_, emb_config_.slot_num + 1);
-    }
-
     if (storage_->IsUseHbm()) {
 #if GOOGLE_CUDA
       default_value_ = TypedAllocator::Allocate<V>(alloc_,
@@ -115,12 +90,6 @@ class EmbeddingVar : public ResourceBase {
       dev_addr_buffer_size_ = 0;
       cudaMemcpy(default_value_, &default_tensor_flat(0),
           default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
-      storage_->
-          CreateEmbeddingMemoryPool(
-              alloc_,
-              emb_config_.total_num(
-                  storage_->GetAllocLen()),
-              1024 * 1024 * 64);
 #endif  // GOOGLE_CUDA
     } else if (storage_->IsSingleHbm()) {
 #if GOOGLE_CUDA
@@ -147,6 +116,14 @@ class EmbeddingVar : public ResourceBase {
             emb_config_.default_value_no_permission);
       }
     }
+    bool is_all_slots_initialized = 
+        feat_desc_->InitSlotInfo(
+            emb_config_.emb_index, value_len_,
+            std::pair<V*, int64>(
+                default_value_, emb_config_.default_value_dim));
+    if (is_all_slots_initialized) {
+      storage_->Init();
+    }
 
     return Status::OK();
   }
@@ -159,57 +136,92 @@ class EmbeddingVar : public ResourceBase {
     return is_initialized_;
   }
 
-  Status LookupKey(K key, ValuePtr<V>** value_ptr) {
+  Status LookupKey(K key, void** value_ptr) {
     return storage_->Get(key, value_ptr);
   }
 
   void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
                       const K* keys,
-                      ValuePtr<V>** value_ptr_list,
+                      void** value_ptr_list,
                       int64 num_of_keys) {
-    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys,
-                       emb_config_.total_num(storage_->GetAllocLen()));
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
+  Status LookupOrCreateKey(K key, void** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
     if (indices_as_pointer) {
-      *value_ptr = (ValuePtr<V>*)key;
-      *is_filter = (*value_ptr != nullptr);
+      *value_ptr = (void*)key;
+      *is_filter = filter_->is_admit(key, *value_ptr);
       return Status::OK();
     } else {
       Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count);
-      add_freq_fn_(*value_ptr, count, emb_config_.filter_freq);
       return s;
     }
   }
 
   Status Insert(K key, V* value) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     CreateKey(key, &value_ptr, true);
-    LookupOrCreateEmb(value_ptr, value);
+    feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value);
     return Status::OK();
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr) {
-    Status s = storage_->GetOrCreate(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()));
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys,
+                           void** value_ptrs,
+                           int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            add_freq_fn);
+    }
+    return Status::OK();
+  }
+
+
+  Status LookupOrCreateKey(K key, void** value_ptr) {
+    Status s = storage_->GetOrCreate(key, value_ptr);
     TF_CHECK_OK(s);
     return s;
   }
 
-  void CreateKey(K key, ValuePtr<V>** value_ptr, bool to_dram) {
-    storage_->Insert(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()), to_dram);
+  void CreateKey(K key, void** value_ptr, bool to_dram) {
+    storage_->CreateAndInsert(key, value_ptr, to_dram);
   }
 
-  void UpdateVersion(ValuePtr<V>* value_ptr, int64 gs) {
-    update_version_fn_(value_ptr, gs);
+  void UpdateVersion(void* value_ptr, int64 gs) {
+    feat_desc_->UpdateVersion(value_ptr, gs);
   }
 
   void BatchCommit(const std::vector<K>& keys,
-                   const std::vector<ValuePtr<V>*>& value_ptrs) {
+                   const std::vector<void*>& value_ptrs) {
     TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs));
   }
 
@@ -218,9 +230,9 @@ class EmbeddingVar : public ResourceBase {
   }
 
   int64 GetVersion(K key) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetStep();
+    return feat_desc_->GetVersion(value_ptr);
   }
 
   int64 GetFreq(K key) {
@@ -261,11 +273,11 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         V* default_v = default_value + i * value_len_;
-        ValuePtr<V>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         filter_->LookupOrCreate(
             keys[i], output + i * value_len_, default_v, &value_ptr, 1,
             default_value_no_permission_);
-        add_freq_fn_(value_ptr, 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptr, 1);
       }
     };
     auto worker_threads = context.worker_threads;
@@ -276,7 +288,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<CPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
+                      void** value_ptrs,
                       int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
     auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) {
@@ -295,7 +307,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GatherEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
@@ -303,13 +315,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
         V* value = nullptr;
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          value = LookupOrCreateEmb(value_ptrs[i], default_v);
+          value = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           value = default_value_no_permission_;
         }
@@ -341,8 +350,9 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
-                      int64 num_of_keys) {
+                      void** value_ptrs,
+                      int64 num_of_keys,
+                      bool indices_as_pointer = false) {
     const K* keys = (K*)keys_tensor.data();
     filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
     storage_->AddToCachePrefetchList(keys_tensor);
@@ -351,17 +361,17 @@ class EmbeddingVar : public ResourceBase {
   void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& context,
       const K* keys,
-      ValuePtr<V>** value_ptrs,
+      void** value_ptrs,
       int64 num_of_keys,
       std::vector<std::list<int64>>& not_found_cursor_list) {
     storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys,
-                               emb_config_.total_num(storage_->GetAllocLen()),
+                               value_len_,
                                not_found_cursor_list);
   }
 
   void GatherEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     std::vector<V*> embedding_ptr(num_of_keys);
@@ -370,12 +380,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptrs[i], 1);
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v);
+          embedding_ptr[i] = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission_;
         }
@@ -394,72 +402,8 @@ class EmbeddingVar : public ResourceBase {
 
     storage_->AddToCache(keys_tensor);
   }
-
-  void BatchLookupOrCreateEmb(
-      const EmbeddingVarContext<GPUDevice>& ctx,
-      V** var_ptr,
-      ValuePtr<V>** value_ptrs,
-      const K* indices,
-      int64 num_of_keys,
-      IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-    int num_worker_threads = ctx.worker_threads->num_threads;
-    std::vector<std::list<int64>> init_cursor_list(
-        num_worker_threads + 1);
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-
-    auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list,
-        &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptr[i] = LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          num_of_keys, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
-
-    auto stream = ctx.compute_stream;
-    auto event_mgr = ctx.event_mgr;
-
-    SetDefaultValueOfNewFeatures(
-        indices, num_of_keys,
-        init_cursor_list[0],
-        var_ptr, stream, event_mgr,
-        ctx.gpu_device);
-  }
 #endif
 
-  void LookupOrCreate(K key, V* val, V* default_v, int count = 1)  {
-    const V* default_value_ptr =
-      (default_v == nullptr) ? default_value_ : default_v;
-    ValuePtr<V>* value_ptr = nullptr;
-    filter_->LookupOrCreate(key, val, default_value_ptr, &value_ptr, count,
-                            default_value_no_permission_);
-    add_freq_fn_(value_ptr, count, emb_config_.filter_freq);
-  }
-
-  void BatchInitEmb(int64 size, V** memcpy_address, V* default_value,
-      bool* init_flags, int64 value_len) {
-    filter_->BatchInitEmb(size, memcpy_address, default_value,
-        init_flags, value_len);
-  }
-
 #if GOOGLE_CUDA
   void CopyEmbeddingsToBuffer(
       V* val_base, int64 size,
@@ -467,73 +411,18 @@ class EmbeddingVar : public ResourceBase {
       se::Stream* compute_stream,
       EventMgr* event_mgr,
       const Eigen::GpuDevice& gpu_device);
-
-  void SetDefaultValueOfNewFeatures(
-      const K* keys, int64 size,
-      const std::list<int64>& init_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device);
-
-  void CopyEmbeddingsFromCPUToGPU(
-      const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device,
-      const DeviceBase::CpuWorkerThreads* worker_threads,
-      int64* output_value_ptrs = nullptr);
-
-  void AllocateMemoryForNewFeatures(
-      V** memcpy_address,
-      const std::list<int64>& init_cursor) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
-    for (auto it = init_cursor.cbegin();
-      it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr_list.emplace_back(value_ptr);
-    }
-    storage_->AllocateMemoryForNewFeatures(value_ptr_list);
-  }
 #endif  // GOOGLE_CUDA
 
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-          emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v,
-                       Allocator* alloc) {
-    return value_ptr->GetOrAllocate(alloc, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-            emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, bool &need_initialize) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, nullptr,
-        emb_config_.emb_index,
-        storage_->GetOffset(emb_config_.emb_index),
-        need_initialize);
-  }
-
-  V* LookupPrimaryEmb(ValuePtr<V>* value_ptr) {
-    V* primary_val = value_ptr->GetValue(emb_config_.primary_emb_index,
-        storage_->GetOffset(emb_config_.primary_emb_index));
-    return primary_val;
-  }
-
-  typename TTypes<V>::Flat flat(ValuePtr<V>* value_ptr, int64 index) {
-    V* default_v =
-        default_value_ + (index % emb_config_.default_value_dim) * value_len_;
-    V* val = LookupOrCreateEmb(value_ptr, default_v);
+  typename TTypes<V>::Flat flat(void* value_ptr) {
+    V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
     Eigen::array<Eigen::DenseIndex, 1> dims({value_len_});
     return typename TTypes<V>::Flat(val, dims);
   }
 
+  V* GetValuePtr(void* ptr) {
+    return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index);
+  }
+
   int64 ValueLen() const {
     return value_len_;
   }
@@ -602,25 +491,26 @@ class EmbeddingVar : public ResourceBase {
                    std::vector<V*>* value_list,
                    std::vector<int64>* version_list,
                    std::vector<int64>* freq_list) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     storage_->GetSnapshot(key_list, &value_ptr_list);
     bool is_save_freq = emb_config_.is_save_freq();
     bool is_save_version = emb_config_.is_save_version();
     for (int64 i = 0; i < key_list->size(); i++) {
-      V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0);
-      if (val != nullptr) {
+      if (feat_desc_->IsAdmit(value_ptr_list[i])) {
+        V* val = feat_desc_->GetEmbedding(
+            value_ptr_list[i], emb_config_.emb_index);
         value_list->emplace_back(val);
       } else {
         value_list->emplace_back(default_value_);
       }
 
       if(is_save_version) {
-        int64 dump_version = value_ptr_list[i]->GetStep();
+        int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]);
         version_list->emplace_back(dump_version);
       }
 
       if(is_save_freq) {
-        int64 dump_freq = value_ptr_list[i]->GetFreq();
+        int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]);
         freq_list->emplace_back(dump_freq);
       }
     }
@@ -634,6 +524,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_;
   }
 
+  embedding::FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
+  }
+
   Status Shrink(embedding::ShrinkArgs& shrink_args) {
     if (emb_config_.is_primary()) {
       shrink_args.value_len = value_len_;
@@ -671,10 +565,6 @@ class EmbeddingVar : public ResourceBase {
     return alloc_;
   }
 
-  int64 GetAllocLen() {
-    return emb_config_.total_num(storage_->GetAllocLen());
-  }
-
   V** GetBuffer(int64 size) {
     if (dev_addr_buffer_size_ >= size) {
       return dev_addr_buffer_;
@@ -756,16 +646,17 @@ class EmbeddingVar : public ResourceBase {
     return storage_->HashTable();
   }
 
- protected:
   FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const {
     return filter_;
   }
 
+ protected:
   ~EmbeddingVar() override {
     // When dynamic dimension embedding is used,
     // there will be more than one primary slot
     if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
       delete storage_;
+      delete feat_desc_;
     }
     if (embedding::StorageType::HBM_DRAM == storage_type_) {
       alloc_->DeallocateRaw(dev_addr_buffer_);
@@ -804,35 +695,6 @@ class EmbeddingVar : public ResourceBase {
           value_len_ * sizeof(V), do_work);
   }
 
-  V* GetAddressOfGpuValuePtr(ValuePtr<V>* value_ptr,
-      int64 index,
-      bool copyback_flag,
-      std::list<int64>& init_cursor,
-      std::list<int64>& copyback_cursor) {
-    V* mem_addr = nullptr;
-    bool init_flag = false;
-    if (!copyback_flag) {
-      mem_addr = LookupOrCreateEmb(value_ptr, init_flag);
-    } else {
-      mem_addr = value_ptr->GetValue(0,0);
-      if (copyback_flag ==
-          embedding::CopyBackFlag::COPYBACK_AND_DESTROY) {
-        delete value_ptr;
-        // If the 64th bit of cursor is set to 1,
-        // the corresponding valueptr need to be deleted later.
-        int64 tmp = 1;
-        tmp = tmp << 63;
-        copyback_cursor.emplace_back(index | tmp);
-      } else {
-        copyback_cursor.emplace_back(index);
-      }
-    }
-    if (init_flag) {
-      init_cursor.emplace_back(index);
-    }
-    return mem_addr;
-  }
-
   std::string name_;
   bool is_initialized_ = false;
 
@@ -849,8 +711,7 @@ class EmbeddingVar : public ResourceBase {
   embedding::StorageType storage_type_;
   EmbeddingConfig emb_config_;
   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
-  std::function<void(ValuePtr<V>*, int64, int64)> add_freq_fn_;
-  std::function<void(ValuePtr<V>*, int64)> update_version_fn_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar);
 };
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
index c1b43a608b5..7dddf714b6b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
@@ -21,42 +21,38 @@ namespace tensorflow {
 namespace embedding {
 template<class K, class V>
 void EmbeddingVarCkptData<K, V>::Emplace(
-    K key, ValuePtr<V>* value_ptr,
+    K key, void* value_ptr,
     const EmbeddingConfig& emb_config,
-    V* default_value, int64 value_offset,
+    V* default_value,
+    FeatureDescriptor<V>* feat_desc,
     bool is_save_freq,
     bool is_save_version,
     bool save_unfiltered_features) {
   if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
     return;
 
-  V* primary_val = value_ptr->GetValue(0, 0);
-  bool is_not_admit =
-      primary_val == nullptr
-      && emb_config.filter_freq != 0;
+  bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+  bool is_admit = feat_desc->IsAdmit(value_ptr);
 
-  if (!is_not_admit) {
+  if (is_admit) {
     key_vec_.emplace_back(key);
 
-    if (primary_val == nullptr) {
+    if (!is_in_dram) {
+      value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM);
+      value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+    } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) {
       value_ptr_vec_.emplace_back(default_value);
-    } else if (
-        (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
-      value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
     } else {
-      V* val = value_ptr->GetValue(emb_config.emb_index,
-          value_offset);
+      V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index);
       value_ptr_vec_.emplace_back(val);
     }
-
-
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_vec_.emplace_back(dump_version);
     }
 
     if(is_save_freq) {
-      int64 dump_freq = value_ptr->GetFreq();
+      int64 dump_freq = feat_desc->GetFreq(value_ptr);
       freq_vec_.emplace_back(dump_freq);
     }
   } else {
@@ -66,18 +62,18 @@ void EmbeddingVarCkptData<K, V>::Emplace(
     key_filter_vec_.emplace_back(key);
 
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_filter_vec_.emplace_back(dump_version);
     }
 
-    int64 dump_freq = value_ptr->GetFreq();
+    int64 dump_freq = feat_desc->GetFreq(value_ptr);
     freq_filter_vec_.emplace_back(dump_freq);
   }
 }
 #define REGISTER_KERNELS(ktype, vtype)                               \
   template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
-      ktype, ValuePtr<vtype>*, const EmbeddingConfig&, \
-      vtype*, int64, bool, bool, bool); 
+      ktype, void*, const EmbeddingConfig&, \
+      vtype*, FeatureDescriptor<vtype>*, bool, bool, bool);
 #define REGISTER_KERNELS_ALL_INDEX(type)                             \
   REGISTER_KERNELS(int32, type)                                      \
   REGISTER_KERNELS(int64, type)
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index 6d7b09e70b0..10bf0d0e43b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -19,15 +19,19 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 namespace tensorflow {
 class BundleWriter;
+namespace {
+  const int kSavedPartitionNum = 1000;
+  const int kDramFlagOffset = 49;
+}
 
 namespace embedding {
-
 template<class K, class V>
 class  EmbeddingVarCkptData {
  public:
-  void Emplace(K key, ValuePtr<V>* value_ptr,
+  void Emplace(K key, void* value_ptr,
                const EmbeddingConfig& emb_config,
-               V* default_value, int64 value_offset,
+               V* default_value,
+               FeatureDescriptor<V>* feat_desc,
                bool is_save_freq,
                bool is_save_version,
                bool save_unfiltered_features);
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
index 84c823a90dc..4c052b43c7e 100644
--- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -57,7 +57,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
         value_len_(value_len),
         col_idx_(0) {
     if (!valueptr_list.empty()) {
-      if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+      if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
         curr_ptr_ = val_iter_->Next();
       } else {
         curr_ptr_ = *curr_iter_;
@@ -75,7 +75,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
       curr_iter_++;
       col_idx_ = 0;
       if (curr_iter_ != end_iter_) {
-        if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+        if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
           curr_ptr_ = val_iter_->Next();
         } else {
           curr_ptr_ = *curr_iter_;
diff --git a/tensorflow/core/framework/embedding/feature_descriptor.h b/tensorflow/core/framework/embedding/feature_descriptor.h
new file mode 100644
index 00000000000..8808da353f4
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
+#include "tensorflow/core/framework/embedding/normal_feature_descriptor.h"
+#include <list>
+
+namespace tensorflow {
+namespace embedding {
+
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl;
+
+template <class V>
+class FeatureDescriptor {
+ public:
+  FeatureDescriptor(
+      int64 block_num,
+      int64 slot_num,
+      Allocator* alloc,
+      StorageType storage_type,
+      bool need_record_freq,
+      bool need_record_version,
+      const std::pair<bool, int64>& filter_info) {
+    if (block_num > 1) {
+      feat_desc_impl_.reset(
+          new DynmaicDimDescriptorImpl<V>(
+              alloc, block_num * slot_num));
+    } else if (filter_info.first) {
+      feat_desc_impl_.reset(
+          new CounterFilterDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version,
+              filter_info.second,
+              storage_type));
+    } else if (storage_type == StorageType::HBM_DRAM || 
+               storage_type == StorageType::HBM_DRAM_SSDHASH) {
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  FeatureDescriptor(FeatureDescriptor<V>* feat_desc) {
+    if (typeid(*(feat_desc->feat_desc_impl_.get())) == 
+        typeid(CounterFilterDescriptorImpl<V>*)) {
+      feat_desc_impl_.reset(
+        new CounterFilterDescriptorImpl<V>(
+          dynamic_cast<CounterFilterDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    }
+    else if (typeid(*(feat_desc->feat_desc_impl_.get())) ==
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>)) {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+    else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+  }
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptor<V>* feat_desc) {
+    return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get());
+  }
+
+  V* GetEmbedding(void *val, int emb_index) {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  void* Allocate() {
+    return feat_desc_impl_->Allocate();
+  }
+
+  void* Allocate(int64 freq) {
+    return feat_desc_impl_->Allocate(freq);
+  }
+
+  void Deallocate(void* val) {
+    feat_desc_impl_->Deallocate(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) {
+    feat_desc_impl_->Deallocate(value_ptrs);
+  }
+
+  void SetDefaultValue(void* val, int64 index) {
+    feat_desc_impl_->SetDefaultValue(val, index);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    feat_desc_impl_->SetValue(val, emb_index, value);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    reinterpret_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(feat_desc_impl_.get())->SetDefaultValues(
+        keys, init_cursor, value_ptrs,
+        compute_stream, event_mgr, gpu_device);
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  int data_bytes() {
+    return feat_desc_impl_->data_bytes();
+  }
+
+  int64 GetFreq(void* val) {
+    return feat_desc_impl_->GetFreq(val);
+  }
+
+  int64 GetVersion(void* val) {
+    return feat_desc_impl_->GetVersion(val);
+  }
+
+  void SetFreq(void* val, int64 freq) {
+    feat_desc_impl_->SetFreq(val, freq);
+  }
+
+  void UpdateVersion(void* val, int64 version) {
+    feat_desc_impl_->UpdateVersion(val, version);
+  }
+
+  void AddFreq(void* val, int64 freq) {
+    feat_desc_impl_->AddFreq(val, freq);
+  }
+
+  int total_dim() {
+    return feat_desc_impl_->total_dim();
+  }
+  
+  bool IsAdmit(void* val) {
+    return feat_desc_impl_->IsAdmit(val);
+  }
+
+  void* Admit(void* val) {
+    return feat_desc_impl_->Admit(val);
+  }
+
+
+ protected:
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/feature_descriptor_impl.h b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
new file mode 100644
index 00000000000..6996d22f447
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
@@ -0,0 +1,317 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace embedding {
+struct SlotInfo {
+  int embedding_dim;
+  int embedding_offset;
+  void* default_value;
+  int64 default_value_dim;
+  int default_value_len;
+};
+
+class BaseFreqDescriptor {
+ public:
+  virtual int64 GetFreq(void* value_ptr) = 0;
+  virtual void AddFreq(void* value_ptr, int64 freq) {}
+  virtual void SetFreq(void* value_ptr, int64 freq) {}
+  virtual BaseFreqDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class FreqDescriptor: public BaseFreqDescriptor {
+ public:
+  explicit FreqDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+
+  int64 GetFreq(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void AddFreq(void* value_ptr, int64 freq) override {
+    __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq);
+  }
+
+  void SetFreq(void* value_ptr, int64 freq) override {
+    *(int64*)(value_ptr + offset_byte_) = freq;
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new FreqDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+  
+ private:
+  int offset_byte_;
+};
+
+class NonFreqDescriptor: public BaseFreqDescriptor {
+ public:
+  int64 GetFreq(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get freq from NonFreqCounter.";
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new NonFreqDescriptor();
+  }
+};
+
+class BaseVersionDescriptor {
+ public:
+  virtual int64 GetVersion(void* value_ptr) = 0;
+  virtual void UpdateVersion(void* value_ptr, int64 version) {}
+  virtual BaseVersionDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class VersionDescriptor: public BaseVersionDescriptor {
+ public:
+  explicit VersionDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+  
+  int64 GetVersion(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void UpdateVersion(void* value_ptr, int64 version) override {
+    *(int64*)(value_ptr + offset_byte_) = version;
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new VersionDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+
+ private:
+  int offset_byte_;
+};
+
+class NonVersionDescriptor: public BaseVersionDescriptor {
+ public:
+  int64 GetVersion(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get version from NonFreqCounter.";
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new NonVersionDescriptor();
+  }
+};
+
+template <class V>
+class FeatureDescriptorImpl {
+ public:
+  FeatureDescriptorImpl(int64 slot_num,
+                    bool need_record_freq,
+                    bool need_record_version) {
+    slot_infos_.resize(slot_num);
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE;
+    }
+
+    if (!need_record_freq) {
+      freq_desc_.reset(new NonFreqDescriptor());
+    }
+    if (!need_record_version) {
+      version_desc_.reset(new NonVersionDescriptor());
+    }
+  }
+
+  FeatureDescriptorImpl(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+    freq_desc_.reset(
+        feat_desc_impl->freq_desc_->Clone());
+    version_desc_.reset(
+        feat_desc_impl->version_desc_->Clone());
+  }
+
+  virtual ~FeatureDescriptorImpl() {}
+
+  virtual bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) = 0;
+  virtual bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    LOG(FATAL)<<"InitSlotInfo(feat_desc_impl) is not implemented.";
+  }
+  virtual V* GetEmbedding(void* val, int emb_index) = 0;
+  virtual void* Allocate() = 0;
+  virtual void* Allocate(int64 freq) {return Allocate();}
+  virtual void Deallocate(void* val) = 0;
+  virtual void Deallocate(const std::vector<void*>& val) = 0;
+  virtual void SetAllocator(Allocator* alloc) = 0;
+  virtual void SetDefaultValue(void* val, int64 key) = 0;
+  virtual void SetValue(void* val, int64 emb_index, V* value) {}
+  virtual bool IsAdmit(void* val) {return true;}
+  virtual void* Admit(void* val) {}
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+  virtual int data_bytes() = 0;
+
+  virtual int64 GetFreq(void* val) {
+    return freq_desc_->GetFreq(val);
+  }
+
+  virtual int64 GetVersion(void* val) {
+    return version_desc_->GetVersion(val);
+  }
+
+  virtual void SetFreq(void* val, int64 freq) {
+    freq_desc_->SetFreq(val, freq);
+  }
+
+  virtual void UpdateVersion(void* val, int64 version) {
+    version_desc_->UpdateVersion(val, version);
+  }
+
+  virtual void AddFreq(void* val, int64 freq) {
+    freq_desc_->AddFreq(val, freq);
+  }
+
+  inline int total_dim() {
+    int64 slot_num = slot_infos_.size();
+    return slot_infos_[slot_num - 1].embedding_offset
+           + slot_infos_[slot_num - 1].embedding_dim;
+  }
+
+ protected:
+  bool SetEmbeddingInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    slot_infos_[emb_index].default_value = default_value.first;
+    slot_infos_[emb_index].default_value_dim = default_value.second;
+    slot_infos_[emb_index].default_value_len = embedding_dim;
+
+    bool is_aligned = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true,
+        &is_aligned));
+    if (is_aligned) {
+      embedding_dim = ComputeAlignedDim(embedding_dim);
+    }
+
+    //Avoid parallel consitency issue
+    __sync_bool_compare_and_swap(
+        &slot_infos_[emb_index].embedding_offset,
+        EMPTY_OFFSET_VALUE, embedding_dim);
+    slot_infos_[emb_index].embedding_dim = embedding_dim;
+    //Check whether all offsets are set
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) {
+        return false;
+      }
+    }
+
+    ComputeEmbeddingOffsets();
+    return true;
+  }
+
+  void SetSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+  }
+
+  void ComputeAllocBytes(int* alloc_bytes) {
+    for(auto slot_info: slot_infos_) {
+      *alloc_bytes += slot_info.embedding_dim * sizeof(V);
+    }
+  }
+
+  void CreateFreqAndVersionDescriptor(int* alloc_bytes) {
+    if (!freq_desc_) {
+      freq_desc_.reset(new FreqDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+    if (!version_desc_) {
+      version_desc_.reset(new VersionDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+  }
+
+  void InitFreqAndVersion(void* val) {
+    freq_desc_->SetFreq(val, 0);
+    version_desc_->UpdateVersion(val, -1);
+  }
+
+  void SetFreqAndVersionOffset(int* alloc_bytes) {
+    freq_desc_->SetOffset(alloc_bytes);
+    version_desc_->SetOffset(alloc_bytes);
+  }
+
+  V* GetDefaultValuePtr(int64 emb_index, int64 key) {
+    V* default_value_base = (V*)slot_infos_[emb_index].default_value;
+    int64 default_value_offset =
+        (key % slot_infos_[emb_index].default_value_dim) *
+        slot_infos_[emb_index].default_value_len;
+    return default_value_base + default_value_offset;
+  }
+
+  void SetDefaultValue(void* val, int64 emb_index, int64 key) {
+    memcpy(val,
+           GetDefaultValuePtr(emb_index, key),
+           slot_infos_[emb_index].default_value_len * sizeof(V));
+  }
+
+ private:
+  int64 ComputeAlignedDim(int64 embedding_dim) {
+    int padding_bytes =
+        ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES;
+    if (padding_bytes == ALIGN_BYTES) {
+      return embedding_dim;
+    } else {
+      return embedding_dim + padding_bytes / sizeof(V);
+    }
+  }
+
+  void ComputeEmbeddingOffsets() {
+    for (int i = slot_infos_.size() - 1 ; i >= 0; i--) {
+      slot_infos_[i].embedding_offset = 0;
+      for (int j = 0; j < i; j++) {
+        slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset;
+      }
+    }
+  }
+
+ protected:
+  const int EMPTY_OFFSET_VALUE= -1;
+  const int ALIGN_BYTES = 16;
+  std::vector<SlotInfo> slot_infos_;
+  std::unique_ptr<BaseFreqDescriptor> freq_desc_;
+  std::unique_ptr<BaseVersionDescriptor> version_desc_;
+};
+
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
diff --git a/tensorflow/core/framework/embedding/filter_factory.h b/tensorflow/core/framework/embedding/filter_factory.h
index 5bb92467a51..0127e2c882a 100644
--- a/tensorflow/core/framework/embedding/filter_factory.h
+++ b/tensorflow/core/framework/embedding/filter_factory.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/nullable_filter_policy.h"
 
-
 namespace tensorflow {
 namespace embedding{
 template <class K, class V>
@@ -34,22 +33,23 @@ class FilterFactory {
   template<typename K, typename V, typename EV>
   static FilterPolicy<K, V, EV>* CreateFilter(
       const EmbeddingConfig& config, EV* ev,
-      embedding::Storage<K, V>* storage) {
+      embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc) {
     if (config.filter_freq > 0) {
       if (config.kHashFunc != 0) {
         return new BloomFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       } else {
         return new CounterFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       }
     } else {
       return new NullableFilterPolicy<K, V, EV>(
-          config, ev, storage);
+          config, ev, storage, feat_desc);
     }
   }
 };
 
-} // tensorflow
+} //namespace tensorflow
 
 #endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 559a6796246..256d3b044d4 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/emb_file.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 
 namespace tensorflow {
 
@@ -45,9 +46,6 @@ struct RestoreBuffer {
 template<typename K>
 class RestoreSSDBuffer;
 
-template <typename V>
-class ValuePtr;
-
 template<typename K, typename V, typename EV>
 class FilterPolicy {
  public:
@@ -55,7 +53,7 @@ class FilterPolicy {
       config_(config), ev_(ev) {}
 
   virtual void LookupOrCreate(K key, V* val,
-      const V* default_value_ptr, ValuePtr<V>** value_ptr,
+      const V* default_value_ptr, void** value_ptr,
       int count, const V* default_value_no_permission) = 0;
 
   virtual Status Lookup(K key, V* val, const V* default_value_ptr,
@@ -70,53 +68,25 @@ class FilterPolicy {
 
   virtual void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& ctx,
-      const K* keys, ValuePtr<V>** value_ptrs_list,
+      const K* keys, void** value_ptrs_list,
       int64 num_of_keys) = 0;
 #endif //GOOGLE_CUDA
 
-  virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  virtual Status LookupOrCreateKey(K key, void** val,
       bool* is_filter, int64 count) = 0;
+  
+  virtual Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) {}
 
-  virtual int64 GetFreq(K key, ValuePtr<V>* value_ptr) = 0;
-
+  virtual int64 GetFreq(K key, void* value_ptr) = 0;
   virtual int64 GetFreq(K key) = 0;
 
-  virtual bool is_admit(K key, ValuePtr<V>* value_ptr) = 0;
+  virtual bool is_admit(K key, void* value_ptr) = 0;
 
   virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
                          bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0;
 
- protected:
-  void LookupOrCreateEmbInternal(bool is_filter, bool to_dram,
-                                 int i, int value_len,
-                                 ValuePtr<V>* value_ptr,
-                                 V* value_src, K* key_src) {
-    
-    if (!is_filter) {
-      ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen());
-      return;
-    } else {
-      if (to_dram) {
-#if GOOGLE_CUDA
-        std::vector<V> default_value_host;
-        default_value_host.resize(config_.default_value_dim * value_len);
-        cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(),
-                    sizeof(V) * config_.default_value_dim * value_len,
-                    cudaMemcpyDeviceToHost);
-        ev_->LookupOrCreateEmb(value_ptr,
-                               default_value_host.data() +
-                                  (key_src[i] % config_.default_value_dim)
-                                  * ev_->ValueLen());
-#endif
-        return;
-      } else {
-        ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i]));
-      return;
-      }
-    }
-  }
-
  protected:
   EmbeddingConfig config_;
   EV* ev_;
diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
index a2af6a2430a..b0950eff22d 100644
--- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
@@ -18,25 +18,21 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/shrink_policy.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   GlobalStepShrinkPolicy(int64 steps_to_live,
-                         Allocator* alloc,
+                         FeatureDescriptor<V>* feat_desc,
                          KVInterface<K, V>* kv)
       : steps_to_live_(steps_to_live),
         kv_(kv),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.global_step,
@@ -46,16 +42,16 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 global_step,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      int64 version = value_list[i]->GetStep();
+      int64 version = ShrinkPolicy<K, V>::feat_desc_->GetVersion(value_list[i]);
       if (version == -1) {
-        value_list[i]->SetStep(global_step);
+        ShrinkPolicy<K, V>::feat_desc_->UpdateVersion(value_list[i], global_step);
       } else {
         if (global_step - version > steps_to_live_) {
           kv_->Remove(key_list[i]);
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 1dd90d63a6e..fc4a2506313 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -204,29 +204,29 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchLookupOrCreate(const K* keys, size_t n,
-                             ValuePtr<V>** value_ptrs) override {
+                             void** value_ptrs) override {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     return Status::OK();
   }
 
   Status Contains(K key) override { return Status::OK(); }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status Remove(K key) override { return Status::OK(); }
 
   Status BatchLookup(const K* keys, size_t size,
-                     ValuePtr<V>** value_ptrs) override {
+                     void** value_ptrs) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
@@ -235,22 +235,20 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
   int64 Size() const override { return 0; }
 
-  void SetTotalDims(int total_dims) override {}
+  void FreeValuePtr(void* value_ptr) override {}
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {}
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 581f1f1cfaf..1056f4bbd78 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -3,7 +3,6 @@
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -14,9 +13,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -26,15 +22,17 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramSsdStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-      Allocator* cpu_alloc, LayoutCreator<V>* lc, const std::string& name)
-      : cpu_alloc_(cpu_alloc), gpu_alloc_(gpu_alloc),
+  HbmDramSsdStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
         MultiTierStorage<K, V>(sc, name),
         dram_capacity_(-1) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc_, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc_, lc,
-        new LocklessHashMapCPU<K, V>(gpu_alloc_));
-    ssd_ = new SsdHashStorage<K, V>(sc, cpu_alloc_, lc);
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
+    ssd_ = new SsdHashStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramSsdStorage() override {
@@ -46,29 +44,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    ssd_->Init();
 
-      MultiTierStorage<K, V>::cache_capacity_ =
-          Storage<K, V>::storage_config_.size[0]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
+    MultiTierStorage<K, V>::cache_capacity_ =
+        Storage<K, V>::storage_config_.size[0]
+        / (total_dim() * sizeof(V));
           
-      dram_capacity_ = Storage<K, V>::storage_config_.size[1]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
-      MultiTierStorage<K, V>::ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+    dram_capacity_ = Storage<K, V>::storage_config_.size[1]
+        / (total_dim() * sizeof(V));
+    MultiTierStorage<K, V>::ready_eviction_ = true;
   }
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -88,13 +77,12 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -102,20 +90,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -124,70 +112,27 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
 
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+              bool to_dram = false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->Insert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->Insert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    // Insert Failed
-    if (!s.ok()) {
-      {
-        mutex_lock l(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-      }
-      delete *value_ptr;
-      return hbm_->Get(key, value_ptr);
-    } else {
-      return s;
-    }
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-    s = ssd_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK_AND_DESTROY;
-      return s;
-    }
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   void InitCache(embedding::CacheStrategy cache_strategy) override {
@@ -195,66 +140,6 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     dram_cache_ = new LRUCache<K>();
   }
 
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it & 0x0fffffffffffffff;
-        memory_index[i] = *it;
-        ValuePtr<V>* gpu_value_ptr =
-            hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int64 j = memory_index[i] & 0x0fffffffffffffff;
-        bool destroy_flag = (memory_index[i] >> 63) & 0x1;
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-        if (destroy_flag) {
-          ssd_->DestroyValuePtr(reinterpret_cast<ValuePtr<V>*>(
-              (char *)memcpy_address[j] - sizeof(FixedLengthHeader)));
-        }
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
-  }
-
   Status Remove(K key) override {
     hbm_->Remove(key);
     dram_->Remove(key);
@@ -311,25 +196,23 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
 
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -347,17 +230,24 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
+      }
     }
 
     ssd_->Save(tensor_name, prefix, writer, emb_config,
@@ -368,7 +258,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   Status DramToSsdBatchCommit(std::shared_ptr<std::vector<K>> keys) {
     MultiTierStorage<K, V>::ReleaseValuePtrs(dram_value_ptr_out_of_date_,
-                                             dram_->alloc_);
+                                             dram_feat_desc_);
     mutex_lock l(*(ssd_->get_mutex()));
     mutex_lock l1(*(dram_->get_mutex()));
 
@@ -380,7 +270,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, DramEvictionSize);
       K dram_evic_ids[DramEvictionSize];
       size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       for (int64 i = 0; i < true_size; ++i) {
         if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) {
           TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr));
@@ -408,22 +298,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::shared_ptr<std::vector<K>> keys(new std::vector<K>());
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys->emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(*keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+      
+      dram_->BatchCommit(*keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : *keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -435,58 +334,14 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     }
   }
 
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-        new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
   }
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
-    ssd_->SetTotalDims(total_dims);
-  }
-
-  void CopyToGpuValuePtr(
-      ValuePtr<V>* gpu_ptr,
-      ValuePtr<V>* cpu_ptr,
-      int64 size) {
-    V* cpu_data_address = cpu_ptr->GetValue(0, 0);
-    V* gpu_data_address = gpu_ptr->GetValue(0, 0);
-    cudaMemcpy(gpu_data_address, cpu_data_address,
-        size * sizeof(V), cudaMemcpyHostToDevice);
-    memcpy(gpu_ptr->GetPtr(),
-           cpu_ptr->GetPtr(),
-           sizeof(FixedLengthHeader));
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
 
   void Restore(const std::string& name_string,
@@ -539,6 +394,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
                                            (int64*)restore_buff.freq_buffer);
     return s;
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {}
  private:
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
     V* memcpy_buffer_cpu = new V[size * value_len];
@@ -551,46 +410,30 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-                                         Storage<K, V>::GetOffset(emb_index)),
-                                         value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -611,10 +454,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
-      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list,
+      std::vector<std::list<void*>>& ssd_value_ptr_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
@@ -688,39 +531,32 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
-                                   std::list<ValuePtr<V>*>& ssd_value_ptrs,
-                                   int64 value_len) {
+                                   std::list<void*>& ssd_value_ptrs) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs,  hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -730,12 +566,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -752,34 +583,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
       //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      //Mutex with eviction thread
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -787,12 +615,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -804,29 +627,28 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_;
   Allocator* gpu_alloc_;
-  Allocator* cpu_alloc_;
   BatchCache<K>* dram_cache_;
   int64 dram_capacity_;
-  std::deque<ValuePtr<V>*> dram_value_ptr_out_of_date_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  std::deque<void*> dram_value_ptr_out_of_date_;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index 518c39287e0..d058d95f05b 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
@@ -29,9 +28,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -41,27 +37,27 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-                 Allocator* cpu_alloc, LayoutCreator<V>* lc,
-                 const std::string& name)
-      : gpu_alloc_(gpu_alloc), MultiTierStorage<K, V>(sc, name) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc, lc);
-    StorageConfig storage_config = StorageConfig();
-    storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc,
-                                  LayoutCreatorFactory::Create<V>(storage_config),
-                                  new LocklessHashMapCPU<K, V>(gpu_alloc));
+  HbmDramStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
+        MultiTierStorage<K, V>(sc, name) {
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete hbm_;
     delete dram_;
+    delete dram_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -76,9 +72,8 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
@@ -87,18 +82,17 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
@@ -110,115 +104,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list, &not_fountd_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
-
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->CreateAndInsert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->CreateAndInsert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    // Insert Failed, key already exist
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-    }
-    delete *value_ptr;
-    return hbm_->Get(key, value_ptr);
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
-  }
-
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-    //Split from above for loop for minize the cost of mutex lock
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int j = memory_index[i];
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   Status Remove(K key) override {
@@ -270,25 +171,23 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
-
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
+    
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -306,54 +205,26 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
-    }
-    return Status::OK();
-  }
-
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-       new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
       }
     }
+    return Status::OK();
   }
 
   void BatchEviction() override {
@@ -372,22 +243,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::vector<K> keys;
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys.emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+      
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+
+      dram_->BatchCommit(keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -430,6 +310,16 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     }
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
   Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
@@ -447,14 +337,14 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
  private:
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
@@ -522,38 +412,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
-                                   std::list<int64>& copyback_cursors,
-                                   int64 value_len) {
+                                   void** value_ptr_list,
+                                   std::list<int64>& copyback_cursors) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -563,12 +446,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -580,34 +458,29 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
-      //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      //Create Hbm ValuePtrs.      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -615,12 +488,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -632,16 +500,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
@@ -655,45 +529,30 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-             Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -714,9 +573,9 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_ = nullptr;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   Allocator* gpu_alloc_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
new file mode 100644
index 00000000000..a3603a61550
--- /dev/null
+++ b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
@@ -0,0 +1,122 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template<class V>
+class HbmMultiTierFeatureDescriptorImpl
+    : public FeatureDescriptorImpl<V> {
+ public:
+  HbmMultiTierFeatureDescriptorImpl(
+      Allocator* alloc, int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version)
+      : dram_alloc_bytes_(sizeof(V*)),
+        hbm_alloc_(alloc),
+        dram_alloc_(ev_allocator()),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&dram_alloc_bytes_);
+  }
+
+  ~HbmMultiTierFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes =
+        FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+            emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&hbm_alloc_bytes_);
+      embedding_mem_pool_.reset(
+        new EmbeddingMemoryPool<V>(hbm_alloc_,
+                                   hbm_alloc_bytes_ / sizeof(V),
+                                   1024 * 1024 * 64));
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return *((V**)val) +
+        FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = dram_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, dram_alloc_bytes_);
+    mutex_lock l(memory_pool_mu_);
+    *((V**)val) = embedding_mem_pool_->Allocate();
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    mutex_lock l(memory_pool_mu_);
+    embedding_mem_pool_->Deallocate(*((V**)val));
+    dram_alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    mutex_lock l(memory_pool_mu_);
+    for (auto ptr: value_ptrs) {
+      embedding_mem_pool_->Deallocate(*((V**)ptr));
+      dram_alloc_->DeallocateRaw(ptr);
+    }
+  }
+  void SetDefaultValue(void* val, int64 key) override {
+    LOG(FATAL)<<"Can't call SetDefaultValue(void* val, int64 key,"
+              <<"int default_value_len) in HbmMultiTierFeatureDescriptor.";
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    hbm_alloc_ = alloc;
+  }
+
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device);
+
+  int data_bytes() override {
+    return dram_alloc_bytes_;
+  }
+ public:
+  friend class NormalFeatureDescriptorImpl<V>;
+ protected:
+  int dram_alloc_bytes_;
+  int hbm_alloc_bytes_ = 0;
+  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  Allocator* hbm_alloc_;
+  Allocator* dram_alloc_;
+  std::unique_ptr<EmbeddingMemoryPool<V>> embedding_mem_pool_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
index 36d331e74aa..31dc4459a13 100644
--- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h
+++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
@@ -28,10 +28,11 @@ class HbmValueIterator: public ValueIterator<V> {
  public:
   HbmValueIterator(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       int64 emb_index,
       int64 value_len,
-      Allocator* alloc)
+      Allocator* alloc,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         alloc_(alloc) {
     int64 emb_offset = value_len_ * emb_index;
@@ -40,7 +41,7 @@ class HbmValueIterator: public ValueIterator<V> {
       for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
         if (key_list[i] % kSavedPartitionNum == part_id) {
           value_parts_vec[part_id].emplace_back(
-              value_ptr_list[i]->GetValue(emb_index, emb_offset));
+              feat_desc->GetEmbedding(value_ptr_list[i], emb_index));
           break;
         }
       }
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 5d1f20b581a..3659187c825 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -24,9 +25,6 @@ namespace {
 const char* kInferenceMode = "INFERENCE_MODE";
 }
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class GPUHashTable;
 
@@ -43,19 +41,19 @@ template <class K, class V>
 class KVInterface {
  public:
   virtual ~KVInterface() {}
-  virtual Status Lookup(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Lookup(K key, void** value_ptr) = 0;
   virtual Status Contains(K key) = 0;
-  virtual Status Insert(K key, const ValuePtr<V>* value_ptr) = 0;
+  virtual Status Insert(K key, const void* value_ptr) = 0;
   virtual Status Remove(K key) = 0;
 
   virtual Status BatchLookup(const K* keys, size_t size,
-                             ValuePtr<V>** value_ptrs) {
+                             void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
   }
   // KV Batch Insert
   virtual Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchInsert in KVInterface.");
   }
@@ -66,27 +64,30 @@ class KVInterface {
   }
 
   virtual Status BatchLookupOrCreate(const K* keys, size_t size,
-      ValuePtr<V>** value_ptrs) {
+      void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookupOrInsert in KVInterface.");
   }
 
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) {
+    LOG(FATAL)<<"Unimplemented for UpdateValuePtr in KVInterface.";
+  }
+
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   // KV Size
   virtual int64 Size() const = 0;
 
-  virtual void SetTotalDims(int total_dims) {}
-
-  virtual void FreeValuePtr(ValuePtr<V>* value_ptr) {}
+  virtual void FreeValuePtr(void* value_ptr) {}
 
-  virtual Status Commit(K key, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K key, const void* value_ptr) {
     return Status::OK();
   }
 
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
 
   virtual std::string DebugString() const = 0;
 
diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
index 2af6b58f94b..9b0ea8aba3f 100644
--- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
@@ -19,28 +19,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   L2WeightShrinkPolicy(float l2_weight_threshold,
                        int64 index,
-                       int64 offset,
-                       Allocator* alloc,
+                       FeatureDescriptor<V>* feat_desc,
                        KVInterface<K, V>* kv)
       : index_(index),
-        offset_(offset),
         kv_(kv),
         l2_weight_threshold_(l2_weight_threshold),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.value_len,
@@ -50,9 +45,9 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 value_len,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      V* val = value_list[i]->GetValue(index_, offset_);
+      V* val = ShrinkPolicy<K, V>::feat_desc_->GetEmbedding(value_list[i], index_);
       if (val != nullptr) {
         V l2_weight = (V)0.0;
         for (int64 j = 0; j < value_len; j++) {
@@ -61,7 +56,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
         l2_weight *= (V)0.5;
         if (l2_weight < (V)l2_weight_threshold_) {
           kv_->Remove(key_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
         }
       }
@@ -70,7 +65,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
 
  private:
   int64 index_;
-  int64 offset_;
+  //int64 offset_;
   KVInterface<K, V>* kv_;
   float l2_weight_threshold_;
 };
diff --git a/tensorflow/core/framework/embedding/layout_creator.h b/tensorflow/core/framework/embedding/layout_creator.h
deleted file mode 100644
index 07d50451bf0..00000000000
--- a/tensorflow/core/framework/embedding/layout_creator.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-======================================================================*/
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-
-#include "tensorflow/core/framework/embedding/cache.h"
-#include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/storage_config.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-template <class V>
-class ValuePtr;
-
-namespace embedding {
-template<typename V>
-class LayoutCreator {
- public:
-  virtual ValuePtr<V>* Create(Allocator* alloc, size_t size) = 0;
-};
-
-template<typename V>
-class NormalLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class LightLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new LightValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalContiguousValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousGPULayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalGPUValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class CompactLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new CompactValuePtr<V>(alloc, size);
-  }
-};
-
-class LayoutCreatorFactory {
- public:
-  template<typename V>
-  static LayoutCreator<V>* Create(const StorageConfig& sc) {
-    switch (sc.layout_type) {
-      case LayoutType::NORMAL:
-        static NormalLayoutCreator<V> normal_creator;
-        return &normal_creator;
-      case LayoutType::LIGHT:
-        static LightLayoutCreator<V> light_creator;
-        return &light_creator;
-      case LayoutType::NORMAL_CONTIGUOUS:
-        static NormalContiguousLayoutCreator<V> normal_contiguous_creator;
-        return &normal_contiguous_creator;
-      case LayoutType::NORMAL_CONTIGUOUS_GPU:
-        static NormalContiguousGPULayoutCreator<V>
-                   normal_contiguous_gpu_creator;
-        return &normal_contiguous_gpu_creator;
-      case LayoutType::COMPACT:
-        static CompactLayoutCreator<V> compact_creator;
-        return &compact_creator;
-      default:
-        static NormalLayoutCreator<V> default_creator;
-        return &default_creator;
-    }
-  }
-};
-} // embedding
-} // tensorflow
-
-#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index 8ea1fa63fc2..e488ab3776d 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
 
 #include "tensorflow/core/lib/io/path.h"
-
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 
 #include "leveldb/db.h"
@@ -35,9 +33,6 @@ using leveldb::WriteBatch;
 using leveldb::WriteOptions;
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K>
@@ -76,28 +71,21 @@ class SizeCounter {
 template <class K, class V>
 class LevelDBKV : public KVInterface<K, V> {
  public:
-  LevelDBKV(std::string path) {
+  LevelDBKV(std::string path, FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc) {
     path_ = io::JoinPath(path,
         "level_db_" + std::to_string(Env::Default()->NowMicros()));;
     options_.create_if_missing = true;
     leveldb::Status s = leveldb::DB::Open(options_, path_, &db_);
     CHECK(s.ok());
     counter_ =  new SizeCounter<K>(8);
-    new_value_ptr_fn_ = [] (size_t size) {
-      return new NormalContiguousValuePtr<V>(ev_allocator(), size);
-    };
-    total_dims_ = 0;
-  }
-
-  void SetTotalDims(int total_dims) {
-    total_dims_ = total_dims;
   }
 
   ~LevelDBKV() override {
     delete db_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     std::string val_str;
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::ReadOptions options;
@@ -106,8 +94,8 @@ class LevelDBKV : public KVInterface<K, V> {
       return errors::NotFound(
           "Unable to find Key: ", key, " in LevelDB.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
-      memcpy((int64 *)(val->GetPtr()), &val_str[0], val_str.length());
+      void* val = feat_desc_->Allocate();
+      memcpy((int64 *)val, &val_str[0], val_str.length());
       *value_ptr = val;
       return Status::OK();
     }
@@ -126,22 +114,22 @@ class LevelDBKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     counter_->add(key, 1);
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   } 
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     WriteBatch batch;
     for (int i = 0; i < keys.size(); i++) {
-      std::string value_res((char*)value_ptrs[i]->GetPtr(),
-          sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+      std::string value_res((char*)value_ptrs[i],
+          feat_desc_->data_bytes());
       leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*));
       batch.Put(db_key, value_res);
       delete value_ptrs[i];
@@ -150,9 +138,9 @@ class LevelDBKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    std::string value_res((char*)value_ptr->GetPtr(),
-        sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+  Status Commit(K key, const void* value_ptr) override {
+    std::string value_res((char*)value_ptr,
+        feat_desc_->data_bytes());
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res);
     if (!s.ok()){
@@ -176,22 +164,32 @@ class LevelDBKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
     leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       K key;
       memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
       key_list->emplace_back(key);
-      ValuePtr<V>* value_ptr =
-          new NormalGPUValuePtr<V>(ev_allocator(), 1);
-      memcpy((char *)value_ptr->GetPtr(),
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM, true, true,
+          {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr,
              it->value().ToString().data(),
-             sizeof(FixedLengthHeader));
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(
+          value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(
+          value_ptr, feat_desc_->GetVersion(dram_value_ptr));
       value_ptr_list->emplace_back(value_ptr);
     }
     delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
     return Status::OK();
   }
 
@@ -199,8 +197,8 @@ class LevelDBKV : public KVInterface<K, V> {
     return counter_->size();
   }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
   std::string DebugString() const override{
@@ -212,8 +210,7 @@ class LevelDBKV : public KVInterface<K, V> {
   SizeCounter<K>* counter_;
   Options options_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
-  int total_dims_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<class K, class  V>
@@ -223,10 +220,12 @@ class DBValueIterator: public ValueIterator<V> {
       const std::vector<K>& key_list,
       int64 emb_index,
       int64 value_len,
-      LevelDBKV<K, V>* leveldb_kv)
+      LevelDBKV<K, V>* leveldb_kv,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         emb_index_(emb_index),
-        leveldb_kv_(leveldb_kv) {
+        leveldb_kv_(leveldb_kv),
+        feat_desc_(feat_desc) {
     int64 emb_offset = value_len_ * emb_index;
     std::vector<std::list<K>> keys_parts_vec(kSavedPartitionNum);
     for (int64 i = 0; i < key_list.size(); i++) {
@@ -251,8 +250,7 @@ class DBValueIterator: public ValueIterator<V> {
 
   V* Next() {
     if (value_ptr_ != nullptr) {
-      value_ptr_->Destroy(ev_allocator());
-      delete value_ptr_;
+      feat_desc_->Deallocate(value_ptr_);
     }
     K key = *(keys_iter_++);
 
@@ -260,16 +258,17 @@ class DBValueIterator: public ValueIterator<V> {
     if (!s.ok()) {
       LOG(FATAL)<<"Not found value in LevelDB when Save.";
     }
-    return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_);
+    return feat_desc_->GetEmbedding(value_ptr_, emb_index_);
   }
 
  private:
   int64 value_len_;
   int64 emb_index_;
   LevelDBKV<K, V>* leveldb_kv_;
+  FeatureDescriptor<V>* feat_desc_;
   std::list<K> keys_;
   typename std::list<K>::const_iterator keys_iter_;
-  ValuePtr<V>* value_ptr_ = nullptr;
+  void* value_ptr_ = nullptr;
   int64 key_cursor_ = 0;
 };
 
diff --git a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h b/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
deleted file mode 100644
index 8dcea81d4a1..00000000000
--- a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-=======================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "sparsehash/dense_hash_map_lockless"
-#include "tensorflow/core/framework/embedding/batch.h"
-#include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/stream_executor.h"
-
-namespace tensorflow {
-using se::DeviceMemoryBase;
-using se::Stream;
-
-namespace embedding {
-
-template <class K, class V>
-class LocklessHashMapCPU : public KVInterface<K, V> {
- public:
-  LocklessHashMapCPU(Allocator* gpu_alloc): gpu_alloc_(gpu_alloc) {
-    hash_map_.max_load_factor(0.8);
-    hash_map_.set_empty_key_and_value(EMPTY_KEY_, nullptr);
-    hash_map_.set_counternum(16);
-    hash_map_.set_deleted_key(DELETED_KEY_);
-    cudaEventCreate(&is_finish_);
-  }
-
-  ~LocklessHashMapCPU() override {
-    cudaEventDestroy(is_finish_);
-  }
-
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      *value_ptr = iter.second;
-      return Status::OK();
-    }
-  }
-
-  Status Contains(K key) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
-    auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
-    // insert fail, exist key
-    if ((*(iter.first)).second != value_ptr){
-      return errors::AlreadyExists(
-          "already exists Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  // Other Method
-  int64 Size() const override {
-    return hash_map_.size_lockless();
-  }
-
-  // Remove KV
-  Status Remove(K key) override {
-    if (hash_map_.erase_lockless(key)) {
-      return Status::OK();
-    } else {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    }
-  }
-
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-  }
-
-  void AppendToValuePtrQueue(ValuePtr<V>* old_value_ptr) {
-    //A parameter that can be adjusted in the future
-    if (value_ptr_out_of_date_.size() > CAP_INVALID_VALUEPTR) {
-      ValuePtr<V>* value_ptr = value_ptr_out_of_date_.front();
-      delete value_ptr;
-      value_ptr_out_of_date_.pop_front();
-    }
-    value_ptr_out_of_date_.emplace_back(old_value_ptr);
-  }
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    ValuePtr<V>* cpu_value_ptr =
-      new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-    cudaMemcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-               *(char **)((char*)value_ptr->GetPtr() + sizeof(FixedLengthHeader)),
-               total_dims_ * sizeof(V),
-               cudaMemcpyDeviceToHost);
-    memcpy((char *)cpu_value_ptr->GetPtr(),
-        (char*)value_ptr->GetPtr(), sizeof(FixedLengthHeader));
-    auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-    if ((*(iter.first)).second != cpu_value_ptr) {
-      AppendToValuePtrQueue((*(iter.first)).second);
-      (*(iter.first)).second = cpu_value_ptr;
-    }
-    return Status::OK();
-  }
-
-  Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
-    int batch_size = keys.size();
-    Allocator* cpu_alloc = cpu_allocator();
-    V** value_address = (V **)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
-    V** dev_value_address;
-    V* batch_data_place;
-    V* dev_batch_data_place;
-    dev_value_address = (V**)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
-    dev_batch_data_place = (V*)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-    batch_data_place = (V *)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-
-    // Copy GPU addresses V*
-    for(int i = 0;i < batch_size;++i) {
-      value_address[i] =
-        *(V **)((char*)value_ptrs[i]->GetPtr() + sizeof(FixedLengthHeader));
-    }
-
-    cudaMemcpyAsync(dev_value_address, value_address,
-                    sizeof(V*) * batch_size,
-                    cudaMemcpyHostToDevice);
-
-    // Launch Kernel,Copy data to continuous place
-    int block_dim = 128;
-    void* args[] = { (void*)&dev_value_address,
-        (void*)&dev_batch_data_place, (void*)&total_dims_,
-        (void*)&batch_size};
-
-    cudaLaunchKernel((void *)BatchCopy<V>,
-                     (batch_size * total_dims_ + block_dim - 1) / block_dim,
-                     block_dim, args, 0, NULL);
-
-    cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
-                    sizeof(V) * batch_size * total_dims_,
-                    cudaMemcpyDeviceToHost);
-
-    cudaEventRecord(is_finish_);
-    cudaEventSynchronize(is_finish_);
-
-    // Copy data to ValuePtrs in memory;Insert it into hashmap
-    for(int i = 0; i < batch_size; ++i) {
-      ValuePtr<V>* cpu_value_ptr =
-        new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-      memcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-          &batch_data_place[i * total_dims_], total_dims_ * sizeof(V));
-      memcpy((char *)cpu_value_ptr->GetPtr(),
-          (char *)value_ptrs[i]->GetPtr(), sizeof(FixedLengthHeader));
-      auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(keys[i],
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-      if ((*(iter.first)).second != cpu_value_ptr) {
-        AppendToValuePtrQueue((*(iter.first)).second);
-        (*(iter.first)).second = cpu_value_ptr;
-      }
-    }
-
-    gpu_alloc_->DeallocateRaw(dev_value_address);
-    gpu_alloc_->DeallocateRaw(dev_batch_data_place);
-
-    cpu_alloc->DeallocateRaw(batch_data_place);
-    cpu_alloc->DeallocateRaw(value_address);
-
-    return Status::OK();
-  }
-
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
-    int64 bucket_count;
-    auto it = hash_map_.GetSnapshot();
-    hash_map_dump = it.first;
-    bucket_count = it.second;
-    for (int64 j = 0; j < bucket_count; j++) {
-      if (hash_map_dump[j].first != EMPTY_KEY_ &&
-          hash_map_dump[j].first != DELETED_KEY_) {
-        key_list->emplace_back(hash_map_dump[j].first);
-        value_ptr_list->emplace_back(hash_map_dump[j].second);
-      }
-    }
-    free(hash_map_dump);
-    return Status::OK();
-  }
-
-  std::string DebugString() const override {
-    LOG(INFO) << "map info size:" << Size()
-              << "map info bucket_count:" << hash_map_.bucket_count()
-              << "map info load_factor:" << hash_map_.load_factor()
-              << "map info max_load_factor:" << hash_map_.max_load_factor()
-              << "map info min_load_factor:" << hash_map_.min_load_factor();
-    return "";
-  }
-
- private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>* >
-    LockLessHashMap;
-  static const int EMPTY_KEY_ = -1;
-  static const int DELETED_KEY_ = -2;
-  static constexpr int CAP_INVALID_VALUEPTR = 200000;
-  LockLessHashMap hash_map_;
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
-  int total_dims_;
-  Allocator* gpu_alloc_;
-  cudaEvent_t is_finish_;
-};
-}  // namespace embedding
-}  // namespace tensorflow
-
-#endif //GOOGLE_CUDA
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
index de275183d22..9745ab5fcc3 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/framework/embedding/batch.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -44,11 +43,13 @@ template <class K, class V>
 void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     const EmbeddingVarContext<GPUDevice>& ctx,
     const K* keys,
-    ValuePtr<V>** value_ptr_list,
+    void** value_ptr_list,
     std::list<int64>& copyback_cursor,
     const std::vector<int64>& memory_index,
-    const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-    int value_len) {
+    const std::vector<void*>& gpu_value_ptrs,
+    int value_len,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
   if (copyback_cursor.size() > 0) {
     int total = copyback_cursor.size();
     //Alocate memcpy buffer on CPU and GPU.
@@ -64,11 +65,13 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     auto do_work = [memory_index,
                     memcpy_buffer_cpu, value_ptr_list,
                     gpu_value_ptrs,
+                    dram_feat_desc,
                     value_len, this] (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
         int j = memory_index[i];
         memcpy(memcpy_buffer_cpu + i * value_len,
-               value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V));
+               dram_feat_desc->GetEmbedding(value_ptr_list[j], 0),
+               value_len * sizeof(V));
         value_ptr_list[j] = gpu_value_ptrs[i];
       }
     };
@@ -96,8 +99,7 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     for (; it != copyback_cursor.cend(); ++it, ++i) {
       // Get the cursor
       int64 cursor = *it;
-      gpu_value_ptrs[i]->SetInitialized(0);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0);
+      value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0);
     }
     DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*));
     compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*));
@@ -119,16 +121,71 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
 }
 #define REGISTER_KERNELS(ktype, vtype)                                        \
   template void MultiTierStorage<ktype, vtype>::CopyEmbeddingsFromDramToHbm(       \
-      const EmbeddingVarContext<GPUDevice>&, const ktype*, ValuePtr<vtype>**,\
+      const EmbeddingVarContext<GPUDevice>&, const ktype*, void**,\
       std::list<int64>&, const std::vector<int64>&,\
-      const std::vector<ValuePtr<vtype>*>&, int);
+      const std::vector<void*>&, int, FeatureDescriptor<vtype>*,\
+      FeatureDescriptor<vtype>*);
 #define REGISTER_KERNELS_ALL(type) \
   REGISTER_KERNELS(int32, type);   \
   REGISTER_KERNELS(int64, type)
 #define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <class TValue>
+template <class K>
+void HbmMultiTierFeatureDescriptorImpl<TValue>::SetDefaultValues(
+    const K* keys, const std::list<int64>& init_cursor,
+    void** value_ptrs, se::Stream* compute_stream, EventMgr* event_mgr,
+    const Eigen::GpuDevice& gpu_device) {
+  if (init_cursor.size() > 0) {
+    int64 total = init_cursor.size();
+    TValue** value_address = nullptr;
+    value_address = TypedAllocator::Allocate<TValue*>(cpu_allocator(), total * 2,
+                                                 AllocationAttributes());
+    TValue** default_value_address = value_address + total;
+    TValue** dev_value_address = nullptr;
+    dev_value_address =
+        TypedAllocator::Allocate<TValue*>(hbm_alloc_, total * 2, AllocationAttributes());
+    TValue** dev_default_value_address = dev_value_address + total;
+    for (int emb_index = 0; emb_index < FeatureDescriptorImpl<TValue>::slot_infos_.size(); emb_index++) {
+      int64 i = 0;
+      auto it = init_cursor.cbegin();
+      for (; it != init_cursor.cend(); ++it, ++i) {
+        value_address[i] = GetEmbedding(value_ptrs[*it], emb_index);
+        default_value_address[i] =
+            FeatureDescriptorImpl<TValue>::GetDefaultValuePtr(emb_index, keys[i]);
+      }
+      DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(TValue*));
+      compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
+                                total * 2 * sizeof(TValue*));
+      int block_dim = 128;
+      int value_len = FeatureDescriptorImpl<TValue>::slot_infos_[emb_index].default_value_len;
+      TF_CHECK_OK(GpuLaunchKernel(
+          embedding::CopyEmbedding<TValue>,
+          (total * value_len + block_dim - 1) / block_dim,
+          block_dim, 0, gpu_device.stream(), dev_default_value_address,
+          dev_value_address, value_len, total));
+      SyncWithEventMgr(compute_stream, event_mgr);  
+    }
+    
+    TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2);
+    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
+  }
+}
 
+#define REGISTER_KERNELS(ktype, vtype)                                        \
+  template void HbmMultiTierFeatureDescriptorImpl<vtype>::SetDefaultValues(     \
+      const ktype*, const std::list<int64>&, void**,\
+      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 } // namespace embedding
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 8239d109e64..7955322aca6 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -31,10 +31,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/core/status.h"
 
-namespace tensorflow {
-template<typename V>
-class ValuePtr;
+#if GOOGLE_CUDA
+#include "tensorflow/core/framework/embedding/batch.h"
+#endif
 
+namespace tensorflow {
 template<typename K, typename V>
 class EmbeddingVar;
 
@@ -54,22 +55,10 @@ class MultiTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-
-      cache_capacity_ = Storage<K, V>::storage_config_.size[0]
-                        / (Storage<K, V>::total_dims_ * sizeof(V));
-      ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+  virtual void Init() override {
+    cache_capacity_ = Storage<K, V>::storage_config_.size[0]
+                      / (total_dim() * sizeof(V));
+    ready_eviction_ = true;
   }
 
   int64 CacheSize() const override {
@@ -90,13 +79,13 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL)<<"BatchCommit isn't supported by MultiTierStorage.";
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
   }
 
@@ -104,7 +93,7 @@ class MultiTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -128,17 +117,6 @@ class MultiTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
-
   void Schedule(std::function<void()> fn) override {
     cache_thread_pool_->Schedule(std::move(fn));
   }
@@ -223,50 +201,50 @@ class MultiTierStorage : public Storage<K, V> {
     }
     return s;
   }
- 
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual int total_dim() = 0;
 
   void DeleteFromEvictionManager() {
     eviction_manager_->DeleteStorage(this);
   }
 
-  void ReleaseValuePtrs(std::deque<ValuePtr<V>*>& value_ptrs,
-                        Allocator* allocator) {
+  void ReleaseValuePtrs(std::deque<void*>& value_ptrs,
+                        FeatureDescriptor<V>* feat_desc) {
     constexpr int CAP_INVALID_VALUEPTR = 64 * 1024;
     if (value_ptrs.size() > CAP_INVALID_VALUEPTR) {
       int64 num_of_deleted_value_ptrs =
           value_ptrs.size() - CAP_INVALID_VALUEPTR;
       for (int i = 0; i < num_of_deleted_value_ptrs; i++) {
-        ValuePtr<V>* value_ptr = value_ptrs.front();
-        value_ptr->Destroy(allocator);
-        delete value_ptr;
+        void* value_ptr = value_ptrs.front();
+        feat_desc->Deallocate(value_ptr);
         value_ptrs.pop_front();
       }
     }
   }
 
-  void ReleaseInvalidValuePtr(Allocator* allocator) {
-    ReleaseValuePtrs(value_ptr_out_of_date_, allocator);
+  void ReleaseInvalidValuePtr(FeatureDescriptor<V>* feat_desc) {
+    ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc);
   }
 
-  void KeepInvalidValuePtr(ValuePtr<V>* value_ptr) {
+  void KeepInvalidValuePtr(void* value_ptr) {
     value_ptr_out_of_date_.emplace_back(value_ptr);
   }
 
 #if GOOGLE_CUDA
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& context,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
                                    const std::vector<int64>& memory_index,
-                                   const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-                                   int value_len);
+                                   const std::vector<void*>& gpu_value_ptrs,
+                                   int value_len,
+                                   FeatureDescriptor<V>* hbm_feat_desc,
+                                   FeatureDescriptor<V>* dram_feat_desc);
 #endif //GOOGL_CUDA
  private:
   virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {}
 
  protected:
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
+  std::deque<void*> value_ptr_out_of_date_;
   BatchCache<K>* cache_ = nullptr;
 
   EvictionManager<K, V>* eviction_manager_;
@@ -281,6 +259,70 @@ class MultiTierStorage : public Storage<K, V> {
   std::string name_;
   std::vector<mutex> mu_list_;
 };
+
+#if GOOGLE_CUDA
+template <class V>
+void CopyEmbeddingFromHbmToDram(
+    const std::vector<void*>& hbm_value_ptrs,
+    const std::vector<void*>& dram_value_ptrs,
+    Allocator* gpu_alloc,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
+  int batch_size = hbm_value_ptrs.size();
+    V** dev_value_address;
+
+  dev_value_address = (V**)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
+  Allocator* cpu_alloc = ev_allocator();
+  V** value_address = (V**)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
+
+  V* batch_data_place;
+  V* dev_batch_data_place;
+  int total_dim = dram_feat_desc->total_dim();
+  dev_batch_data_place = (V*)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  batch_data_place = (V *)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  // Copy GPU addresses V*
+  for(int i = 0; i < batch_size; ++i) {
+    value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0);
+  }
+  cudaMemcpyAsync(dev_value_address, value_address,
+                  sizeof(V*) * batch_size,
+                  cudaMemcpyHostToDevice);
+
+  // Launch Kernel,Copy data to continuous place
+  int block_dim = 128;
+  void* args[] = { (void*)&dev_value_address,
+      (void*)&dev_batch_data_place, (void*)&total_dim,
+      (void*)&batch_size};
+
+  cudaLaunchKernel((void *)BatchCopy<V>,
+                    (batch_size * total_dim + block_dim - 1) / block_dim,
+                    block_dim, args, 0, NULL);
+
+  cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
+                  sizeof(V) * batch_size * total_dim,
+                  cudaMemcpyDeviceToHost);
+
+  cudaEvent_t is_finish_;
+  cudaEventCreate(&is_finish_);
+  cudaEventRecord(is_finish_);
+  cudaEventSynchronize(is_finish_);
+  cudaEventDestroy(is_finish_);
+  
+  for(int i = 0; i < batch_size; ++i) {
+    memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0),
+        &batch_data_place[i * total_dim], total_dim * sizeof(V));
+  }
+
+  cpu_alloc->DeallocateRaw(value_address);
+  cpu_alloc->DeallocateRaw(batch_data_place);
+  gpu_alloc->DeallocateRaw(dev_value_address);
+  gpu_alloc->DeallocateRaw(dev_batch_data_place);
+}
+#endif //GOOGL_CUDA
 } // embedding
 } // tensorflow
 
diff --git a/tensorflow/core/framework/embedding/normal_feature_descriptor.h b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
new file mode 100644
index 00000000000..817b33d058b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
@@ -0,0 +1,134 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+#if GOOGLE_CUDA
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+#endif
+
+template<class V>
+class NormalFeatureDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num,
+                          bool need_record_freq,
+                          bool need_record_version)
+      : alloc_bytes_(0),
+        alloc_(alloc),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {}
+  
+  NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_(feat_desc_impl->alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  NormalFeatureDescriptorImpl(
+      HbmMultiTierFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_bytes_(0),
+        alloc_(feat_desc_impl->dram_alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  ~NormalFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes = FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+      FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    FeatureDescriptorImpl<V>::SetSlotInfo(feat_desc_impl);
+    FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+    FeatureDescriptorImpl<V>::SetFreqAndVersionOffset(&alloc_bytes_);
+    return true;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return reinterpret_cast<V*>(val)
+        + FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    for (auto val: value_ptrs) {
+      Deallocate(val);
+    }
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+  void SetDefaultValue(void* val, int64 index) override {
+    for (int i = 0; i < FeatureDescriptorImpl<V>::slot_infos_.size(); i++) {
+      V* val_ptr = GetEmbedding(val, i);
+      FeatureDescriptorImpl<V>::SetDefaultValue((void*)val_ptr, i, index);
+    }
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    LOG(FATAL)<<"Can't call SetDefaultValue(const K*, const std::list<int64>&,"
+              <<"void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)"
+              <<" in HbmMultiTierFeatureDescriptor.";
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+
+ private:
+  int alloc_bytes_;
+  Allocator* alloc_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 0c5ce80886a..7e3ace0063d 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -30,19 +30,21 @@ template<typename K, typename V, typename EV>
 class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
   NullableFilterPolicy(const EmbeddingConfig& config,
-                       EV* ev, embedding::Storage<K, V>* storage) : 
-      FilterPolicy<K, V, EV>(config, ev), storage_(storage) {}
+      EV* ev, embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc)
+      : storage_(storage), feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(
+          value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_ptr,
@@ -57,17 +59,17 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_ptr;
         }
@@ -85,65 +87,55 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs,
+                              const K* keys, void** value_ptrs,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         not_found_cursor_list(num_worker_threads + 1);
     ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs,
                                 num_of_keys, not_found_cursor_list);
-    std::vector<V*> var_ptrs(num_of_keys);
-    auto do_work = [this, value_ptrs, &var_ptrs]
-        (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptrs[i] = ev_->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-      }
-    };
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers, num_of_keys,
-          1000, do_work);
-
-    ev_->SetDefaultValueOfNewFeatures(
-        keys, num_of_keys,
-        not_found_cursor_list[0],
-        var_ptrs.data(), ctx.compute_stream,
-        ctx.event_mgr, ctx.gpu_device);
   }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
     memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
     *is_filter = true;
-    return ev_->LookupOrCreateKey(key, val);
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      feat_desc_->SetDefaultValue(*value_ptr, key);
+      storage_->Insert(key, value_ptr);
+      s = Status::OK();
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
+    return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      return value_ptr->GetFreq();
-    }else {
-      return 0;
-    }
+  Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) override {
+    *is_filter = true;
+    return ev_->LookupKey(key, val);
+  }
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      ValuePtr<V>* value_ptr = nullptr;
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-      return value_ptr->GetFreq();
-    }else {
+    if (!config_.is_save_freq())
       return 0;
-    }
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -161,27 +153,30 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
+
       if (config_.filter_freq !=0 || ev_->IsMultiLevel()
           || config_.record_freq) {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
+        import_version = version_buff[i];
       }
-      LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                value_ptr, value_buff, key_buff);
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     return true;
   }
 
  private:
   embedding::Storage<K, V>* storage_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h
index ea063a113a3..a8d0d9ada75 100644
--- a/tensorflow/core/framework/embedding/shrink_policy.h
+++ b/tensorflow/core/framework/embedding/shrink_policy.h
@@ -15,14 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 class Allocator;
 
 namespace embedding {
@@ -40,31 +37,29 @@ struct ShrinkArgs {
 template<typename K, typename V>
 class ShrinkPolicy {
  public:
-  ShrinkPolicy(Allocator* alloc): alloc_(alloc) {}
+  ShrinkPolicy(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {}
   virtual ~ShrinkPolicy() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy);
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list,
+                      std::vector<void*>& value_list,
                       const ShrinkArgs& shrink_args) = 0;
 
  protected:
-  void EmplacePointer(ValuePtr<V>* value_ptr) {
+  void EmplacePointer(void* value_ptr) {
     to_delete_.emplace_back(value_ptr);
   }
 
   void ReleaseValuePtrs() {
     for (auto it : to_delete_) {
-      it->Destroy(alloc_);
-      delete it;
+      feat_desc_->Deallocate(it);
     }
     to_delete_.clear();
   }
  protected:
-  std::vector<ValuePtr<V>*> to_delete_;
- private:
-  Allocator* alloc_;
+  std::vector<void*> to_delete_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
@@ -74,7 +69,7 @@ class NonShrinkPolicy: public ShrinkPolicy<K, V> {
   TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {}
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index f9de65df588..be08afd7f50 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -24,7 +24,6 @@ limitations under the License.
 #endif // GOOGLE_CUDA
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/embedding/l2weight_shrink_policy.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/leveldb_kv.h"
 #include "tensorflow/core/framework/embedding/ssd_hash_kv.h"
 #include "tensorflow/core/framework/embedding/storage_config.h"
@@ -32,9 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -62,24 +58,22 @@ class HbmDramSsdStorage;
 template<typename K, typename V>
 class SingleTierStorage : public Storage<K, V> {
  public:
-  SingleTierStorage(const StorageConfig& sc, Allocator* alloc,
-      KVInterface<K, V>* kv, LayoutCreator<V>* lc)
-      : kv_(kv), alloc_(alloc), layout_creator_(lc),
+  SingleTierStorage(const StorageConfig& sc,
+      KVInterface<K, V>* kv, FeatureDescriptor<V>* feat_desc)
+      : kv_(kv), feat_desc_(feat_desc),
         Storage<K, V>(sc) {
     if (sc.embedding_config.steps_to_live != 0) {
       shrink_policy_ =
           new GlobalStepShrinkPolicy<K, V>(
               sc.embedding_config.steps_to_live,
-              alloc_,
+              feat_desc_,
               kv_);
     } else if (sc.embedding_config.l2_weight_threshold != -1.0) {
       shrink_policy_ =
           new L2WeightShrinkPolicy<K, V>(
               sc.embedding_config.l2_weight_threshold,
               sc.embedding_config.primary_emb_index,
-              Storage<K, V>::GetOffset(
-                  sc.embedding_config.primary_emb_index),
-              alloc_,
+              feat_desc_,
               kv_);
     } else {
       shrink_policy_ = new NonShrinkPolicy<K, V>();
@@ -89,11 +83,10 @@ class SingleTierStorage : public Storage<K, V> {
   ~SingleTierStorage() override {
     mutex_lock l(Storage<K, V>::mu_);
     std::vector<K> key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     kv_->GetSnapshot(&key_list, &value_ptr_list);
     for (auto value_ptr : value_ptr_list) {
-      value_ptr->Destroy(alloc_);
-      delete value_ptr;
+      feat_desc_->Deallocate(value_ptr);
     }
     delete kv_;
     delete shrink_policy_;
@@ -101,7 +94,7 @@ class SingleTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     return kv_->Lookup(key, value_ptr);
   }
 
@@ -109,47 +102,45 @@ class SingleTierStorage : public Storage<K, V> {
     return kv_->Contains(key);
   }
 
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) override {
+  virtual void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     do {
-      *value_ptr = layout_creator_->Create(alloc_, alloc_len);
+      *value_ptr = feat_desc_->Allocate();
       Status s = kv_->Insert(key, *value_ptr);
       if (s.ok()) {
         break;
       } else {
-        (*value_ptr)->Destroy(alloc_);
-        delete *value_ptr;
+        feat_desc_->Deallocate(*value_ptr);
       }
     } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in SingleTireStorage.";
+  virtual void Insert(K key, void** value_ptr) override {
+    do {
+      Status s = kv_->Insert(key, *value_ptr);
+      if (s.ok()) {
+        break;
+      } else {
+        feat_desc_->Deallocate(*value_ptr);
+      }
+    } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = kv_->Lookup(key, value_ptr);
     if (s.ok()) {
       return s;
     }
 
-    *value_ptr = layout_creator_->Create(alloc_, size);
+    *value_ptr = feat_desc_->Allocate();
     s = kv_->Insert(key, *value_ptr);
     if (s.ok()) {
       return s;
     }
     // Insert Failed, key already exist
-    (*value_ptr)->Destroy(alloc_);
-    delete *value_ptr;
+    feat_desc_->Deallocate(*value_ptr);
     return kv_->Lookup(key, value_ptr);
   }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    return GetOrCreate(key, value_ptr, size);
-  }
  
   Status Remove(K key) override {
     return kv_->Remove(key);
@@ -180,7 +171,7 @@ class SingleTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -198,13 +189,13 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL) << "Unsupport BatchCommit in Storage: "
                << typeid(this).name();
     return Status::OK();
   }
 
-  virtual Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K keys, const void* value_ptr) {
      LOG(FATAL) << "Unsupport Commit in Storage: "
                 << typeid(this).name();
     return Status::OK();
@@ -222,19 +213,12 @@ class SingleTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) override {}
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
@@ -247,7 +231,7 @@ class SingleTierStorage : public Storage<K, V> {
       ShrinkArgs& shrink_args,
       int64 value_len,
       V* default_value) override {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     std::vector<K> key_list_tmp;
     TF_CHECK_OK(kv_->GetSnapshot(
         &key_list_tmp, &value_ptr_list));
@@ -255,30 +239,16 @@ class SingleTierStorage : public Storage<K, V> {
     if (emb_config.is_primary()) {
       Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len);
     }
-
     TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
         tensor_name, writer,
         emb_config,
         value_len, default_value,
         key_list_tmp,
-        value_ptr_list)));
+        value_ptr_list,
+        SingleTierStorage<K, V>::feat_desc_)));
     return Status::OK();
   }
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
-  }
-
   bool IsMultiLevel() override {
     return false;
   }
@@ -299,16 +269,22 @@ class SingleTierStorage : public Storage<K, V> {
     LOG(FATAL) << "Unsupport Schedule in SingleTierStorage.";
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual void* CreateValuePtr() {
+    return feat_desc_->Allocate();
+  }
 
-  virtual ValuePtr<V>* CreateValuePtr(int64 size) {
-    return layout_creator_->Create(alloc_, size);
+  virtual void DestroyValuePtr(void* value_ptr) {
+    feat_desc_->Deallocate(value_ptr);
   }
 
-  virtual void DestroyValuePtr(ValuePtr<V>* value_ptr) {
-    value_ptr->Destroy(alloc_);
-    delete value_ptr;
+  FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
   }
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -324,7 +300,7 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_ptr_list,
+                      std::vector<void*>& value_ptr_list,
                       ShrinkArgs& shrink_args,
                       int64 value_len) {
     mutex_lock l(Storage<K, V>::mu_);
@@ -339,31 +315,40 @@ class SingleTierStorage : public Storage<K, V> {
   KVInterface<K, V>* kv_;
   ShrinkPolicy<K, V>* shrink_policy_;
   Allocator* alloc_;
-  LayoutCreator<V>* layout_creator_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
 class DramStorage : public SingleTierStorage<K, V> {
  public:
-  DramStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc,
-      KVInterface<K, V>* kv)
-      : SingleTierStorage<K, V>(sc, alloc, kv, lc) {}
+  DramStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {}
 
   ~DramStorage() override {}
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return SingleTierStorage<K, V>::kv_->BatchCommit(keys, value_ptrs);
   }
 
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) override{
+  Status Commit(K keys, const void* value_ptr) override{
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    void* value_ptr = SingleTierStorage<K, V>::feat_desc_->Allocate(freq);
+    SingleTierStorage<K, V>::Insert(key, &value_ptr);
+    SingleTierStorage<K, V>::feat_desc_->SetValue(value_ptr, emb_index, value);
+    SingleTierStorage<K, V>::feat_desc_->SetFreq(value_ptr, freq);
+    SingleTierStorage<K, V>::feat_desc_->UpdateVersion(value_ptr, version);
+  }
  
   TF_DISALLOW_COPY_AND_ASSIGN(DramStorage);
  public:
@@ -375,12 +360,8 @@ class DramStorage : public SingleTierStorage<K, V> {
   friend class HbmDramSsdStorage<K, V>;
 #endif
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -395,9 +376,10 @@ class DramStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class HbmStorage : public SingleTierStorage<K, V> {
  public:
-  HbmStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new GPUHashMapKV<K, V>(sc.embedding_config, alloc), lc) {
+  HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new GPUHashMapKV<K, V>(
+            sc.embedding_config, gpu_allocator), feat_desc) {
   }
   ~HbmStorage() override {}
 
@@ -488,48 +470,27 @@ class HbmStorage : public SingleTierStorage<K, V> {
     gpu_kv->Import(key_import, value_import, device, emb_config);
     return Status::OK();
   }
-
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
  public:
-  HbmStorageWithCpuKv(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  HbmStorageWithCpuKv(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
 
   ~HbmStorageWithCpuKv() override {}
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    do {
-      Status s = SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
-      if (s.ok()) {
-        break;
-      } else {
-        value_ptr->Destroy(SingleTierStorage<K, V>::alloc_);
-        delete value_ptr;
-      }
-    } while (!(SingleTierStorage<K, V>::kv_->Lookup(key, &value_ptr)).ok());
-  }
-
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    SingleTierStorage<K, V>::Insert(key, value_ptr, alloc_len, to_dram);
-  }
-
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
  public:
   friend class HbmDramStorage<K, V>;
   friend class HbmDramSsdStorage<K, V>;
  protected:
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -544,28 +505,25 @@ class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class PmemMemkindStorage : public SingleTierStorage<K, V> {
  public:
-  PmemMemkindStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemMemkindStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemMemkindStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage);
- 
- protected:
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  public:
-  PmemLibpmemStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemLibpmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemLibpmemStorage() override {}
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -573,10 +531,8 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  
  protected:
   friend class DramPmemStorage<K, V>;
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -590,15 +546,15 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class LevelDBStore : public SingleTierStorage<K, V> {
  public:
-  LevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LevelDBKV<K, V>(sc.path), lc) {
+  LevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LevelDBKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~LevelDBStore() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -608,29 +564,25 @@ class LevelDBStore : public SingleTierStorage<K, V> {
     LevelDBKV<K, V>* leveldb_kv =
         reinterpret_cast<LevelDBKV<K, V>*>(SingleTierStorage<K, V>::kv_);
     return new DBValueIterator<K, V>(
-        key_list, emb_index, value_len, leveldb_kv);
+        key_list, emb_index, value_len,
+        leveldb_kv, SingleTierStorage<K, V>::feat_desc_);
   }
  public:
   friend class DramLevelDBStore<K, V>;
-
- protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
 };
 
 template<typename K, typename V>
 class SsdHashStorage : public SingleTierStorage<K, V> {
  public:
-  SsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new SSDHashKV<K, V>(sc.path, alloc), lc) {
+  SsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new SSDHashKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~SsdHashStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -691,8 +643,9 @@ class SsdHashStorage : public SingleTierStorage<K, V> {
 #endif
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
+  void Init() override {
+    dynamic_cast<SSDHashKV<K, V>*>(
+        SingleTierStorage<K, V>::kv_)->Init();
   }
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index 8040421233e..f51c6904a50 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -25,17 +25,12 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
 #include "tensorflow/core/framework/embedding/emb_file_creator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
-
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 class EmbPosition {
  public:
@@ -115,55 +110,6 @@ class SSDIterator {
     }
   }
 
-  virtual void Key(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    memcpy((char*)val, &((file_map_[f_id])[curr_vec_].first), dim);
-  }
-
-  virtual void Value(char* val, int64 dim, int64 value_offset) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, dim,
-              posi->offset_ + value_offset + sizeof(FixedLengthHeader));
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_ +
-          value_offset + sizeof(FixedLengthHeader), dim);
-    }
-  }
-
-  virtual void Freq(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) =
-        reinterpret_cast<FixedLengthHeader*>(val)->GetFreqCounter();
-  }
-
-  virtual void Version(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) = 
-        reinterpret_cast<FixedLengthHeader*>(val)->GetGlobalStep();
-  }
-
   virtual K Key() {
     int64 f_id = file_id_vec_[curr_file_];
     return (file_map_[f_id])[curr_vec_].first;
@@ -192,8 +138,9 @@ class SSDIterator {
 template <class K, class V>
 class SSDHashKV : public KVInterface<K, V> {
  public:
-  explicit SSDHashKV(const std::string& path, Allocator* alloc)
-  : alloc_(alloc) {
+  explicit SSDHashKV(const std::string& path,
+                     FeatureDescriptor<V>* feat_desc)
+  : feat_desc_(feat_desc) {
     path_ = io::JoinPath(
         path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_");
     hash_map_.max_load_factor(0.8);
@@ -205,9 +152,6 @@ class SSDHashKV : public KVInterface<K, V> {
     evict_file_set_.set_counternum(16);
     evict_file_set_.set_deleted_key(DELETED_KEY);
 
-    new_value_ptr_fn_ = [this](size_t size) {
-      return new NormalContiguousValuePtr<V>(alloc_, size);
-    };
     is_async_compaction_ = true;
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true,
           &is_async_compaction_));
@@ -224,7 +168,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [this](){Compaction();}; 
       check_buffer_fn_ = [this](){CheckBuffer();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKV(key, value_ptr, is_compaction);
       };
@@ -233,7 +177,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Async Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [](){};
       check_buffer_fn_ = [this](){CheckBufferAsync();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKVAsync(key, value_ptr, is_compaction);
       };
@@ -244,9 +188,8 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-    val_len_ = sizeof(FixedLengthHeader) + total_dims_ * sizeof(V);
+  void Init() {
+    val_len_ = feat_desc_->data_bytes();
     max_app_count_ = BUFFER_SIZE / val_len_;
     write_buffer_ = new char[BUFFER_SIZE];
     unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_);
@@ -334,18 +277,18 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == EMPTY_KEY) {
       return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+      void* val = feat_desc_->Allocate();
       EmbPosition* posi = iter.second;
       if (posi->flushed_) {
-        emb_files_[posi->version_]->Read((char*)(val->GetPtr()),
+        emb_files_[posi->version_]->Read((char*)val,
             val_len_, posi->offset_);
       } else {
-        memcpy((char*)val->GetPtr(),
+        memcpy((char*)val,
             write_buffer_ + posi->buffer_offset_, val_len_);
       }
       *value_ptr = val;
@@ -363,17 +306,17 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, keys.size());
     for (int i = 0; i < keys.size(); i++) {
@@ -384,7 +327,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, 1);
     check_buffer_fn_();
@@ -402,7 +345,7 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
@@ -467,8 +410,8 @@ class SSDHashKV : public KVInterface<K, V> {
 
   int64 Size() const override { return hash_map_.size_lockless(); }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
  private:
@@ -555,10 +498,10 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void AppendToWriteBuffer(size_t curr_buffer_offset, K key,
-                            const ValuePtr<V>* value_ptr) {
+                            const void* value_ptr) {
     current_offset_ += val_len_;
     memcpy(write_buffer_ + curr_buffer_offset,
-        (char*)value_ptr->GetPtr(), val_len_);
+        (char*)value_ptr, val_len_);
     key_buffer_[buffer_cur_] = key;
     ++buffer_cur_;
   }
@@ -582,7 +525,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return flag;
   }
 
-  void SaveKV(K key, const ValuePtr<V>* value_ptr,
+  void SaveKV(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, current_version_,
@@ -608,7 +551,7 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SaveKVAsync(K key, const ValuePtr<V>* value_ptr,
+  void SaveKVAsync(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, evict_version_,
@@ -681,21 +624,21 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void MoveToNewFile() {
-    ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+    void* val = feat_desc_->Allocate();
     for (auto it : evict_file_map_) {
       EmbFile* file = emb_files_[it.first];
       total_app_count_ -= file->InvalidCount();
       file->MapForRead();
       for (auto it_vec : it.second) {
         EmbPosition* posi = it_vec.second;
-        file->ReadWithMemcpy((char*)(val->GetPtr()), val_len_,
+        file->ReadWithMemcpy((char*)val, val_len_,
             posi->offset_);
         CheckBuffer();
         SaveKV(it_vec.first, val, true);
       }
       file->UnmapForRead();
     }
-    delete val;
+    feat_desc_->Deallocate(val);
   }
 
   void MoveToNewFileAsync() {
@@ -825,11 +768,10 @@ class SSDHashKV : public KVInterface<K, V> {
   char* write_buffer_ = nullptr;
   K* key_buffer_ = nullptr;
   bool is_async_compaction_;
-  Allocator* alloc_ = nullptr;
+  FeatureDescriptor<V>* feat_desc_;
 
   int total_dims_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
 
   typedef google::dense_hash_map_lockless<K, EmbPosition*> LockLessHashMap;
   LockLessHashMap hash_map_;
@@ -857,7 +799,7 @@ class SSDHashKV : public KVInterface<K, V> {
 
   std::function<void()> compaction_fn_;
   std::function<void()> check_buffer_fn_;
-  std::function<void(K, const ValuePtr<V>*, bool)> save_kv_fn_;
+  std::function<void(K, const void*, bool)> save_kv_fn_;
   EmbFileCreator* emb_file_creator_ = nullptr;
 };
 template <class K, class V>
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index bb949183492..1ffb435054b 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -40,9 +40,6 @@ using GPUDevice = Eigen::GpuDevice;
 template <class K, class V>
 class CheckpointLoader;
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -57,9 +54,6 @@ class BundleReader;
 
 template<typename Device>
 struct EmbeddingVarContext;
-namespace {
-  const int kSavedPartitionNum = 1000;
-}
 namespace embedding {
 
 template<typename K, typename V>
@@ -67,42 +61,40 @@ class Storage {
  friend class CheckpointLoader<K, V>;
  public:
   explicit Storage(const StorageConfig& storage_config)
-      : storage_config_(storage_config) {}
+      : storage_config_(storage_config) {
+    initialize_value_.resize(storage_config.embedding_config.slot_num + 1);    
+  }
   virtual ~Storage() {}
   TF_DISALLOW_COPY_AND_ASSIGN(Storage);
 
-  virtual Status Get(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Get(K key, void** value_ptr) = 0;
 #if GOOGLE_CUDA
   virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                         const K* key,
-                        ValuePtr<V>** value_ptr_list,
-                        int64 num_of_keys,
-                        int64 value_len) {}
+                        void** value_ptr_list,
+                        int64 num_of_keys) {}
 
   virtual void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* key,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_found_cursor_list) {}
 #endif //GOOGLE_CUDA
   virtual Status Contains(K key) = 0;
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) = 0;
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) = 0;
-  virtual void SetAllocLen(int64 value_len, int slot_num) = 0;
+  virtual void CreateAndInsert(K key, void** value_ptr,
+                               bool to_dram=false) = 0;
+  virtual void Insert(K key, void** value_ptr) = 0;
+  virtual void Init() {}
   virtual void SetValueLen(int64 value_len) {}
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) = 0;
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) = 0;
+  virtual Status GetOrCreate(K key, void** value_ptr) = 0;
   virtual int LookupTier(K key) const = 0;
   virtual Status Remove(K key) = 0;
   virtual int64 Size() const = 0;
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
   virtual Status Save(
       const string& tensor_name,
       const string& prefix,
@@ -113,7 +105,7 @@ class Storage {
       V* default_value) = 0;
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   virtual Status Eviction(K* evict_ids, int64 evict_size) = 0;
 
@@ -121,7 +113,7 @@ class Storage {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -149,25 +141,11 @@ class Storage {
       Allocator* alloc,
       int64 value_len,
       int64 block_size) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list, int64 num_of_value_ptrs) = 0;
  
   inline mutex* get_mutex() { return &mu_; }
   inline int64 GetAllocLen() { return alloc_len_; }
   inline int64 GetOffset(int64 index) { return alloc_len_ * index; }
   inline int64 GetTotalDims() { return total_dims_; }
-  inline int64 ComputeAllocLen(int64 value_len) {
-    if (LayoutType::COMPACT == storage_config_.layout_type) {
-      return value_len;
-    } else {
-      return (value_len * sizeof(V) % 16 == 0)
-          ? value_len
-          : value_len + (16 - (sizeof(V) * value_len) % 16) / sizeof(V);
-    }
-  }
-  inline LayoutType GetLayoutType() { return storage_config_.layout_type; }
   inline embedding::StorageType GetStorageType() { return storage_config_.type; }
   inline std::string GetStoragePath() { return storage_config_.path; }
   inline embedding::CacheStrategy
@@ -183,7 +161,7 @@ class Storage {
   }
 
   inline void Insert(const std::vector<K>& keys,
-                     ValuePtr<V>** value_ptrs) {
+                     void** value_ptrs) {
     for (size_t i = 0; i < keys.size(); i++) {
       Insert(keys[i], value_ptrs[i]);
     }
@@ -211,6 +189,13 @@ class Storage {
                                     reset_version, reader);
     restorer.RestoreCkpt(emb_config, device);
   };
+  
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) = 0;
+  
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) = 0;
 
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -227,12 +212,7 @@ class Storage {
                             const std::string& ssd_emb_file_name,
                             EmbeddingVar<K, V>* ev,
                             RestoreSSDBuffer<K>& restore_buff) {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
-    auto* alloc = ev->GetAllocator();
     for (int64 i = 0; i < restore_buff.num_of_keys; i++) {
-      ValuePtr<V>* value_ptr = nullptr;
-      ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr);
-      value_ptr->SetInitialized(emb_index);
       int64 file_id = restore_buff.key_file_id_list_buf[i];
       int64 key_offset = restore_buff.key_offset_list_buf[i];
       // Read data from embedding files on SSD. Data are stored in
@@ -240,32 +220,29 @@ class Storage {
       std::stringstream ss;
       ss << ssd_emb_file_name << "/" << file_id << ".emb";
       int fd = open(ss.str().data(), O_RDONLY);
+      EmbeddingConfig& emb_config = storage_config_.embedding_config;
+      FeatureDescriptor<V> normal_feat_desc(
+          emb_config.block_num, emb_config.slot_num + 1,
+          ev_allocator(), StorageType::DRAM, true,
+          true, {false, 0});
+      void* value_ptr = normal_feat_desc.Allocate();
       char* file_addr = (char*)mmap(nullptr,
-                                    sizeof(FixedLengthHeader) +
-                                    alloc_len * sizeof(V) * (emb_slot_num + 1) +
+                                    normal_feat_desc.data_bytes() +
                                     key_offset,
                                     PROT_READ, MAP_PRIVATE, fd, 0);
-
-      NormalContiguousValuePtr<V> tmp_value_ptr(alloc,
-                                                alloc_len * (emb_slot_num + 1));
-      void* ptr = tmp_value_ptr.GetPtr();
-      memcpy(ptr, file_addr + key_offset,
-             sizeof(FixedLengthHeader) +
-              alloc_len * sizeof(V) * (emb_slot_num + 1));
+      memcpy(value_ptr, file_addr + key_offset,
+             normal_feat_desc.data_bytes());
       munmap(file_addr,
-             sizeof(FixedLengthHeader) +
-             alloc_len * sizeof(V) * (emb_slot_num + 1) +
+             normal_feat_desc.data_bytes() +
              key_offset);
       close(fd);
       // Copy Data to ValuePtr, data of slots are set by primary here.
-      for (int j = 0; j < emb_slot_num + 1; j++) {
-        V* value = tmp_value_ptr.GetValue(j, alloc_len * j);
-        if (value != nullptr) {
-          value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j);
-        }
-      }
-      value_ptr->SetFreq(tmp_value_ptr.GetFreq());
-      value_ptr->SetStep(tmp_value_ptr.GetStep());
+      int64 import_freq = normal_feat_desc.GetFreq(value_ptr);
+      int64 import_version = normal_feat_desc.GetVersion(value_ptr);
+      V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index);
+      Import(restore_buff.key_list_buf[i], value,
+             import_freq, import_version, emb_index);
+      normal_feat_desc.Deallocate(value_ptr);
     }
     return Status::OK();
   }
@@ -273,10 +250,11 @@ class Storage {
  private:
   void GeneratePartitionedCkptData(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
       const EmbeddingConfig& emb_config,
-      V* default_value) {
+      V* default_value,
+      FeatureDescriptor<V>* feat_desc) {
     std::vector<EmbeddingVarCkptData<K, V>>
         ev_ckpt_data_parts(kSavedPartitionNum);
 
@@ -293,7 +271,43 @@ class Storage {
           ev_ckpt_data_parts[part_id].Emplace(
               key_list[i], value_ptr_list[i],
               emb_config, default_value,
-              GetOffset(emb_config.emb_index),
+              feat_desc,
+              is_save_freq,
+              is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config,
+      V* default_value,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc) {
+    std::vector<EmbeddingVarCkptData<K, V>>
+        ev_ckpt_data_parts(kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset;
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i],
+              emb_config, default_value,
+              feat_desc[feat_desc_type],
               is_save_freq,
               is_save_version,
               save_unfiltered_features);
@@ -333,12 +347,33 @@ class Storage {
       int64 value_len,
       V* default_value,
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
+      FeatureDescriptor<V>* feat_desc,
+      ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value, feat_desc);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(
+            tensor_name, writer, value_len, value_iter);
+    return Status::OK();
+  }
+
+  Status SaveToCheckpoint(
+      const string& tensor_name,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      int64 value_len,
+      V* default_value,
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc,
       ValueIterator<V>* value_iter = nullptr) {
     EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
     GeneratePartitionedCkptData(key_list, value_ptr_list,
                                 &partitioned_ckpt_data, emb_config,
-                                default_value);
+                                default_value, feat_desc);
     Status s =
         partitioned_ckpt_data.ExportToCkpt(
             tensor_name, writer, value_len, value_iter);
@@ -366,6 +401,7 @@ class Storage {
 
   mutex mu_;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
+  std::vector<V*> initialize_value_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h
index 85e44879dcb..23babc9ef08 100644
--- a/tensorflow/core/framework/embedding/storage_config.h
+++ b/tensorflow/core/framework/embedding/storage_config.h
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 namespace tensorflow {
 namespace embedding {
 struct StorageConfig {
   StorageConfig() : type(StorageType::DEFAULT),
                     path(""),
-                    layout_type(LayoutType::NORMAL),
                     cache_strategy(CacheStrategy::LFU) {
     size = {1<<30,1<<30,1<<30,1<<30};
   }
@@ -31,32 +29,14 @@ struct StorageConfig {
   StorageConfig(StorageType t,
                 const std::string& p,
                 const std::vector<int64>& s,
-                const std::string& layout,
                 const EmbeddingConfig& ec,
                 const CacheStrategy cache_strategy_ = CacheStrategy::LFU)
-                                      : type(t),
-                                        path(p),
-                                        embedding_config(ec),
-                                        cache_strategy(cache_strategy_) {
-    if ("normal" == layout) {
-      layout_type = LayoutType::NORMAL;
-    } else if ("light" == layout) {
-      layout_type = LayoutType::LIGHT;
-    } else if ("normal_contiguous" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    } else if ("normal_contiguous_gpu" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS_GPU;
-    } else if ("compact" == layout){
-      layout_type = LayoutType::COMPACT;
-    } else {
-      LOG(WARNING) << "Unknown layout: "
-        << layout << ", use LayoutType::NORMAL by default.";
-      layout_type = LayoutType::NORMAL;
-    }
-    size = s;
-  }
+      : type(t),
+        path(p),
+        size(s),
+        embedding_config(ec),
+        cache_strategy(cache_strategy_) {}
   StorageType type;
-  LayoutType layout_type;
   std::string path;
   std::vector<int64> size;
   CacheStrategy cache_strategy;
diff --git a/tensorflow/core/framework/embedding/storage_factory.h b/tensorflow/core/framework/embedding/storage_factory.h
index 10d2d52b83f..c585b058470 100644
--- a/tensorflow/core/framework/embedding/storage_factory.h
+++ b/tensorflow/core/framework/embedding/storage_factory.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
 
 #include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/dram_leveldb_storage.h"
 #include "tensorflow/core/framework/embedding/dram_pmem_storage.h"
 #include "tensorflow/core/framework/embedding/dram_ssd_storage.h"
@@ -34,50 +33,41 @@ class StorageFactory {
  public:
   template<typename K, typename V>
   static Storage<K, V>* Create(const StorageConfig& sc,
-      Allocator* gpu_allocator, const string& name) {
-    auto layout_creator = LayoutCreatorFactory::Create<V>(sc);
-
+      Allocator* gpu_allocator, FeatureDescriptor<V>* feat_desc,
+      const string& name) {
     switch (sc.type) {
       case StorageType::DRAM:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_MEMKIND:
-        return new PmemMemkindStorage<K, V>(sc, pmem_allocator(),
-            layout_creator);
+        feat_desc->SetAllocator(pmem_allocator());
+        return new PmemMemkindStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_LIBPMEM:
-        return new PmemLibpmemStorage<K, V>(sc,
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator);
+        feat_desc->SetAllocator(
+            experimental_pmem_allocator(sc.path, sc.size[0]));
+        return new PmemLibpmemStorage<K, V>(sc, feat_desc);
       case StorageType::DRAM_PMEM:
-        return new DramPmemStorage<K, V>(sc, ev_allocator(),
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator, name);
+        return new DramPmemStorage<K, V>(sc,
+            feat_desc, name);
       case StorageType::LEVELDB:
       case StorageType::DRAM_LEVELDB:
-        return new DramLevelDBStore<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramLevelDBStore<K, V>(sc, feat_desc, name);
       case StorageType::SSDHASH:
       case StorageType::DRAM_SSDHASH:
-        return new DramSsdHashStorage<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramSsdHashStorage<K, V>(sc, feat_desc, name);
       case StorageType::HBM:
 #if GOOGLE_CUDA
-        return new HbmStorage<K, V>(sc, gpu_allocator,
-            layout_creator);
+        return new HbmStorage<K, V>(sc, gpu_allocator, feat_desc);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM:
 #if GOOGLE_CUDA
-        return new HbmDramStorage<K, V>(sc, gpu_allocator,
-        ev_allocator(), layout_creator, name);
+        return new HbmDramStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM_SSDHASH:
 #if GOOGLE_CUDA
-        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator,
-            ev_allocator(), layout_creator, name);
+        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       default:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
     }
   }
 };
diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h
deleted file mode 100644
index ca7d234ed61..00000000000
--- a/tensorflow/core/framework/embedding/value_ptr.h
+++ /dev/null
@@ -1,647 +0,0 @@
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-
-#include <pthread.h>
-#include <bitset>
-#include <atomic>
-#include <memory>
-
-#include "tensorflow/core/framework/typed_allocator.h"
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#endif  // GOOGLE_CUDA
-
-namespace tensorflow {
-
-enum class LayoutType {
-  LIGHT,
-  NORMAL,
-  LEVELDB,
-  NORMAL_CONTIGUOUS,
-  NORMAL_CONTIGUOUS_GPU,
-  COMPACT,
-};
-
-namespace {
-constexpr int COLUMN_BITSET_BYTES = 5;
-constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
-
-struct MetaHeader {
-  unsigned char embed_num;
-  unsigned char value_type;
-  unsigned char header_size;
-  unsigned char column_bitset[COLUMN_BITSET_BYTES];
-
-  static const int kEmbeddingNumStartIndex = 0;
-  static const int kValueTypeStartIndex =
-      kEmbeddingNumStartIndex + sizeof(char);
-  static const int kHeaderSizeStartIndex =
-      kValueTypeStartIndex + sizeof(char);
-  static const int kColumnBitsetIndex =
-      kHeaderSizeStartIndex + sizeof(char);
-
-  inline unsigned int GetEmbeddingNum() {
-    return (unsigned int) embed_num;
-  }
-
-  inline void SetEmbeddingNum(size_t s) {
-    embed_num = (unsigned char)s;
-  }
-
-  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
-    unsigned long meta = ((unsigned long*)this)[0];
-    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
-    return bs;
-  }
-
-  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
-      unsigned int embnum) {
-    ((unsigned long*)(this))[0] =
-      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
-      (header_size << (8 * kHeaderSizeStartIndex)) |
-      (value_type << (8 * kValueTypeStartIndex)) |
-      (embnum << (8 * kEmbeddingNumStartIndex));
-  }
-
-  inline unsigned int GetHeaderSize() {
-    return (unsigned int) header_size;
-  }
-
-  inline void SetHeaderSize(size_t size) {
-    header_size = (unsigned char)size;
-  }
-
-  inline void SetLayoutType(LayoutType vt) {
-    value_type = (unsigned char)vt;
-  }
-
-  inline LayoutType GetLayoutType() {
-    return (LayoutType)value_type;
-  }
-};
-
-struct LightHeader {
-/*__________________________________________________________________________________________
- |           |          |          |               |    embedding     |       slot       |
- | number of | valueptr |  header  | each bit a V* |        V*        |        V*        |
- | embedding | type     |   size   |    1 valid    | actually pointer | actually pointer |...
- |  columns  |          |          |   0 no-valid  |    by alloctor   |    by alloctor   |
- |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |     (8 bytes)    |     (8 bytes)    |
- --------------------------------------------------------------------------------------------
-*/
-  MetaHeader meta;
-  LightHeader() {
-    memset(this, 0, sizeof(LightHeader));
-    meta.SetLayoutType(LayoutType::LIGHT);
-    meta.SetHeaderSize(sizeof(LightHeader) / sizeof(int64));
-  }
-};
-
-struct NormalHeader {
-/*_________________________________________________________________________________________________________________________
-  |           |          |          |               |             |               |    embedding     |       slot       |
-  | number of | valueptr |  header  | each bit a V* | global step | freq counter  |        V*        |        V*        |
-  | embedding | type     |   size   |    1 valid    |             |               | actually pointer | actually pointer |...
-  |  columns  |          |          |   0 no-valid  |    int64    |     int64     |    by alloctor   |    by alloctor   |
-  |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |  (8 bytes)  |   (8 bytes)   |     (8 bytes)    |     (8 bytes)    |
-  --------------------------------------------------------------------------------------------------------------------------
- */
-  MetaHeader meta;
-  int64 global_step;
-  int64 freq_counter;
-
-  NormalHeader() {
-    memset(this, 0, sizeof(NormalHeader));
-    meta.SetLayoutType(LayoutType::NORMAL);
-    meta.SetHeaderSize(sizeof(NormalHeader) / sizeof(int64));
-    SetGlobalStep(-1);
-  }
-
-  inline int64 GetGlobalStep() {
-    return global_step;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    global_step = gs;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-
-struct FixedLengthHeader {
-/*_________________________________________________________________________________
-  |                        |               |                embeddings             |
-  | slotflag + global step | freq counter  |                    V                  |
-  |                        |               |             actually value            |
-  |           int64        |     int64     |               by alloctor             |
-  |         (8 bytes)      |   (8 bytes)   |     (4 * slot_num * emb_dim bytes)    |
-  ---------------------------------------------------------------------------------
-*/
-  int64 global_step;
-  int64 freq_counter;
-
-  FixedLengthHeader() {
-    memset(this, 0, sizeof(FixedLengthHeader));
-    SetGlobalStep(-1);
-  }
-
-   inline int64 GetGlobalStep() {
-    return global_step & 0x0000ffffffffffff;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    int64 temp = global_step;
-    temp &= 0xffff000000000000;
-    gs &= 0x0000ffffffffffff;
-    temp |= gs;
-    global_step = temp;
-  }
-
-  inline void SetInitialized(int64 emb_index) {
-    int64 temp = 1;
-    temp = temp << (48 + emb_index);
-    global_step |= temp;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-} // namespace
-
-template <class V>
-class ValuePtr {
- public:
-  virtual ~ValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) = 0;
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) = 0;
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) = 0;
-
-  virtual void Destroy(Allocator* allocator) = 0;
-
-  virtual void* GetPtr() const = 0;
-
-  // Global Step
-  virtual int64 GetStep() {
-    LOG(FATAL) << "Unsupport GlobalStep in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetStep(int64 gs) {}
-
-  // Frequency Counter
-  virtual int64 GetFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetFreq(int64 freq) {}
-
-  virtual void AddFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void AddFreq(int64 count) {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void SetValue(V val, size_t size) {
-    LOG(FATAL) << "Unsupport SetValue in subclass of ValuePtrBase";
-  }
-
-  virtual void SetInitialized(int64 emb_index) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-  }
-
-  virtual bool SetPtr(V* ptr) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-    return false;
-  }
-
-};
-
-template <class V>
-class LooseValuePtr : public ValuePtr<V> {
- public:
-  virtual ~LooseValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-
-    if (!metadata.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      metadata = meta->GetColumnBitset();
-      if (metadata.test(emb_index)) {
-        this->flag_.clear(std::memory_order_release);
-        return ((V**)((int64*)ptr_ +
-              (unsigned int)meta->header_size))[emb_index];
-      }
-      embnum++ ;
-      int64 alloc_value_len = value_len;
-      V* tensor_val = (V*)allocator->AllocateRaw(
-          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]  = tensor_val;
-
-      metadata.set(emb_index);
-      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
-      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
-      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
-      meta->SetColumnBitset(metadata, embnum);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    auto metadata = meta->GetColumnBitset();
-    if (metadata.test(emb_index)) {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-    for (int i = 0; i< embnum; i++) {
-      if (metadata.test(i)) {
-        V* val = ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[i];
-        if (val != nullptr) {
-          allocator->DeallocateRaw(val);
-        }
-      }
-    }
-  }
-
-  virtual void* GetPtr() const {
-    return ptr_;
-  }
-
- protected:
-  void* ptr_;
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-template <class V>
-class LightValuePtr : public LooseValuePtr<V> {
- public:
-  LightValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*)malloc(
-        sizeof(LightHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(LightHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) LightHeader();
-  }
-
-  ~LightValuePtr() {
-    free(this->ptr_);
-  }
-};
-
-template <class V>
-class NormalValuePtr : public LooseValuePtr<V> {
- public:
-  NormalValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(NormalHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(NormalHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) NormalHeader();
-  }
-
-  ~NormalValuePtr() {
-    free(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((NormalHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((NormalHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((NormalHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((NormalHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    return ((NormalHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    return ((NormalHeader*)this->ptr_)->AddFreq(count);
-  }
-};
-
-template <class V>
-class NormalContiguousValuePtr : public LooseValuePtr<V> {
-  public:
-   NormalContiguousValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
-      sizeof(FixedLengthHeader) + sizeof(V) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(FixedLengthHeader), 0, sizeof(V) * size);
-    new ((char*)this->ptr_) FixedLengthHeader();
-   }
-
-   ~NormalContiguousValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(FixedLengthHeader) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  void SetValue(V val, size_t size) {
-    for (int i = 0; i < size; ++i) {
-      *((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + i) = val;
-    }
-  }
-};
-
-template <class V>
-class NormalGPUValuePtr : public LooseValuePtr<V> {
- public:
-  NormalGPUValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(FixedLengthHeader) + sizeof(V *));
-    *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = nullptr;
-    new ((char*)this->ptr_) FixedLengthHeader();
-  }
-
-  ~NormalGPUValuePtr() {
-    free(this->ptr_);
-  }
-
-#if GOOGLE_CUDA
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      V* tensor_val =
-        *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      cudaMemcpy(tensor_val, default_v, value_len * sizeof(V),
-          cudaMemcpyDeviceToDevice);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-#endif  // GOOGLE_CUDA
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset,
-      bool &need_initialize) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      need_initialize = 1;
-      this->flag_.clear(std::memory_order_release);
-      return reinterpret_cast<V*>(this);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    return;
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  bool SetPtr(V* ptr) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    V* value_ptr = *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader));
-    if (value_ptr == nullptr) {
-      *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = ptr;
-      this->flag_.clear(std::memory_order_release);
-      return true;
-    } else {
-      this->flag_.clear(std::memory_order_release);
-      return false;
-    }
-  }
-
-  void SetInitialized(int64 emb_index) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    ((FixedLengthHeader*)this->ptr_)->SetInitialized(emb_index);
-    this->flag_.clear(std::memory_order_release);
-  }
-
-};
-
-template <class V>
-class CompactValuePtr : public ValuePtr<V> {
-  public:
-   CompactValuePtr(Allocator* allocator, size_t size) {
-    memset(static_cast<char*>(this->ptr_), 0, sizeof(V) * size + sizeof(int64));
-   }
-
-   ~CompactValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(int64) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(int64) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(int64) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(int64) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  virtual void* GetPtr() const {
-    return (void*)ptr_;
-  }
-
- private:
-  char ptr_[23];
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 115e3c4bae6..0c08c30c30a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -439,7 +439,8 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "embedding_variable_ops_test",
-    srcs = ["embedding_variable_ops_test.cc"],
+    srcs = ["embedding_variable_ops_test.cc",
+            "embedding_variable_test.h"],
     extra_copts = ["-fexceptions", "-g"],
     deps = [
         ":io",
@@ -6497,7 +6498,7 @@ tf_kernel_library(
         "training_ali_ops_gpu.h",
         "training_ali_ops.h"
     ],
-    copts = tf_copts(),
+    copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",
         ":training_op_helpers",
diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc
index 7ec6b1cf109..393e9a9754b 100644
--- a/tensorflow/core/kernels/embedding_variable_memory_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc
@@ -19,17 +19,22 @@ namespace embedding {
 float PerfMemory(Tensor& default_value,
                 const std::vector<int64>& id_list,
                 int value_size, int64 default_value_dim,
-                int64 filter_freq = 0) {
+                int64 filter_freq = 0, int64 steps_to_live = 0,
+                int64 record_freq = false) {
   auto ev = CreateEmbeddingVar(value_size, default_value,
-                               default_value_dim, filter_freq);
-  ValuePtr<float>* value_ptr = nullptr;
+                               default_value_dim, filter_freq,
+                               steps_to_live, -1.0,
+                               embedding::StorageType::DRAM,
+                               {1024, 1024, 1024, 1024},
+                               record_freq);
+  void* value_ptr = nullptr;
   bool is_filter = false;
   double start_mem, end_mem;
   start_mem = getResident() * getpagesize();
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
     if (is_filter)
-      ev->flat(value_ptr, id_list[i]);
+      ev->flat(value_ptr);
   }
   end_mem = getResident() * getpagesize();
   double used_mb = (end_mem - start_mem)/1000000;
@@ -58,7 +63,7 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   float used_mb = PerfMemory(default_value, id_list,
                              value_size, default_value_dim);
   float theoritical_mb =
-      50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000;
+      50 + num_of_ids * (value_size * sizeof(float)) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
               (used_mb < theoritical_mb * 1.01));
 
@@ -68,9 +73,10 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   used_mb = PerfMemory(default_value, id_list, value_size,
                        default_value_dim, filter_freq);
   theoritical_mb =
-      50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000;
+      50 + num_of_ids * (8 + value_size * sizeof(float) / 2
+                         + 4/*memory for ids_list*/) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
-              (used_mb < theoritical_mb * 1.01));
+              (used_mb < theoritical_mb * 1.02));
 }
 } //namespace embedding
 } //namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 4839c171708..e30381fef07 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -21,6 +21,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/embedding_variable_test.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -48,18 +49,6 @@ namespace {
 const int THREADNUM = 16;
 const int64 max = 2147483647;
 
-template<class K, class V>
-class TestableEmbeddingVar : public EmbeddingVar<K, V> {
- public:
-  TestableEmbeddingVar(const string& name,
-                       embedding::Storage<K, V>* storage,
-                       EmbeddingConfig emb_cfg = EmbeddingConfig(),
-                       Allocator* alloc = nullptr) : EmbeddingVar<K, V>(
-                         name, storage, emb_cfg, alloc) {}
-
-  using EmbeddingVar<K, V>::GetFilter;
-};
-
 struct ProcMemory {
   long size;      // total program size
   long resident;  // resident set size
@@ -123,11 +112,7 @@ TEST(EmbeddingVariableTest, TestEmptyEV) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   {
-    auto storage = embedding::StorageFactory::Create<int64, float>(
-        embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-    auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-        storage, EmbeddingConfig(), cpu_allocator());
-    variable->Init(value, 1);
+    auto variable = CreateEmbeddingVar(value_size, value, 1);
 
     LOG(INFO) << "size:" << variable->Size();
     Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
@@ -191,19 +176,14 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
   int64 value_size = 8;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddigVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   for (int64 i = 0; i < 5; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
     vflat(i) = 5.0;
   }
 
@@ -269,20 +249,15 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   int64 ev_size = 10048576;
   for (int64 i = 0; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -344,9 +319,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
 
 void multi_insertion(EmbeddingVar<int64, float>* variable, int64 value_size){
   for (long j = 0; j < 5; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, j);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 }
 
@@ -355,12 +330,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   std::vector<std::thread> insert_threads(THREADNUM);
   for (size_t i = 0 ; i < THREADNUM; i++) {
@@ -375,54 +345,45 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
 
 void InsertAndLookup(EmbeddingVar<int64, float>* variable,
                      int64 *keys, long ReadLoops, int value_size){
-  float *default_value_fake = (float *)malloc((value_size)*sizeof(float));
-  for (int j = 0; j < value_size; j++) {
-      default_value_fake[j] = -1.0;
-    }
   for (long j = 0; j < ReadLoops; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    float *default_value = (float *)malloc((value_size)*sizeof(float));
-    for (int k = 0; k < value_size; k++) {
-      default_value[k] = (float)keys[j];
-    }
-    variable->LookupOrCreate(keys[j], val, default_value);
-    variable->LookupOrCreate(keys[j], val, default_value_fake);
-    ASSERT_EQ(default_value[0] , val[0]);
-    free(val);
-    free(default_value);
+    void* val = nullptr;
+    void* val_1 = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(keys[j], &val, &is_filter, false);
+    variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false);
+    ASSERT_EQ(val, val_1);
   }
-  free(default_value_fake);
 }
 
 void MultiBloomFilter(EmbeddingVar<int64, float>* var, int value_size, int64 i) {
   for (long j = 0; j < 1; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    var->LookupOrCreate(i+1, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    var->LookupOrCreateKey(i+1, &val, &is_filter, false);
   }
 }
 
 TEST(EmbeddingVariableTest, TestBloomFilter) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, "normal", 10, 0.01),
-      cpu_allocator());
-
-  var->Init(value, 1);
-
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  float *default_value = (float *)malloc((value_size+1)*sizeof(float));
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(2, val, default_value);
+  std::vector<float> default_value =
+      {0.0 ,1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+  test::FillValues<float>(&value, default_value);
+
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01);
+
+  //float *val = (float *)malloc((value_size+1)*sizeof(float));
+  void* val = nullptr;
+  bool is_filter = true;
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(2, &val, &is_filter, false);
   
   std::vector<int64> keylist;
   std::vector<float *> valuelist;
@@ -437,14 +398,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt64) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT64), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT64);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -509,14 +467,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt32) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT32), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT32);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -581,14 +536,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt16) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT16), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT16);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -654,14 +606,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT8), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT8);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -725,12 +674,7 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
   int64 value_size = 128;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 InsertLoops = 1000;
   bool* flag = (bool *)malloc(sizeof(bool)*max);
@@ -765,8 +709,9 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
 }
 
 void MultiFilter(EmbeddingVar<int64, float>* variable, int value_size) {
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  variable->LookupOrCreate(20, val, nullptr);
+  bool is_filter = true;
+  void* val;
+  variable->LookupOrCreateKey(20, &val, &is_filter, false);
 }
 
 TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
@@ -774,14 +719,8 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 7),
-      cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5);
+
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   int thread_num = 5;
   std::vector<std::thread> insert_threads(thread_num);
@@ -792,20 +731,16 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
     t.join();
   }
 
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   var->LookupOrCreateKey(20, &value_ptr);
-  ASSERT_EQ(value_ptr->GetFreq(), thread_num);
+  ASSERT_EQ(var->GetFreq(20), thread_num);
 }
 
 EmbeddingVar<int64, float>* InitEV_Lockless(int64 value_size) {
   Tensor value(DT_INT64, TensorShape({value_size}));
   test::FillValues<int64>(&value, std::vector<int64>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
-  variable->Init(value, 1);
   return variable;
 }
 
@@ -813,7 +748,7 @@ void MultiLookup(EmbeddingVar<int64, float>* variable,
     int64 InsertLoop, int thread_num, int i) {
   for (int64 j = i * InsertLoop/thread_num;
       j < (i+1)*InsertLoop/thread_num; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
   }
 }
@@ -829,9 +764,9 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
   float* fill_v = (float*)malloc(value_size * sizeof(float));
 
   for (int64 i = 0; i < InsertLoop; i++){
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   testing::StartTiming();
@@ -848,58 +783,6 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
 
 }
 
-void hybrid_process(EmbeddingVar<int64, float>* variable,
-    int64* keys, int64 InsertLoop, int thread_num,
-    int64 i, int64 value_size) {
-  float *val = (float *)malloc(sizeof(float)*(value_size + 1));
-  for (int64 j = i * InsertLoop/thread_num;
-      j < (i+1) * InsertLoop/thread_num; j++) {
-    variable->LookupOrCreate(keys[j], val, nullptr);
-  }
-}
-
-void BM_HYBRID_LOCKLESS(int iters, int thread_num) {
-  testing::StopTiming();
-  testing::UseRealTime();
-
-  int64 value_size = 128;
-  auto variable = InitEV_Lockless(value_size);
-  int64 InsertLoop =  1000000;
-
-  srand((unsigned)time(NULL));
-  int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoop);
-
-  for (int64 i = 0; i < InsertLoop; i++) {
-    keys[i] =  rand() % 1000;
-  }
-
-  testing::StartTiming();
-  while (iters--) {
-    std::vector<std::thread> insert_threads(thread_num);
-    for (size_t i = 0 ; i < thread_num; i++) {
-      insert_threads[i] = std::thread(hybrid_process,
-          variable, keys, InsertLoop, thread_num, i, value_size);
-    }
-    for (auto &t : insert_threads) {
-      t.join();
-    }
-  }
-}
-
-BENCHMARK(BM_MULTIREAD_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
-BENCHMARK(BM_HYBRID_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
 
 TEST(EmbeddingVariableTest, TestAllocate) {
   int value_len = 8;
@@ -923,23 +806,13 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(/*emb_index = */0, /*primary_emb_index = */0,
-                      /*block_num = */1, /*slot_num = */1,
-                      /*name = */"", /*steps_to_live = */0,
-                      /*filter_freq = */0, /*max_freq = */999999,
-                      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-                      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-                      /*counter_type = */DT_UINT64),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 ev_size = 100;
   for (int64 i = 0; i < ev_size; i++) {
-    variable->LookupOrCreate(i, fill_v, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(i, &val, &is_filter, false);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -947,59 +820,20 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
 
 void t1(KVInterface<int64, float>* hashmap) {
   for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalValuePtr<float>(ev_allocator(), 100));
+    hashmap->Insert(i, nullptr);
   }
 }
 
 TEST(EmbeddingVariableTest, TestRemoveLockless) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMap<int64, float>();
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(t1, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofDBKV) {
-  int64 value_size = 4;
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM,
+      false, false, {false, 0});
   KVInterface<int64, float>* hashmap =
-      new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(value_size);
-
-  for (int64 i = 0; i < 6; ++i) {
-    const ValuePtr<float>* tmp =
-        new NormalContiguousValuePtr<float>(ev_allocator(), value_size);
-    hashmap->Commit(i, tmp);
-  }
-
-  for(int64 i = 0; i < 6; i++) {
-    ValuePtr<float>* tmp = nullptr;
-    Status s = hashmap->Lookup(i, &tmp);
-    ASSERT_EQ(s.ok(), true);
-  }
-}
-
-void InsertAndCommit(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    const ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(ev_allocator(), 100);
-    hashmap->Insert(i, tmp);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestSizeDBKV) {
-  KVInterface<int64, float>* hashmap =
-    new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(100);
+      new LocklessHashMap<int64, float>(feat_desc);
+  feat_desc->InitSlotInfo(0, 100, {nullptr, 1});
   ASSERT_EQ(hashmap->Size(), 0);
   LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(InsertAndCommit, hashmap);
+  auto t = std::thread(t1, hashmap);
   t.join();
   LOG(INFO) << "hashmap size: " << hashmap->Size();
   ASSERT_EQ(hashmap->Size(), 100);
@@ -1190,213 +1024,6 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
   }
 }
 
-TEST(EmbeddingVariableTest, TestCacheRestore) {
-  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
-  int64 value_size = 4;
-  Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float));
-  std::vector<int64> size;
-  size.emplace_back(64);
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal_contiguous",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, emb_config, cpu_allocator());
-  variable->Init(value, 1);
-  variable->InitCache(CacheStrategy::LFU);
-
-  Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
-
-  int64 ev_size = 7;
-  int64 cache_size = 3;
-  for (int64 i = 1; i < cache_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(2);
-  }
-  for (int64 i = cache_size; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(1);
-  }
-
-  LOG(INFO) << "size:" << variable->Size();
-
-  BundleWriter writer(Env::Default(), Prefix("foo"));
-  embedding::ShrinkArgs shrink_args;
-  shrink_args.global_step = 1;
-  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
-  TF_ASSERT_OK(writer.Finish());
-  variable->Unref();
-
-  auto imported_storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar1");
-  auto imported_variable = new EmbeddingVar<int64, float>("EmbeddingVar1",
-      imported_storage, emb_config, cpu_allocator());
-  imported_variable->Init(value, 1);
-  imported_variable->InitCache(CacheStrategy::LFU);
-
-  BundleReader reader(Env::Default(), Prefix("foo"));
-  std::string name_string("var");
-  imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false);
-
-  ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
-  ASSERT_EQ(imported_storage->Size(1), 2);
-  delete imported_storage;
-}
-
-void t1_gpu(KVInterface<int64, float>* hashmap) {
-  for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalGPUValuePtr<float>(ev_allocator(), 100));
-  }
-}
-
-#if GOOGLE_CUDA
-TEST(EmbeddingVariableTest,TestRemoveLocklessCPU) {
-    SessionOptions sops;
-    std::unique_ptr<Device> device =
-      DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
-    Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-        GPUOptions(), TfGpuId(0), 1 << 26);
-    KVInterface<int64, float>* hashmap =
-      new LocklessHashMapCPU<int64, float>(gpu_allocator);
-    ASSERT_EQ(hashmap->Size(), 0);
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    auto t = std::thread(t1, hashmap);
-    t.join();
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    ASSERT_EQ(hashmap->Size(), 100);
-    TF_CHECK_OK(hashmap->Remove(1));
-    TF_CHECK_OK(hashmap->Remove(2));
-    ASSERT_EQ(hashmap->Size(), 98);
-    LOG(INFO) << "2 size:" << hashmap->Size();
-}
-#endif  // GOOGLE_CUDA
-
-/*void CommitGPU(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    ValuePtr<float>* tmp= new NormalGPUValuePtr<float>(ev_allocator(), 100);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestCommitHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(100);
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(CommitGPU, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestGPUValuePtr) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(), ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float host_data[ev_list_size];
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << initial_data[i];
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(host_data, address, ev_list_size * sizeof(float), cudaMemcpyDeviceToHost);
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << host_data[i];
-  }
-}//Forbidden, due to no gpu allocator at that time
-
-TEST(EmbeddingVariableTest, TestCommitValue) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(),ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(ev_list_size);
-  hashmap->Commit(1, ptr_);
-  ValuePtr<float>* check;
-  hashmap->Lookup(1,&check);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << tmp[i];
-    //ASSERT_EQ(tmp[i], 10);
-  }//
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofLocklessHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  const int EmbeddingSize = 16;
-  const int BatchSize = 16;
-
-  hashmap->SetTotalDims(EmbeddingSize);
-  std::vector<ValuePtr<float>*> value_ptr_list;
-  std::vector<int64> key_list;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    key_list.emplace_back(i);
-    ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(EmbeddingSize);
-    float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-    float initial_data[EmbeddingSize];
-    for(int j = 0;j < EmbeddingSize;++j){
-      initial_data[j] = i;
-      //LOG(INFO) << "initial[" << i << "][" << j << "]=" << initial_data[j];
-    }
-    cudaMemcpy(address, initial_data, EmbeddingSize * sizeof(float), cudaMemcpyHostToDevice);
-    value_ptr_list.emplace_back(ptr_);
-  }//initialize V on GPU
-
-  timespec start,end;
-  clock_gettime(CLOCK_MONOTONIC, &start);
-  hashmap->BatchCommit(key_list, value_ptr_list);
-  clock_gettime(CLOCK_MONOTONIC, &end);
-  std::cout << "time: " << ((double)(end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec)/1000000 << "ms" << std::endl;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    ValuePtr<float>* check;
-    hashmap->Lookup(i,&check);
-    float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-    for(int j = 0;j < EmbeddingSize;++j){
-      LOG(INFO) << "batch[" << i << "][" << j << "]=" << tmp[j];
-      //ASSERT_EQ(tmp[j], i);
-    }
-  }//compare value after BatchCommit
-}
-*/
-
 const int total_size = 1024 * 8;
 const int th_num = 1;
 const int malloc_size = total_size / th_num;
@@ -1466,17 +1093,11 @@ TEST(EmbeddingVariableTest, TestCPUGPUMalloc) {
   auto mem_pool = new EmbeddingMemoryPool<float>(gpu_allocator, 256, 1024);
   float* ptr_1 = mem_pool->Allocate();
   float* ptr_2 = mem_pool->Allocate();
-  ValuePtr<float>* value_ptr1 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  ValuePtr<float>* value_ptr2 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  value_ptr1->SetPtr(ptr_1);
-  value_ptr2->SetPtr(ptr_2);
-  value_ptr1->SetInitialized(0);
-  value_ptr2->SetInitialized(0);
-  std::vector<ValuePtr<float>*> value_ptrs;
-  value_ptrs.emplace_back(value_ptr1);
+  std::vector<void*> value_ptrs;
+  value_ptrs.emplace_back(ptr_1);
   mem_pool->Deallocate(value_ptrs);
   value_ptrs.clear();
-  value_ptrs.emplace_back(value_ptr2);
+  value_ptrs.emplace_back(ptr_2);
   mem_pool->Deallocate(value_ptrs);
   float* ptr_3 = mem_pool->Allocate();
   ASSERT_EQ(ptr_1, ptr_3);
@@ -1539,16 +1160,16 @@ TEST(EmbeddingVariableTest, TestEVMallocFree) {
 
 void SingleCommit(KVInterface<int64, float>* hashmap,
     std::vector<int64> keys, int bias) {
-  std::vector<ValuePtr<float>*> value_ptrs;
+  std::vector<void*> value_ptrs;
   for (int64 i = 0; i < keys.size(); ++i) {
-    ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(cpu_allocator(), 124);
-    tmp->SetValue(float(keys[i] + bias), 124);
+    void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16);
+    for (int j = 0; j < 124; j++) {
+      ((float*)tmp)[j] = keys[i] + bias;
+    }
     value_ptrs.push_back(tmp);
   }
   ASSERT_EQ(keys.size(), value_ptrs.size());
   uint64 start = Env::Default()->NowNanos();
-  
   for (int64 i = 0; i < keys.size(); i++) {
     hashmap->Commit(keys[i], value_ptrs[i]);
   }
@@ -1558,9 +1179,13 @@ void SingleCommit(KVInterface<int64, float>* hashmap,
 
 void TestCompaction() {
   std::string temp_dir = testing::TmpDir();
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262144; i++) {
@@ -1576,12 +1201,12 @@ void TestCompaction() {
   t1.join();
   ids.clear();
   sleep(1);
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   for (int i = 131073; i < 262144; i++) {
@@ -1596,16 +1221,16 @@ void TestCompaction() {
   sleep(1);
   for (int i = 0; i < 131073; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 1);
+      ASSERT_EQ(v[j], i + 1);
     }
   }
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 2);
+      ASSERT_EQ(v[j], i + 2);
     }
   }
   delete hashmap;
@@ -1622,10 +1247,14 @@ TEST(KVInterfaceTest, TestSSDKVSyncCompaction) {
 }
 
 void TestReadEmbFile() {
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   std::string temp_dir = testing::TmpDir();
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262145; i++) {
@@ -1634,12 +1263,12 @@ void TestReadEmbFile() {
   SingleCommit(hashmap, ids, 3);
   sleep(1);
   ids.clear();
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 0; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   delete hashmap;
@@ -1666,9 +1295,10 @@ TEST(KVInterfaceTest, TestDirectIoFile) {
 void InsertKey(EmbeddingVar<int64, float>* variable, int value_size) {
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   for (int64 i = 0; i < 100000000; i++) {
-    variable->LookupOrCreate(20, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(20, &val, &is_filter, false);
   }
-  LOG(INFO)<<"Finish Insert";
 }
 
 void RemoveKey(EmbeddingVar<int64, float>* variable) {
@@ -1676,29 +1306,13 @@ void RemoveKey(EmbeddingVar<int64, float>* variable) {
     sleep(1);
     variable->storage()->Remove(20);
   }
-  LOG(INFO)<<"Remove thread finish";
 }
 
 TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */2, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-
-   var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
    int thread_num = 5;
    std::vector<std::thread> insert_threads(thread_num);
    for (size_t i = 0 ; i < thread_num - 1; i++) {
@@ -1714,21 +1328,7 @@ TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
   float* set_value = (float*)malloc(value_size * sizeof(float));
   //Insertion
   for (int i = 0; i < 100; i++) {
diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc
index 9b01e35840b..16f4a894858 100644
--- a/tensorflow/core/kernels/embedding_variable_performance_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc
@@ -90,14 +90,21 @@ void GenerateSkewInput(int num_of_ids, float skew_factor,
 void thread_lookup_or_create(
     EmbeddingVar<int64, float>* ev,
     const int64* input_batch,
+    float* default_value,
+    int default_value_dim,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false);
-    auto val = ev->flat(value_ptr, input_batch[i]);
-    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    if (is_filter) {
+      auto val = ev->flat(value_ptr);
+      memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    } else {
+      int default_value_index = input_batch[i] % default_value_dim;
+      memcpy(outputs[i], default_value + default_value_index * value_size, sizeof(float) * value_size);
+    }
   }
 }
 
@@ -138,6 +145,8 @@ double PerfLookupOrCreate(
     for (int i = 0; i < num_thread; i++) {
       worker_threads[i] = std::thread(thread_lookup_or_create,
                                       ev, input_batches[k].data(),
+                                      default_value_matrix.data(),
+                                      default_value_dim,
                                       outputs.data(), value_size,
                                       thread_task_range[i].first,
                                       thread_task_range[i].second);
@@ -201,11 +210,11 @@ void thread_lookup(
     const int64* input_batch,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupKey(input_batch[i], &value_ptr);
-    auto val = ev->flat(value_ptr, input_batch[i]);
+    auto val = ev->flat(value_ptr);
     memcpy(outputs[i], &val(0), sizeof(float) * value_size);
   }
 }
@@ -293,7 +302,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) {
 		}
 	}
   auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   for (int i = 0; i < hot_ids_list.size(); i++) {
     ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false);
@@ -339,13 +348,13 @@ void PerfSave(Tensor& default_value,
       value_size, default_value,
       default_value_dim, 0, steps_to_live,
       l2_weight_threshold);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   srand((unsigned)time(NULL));
 
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
-    ev->flat(value_ptr, id_list[i]);
+    ev->flat(value_ptr);
     int64 global_step = rand() % 100;
     ev->UpdateVersion(value_ptr, global_step);
   }
diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h
index d06304fb78a..07c34764fb0 100644
--- a/tensorflow/core/kernels/embedding_variable_test.h
+++ b/tensorflow/core/kernels/embedding_variable_test.h
@@ -107,35 +107,42 @@ EmbeddingVar<int64, float>* CreateEmbeddingVar(
     int value_size, Tensor& default_value,
     int64 default_value_dim, int64 filter_freq = 0,
     int64 steps_to_live = 0,
-    float l2_weight_threshold=-1.0) {
-  std::string layout_type = "light";
-  if (filter_freq != 0) {
-    layout_type = "normal";
-  }
-
-  if (steps_to_live != 0) {
-    if (layout_type == "light") {
-      layout_type = "normal_contiguous";
-    }
-  }
+    float l2_weight_threshold=-1.0,
+    embedding::StorageType storage_type = embedding::StorageType::DRAM,
+    std::vector<int64> storage_size = {1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024},
+    bool record_freq = false,
+    int64 max_element_size = 0,
+    float false_positive_probability = -1.0,
+    DataType counter_type = DT_UINT64) {
   auto embedding_config = EmbeddingConfig(
-			0, 0, 1, 0, "emb_var", steps_to_live,
-			filter_freq, 999999, l2_weight_threshold, layout_type,
-			0, -1.0, DT_UINT64, default_value_dim,
-			0.0, false, false, false);
+      0, 0, 1, 0, "emb_var", steps_to_live,
+      filter_freq, 999999, l2_weight_threshold,
+      max_element_size, false_positive_probability,
+      counter_type, default_value_dim,
+      0.0, record_freq, false, false);
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), storage_type,
+      record_freq,
+      embedding_config.is_save_version(),
+      {embedding_config.is_counter_filter(), filter_freq});
   auto storage =
       embedding::StorageFactory::Create<int64, float>(
           embedding::StorageConfig(
-              embedding::StorageType::DRAM, "",
-              {1024, 1024, 1024, 1024}, layout_type,
+              storage_type, "",
+              storage_size,
               embedding_config),
           cpu_allocator(),
+          feat_desc,
           "emb_var");
 	auto ev = new EmbeddingVar<int64, float>(
       "emb_var",
       storage,
       embedding_config,
-      cpu_allocator());
+      cpu_allocator(),
+      feat_desc);
 	ev->Init(default_value, default_value_dim);
   return ev;
 }
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
index 55dd40176a8..2f07e2ef537 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
@@ -774,7 +774,7 @@ class GroupEmbeddingVariableForWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
@@ -958,7 +958,7 @@ class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h
index 0582697ad16..d84838ae413 100644
--- a/tensorflow/core/kernels/incr_save_restore_ops.h
+++ b/tensorflow/core/kernels/incr_save_restore_ops.h
@@ -225,9 +225,9 @@ class IncrEVValueDumpIterator : public  DumpIterator<T> {
       keys_idx_++;
       col_idx_ = 0;
     }
-    ValuePtr<T>* value_ptr = NULL;
+    void* value_ptr = NULL;
     TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr));
-    return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++);
+    return emb_var_->flat(value_ptr)(col_idx_++);
   }
 
  private:
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index c69aec8ebb9..7e40dfff7ac 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -121,7 +121,7 @@ class KvResourceLookupIDOp : public OpKernel {
       const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GetOrCreateKey(ev_ctx, indices,
-                         reinterpret_cast<ValuePtr<TValue>**>(out_base),
+                         reinterpret_cast<void**>(out_base),
                          indices_size);
     }
   }
@@ -203,7 +203,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
       const size_t slice_bytes = slice_elems * sizeof(TValue);
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GatherEmbeddings(ev_ctx, indices,
-                          (ValuePtr<TValue>**)pointer.data(),
+                          (void**)pointer.data(),
                           out_base, N);
     }
   }
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 8a01a7bf2cd..5cd0ef140bd 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -214,16 +214,16 @@ class InitializeKvVariableOp : public OpKernel {
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
     storage_type_ = static_cast<embedding::StorageType>(storage_type);
-    auto device_type_str = c->device_type().type_string();
+    device_type_str_ = c->device_type().type_string();
     if (storage_type_ == embedding::DEFAULT) {
-      if (device_type_str == "CPU") {
+      if (device_type_str_ == "CPU") {
         storage_type_ = embedding::DRAM;
       } else {
         storage_type_ = embedding::HBM;
       }
     }
 
-    bool if_op_on_gpu = (device_type_str == "GPU");
+    bool if_op_on_gpu = (device_type_str_ == "GPU");
     bool if_embedding_on_hbm = (storage_type_ == embedding::HBM ||
                                 storage_type_ == embedding::HBM_DRAM ||
                                 storage_type_ == embedding::HBM_DRAM_SSDHASH);
@@ -238,57 +238,14 @@ class InitializeKvVariableOp : public OpKernel {
       filter_freq_ = 0;
     }
 
-    OP_REQUIRES_OK(c, c->GetAttr("layout", &layout_));
-    if (!layout_.empty()) {
-      // use layout by user configuration
-    } else if ((filter_freq_ != 0 && max_element_size_ == 0)
-               || steps_to_live_ != 0 || record_freq_
-               || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        if (storage_type == embedding::HBM_DRAM ||
-            storage_type == embedding::HBM_DRAM_SSDHASH) {
-          layout_ = "normal_contiguous_gpu";
-        } else {
-          layout_ = "normal_contiguous";
-        }
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
-    if ("compact" == layout_) {
-      OP_REQUIRES(c, shape_.dim_size(0) == 1 &&
-            storage_type_ == embedding::StorageType::DRAM,
-          errors::InvalidArgument("embedding_dim must be 1 and storage type"
-                                  " should be DRAM when layout is 'compact'."));
-    }
+    record_freq_ |= (storage_type > 5);
+    record_version_ |= (storage_type > 5);
 
     OP_REQUIRES(c, steps_to_live_ >= 0,
         errors::InvalidArgument(
             "steps_to_live must >= 0, ", std::to_string(steps_to_live_)));
 
     OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
-    if (embedding::StorageType::LEVELDB == storage_type_) {
-      ht_type_ = "leveldb_kv";
-      if (layout_ != "normal_contiguous")
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS when storage type is LEVELDB";
-      layout_ = "normal_contiguous";
-    }
-
-    if (embedding::StorageType::PMEM_LIBPMEM == storage_type_ ||
-        embedding::StorageType::PMEM_MEMKIND == storage_type_){
-      if (layout_ != "normal_contiguous"){
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS"
-          << " when storage type is PMEM_LIBPMEM or PMEM_MEMKIND";
-      }
-      layout_ = "normal_contiguous";
-    }
     OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_));
   }
 
@@ -314,35 +271,43 @@ class InitializeKvVariableOp : public OpKernel {
               context, handle_self, &ev,
               [this, default_values, opname, context,
                handle_self](EmbeddingVar<TKey, TValue>** ptr) {
-            Allocator* gpu_allocator =
+            Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 emb_index_ + block_num_ * slot_index_,
                 emb_index_, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_,
                 filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
-            return Status::OK();
-          }));
-      ev->Init(default_values, default_value_dim_);
+                alloc_for_ev,
+                feat_desc);
+            return (*ptr)->Init(default_values, default_value_dim_);
+          }));   
     } else {
       EmbeddingVar<TKey, TValue>* primary_variable = nullptr;
       OP_REQUIRES_OK(
@@ -352,30 +317,38 @@ class InitializeKvVariableOp : public OpKernel {
               [this, default_values, opname,
                handle_primary, context](EmbeddingVar<TKey, TValue>** ptr) {
             int64 primary_slot_index(0), primary_emb_index(0);
-            Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes());
-            //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes());
+            Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 primary_emb_index + block_num_ * primary_slot_index,
                 primary_emb_index,
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_primary.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
+                alloc_for_ev,
+                feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -386,20 +359,26 @@ class InitializeKvVariableOp : public OpKernel {
             context, handle_self, &ev,
             [this, default_values, opname, primary_variable,
              handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
+           Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
+          auto embedding_config = EmbeddingConfig(
+              emb_index_ + block_num_ * slot_index_,
+              emb_index_,
+              block_num_, slot_num_, opname,
+              steps_to_live_, filter_freq_,
+              max_freq_, l2_weight_threshold_,
+              max_element_size_,
+              false_positive_probability_,
+              counter_type_, default_value_dim_,
+              default_value_no_permission_,
+              record_freq_, record_version_,
+              is_inference_);
+          Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
           *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
               primary_variable->storage(),
-              EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                              emb_index_,
-                              block_num_, slot_num_, opname,
-                              steps_to_live_, filter_freq_,
-                              max_freq_, l2_weight_threshold_,
-                              layout_, max_element_size_,
-                              false_positive_probability_,
-                              counter_type_, default_value_dim_,
-                              default_value_no_permission_,
-                              record_freq_, record_version_,
-                              is_inference_),
-          primary_variable->GetAllocator());
+              embedding_config,
+              alloc_for_ev,
+              primary_variable->feature_descriptor());
           return (*ptr)->Init(default_values, default_value_dim_);
         }));
       core::ScopedUnref unref_me(primary_variable);
@@ -424,7 +403,6 @@ class InitializeKvVariableOp : public OpKernel {
   int64 filter_freq_;
   int64 max_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_element_size_;
   float false_positive_probability_;
   embedding::StorageType storage_type_;
@@ -436,6 +414,7 @@ class InitializeKvVariableOp : public OpKernel {
   bool record_version_;
   bool is_inference_;
   bool is_set_initialized_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index 8e3572443ba..3202e6d12bf 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/cache_factory.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 23a504eea5d..3b10c2521b9 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -120,20 +120,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_));
     OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_));
 
-    if ((filter_freq_ != 0 && max_element_size_ == 0)
-         || steps_to_live_ != -1 || record_freq_
-         || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        layout_ = "normal_contiguous";
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true,
                                    &ev_async_restore_));
   }
@@ -170,24 +156,33 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                allocator);
+                alloc_for_ev,
+                feat_desc);
             return Status::OK();
         }));
       ev->Init(default_values, default_value_dim_);
@@ -207,19 +202,27 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 primary_emb_index, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(handle_primary.name(),
-                storage, embedding_config, allocator);
+                storage, embedding_config, alloc_for_ev, feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -232,17 +235,22 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
             Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
+            auto embedding_config = EmbeddingConfig(
+                emb_index_ + block_num_ * slot_index_,
+                emb_index_, block_num_, slot_num_, opname,
+                steps_to_live_, filter_freq_, max_freq_,
+                l2_weight_threshold_, max_element_size_,
+                false_positive_probability_,
+                counter_type_, default_value_dim_,
+                default_value_no_permission_,
+                record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
             *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
                 primary_variable->storage(),
-                EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                    emb_index_, block_num_, slot_num_, opname,
-                    steps_to_live_, filter_freq_, max_freq_,
-                    l2_weight_threshold_, layout_, max_element_size_,
-                    false_positive_probability_,
-                    counter_type_, default_value_dim_,
-                    default_value_no_permission_,
-                    record_freq_, record_version_),
-                    allocator);
+                embedding_config,
+                alloc_for_ev,
+                primary_variable->feature_descriptor());
             return (*ptr)->Init(default_values, default_value_dim_);
           }));
       core::ScopedUnref unref_me(primary_variable);
@@ -290,7 +298,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   int64 slot_num_;
   int64 filter_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_freq_;
   embedding::StorageType storage_type_;
   std::string storage_path_;
@@ -301,6 +308,7 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   bool record_version_;
   bool reset_version_;
   bool ev_async_restore_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                    \
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 4f69ebe3fb5..da58e17e1bb 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/hash_table/hash_table.h"
 #include "tensorflow/core/framework/hash_table/bloom_filter_strategy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h
index e013a6a2bae..12948de24a4 100644
--- a/tensorflow/core/kernels/training_ali_op_helpers.h
+++ b/tensorflow/core/kernels/training_ali_op_helpers.h
@@ -121,55 +121,54 @@ EmbeddingVariableInputLockHolder<K, V> MaybeLockEmbeddingVariableInputMutexesInO
 template<class K, class V, class Tstep>
 void LookupKeyAndSetVersion(
     OpKernelContext* ctx, EmbeddingVar<K, V>* var,
-    ValuePtr<V>** value_ptrs, Tstep gs, const K* indices,
+    void** value_ptrs, Tstep gs, const K* indices,
     int64 task_size, bool indices_as_pointer,
     int counts_index) {
+  EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
   int64* indices_counts = nullptr;
   std::function<int64(int64*, int64)> get_count_fn = 0;
   if (counts_index != -1) {
     const Tensor& counts_tensor = ctx->input(counts_index);
     indices_counts = (int64*)counts_tensor.data();
-    get_count_fn = [](int64* counts, int64 index) {
-      return counts[index];};
-  } else {
-    get_count_fn = [](int64* counts, int64 index) {return 1;};
   }
+  var->LookupOrCreateKey(ev_ctx, indices, value_ptrs,
+                         task_size, indices_counts,
+                         indices_as_pointer);
 
-  auto lookup_key_and_set_version_fn = [var, value_ptrs, gs,
-      indices, indices_as_pointer,
-      indices_counts, get_count_fn] (int64 start, int64 limit) {
-    ValuePtr<V>* value_ptr = nullptr;
+  auto update_version_fn = [var, value_ptrs, gs]
+      (int64 start, int64 limit) {
     for (int i = start; i < limit; i++) {
-      bool is_filter = false;
-      int64 count = get_count_fn(indices_counts, i);
-      var->LookupOrCreateKey(indices[i], &value_ptr,
-          &is_filter, indices_as_pointer, count);
-      value_ptrs[i] = value_ptr;
-      var->UpdateVersion(value_ptr, gs);
+      var->UpdateVersion(value_ptrs[i], gs);
     }
   };
   const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   Shard(worker_threads->num_threads,
         worker_threads->workers, task_size, unit_cost,
-        lookup_key_and_set_version_fn);
+        update_version_fn);
 }
 
 template<class K, class V>
-void LookupOrCreateEmbedding(
+void LookupEmbedding(
     OpKernelContext* ctx,
     std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
-    ValuePtr<V>** value_ptrs,
+    void** value_ptrs,
     const K* indices,
-    int64 num_of_keys,
-    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+    int64 num_of_keys) {
   for (auto it: vars) {
     EmbeddingVar<K, V>* var = it.first;
     V** var_ptr = it.second;
-    EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
-    var->BatchLookupOrCreateEmb(
-        ev_ctx, var_ptr, value_ptrs,
-        indices, num_of_keys, thread_copy_id_alloc);
+    auto lookup_emb_fn = [var, var_ptr, value_ptrs]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        var_ptr[i] = var->GetValuePtr(value_ptrs[i]);
+      }
+    };
+    const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads->num_threads,
+        worker_threads->workers, num_of_keys, unit_cost,
+        lookup_emb_fn);
   }
 }
 
@@ -180,12 +179,12 @@ void GetEmbeddingPointers(
     const K* indices, Tstep gs, bool indices_as_pointer,
     int counts_index, int64 num_of_keys,
     IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-  std::vector<ValuePtr<V>*> value_ptrs(num_of_keys);
+  std::vector<void*> value_ptrs(num_of_keys);
   LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(),
                          gs, indices, num_of_keys,
                          indices_as_pointer, counts_index);
-  LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(),
-                          indices, num_of_keys, thread_copy_id_alloc);
+  LookupEmbedding(ctx, vars, value_ptrs.data(),
+                  indices, num_of_keys);
 }
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 839ce82feef..546b30e29dd 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -141,16 +141,16 @@ class KvSparseApplyAdagradOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               a += g.square();
               v -= g.constant(lr_scalar) * g * a.rsqrt();
             }
@@ -542,15 +542,15 @@ class KvSparseApplyFtrlOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             if (is_filter) {
-              auto var = var_->flat(value_ptr, index);
-              auto accum = accum_->flat(value_ptr, index);
-              auto linear = linear_->flat(value_ptr, index);
+              auto var = var_->flat(value_ptr);
+              auto accum = accum_->flat(value_ptr);
+              auto linear = linear_->flat(value_ptr);
               auto grad = grad_flat.template chip<0>(i);
 
 // Use a macro to implement the computation here due to the templating of the
@@ -1301,19 +1301,19 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
 
-              auto v = var->flat(value_ptr, index);
-              auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
+              auto accum_decay_power = accum_decay_power_var->flat(value_ptr);
 
               if (gs / decay_step_scalar > accum_decay_power(0)) {
                 a *= a.constant(decay_rate_scalar);
@@ -1505,19 +1505,18 @@ class KvSparseApplyAdamOp : public OpKernel {
           auto indices_vec = indices.vec<Tindex>();
 
           int64 gs = global_step.scalar<int64>()();
-
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
@@ -2412,15 +2411,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
           Tstep gs = global_step.scalar<Tstep>()();
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto v_ = v->flat(value_ptr, index);
-              auto m_ = m->flat(value_ptr, index);
+              auto v_ = v->flat(value_ptr);
+              auto m_ = m->flat(value_ptr);
               auto grad_ = grad_flat.template chip<0>(i);
 
               v_ = v_ * v_.constant(beta2_scalar) +
@@ -2429,7 +2428,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
                      (v_ + v_.constant(epsilon_scalar)).rsqrt() *
                          v_.constant(lr_scalar) * grad_;
 
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= m_;
             }
           }
@@ -2461,17 +2460,17 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
 
             for (int64 i = start_i; i < limit_i; i++) {
               const Tindex index = indices_vec(i);
-              ValuePtr<T>* value_ptr = nullptr;
+              void* value_ptr = nullptr;
               bool is_filter = false;
               int64 count = get_count_fn(indices_counts, i);
               OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                              &is_filter, indices_as_pointer, count));
               var->UpdateVersion(value_ptr, gs);
               if (is_filter) {
-                auto m_a = m->flat(value_ptr, index);
-                auto v_a = v->flat(value_ptr, index);
+                auto m_a = m->flat(value_ptr);
+                auto v_a = v->flat(value_ptr);
                 auto g = grad_flat.template chip<0>(i);
-                auto var_i = var->flat(value_ptr, index);
+                auto var_i = var->flat(value_ptr);
 
                 m_a = m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
                 v_a = v_a * beta2_scalar + g.square() * (static_cast<T>(1) - beta2_scalar);
@@ -2939,7 +2938,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
@@ -2947,7 +2946,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= g.constant(lr_scalar) * g;
             }
           }
@@ -3136,16 +3135,16 @@ class KvSparseApplyAdamWOp : public OpKernel {
 
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
               // m_a = beta1 * m + (1 - beta1) * g
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2a56634206c..e89b095aff1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -6132,6 +6132,8 @@ class GraphKeys(object):
   TRAINABLE_VARIABLES = "trainable_variables"
   # Indicate EmbeddingVariable in CollectionDef
   EMBEDDING_VARIABLES = "embedding_variables"
+  # Collection for dependencies of EmbeddingVariable's restore op
+  EMBEDDING_VARIABLE_RESTORE_DEPENDENCY = "embedding_variable_restore_dependency"
   # Key to collect summaries.
   SUMMARIES = "summaries"
   # Key to collect QueueRunners.
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 240938e8675..d47d94d0d99 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -47,69 +47,6 @@
 
 
 class EmbeddingVariableGpuTest(test_util.TensorFlowTestCase):
-  def testDynamicDimensionEmbeddingVariable(self):
-    print("testDynamicDimensionEmbeddingVariable")
-    with ops.device('/gpu:0'):
-      def runTestAdagrad(self, var, g):
-        if isinstance(var, kv_variable_ops.EmbeddingVariable):
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-        else:
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-        fun = math_ops.multiply(emb, 2.0, name='multiply')
-        loss = math_ops.reduce_sum(fun, name='reduce_sum')
-        gs = training_util.get_or_create_global_step()
-        opt = adagrad.AdagradOptimizer(0.1)
-        g_v = opt.compute_gradients(loss)
-        train_op = opt.apply_gradients(g_v)
-        init = variables.global_variables_initializer()
-        with self.test_session(graph=g) as sess:
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-          sess.run([init])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          return r
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            embedding_dim = 8,
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      emb1 = runTestAdagrad(self, emb_var, g)
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      var =  variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                    embedding_block_dimension=4,
-                                                                    embedding_block_num=2,
-                                                                    storage_type=config_pb2.StorageType.HBM,
-                                                                    initializer=init_ops.ones_initializer(dtypes.float32))
-      emb2 = runTestAdagrad(self, var, g)
-    for i in range(0, 6):
-      for j in range(0, 8):
-        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-  def testDynamicEmbeddingVariableForInitFromProto(self):
-    print("testDynamicEmbeddingVariableForInitFromProto")
-    with ops.device('/gpu:0'):
-      embedding = variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                      embedding_block_dimension=4,
-                                                                      embedding_block_num=2,
-                                                                      storage_type=config_pb2.StorageType.HBM,
-                                                                      initializer=init_ops.ones_initializer(dtypes.float32))
-    emb = embedding_ops.embedding_lookup(embedding, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
-    ops.reset_default_graph()
-    with self.test_session() as sess:
-      res = saver_module.import_meta_graph(meta_graph_def)
-
   def testEmbeddingVariableForInitFromProto(self):
     print("testEmbeddingVariableForInitFromProto")
     with ops.device('/gpu:0'):
@@ -235,43 +172,6 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
-    print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-        ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    with ops.device("/gpu:0"):
-      W = feature_column.embedding_column(sparse_id_column=columns,
-              dimension=3,
-              initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
-
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val1 in emb1.tolist():
-        for val in val1:
-          self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for index, val1 in enumerate(emb1.tolist()):
-        if index < 7:
-          for val in val1:
-            self.assertNotEqual(val, 1.0)
-        else:
-          for val in val1:
-            self.assertEqual(val, .0)
-
   def testEmbeddingVariableForSparseColumnEmbeddingCol(self):
     columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
         ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)))
@@ -870,6 +770,66 @@ def testSaveV3(self):
         result = sess.run([emb1])
         print(result)
 
+  def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self):
+    print("testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+        
+        emb = embedding_ops.embedding_lookup(var, 
+                                            math_ops.cast([0,1,2,5,6,7],
+                                            dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver(sharded=True)
+        init = variables.global_variables_initializer()
+        graph = ops.get_default_graph()
+        with self.test_session() as sess:
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+          sess.run([init])
+          sess.run(train_op)
+          emb_ori = sess.run(emb)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        emb = embedding_ops.embedding_lookup(var, 
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver()
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          self.assertAllEqual(emb_ori, emb_val)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+          for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+            if "Adagrad-values" in name:
+              value = checkpoint_utils.load_variable(checkpoint_directory, name)
+              for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+                  self.assertAlmostEqual(1.1, value[i][j])            
+
   def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
     print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm")
     checkpoint_directory = self.get_temp_dir()
@@ -894,8 +854,8 @@ def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
         emb2 = embedding_ops.embedding_lookup(var2,
                                               math_ops.cast([0,1,2,5,6,7],
                                               dtypes.int64))
-        fun = math_ops.multiply(emb, 0.0, name='multiply')
-        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 1.0, name='multiply_1')
         loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
         gs = training_util.get_or_create_global_step()
         opt = adagrad.AdagradOptimizer(0.1)
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index c6cdf951a1e..81b315e2e43 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -120,7 +120,7 @@ def _CounterFilterTestTemplate(self, optimizer):
               initializer=init_ops.ones_initializer(dtypes.float32),
               ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
       fun = math_ops.multiply(emb, 2.0, name='multiply')
       loss = math_ops.reduce_sum(fun, name='reduce_sum')
       gs = training_util.get_or_create_global_step()
@@ -133,11 +133,18 @@ def _CounterFilterTestTemplate(self, optimizer):
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertEqual(val, .0)
+      
+        for val1 in emb1.tolist():
+          for val in val1:
+            self.assertEqual(val, .0)
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertNotEqual(val, 1.0)
+        for index, val1 in enumerate(emb1.tolist()):
+          if index < 7:
+            for val in val1:
+              self.assertNotEqual(val, 1.0)
+          else:
+            for val in val1:
+              self.assertEqual(val, .0)
 
   def _RecordFreqTestTemplate(self, optimizer):
     checkpoint_directory = self.get_temp_dir()
@@ -720,20 +727,11 @@ def testEmbeddingVariableForL2FeatureEviction(self):
       sess.run([init])
       emb_ori = sess.run([emb, train_op])
       save_path = saver.save(sess, os.path.join(checkpoint_directory, "model1.ckpt"), global_step=12345)
-      #for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
-      #  print('loading... ', name, shape)
-    with self.test_session() as sess:
-      saver.restore(sess, os.path.join(checkpoint_directory, "model1.ckpt-12345"))
-      emb_right = [[0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [1.0, 1.0, 1.0]]
-      emb_ori = sess.run(emb)
-      for i in range(6):
-        for j in range(3):
-          self.assertAlmostEqual(emb_ori[i][j], emb_right[i][j])
+      for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+        if name == "var_1-keys":
+          self.assertEqual(shape[0], 2)
+          keys = checkpoint_utils.load_variable(checkpoint_directory, name)
+          self.assertAllEqual(keys, [0, 1])
 
   def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
     columns_list=[]
@@ -764,14 +762,15 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
 
   def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
     print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-                                                          ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    W = feature_column.embedding_column(sparse_id_column=columns,
-            dimension=3,
-            initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
+    with ops.device("/cpu:0"):
+      columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
+                                                            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+      W = feature_column.embedding_column(sparse_id_column=columns,
+              dimension=3,
+              initializer=init_ops.ones_initializer(dtypes.float32))
+      ids={}
+      ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
+      emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
 
     fun = math_ops.multiply(emb, 2.0, name='multiply')
     loss = math_ops.reduce_sum(fun, name='reduce_sum')
@@ -786,6 +785,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
       emb1, top, l = sess.run([emb, train_op, loss])
+      
       for val1 in emb1.tolist():
         for val in val1:
           self.assertEqual(val, .0)
@@ -1328,66 +1328,6 @@ def testEmbeddingVariableForHTPartitionNum(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForLayout(self):
-    print("testEmbeddingVariableForLayout")
-    def runTestAdagrad(self, var, g):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adagrad.AdagradOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session(graph=g) as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            steps_to_live=5)
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=5)))
-      emb1 = runTestAdagrad(self, emb_var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], .0)
-
   def testEVInitializerWithKeyFetch(self):
     print("testEVInitializerWithKeyFetch")
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
@@ -2391,7 +2331,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
                               "model1.ckpt")
     with self.test_session() as sess:
       sess.run([init])
-      sess.run([emb, train_op])
+      sess.run([train_op])
       save_path = saver.save(sess, model_path)
       for name, shape in checkpoint_utils.list_variables(model_path):
         if name == "var_1-keys":
@@ -2403,6 +2343,37 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
            name == "var_1-freqs_filtered":
           self.assertEqual(0, shape[0])
     del os.environ["TF_EV_SAVE_FILTERED_FEATURES"]
+
+  def testEmbeddingVariableForSaveUnfilterFeature(self):
+    checkpoint_directory = self.get_temp_dir()
+    with ops.device("/cpu:0"):
+      emb_var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+    emb = embedding_ops.embedding_lookup(emb_var,  math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v, gs)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if name == "var_1-keys":
+          keys = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(1, len(keys))
+          self.assertEqual(1, keys[0])
+        if name == "var_1-keys_filtered" or \
+           name == "var_1-freqs_filtered":
+          self.assertEqual(2, shape[0])
   
   def testEmbeddingVariableForMultiTierInference(self):
     print("testEmbeddingVariableForMultiTierInference")
@@ -2716,7 +2687,55 @@ def testCPUFbjOpt(self):
   def testCPUFbjOptWithCounterFilter(self):
     print("testCPUFbjOpt")
     os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
-    self._CounterFilterTestTemplate("Adagrad")
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = self._CreateOptimizer("Adagrad")
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        emb1, top, l = sess.run([emb, train_op, loss])
+        emb_list = emb1.tolist()
+        emb_right = [[.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [.0, .0, .0]]
+        
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb_list[i][j], emb_right[i][j])
+
+        emb1= sess.run(emb)
+        emb_right = [[0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0]]
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb1[i][j], emb_right[i][j])
     del os.environ["TF_EMBEDDING_FBJ_OPT"]
   
   def testCPUFbjOptWithBloomFilter(self):
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 96329ca345b..1ef9550ef6d 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -373,6 +373,8 @@ def _init_from_args(self,
           self._slot_num = 0 
         else:
           self._slot_num = evconfig.slot_num
+        if self._is_primary:
+          self._import_dependency_ops = []
         with ops.name_scope("IsInitialized"):
           self._is_initialized_op = (
               gen_kv_variable_ops.kv_var_is_initialized_op(self._handle,
@@ -488,6 +490,7 @@ def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
           set_attr_ops.append(set_cache_op)
         with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
           self._init_op_for_restore = control_flow_ops.no_op()
+        self.collect_restore_denpendencies()
 
   def need_counts(self):
     return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
@@ -612,8 +615,19 @@ def _init_from_proto(self, variable_def, import_scope=None):
     else:
       self._is_primary = False
 
+    self.collect_restore_denpendencies()
   # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py)
 
+  def collect_restore_denpendencies(self):
+    restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    if len(restore_dependency) == 0:
+      ops.add_to_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY, {})
+      restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    dependency_dict = restore_dependency[0]
+    if not dependency_dict.__contains__(self._primary_handle):
+      dependency_dict[self._primary_handle] = []
+    dependency_dict[self._primary_handle].append(self._init_op_for_restore)
+
   def set_init_data_source_initializer(self, init_data_source):
     import pkgutil
     try:
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 0d8bfe87022..650b1a5e272 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -195,7 +195,8 @@ def restore(self, restored_tensors, unused_restored_shapes):
       if self.var._init_data_source is not None:
         return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
       else:
-        with ops.control_dependencies([self.var._init_op_for_restore]):
+        restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+        with ops.control_dependencies(restore_dependency[self.var._primary_handle]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
               restored_tensors[0],