diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h index 29b85e5bb4e..781511578af 100644 --- a/tensorflow/core/framework/embedding/bloom_filter_policy.h +++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h @@ -35,9 +35,10 @@ class BloomFilterPolicy : public FilterPolicy { using FilterPolicy::config_; public: - BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) : - FilterPolicy(config, ev) { - + BloomFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), + FilterPolicy(config, ev) { switch (config_.counter_type){ case DT_UINT64: VLOG(2) << "The type of bloom counter is uint64"; @@ -64,10 +65,10 @@ class BloomFilterPolicy : public FilterPolicy { Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); if (s.ok()) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); @@ -81,17 +82,17 @@ class BloomFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; if (value_ptr != nullptr) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission; } @@ -109,13 +110,13 @@ class BloomFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> lookup_or_create_ids(num_worker_threads); std::vector> lookup_or_create_cursor(num_worker_threads); - std::vector*>> + std::vector> lookup_or_create_ptrs(num_worker_threads); IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); std::vector> @@ -147,7 +148,7 @@ class BloomFilterPolicy : public FilterPolicy { 1000, do_work); std::vector total_ids(num_of_keys); - std::vector*> total_ptrs(num_of_keys); + std::vector total_ptrs(num_of_keys); std::vector total_cursors(num_of_keys); int num_of_admit_id = 0; for (int i = 0; i < num_worker_threads; i++) { @@ -157,7 +158,7 @@ class BloomFilterPolicy : public FilterPolicy { sizeof(K) * lookup_or_create_ids[i].size()); memcpy(total_ptrs.data() + num_of_admit_id, lookup_or_create_ptrs[i].data(), - sizeof(ValuePtr*) * lookup_or_create_ptrs[i].size()); + sizeof(void*) * lookup_or_create_ptrs[i].size()); memcpy(total_cursors.data() + num_of_admit_id, lookup_or_create_cursor[i].data(), sizeof(int) * lookup_or_create_cursor[i].size()); @@ -174,11 +175,12 @@ class BloomFilterPolicy : public FilterPolicy { #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { if (GetBloomFreq(key) >= config_.filter_freq) { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { AddFreq(key, count); @@ -186,19 +188,27 @@ class BloomFilterPolicy : public FilterPolicy { } } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { - *val = nullptr; - if ((GetFreq(key, *val) + count) >= config_.filter_freq) { + *value_ptr = nullptr; + if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) { + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + ev_->storage()->Insert(key, value_ptr); + s = Status::OK(); + } *is_filter = true; - return ev_->LookupOrCreateKey(key, val); + feat_desc_->AddFreq(*value_ptr, count); + } else { + *is_filter = false; + AddFreq(key, count); } - *is_filter = false; - AddFreq(key, count); return Status::OK(); } - int64 GetFreq(K key, ValuePtr*) override { + int64 GetFreq(K key, void* val) override { return GetBloomFreq(key); } @@ -210,7 +220,7 @@ class BloomFilterPolicy : public FilterPolicy { return bloom_counter_; } - bool is_admit(K key, ValuePtr* value_ptr) override { + bool is_admit(K key, void* value_ptr) override { if (value_ptr == nullptr) { return false; } else { @@ -326,8 +336,12 @@ class BloomFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; int64 new_freq = freq_buff[i]; + int64 import_version = -1; + if (config_.steps_to_live != 0 || config_.record_version) { + import_version = version_buff[i]; + } if (!is_filter) { if (freq_buff[i] >= config_.filter_freq) { SetBloomFreq(key_buff[i], freq_buff[i]); @@ -339,17 +353,9 @@ class BloomFilterPolicy : public FilterPolicy { SetBloomFreq(key_buff[i], freq_buff[i]); } if (new_freq >= config_.filter_freq){ - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); - if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); - } - if (!is_filter){ - ev_->LookupOrCreateEmb(value_ptr, - value_buff + i * ev_->ValueLen()); - } else { - ev_->LookupOrCreateEmb(value_ptr, - ev_->GetDefaultValue(key_buff[i])); - } + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + new_freq, import_version, config_.emb_index); } } return Status::OK(); @@ -449,6 +455,7 @@ class BloomFilterPolicy : public FilterPolicy { } private: void* bloom_counter_; + embedding::FeatureDescriptor* feat_desc_; std::vector seeds_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto index a8535347020..424fc5e1a38 100644 --- a/tensorflow/core/framework/embedding/config.proto +++ b/tensorflow/core/framework/embedding/config.proto @@ -50,11 +50,7 @@ enum EmbeddingVariableType { enum ValuePtrStatus { OK = 0; IS_DELETED = 1; -} - -enum ValuePosition { - IN_DRAM = 0; - NOT_IN_DRAM = 1; + NOT_IN_DRAM = 2; } enum IsSetInitialized { diff --git a/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h new file mode 100644 index 00000000000..e51166a2895 --- /dev/null +++ b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h @@ -0,0 +1,272 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl: public FeatureDescriptorImpl { + public: + CounterFilterDescriptorImpl( + Allocator* alloc, + int64 slot_num, + bool need_record_freq, + bool need_record_version, + int64 filter_freq, + StorageType storage_type) + : filter_freq_(filter_freq), + is_record_freq_(need_record_freq), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) { + if (filter_freq >= (1L << version_offset_bits_)) { + LOG(FATAL)<<"Filter freqeuncy threshold shouldn't bigger than 2^12."; + } + + if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { +#if GOOGLE_CUDA + feat_desc_impl_.reset( + new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); +#endif //GOOGLE_CUDA + } else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } + } + + CounterFilterDescriptorImpl(CounterFilterDescriptorImpl* feat_desc_impl) + : filter_freq_(feat_desc_impl->filter_freq_), + FeatureDescriptorImpl(feat_desc_impl) { +#if GOOGLE_CUDA + if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl*)){ + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); + } else { +#endif //GOOGLE_CUDA + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); +#if GOOGLE_CUDA + } +#endif //GOOGLE_CUDA + } + + ~CounterFilterDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return feat_desc_impl_->InitSlotInfo( + emb_index, embedding_dim, default_value); + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + return feat_desc_impl_->InitSlotInfo(feat_desc_impl); + } + + V* GetEmbedding(void* val, int emb_index) override { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + bool IsAdmit(void* val) override { + return (GetFlag(val) == 0); + } + + void* Admit(void* val) override { + if (!IsAdmit(val)) { + return feat_desc_impl_->Allocate(); + } else { + LOG(FATAL)<<"Only unadmited feature could be admited."; + return nullptr; + } + } + + void* Allocate() override { + uint64* val = (uint64*)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + uint64 flag = 1L << flag_offset_bits_; + uint64 version = (0xffffffffffffffff << version_offset_bits_); + uint64 freq = 0; + *val = version + freq; + val = (uint64*)((uint64)val | flag); + return (void*)val; + } + + void* Allocate(int64 freq) override { + if (freq < filter_freq_) { + return Allocate(); + } else { + return feat_desc_impl_->Allocate(); + } + } + + void Deallocate(void* val) override { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val: vals) { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + } + + void AddFreq(void* val, int64 count) override { + uint64* tmp = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + __sync_fetch_and_add(tmp, count); + } else { + feat_desc_impl_->AddFreq(val, count); + } + } + + void SetAllocator(Allocator* alloc) override { + feat_desc_impl_->SetAllocator(alloc); + } + + void SetValue(void* val, int64 emb_index, V* value) { + if (IsAdmit(val)) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + } + + void SetDefaultValue(void* val, int64 key) override { + feat_desc_impl_->SetDefaultValue(val, key); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + feat_desc_impl_->SetDefaultValues( + keys, init_cursor, + value_ptrs, compute_stream, + event_mgr, gpu_device); + } +#endif + + int64 GetFreq(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + return *((uint64*)tmp) & + ((1L << version_offset_bits_) - 1); + } else { + if (is_record_freq_) { + return feat_desc_impl_->GetFreq(val); + } else { + return filter_freq_; + } + } + } + + int64 GetVersion(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + int64 version = *(uint64*)tmp >> version_offset_bits_; + if (version == 0xffffffffffff) { + version = -1; + } + return version; + } else { + return feat_desc_impl_->GetVersion(val); + } + } + + void UpdateVersion(void* val, int64 version) override { + if (!IsAdmit(val)) { + void* tmp_ptr = GetPtr(val); + uint64 tmp_val = 0; + uint64 result = 0; + do { + tmp_val = *(uint64*)tmp_ptr; + version = version << version_offset_bits_; + uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1); + result = version + freq; + } while(!__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result)); + } else { + feat_desc_impl_->UpdateVersion(val, version); + } + } + + void SetFreq(void* val, int64 freq) override { + uint64* tmp_ptr = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + uint64 tmp = *tmp_ptr; + tmp = ~((1L << version_offset_bits_) - 1) & tmp; + tmp += freq; + __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp); + } else { + feat_desc_impl_->SetFreq(val, freq); + } + } + + int data_bytes() override { + return alloc_bytes_; + } + private: + uint64 GetFlag(void* val) { + return (uint64)val >> flag_offset_bits_; + } + + void* GetPtr(void* val) { + return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1)); + } + + int64 filter_freq_; + int alloc_bytes_ = 8; + Allocator* alloc_ = ev_allocator(); + const int freq_offset_bits_ = 0; + const int version_offset_bits_ = 16; + const int flag_offset_bits_ = 48; + std::unique_ptr> feat_desc_impl_; + bool is_record_freq_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h index c9f19f34cd2..19cd90ad01c 100644 --- a/tensorflow/core/framework/embedding/counter_filter_policy.h +++ b/tensorflow/core/framework/embedding/counter_filter_policy.h @@ -25,18 +25,19 @@ template class CounterFilterPolicy : public FilterPolicy { using FilterPolicy::ev_; using FilterPolicy::config_; - using FilterPolicy::LookupOrCreateEmbInternal; public: - CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) : - FilterPolicy(config, ev) {} + CounterFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), + FilterPolicy(config, ev) {} Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); - if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + if (s.ok() && feat_desc_->IsAdmit(value_ptr)) { + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); @@ -50,18 +51,18 @@ class CounterFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, keys, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; int64 freq = GetFreq(keys[i], value_ptr); - if (value_ptr != nullptr && freq >= config_.filter_freq) { + if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission; } @@ -79,7 +80,7 @@ class CounterFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> @@ -90,36 +91,61 @@ class CounterFilterPolicy : public FilterPolicy { #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - if (GetFreq(key, *value_ptr) >= config_.filter_freq) { - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + if (is_filter) { + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); } } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { - Status s = ev_->LookupOrCreateKey(key, val); - *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq; + *is_filter = false; + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + if (count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + feat_desc_->Deallocate(*value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + ev_->storage()->Insert(key, value_ptr); + s = Status::OK(); + } else if (!feat_desc_->IsAdmit(*value_ptr)) { + int64 freq = feat_desc_->GetFreq(*value_ptr); + if (freq + count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetFreq(admit_value_ptr, freq); + feat_desc_->UpdateVersion( + admit_value_ptr, feat_desc_->GetVersion(*value_ptr)); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + } else { + *is_filter = true; + } + feat_desc_->AddFreq(*value_ptr, count); return s; } - int64 GetFreq(K key, ValuePtr* value_ptr) override { - return value_ptr->GetFreq(); + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); } int64 GetFreq(K key) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetFreq(); - } - - bool is_admit(K key, ValuePtr* value_ptr) override { - return (GetFreq(key, value_ptr) >= config_.filter_freq); + return feat_desc_->GetFreq(value_ptr); } Status Restore(int64 key_num, int bucket_num, int64 partition_id, @@ -136,27 +162,33 @@ class CounterFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); + int64 import_freq = 0; + int64 import_version = -1; if (!is_filter) { if (freq_buff[i] >= config_.filter_freq) { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } else { - value_ptr->SetFreq(config_.filter_freq); + import_freq = config_.filter_freq; } } else { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); - } - if (value_ptr->GetFreq() >= config_.filter_freq) { - LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len, - value_ptr, value_buff, key_buff); + import_version = version_buff[i]; } + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); } return Status::OK(); } + + bool is_admit(K key, void* value_ptr) override { + return feat_desc_->IsAdmit(value_ptr); + } + + private: + embedding::FeatureDescriptor* feat_desc_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h index 600f6c20e44..8476c399c40 100644 --- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h @@ -21,25 +21,25 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class LocklessHashMap : public KVInterface { public: - LocklessHashMap() { + LocklessHashMap(FeatureDescriptor* feat_desc): feat_desc_(feat_desc) { hash_map_.max_load_factor(0.8); hash_map_.set_empty_key_and_value( LocklessHashMap::EMPTY_KEY_, nullptr); hash_map_.set_counternum(16); hash_map_.set_deleted_key(LocklessHashMap::DELETED_KEY_); + pthread_key_create(&key_, NULL); } - ~LocklessHashMap() override {} + ~LocklessHashMap() override { + pthread_key_delete(key_); + } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { auto iter = hash_map_.find_wait_free(key); if (iter.first == LocklessHashMap::EMPTY_KEY_) { return errors::NotFound( @@ -60,10 +60,10 @@ class LocklessHashMap : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { auto iter = hash_map_.insert_lockless( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); + std::move(std::pair(key, + const_cast(value_ptr)))); // insert fail, exist key if ((*(iter.first)).second != value_ptr){ return errors::AlreadyExists( @@ -88,14 +88,40 @@ class LocklessHashMap : public KVInterface { } } + Status Commit(K key, const void* value_ptr) override { + auto iter = hash_map_.insert_lockless(std::move( + std::pair(key, + const_cast(value_ptr)))); + if ((*(iter.first)).second != value_ptr) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap( + &((*(iter.first)).second), + (*(iter.first)).second, + value_ptr); + } + return Status::OK(); + } + Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { + for(int i = 0; i < keys.size(); ++i) { + auto iter = hash_map_.insert_lockless(std::move( + std::pair(keys[i], + const_cast(value_ptrs[i])))); + if ((*(iter.first)).second != value_ptrs[i]) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap( + &((*(iter.first)).second), + (*(iter.first)).second, + value_ptrs[i]); + } + } return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { - std::pair*> *hash_map_dump; + std::vector* value_ptr_list) override { + std::pair *hash_map_dump; int64 bucket_count; auto it = hash_map_.GetSnapshot(); hash_map_dump = it.first; @@ -120,11 +146,50 @@ class LocklessHashMap : public KVInterface { return ""; } + void UpdateValuePtr( + K key, void* new_value_ptr, + void* old_value_ptr) override { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(key, old_value_ptr))); + bool flag = __sync_bool_compare_and_swap( + &((*(iter.first)).second), old_value_ptr, new_value_ptr); + if (flag) { + AppendToValuePtrQueue(old_value_ptr); + } else { + feat_desc_->Deallocate(new_value_ptr); + } + } + + private: + void AppendToValuePtrQueue(void* old_value_ptr) { + //A parameter that can be adjusted in the future + std::deque* value_ptr_queue = GetOutOfDateValuePtrQueue(); + if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) { + void* value_ptr = value_ptr_queue->front(); + feat_desc_->Deallocate(value_ptr); + value_ptr_queue->pop_front(); + } + value_ptr_queue->emplace_back(old_value_ptr); + } + + std::deque* GetOutOfDateValuePtrQueue() { + std::deque* value_ptr_queue = + static_cast*>(pthread_getspecific(key_)); + if (value_ptr_queue == nullptr) { + value_ptr_queue = new std::deque(); + pthread_setspecific(key_, value_ptr_queue); + } + return value_ptr_queue; + } + private: - typedef google::dense_hash_map_lockless*> LockLessHashMap; + typedef google::dense_hash_map_lockless LockLessHashMap; static const int EMPTY_KEY_; static const int DELETED_KEY_; LockLessHashMap hash_map_; + const int CAP_INVALID_VALUEPTR = 20000; + FeatureDescriptor* feat_desc_; + pthread_key_t key_; }; template const int LocklessHashMap::EMPTY_KEY_ = -1; diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h index 92baf037721..ffaf2e335dc 100644 --- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h @@ -23,9 +23,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/kv_interface.h" namespace tensorflow { -template -class ValuePtr; - namespace embedding { template @@ -45,7 +42,7 @@ class DenseHashMap : public KVInterface { delete []hash_map_; } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { int64 l_id = std::abs(key)%partition_num_; spin_rd_lock l(hash_map_[l_id].mu); auto iter = hash_map_[l_id].hash_map.find(key); @@ -70,7 +67,7 @@ class DenseHashMap : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { int64 l_id = std::abs(key)%partition_num_; spin_wr_lock l(hash_map_[l_id].mu); auto iter = hash_map_[l_id].hash_map.find(key); @@ -80,8 +77,8 @@ class DenseHashMap : public KVInterface { "already exists Key: ", key, " in DenseHashMap."); } else { auto iter = hash_map_[l_id].hash_map.insert( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); + std::move(std::pair(key, + const_cast(value_ptr)))); return Status::OK(); } } @@ -109,7 +106,7 @@ class DenseHashMap : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector* >* value_ptr_list) override { + std::vector* value_ptr_list) override { dense_hash_map hash_map_dump[partition_num_]; for (int i = 0; i< partition_num_; i++) { spin_rd_lock l(hash_map_[i].mu); @@ -132,7 +129,7 @@ class DenseHashMap : public KVInterface { const int partition_num_ = 1000; struct dense_hash_map { mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER; - google::dense_hash_map* > hash_map; + google::dense_hash_map hash_map; }; dense_hash_map* hash_map_; }; diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h index fdb6697d541..2f9fbade6c5 100644 --- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h +++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h @@ -21,9 +21,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/single_tier_storage.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,11 +28,12 @@ namespace embedding { template class DramLevelDBStore : public MultiTierStorage { public: - DramLevelDBStore(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_ = new DramStorage(sc, alloc, lc, new LocklessHashMap()); - leveldb_ = new LevelDBStore(sc, alloc, lc); + DramLevelDBStore(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + leveldb_ = new LevelDBStore(sc, feat_desc); } ~DramLevelDBStore() override { @@ -46,7 +44,7 @@ class DramLevelDBStore : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -63,23 +61,22 @@ class DramLevelDBStore : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramLevelDBStore."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramLevelDBStore can not be called."; + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -93,7 +90,7 @@ class DramLevelDBStore : public MultiTierStorage { leveldb_->DestroyValuePtr(*value_ptr); return dram_->Get(key, value_ptr); } - dram_->Insert(key, value_ptr, size); + dram_->CreateAndInsert(key, value_ptr); return Status::OK(); } @@ -146,15 +143,15 @@ class DramLevelDBStore : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_leveldb_key_list; - std::vector*> value_ptr_list, tmp_leveldb_value_list; + std::vector value_ptr_list, tmp_leveldb_value_list; TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); TF_CHECK_OK(leveldb_->GetSnapshot( &tmp_leveldb_key_list, &tmp_leveldb_value_list)); for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) { - tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index); + tmp_leveldb_value_list[i] = + (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset)); } std::vector leveldb_key_list; @@ -173,26 +170,34 @@ class DramLevelDBStore : public MultiTierStorage { { mutex_lock l(*(leveldb_->get_mutex())); + std::vector*> feat_desc_list(2); + FeatureDescriptor hbm_feat_desc( + 1, 1, ev_allocator()/*useless*/, + StorageType::HBM_DRAM, + true, true, + {false, 0}); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = &hbm_feat_desc; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, value_iter))); } for (auto it: tmp_leveldb_value_list) { - delete it; + cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff)); } - delete value_iter; return Status::OK(); } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); @@ -206,8 +211,8 @@ class DramLevelDBStore : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(leveldb_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); @@ -218,14 +223,20 @@ class DramLevelDBStore : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + protected: - void SetTotalDims(int64 total_dims) override { - leveldb_->SetTotalDims(total_dims); + int total_dim() override { + return dram_feat_desc_->total_dim(); } private: DramStorage* dram_; LevelDBStore* leveldb_; + FeatureDescriptor* dram_feat_desc_ = nullptr; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h index fd19f75ab4c..e58d9450d96 100644 --- a/tensorflow/core/framework/embedding/dram_pmem_storage.h +++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h @@ -15,14 +15,12 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ +#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" -#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,36 +29,36 @@ namespace embedding { template class DramPmemStorage : public MultiTierStorage { public: - DramPmemStorage(const StorageConfig& sc, Allocator* dram_alloc, - Allocator* pmem_alloc, LayoutCreator* lc, + DramPmemStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_ = new DramStorage(sc, dram_alloc, lc, new LocklessHashMap()); - pmem_ = new PmemLibpmemStorage(sc, pmem_alloc, lc); - value_ptr_size_ = - const_cast(sc.embedding_config).total_num( - Storage::GetAllocLen()); + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + pmem_feat_desc_ = new FeatureDescriptor(feat_desc); + pmem_feat_desc_->SetAllocator(experimental_pmem_allocator(sc.path, sc.size[0])); + + pmem_ = new PmemLibpmemStorage(sc, pmem_feat_desc_); } ~DramPmemStorage() override { MultiTierStorage::DeleteFromEvictionManager(); delete dram_; delete pmem_; + delete pmem_feat_desc_; } TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; } s = pmem_->Get(key, value_ptr); + void* new_value_ptr = dram_->CreateValuePtr(); if (s.ok()) { - ValuePtr* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_); - memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(), - sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_); - *value_ptr = new_value_ptr; + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); s = dram_->TryInsert(key, *value_ptr); if (s.ok()) { return s; @@ -71,19 +69,19 @@ class DramPmemStorage : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramPmemStorage."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramPmemStorage can not be called."; + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } bool IsUseHbm() override { @@ -94,18 +92,16 @@ class DramPmemStorage : public MultiTierStorage { return false; } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; } s = pmem_->Get(key, value_ptr); - ValuePtr* new_value_ptr = dram_->CreateValuePtr(size); + void* new_value_ptr = dram_->CreateValuePtr(); if (s.ok()) { - memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(), - sizeof(FixedLengthHeader) + sizeof(V) * size); + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); } *value_ptr = new_value_ptr; @@ -159,7 +155,7 @@ class DramPmemStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_pmem_key_list; - std::vector*> value_ptr_list, tmp_pmem_value_list; + std::vector value_ptr_list, tmp_pmem_value_list; TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len); @@ -182,13 +178,14 @@ class DramPmemStorage : public MultiTierStorage { emb_config, value_len, default_value, key_list, - value_ptr_list))); + value_ptr_list, + pmem_feat_desc_))); return Status::OK(); } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); @@ -202,8 +199,8 @@ class DramPmemStorage : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(pmem_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); @@ -214,13 +211,26 @@ class DramPmemStorage : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + pmem_feat_desc_->InitSlotInfo(dram_feat_desc_); + MultiTierStorage::Init(); + } + protected: - void SetTotalDims(int64 total_dims) override {} + int total_dim() override { + return pmem_feat_desc_->total_dim(); + } private: DramStorage* dram_; PmemLibpmemStorage* pmem_; - int64 value_ptr_size_; + FeatureDescriptor* dram_feat_desc_ = nullptr; + FeatureDescriptor* pmem_feat_desc_ = nullptr; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h index 356a61d865f..ddd2d782e03 100644 --- a/tensorflow/core/framework/embedding/dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h @@ -21,9 +21,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/single_tier_storage.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,11 +28,12 @@ namespace embedding { template class DramSsdHashStorage : public MultiTierStorage { public: - DramSsdHashStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_= new DramStorage(sc, alloc, lc, new LocklessHashMap()); - ssd_hash_ = new SsdHashStorage(sc, alloc, lc); + DramSsdHashStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_= new DramStorage(sc, feat_desc); + ssd_hash_ = new SsdHashStorage(sc, feat_desc); } ~DramSsdHashStorage() override { @@ -46,7 +44,7 @@ class DramSsdHashStorage : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -64,24 +62,22 @@ class DramSsdHashStorage : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramSsdHashStorage."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramSsdStorage can not be called."; + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -96,7 +92,7 @@ class DramSsdHashStorage : public MultiTierStorage { ssd_hash_->DestroyValuePtr(*value_ptr); return dram_->Get(key, value_ptr); } - dram_->Insert(key, value_ptr, size); + dram_->CreateAndInsert(key, value_ptr); return Status::OK(); } @@ -164,7 +160,6 @@ class DramSsdHashStorage : public MultiTierStorage { Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len, const std::string& ssd_emb_file_name, EmbeddingVar* ev, RestoreSSDBuffer& restore_buff) override { - int64 alloc_len = Storage::ComputeAllocLen(value_len); std::map file_id_map; for (int64 i = 0; i < restore_buff.num_of_files; i++) { file_id_map[restore_buff.file_list_buf[i]] = i; @@ -185,7 +180,7 @@ class DramSsdHashStorage : public MultiTierStorage { } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); @@ -199,8 +194,8 @@ class DramSsdHashStorage : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(ssd_hash_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); @@ -211,14 +206,25 @@ class DramSsdHashStorage : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + ssd_hash_->Init(); + MultiTierStorage::Init(); + } + protected: - void SetTotalDims(int64 total_dims) override { - ssd_hash_->SetTotalDims(total_dims); + int total_dim() override { + return dram_feat_desc_->total_dim(); } private: DramStorage* dram_ = nullptr; SsdHashStorage* ssd_hash_ = nullptr; + FeatureDescriptor* dram_feat_desc_; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h new file mode 100644 index 00000000000..c1fa878788b --- /dev/null +++ b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h @@ -0,0 +1,214 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#include +#include +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +constexpr int COLUMN_BITSET_BYTES = 5; +constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8; + +struct MetaHeader { + volatile unsigned char embed_num; + unsigned char value_type; + unsigned char header_size; + unsigned char column_bitset[COLUMN_BITSET_BYTES]; + + static const int kEmbeddingNumStartIndex = 0; + static const int kValueTypeStartIndex = + kEmbeddingNumStartIndex + sizeof(char); + static const int kHeaderSizeStartIndex = + kValueTypeStartIndex + sizeof(char); + static const int kColumnBitsetIndex = + kHeaderSizeStartIndex + sizeof(char); + + inline unsigned int GetEmbeddingNum() { + return (unsigned int) embed_num; + } + + inline void SetEmbeddingNum(size_t s) { + embed_num = (unsigned char)s; + } + + inline std::bitset GetColumnBitset() { + unsigned long meta = ((unsigned long*)this)[0]; + std::bitset bs(meta >> (8 * kColumnBitsetIndex)); + return bs; + } + + inline void SetColumnBitset(const std::bitset& bs, + unsigned int embnum) { + ((unsigned long*)(this))[0] = + (bs.to_ulong() << (8 * kColumnBitsetIndex)) | + (header_size << (8 * kHeaderSizeStartIndex)) | + (value_type << (8 * kValueTypeStartIndex)) | + (embnum << (8 * kEmbeddingNumStartIndex)); + } + + inline unsigned int GetHeaderSize() { + return (unsigned int) header_size; + } + + inline void SetHeaderSize(size_t size) { + header_size = (unsigned char)size; + } +}; + +template +class DynmaicDimDescriptorImpl: public FeatureDescriptorImpl { +using FeatureDescriptorImpl::slot_infos_; + public: + DynmaicDimDescriptorImpl( + Allocator* alloc, + int64 slot_num) + : alloc_bytes_(sizeof(std::atomic_flag) + + sizeof(MetaHeader) + + sizeof(V*) * slot_num), + header_offset_bytes_(sizeof(V*) * slot_num), + flag_offset_bytes_(sizeof(MetaHeader) + + sizeof(V*) * slot_num), + FeatureDescriptorImpl(slot_num, + false, + false) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + ~DynmaicDimDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + } + + V* GetEmbedding(void* val, int emb_index) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->embed_num; + auto metadata = meta->GetColumnBitset(); + + if (!metadata.test(emb_index)) { + std::atomic_flag* flag= (std::atomic_flag*)(val + flag_offset_bytes_); + while(flag->test_and_set(std::memory_order_acquire)); + metadata = meta->GetColumnBitset(); + if (metadata.test(emb_index)) { + flag->clear(std::memory_order_release); + return ((V**)val)[emb_index]; + } + embnum++ ; + int64 alloc_value_len = slot_infos_[emb_index].embedding_dim; + V* tensor_val = (V*)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len); + V* default_v = (V*)slot_infos_[emb_index].default_value; + memcpy(tensor_val, default_v, + sizeof(V) * slot_infos_[emb_index].default_value_len); + ((V**)val)[emb_index] = tensor_val; + + metadata.set(emb_index); + // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong(); + // the ptr_ will be occaionally modified from 0x7f18700912a0 to 0x700912a0 + // must use ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val; to avoid + //LOG(INFO)<<"emb_num: "<SetColumnBitset(metadata, embnum); + flag->clear(std::memory_order_release); + return tensor_val; + } else { + return ((V**)val)[emb_index]; + } + } + + bool IsAdmit(void* val) override { + return true; + } + + void* Admit(void* val) override {} + + void* Allocate() override { + void* val = alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + memset(val, 0, alloc_bytes_); + new ((char*)val + header_offset_bytes_) MetaHeader(); + return val; + } + + void Deallocate(void* val) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->GetEmbeddingNum(); + //LOG(INFO)<<"emb_num in deallocate: "<GetColumnBitset(); + for (int i = 0; i< embnum; i++) { + if (metadata.test(i)) { + V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i]; + if (val_ptr != nullptr) { + alloc_->DeallocateRaw(val_ptr); + } + } + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val: vals) { + Deallocate(val); + } + } + + void AddFreq(void* val, int64 count) override {} + + void SetAllocator(Allocator* alloc) override { + alloc_ = alloc; + } + + void SetDefaultValue(void* val, int64 key) override {} + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy(val_ptr, value, + sizeof(V) * FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + + int64 GetFreq(void* val) override {} + + int64 GetVersion(void* val) override {} + + void UpdateVersion(void* val, int64 version) override {} + + void SetFreq(void* val, int64 freq) override {} + + int data_bytes() override { + return alloc_bytes_; + } + private: + int alloc_bytes_ = 0; + int header_offset_bytes_ = 0; + int flag_offset_bytes_ = 0; + Allocator* alloc_ = ev_allocator(); +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h index d47d07d4205..a39d2dca303 100644 --- a/tensorflow/core/framework/embedding/embedding_config.h +++ b/tensorflow/core/framework/embedding/embedding_config.h @@ -23,7 +23,6 @@ struct EmbeddingConfig { DataType counter_type; int64 default_value_dim; float default_value_no_permission; - int normal_fix_flag; bool record_freq; bool record_version; bool is_inference; @@ -37,7 +36,6 @@ struct EmbeddingConfig { int64 filter_freq = 0, int64 max_freq = 999999, float l2_weight_threshold = -1.0, - const std::string& layout = "normal", int64 max_element_size = 0, float false_positive_probability = -1.0, DataType counter_type = DT_UINT64, @@ -58,7 +56,6 @@ struct EmbeddingConfig { counter_type(counter_type), default_value_dim(default_value_dim), default_value_no_permission(default_value_no_permission), - normal_fix_flag(0), record_freq(record_freq), record_version(record_version), is_inference(is_inference) { @@ -70,10 +67,6 @@ struct EmbeddingConfig { kHashFunc = 0; num_counter = 0; } - if (layout == "normal_contiguous" || - layout == "normal_contiguous_gpu") { - normal_fix_flag = 1; - } } int64 calc_num_counter(int64 max_element_size, @@ -105,21 +98,13 @@ struct EmbeddingConfig { } bool is_save_freq() const { - return filter_freq != 0 || - record_freq || - normal_fix_flag == 1; + return filter_freq != 0 || record_freq; } bool is_save_version() const { return steps_to_live != 0 || record_version; } - int64 total_num(int alloc_len) { - return block_num * - (1 + (1 - normal_fix_flag) * slot_num) * - (1 + normal_fix_flag * (alloc_len * (slot_num + 1) - 1)); - } - int64 get_filter_freq() { return filter_freq; } diff --git a/tensorflow/core/framework/embedding/embedding_memory_pool.h b/tensorflow/core/framework/embedding/embedding_memory_pool.h index 27b31ce1ed7..ef175151b00 100644 --- a/tensorflow/core/framework/embedding/embedding_memory_pool.h +++ b/tensorflow/core/framework/embedding/embedding_memory_pool.h @@ -18,9 +18,6 @@ limitations under the License. #include namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class EmbeddingMemoryPool { @@ -50,7 +47,7 @@ class EmbeddingMemoryPool { return ptr; } - void Deallocate(std::vector*> value_ptrs) { + void Deallocate(std::vector value_ptrs) { int64 prev_size = value_ptrs_queue_.size(); for (auto it : value_ptrs) { value_ptrs_queue_.emplace_back(it); @@ -59,9 +56,8 @@ class EmbeddingMemoryPool { int64 n = value_ptrs_queue_.size() - embs_per_block_; n = std::min(prev_size, n); for (int64 i = 0; i < n; i++) { - ValuePtr* val = value_ptrs_queue_.front(); - free_ptr_queue_.emplace_back(val->GetValue(0, 0)); - delete val; + void* val = value_ptrs_queue_.front(); + free_ptr_queue_.emplace_back((V*)val); value_ptrs_queue_.pop_front(); } } @@ -88,7 +84,7 @@ class EmbeddingMemoryPool { int64 embs_per_block_; Allocator* alloc_; std::deque free_ptr_queue_; - std::deque*> value_ptrs_queue_; + std::deque value_ptrs_queue_; std::vector block_list_; }; } //embedding diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc index 0c0be83ec1d..f7162fd2c22 100644 --- a/tensorflow/core/framework/embedding/embedding_var.cu.cc +++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc @@ -42,71 +42,6 @@ void SyncWithEventMgr(se::Stream* stream, while(!is_kernel_finish) {} } -template -void EmbeddingVar::SetDefaultValueOfNewFeatures( - const K* keys, int64 size, const std::list& init_cursor, - V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device) { - if (init_cursor.size() > 0) { - int64 total = init_cursor.size(); - V** value_address = nullptr; - value_address = TypedAllocator::Allocate(cpu_allocator(), total * 2, - AllocationAttributes()); - V** default_value_address = value_address + total; - V** dev_value_address = nullptr; - dev_value_address = - TypedAllocator::Allocate(alloc_, total * 2, AllocationAttributes()); - V** dev_default_value_address = dev_value_address + total; - int64 i = 0; - auto it = init_cursor.cbegin(); - for (; it != init_cursor.cend(); ++it, ++i) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_address[i] = - *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) + - storage_->GetOffset(emb_config_.emb_index); - default_value_address[i] = - default_value_ + - (keys[i] % emb_config_.default_value_dim) % value_len_; - } - DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*)); - compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, - total * 2 * sizeof(V*)); - int block_dim = 128; - TF_CHECK_OK(GpuLaunchKernel( - embedding::CopyEmbedding, - (total * value_len_ + block_dim - 1) / block_dim, - block_dim, 0, gpu_device.stream(), dev_default_value_address, - dev_value_address, value_len_, total)); - SyncWithEventMgr(compute_stream, event_mgr); - // Set init meta of ValuePtrs - for (auto it = init_cursor.cbegin(); it != init_cursor.cend(); ++it) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_ptr->SetInitialized(emb_config_.emb_index); - memcpy_address[*it] = value_ptr->GetValue( - emb_config_.emb_index, - storage_->GetOffset(emb_config_.emb_index)); - } - TypedAllocator::Deallocate(alloc_, dev_value_address, total * 2); - TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2); - } -} - -#define REGISTER_KERNELS(ktype, vtype) \ - template void EmbeddingVar::SetDefaultValueOfNewFeatures( \ - const ktype*, int64, const std::list&, vtype**, \ - se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device); -#define REGISTER_KERNELS_ALL(type) \ - REGISTER_KERNELS(int32, type); \ - REGISTER_KERNELS(int64, type) -#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) -TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) -#undef REGISTER_KERNELS_CPU - -#undef REGISTER_KERNELS_ALL -#undef REGISTER_KERNELS - template void EmbeddingVar::CopyEmbeddingsToBuffer( V* val_base, int64 size, V** memcpy_address, @@ -136,85 +71,6 @@ void EmbeddingVar::CopyEmbeddingsToBuffer( TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) #undef REGISTER_KERNELS_CPU -#undef REGISTER_KERNELS_ALL -#undef REGISTER_KERNELS - -template -void EmbeddingVar::CopyEmbeddingsFromCPUToGPU( - const K* keys, const std::list& copyback_cursor, V** memcpy_address, - se::Stream* compute_stream, EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device, - const DeviceBase::CpuWorkerThreads* worker_threads, - int64* output_value_ptrs) { - if (copyback_cursor.size() > 0) { - int64 total = copyback_cursor.size(); - size_t value_len = emb_config_.total_num(storage_->GetAllocLen()); - V* memcpy_buffer_gpu = nullptr; - ValuePtr** gpu_value_ptrs = new ValuePtr*[total]; - memcpy_buffer_gpu = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, - total * value_len * sizeof(V)); - storage_->CopyEmbeddingsFromCPUToGPU( - total, keys, copyback_cursor, memcpy_address, value_len, gpu_value_ptrs, - memcpy_buffer_gpu, compute_stream, event_mgr, worker_threads); - - V** value_address = (V**)cpu_allocator()->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V*) * total); - V** dev_value_address = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, - sizeof(V*) * total); - std::vector copyback_keys(total); - int64 i = 0; - auto it = copyback_cursor.cbegin(); - for (; it != copyback_cursor.cend(); ++it, ++i) { - bool init; - // Get the curosr - int64 cursor = *it & 0x0fffffffffffffff; - gpu_value_ptrs[i]->SetInitialized(emb_config_.emb_index); - memcpy_address[cursor] = LookupOrCreateEmb(gpu_value_ptrs[i], init); - value_address[i] = memcpy_address[cursor]; - copyback_keys[i] = keys[cursor]; - } - DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * sizeof(V*)); - compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, total * sizeof(V*)); - - int block_dim = 128; - TF_CHECK_OK(GpuLaunchKernel( - embedding::BatchUnpack, (total + block_dim - 1) / block_dim * value_len, - block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu, - value_len, total)); - - auto do_insert = [this, copyback_keys, gpu_value_ptrs, value_len]( - int64 start, int64 limit) { - for (int64 i = start; i < limit; i++) - storage_->Insert(copyback_keys[i], gpu_value_ptrs[i]); - }; - Shard(worker_threads->num_threads, worker_threads->workers, - copyback_keys.size(), 100000, do_insert); - if (output_value_ptrs != nullptr) { - auto it = copyback_cursor.cbegin(); - for (int64 i = 0; it != copyback_cursor.cend(); ++it, ++i) { - int64 cursor = *it & 0x0fffffffffffffff; - output_value_ptrs[cursor] = (int64)gpu_value_ptrs[i]; - } - } - SyncWithEventMgr(compute_stream, event_mgr); - - alloc_->DeallocateRaw(dev_value_address); - alloc_->DeallocateRaw(memcpy_buffer_gpu); - cpu_allocator()->DeallocateRaw(value_address); - delete[] gpu_value_ptrs; - } -} -#define REGISTER_KERNELS(ktype, vtype) \ - template void EmbeddingVar::CopyEmbeddingsFromCPUToGPU( \ - const ktype*, const std::list&, vtype**, se::Stream*, EventMgr*, \ - const Eigen::GpuDevice&, const DeviceBase::CpuWorkerThreads*, int64*); -#define REGISTER_KERNELS_ALL(type) \ - REGISTER_KERNELS(int32, type); \ - REGISTER_KERNELS(int64, type) -#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) -TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) -#undef REGISTER_KERNELS_CPU - #undef REGISTER_KERNELS_ALL #undef REGISTER_KERNELS } // namespace tensorflow diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index 28ce5094d87..487f595bf31 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -30,7 +30,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/embedding_var_context.h" #include "tensorflow/core/framework/embedding/embedding_var_restore.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/framework/embedding/filter_factory.h" #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h" #include "tensorflow/core/framework/embedding/embedding_config.h" @@ -57,7 +56,8 @@ class EmbeddingVar : public ResourceBase { EmbeddingVar(const string& name, embedding::Storage* storage, EmbeddingConfig emb_cfg, - Allocator* alloc): + Allocator* alloc, + embedding::FeatureDescriptor* feat_desc): name_(name), storage_(storage), default_value_(nullptr), @@ -65,27 +65,8 @@ class EmbeddingVar : public ResourceBase { value_len_(0), alloc_(alloc), default_value_alloc_(alloc), - emb_config_(emb_cfg) { - if (IsMultiLevel() || emb_config_.record_freq) { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) { - value_ptr->AddFreq(freq); - }; - } else if (emb_config_.is_counter_filter()) { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) { - if (value_ptr->GetFreq() < filter_freq) - value_ptr->AddFreq(freq); - }; - } else { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) {}; - } - if (emb_config_.steps_to_live != 0 || emb_config_.record_version) { - update_version_fn_ = [](ValuePtr* value_ptr, int64 gs) { - value_ptr->SetStep(gs); - }; - } else { - update_version_fn_ = [](ValuePtr* value_ptr, int64 gs) {}; - } - } + emb_config_(emb_cfg), + feat_desc_(feat_desc) {} Status Init(const Tensor& default_tensor, int64 default_value_dim) { if (storage_ == nullptr) { @@ -95,17 +76,11 @@ class EmbeddingVar : public ResourceBase { storage_type_ = storage_->GetStorageType(); filter_ = FilterFactory::CreateFilter>( - emb_config_, this, storage_); + emb_config_, this, storage_, feat_desc_); emb_config_.default_value_dim = default_value_dim; value_len_ = default_tensor.NumElements() / emb_config_.default_value_dim; - if (LayoutType::NORMAL_CONTIGUOUS == storage_->GetLayoutType() || - LayoutType::NORMAL_CONTIGUOUS_GPU == storage_->GetLayoutType() || - LayoutType::COMPACT == storage_->GetLayoutType()) { - storage_->SetAllocLen(value_len_, emb_config_.slot_num + 1); - } - if (storage_->IsUseHbm()) { #if GOOGLE_CUDA default_value_ = TypedAllocator::Allocate(alloc_, @@ -115,12 +90,6 @@ class EmbeddingVar : public ResourceBase { dev_addr_buffer_size_ = 0; cudaMemcpy(default_value_, &default_tensor_flat(0), default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice); - storage_-> - CreateEmbeddingMemoryPool( - alloc_, - emb_config_.total_num( - storage_->GetAllocLen()), - 1024 * 1024 * 64); #endif // GOOGLE_CUDA } else if (storage_->IsSingleHbm()) { #if GOOGLE_CUDA @@ -147,6 +116,14 @@ class EmbeddingVar : public ResourceBase { emb_config_.default_value_no_permission); } } + bool is_all_slots_initialized = + feat_desc_->InitSlotInfo( + emb_config_.emb_index, value_len_, + std::pair( + default_value_, emb_config_.default_value_dim)); + if (is_all_slots_initialized) { + storage_->Init(); + } return Status::OK(); } @@ -159,57 +136,92 @@ class EmbeddingVar : public ResourceBase { return is_initialized_; } - Status LookupKey(K key, ValuePtr** value_ptr) { + Status LookupKey(K key, void** value_ptr) { return storage_->Get(key, value_ptr); } void BatchLookupKey(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys) { - storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys, - emb_config_.total_num(storage_->GetAllocLen())); + storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys); } - Status LookupOrCreateKey(K key, ValuePtr** value_ptr, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, bool indices_as_pointer, int64 count = 1) { if (indices_as_pointer) { - *value_ptr = (ValuePtr*)key; - *is_filter = (*value_ptr != nullptr); + *value_ptr = (void*)key; + *is_filter = filter_->is_admit(key, *value_ptr); return Status::OK(); } else { Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count); - add_freq_fn_(*value_ptr, count, emb_config_.filter_freq); return s; } } Status Insert(K key, V* value) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; CreateKey(key, &value_ptr, true); - LookupOrCreateEmb(value_ptr, value); + feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value); return Status::OK(); } - Status LookupOrCreateKey(K key, ValuePtr** value_ptr) { - Status s = storage_->GetOrCreate(key, value_ptr, - emb_config_.total_num(storage_->GetAllocLen())); + Status LookupOrCreateKey(const EmbeddingVarContext& context, + const K* keys, + void** value_ptrs, + int64 num_of_keys, + int64* indices_counts, + bool indices_as_pointer = false) { + if (indices_as_pointer) { + auto lookup_key_and_set_version_fn = [keys, value_ptrs] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + value_ptrs[i] = (void*)keys[i]; + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + lookup_key_and_set_version_fn); + } else { + filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); + } + + if (indices_counts != nullptr) { + auto add_freq_fn = [this, value_ptrs, indices_counts] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]); + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + add_freq_fn); + } + return Status::OK(); + } + + + Status LookupOrCreateKey(K key, void** value_ptr) { + Status s = storage_->GetOrCreate(key, value_ptr); TF_CHECK_OK(s); return s; } - void CreateKey(K key, ValuePtr** value_ptr, bool to_dram) { - storage_->Insert(key, value_ptr, - emb_config_.total_num(storage_->GetAllocLen()), to_dram); + void CreateKey(K key, void** value_ptr, bool to_dram) { + storage_->CreateAndInsert(key, value_ptr, to_dram); } - void UpdateVersion(ValuePtr* value_ptr, int64 gs) { - update_version_fn_(value_ptr, gs); + void UpdateVersion(void* value_ptr, int64 gs) { + feat_desc_->UpdateVersion(value_ptr, gs); } void BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs)); } @@ -218,9 +230,9 @@ class EmbeddingVar : public ResourceBase { } int64 GetVersion(K key) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetStep(); + return feat_desc_->GetVersion(value_ptr); } int64 GetFreq(K key) { @@ -261,11 +273,11 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { V* default_v = default_value + i * value_len_; - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; filter_->LookupOrCreate( keys[i], output + i * value_len_, default_v, &value_ptr, 1, default_value_no_permission_); - add_freq_fn_(value_ptr, 1, emb_config_.filter_freq); + feat_desc_->AddFreq(value_ptr, 1); } }; auto worker_threads = context.worker_threads; @@ -276,7 +288,7 @@ class EmbeddingVar : public ResourceBase { void GetOrCreateKey(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, int64 num_of_keys) { const K* keys = (K*)keys_tensor.data(); auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) { @@ -295,7 +307,7 @@ class EmbeddingVar : public ResourceBase { void GatherEmbeddings(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, V* output, int64 num_of_keys) { const K* keys = (K*)keys_tensor.data(); @@ -303,13 +315,10 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); - add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq); V* value = nullptr; if (is_admit) { - V* default_v = - default_value_ + - (keys[i] % emb_config_.default_value_dim) * value_len_; - value = LookupOrCreateEmb(value_ptrs[i], default_v); + value = feat_desc_->GetEmbedding( + value_ptrs[i], emb_config_.emb_index); } else { value = default_value_no_permission_; } @@ -341,8 +350,9 @@ class EmbeddingVar : public ResourceBase { void GetOrCreateKey(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, - int64 num_of_keys) { + void** value_ptrs, + int64 num_of_keys, + bool indices_as_pointer = false) { const K* keys = (K*)keys_tensor.data(); filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); storage_->AddToCachePrefetchList(keys_tensor); @@ -351,17 +361,17 @@ class EmbeddingVar : public ResourceBase { void BatchLookupOrCreateKey( const EmbeddingVarContext& context, const K* keys, - ValuePtr** value_ptrs, + void** value_ptrs, int64 num_of_keys, std::vector>& not_found_cursor_list) { storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys, - emb_config_.total_num(storage_->GetAllocLen()), + value_len_, not_found_cursor_list); } void GatherEmbeddings(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, V* output, int64 num_of_keys) { std::vector embedding_ptr(num_of_keys); @@ -370,12 +380,10 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); - add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq); + feat_desc_->AddFreq(value_ptrs[i], 1); if (is_admit) { - V* default_v = - default_value_ + - (keys[i] % emb_config_.default_value_dim) * value_len_; - embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v); + embedding_ptr[i] = feat_desc_->GetEmbedding( + value_ptrs[i], emb_config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission_; } @@ -394,72 +402,8 @@ class EmbeddingVar : public ResourceBase { storage_->AddToCache(keys_tensor); } - - void BatchLookupOrCreateEmb( - const EmbeddingVarContext& ctx, - V** var_ptr, - ValuePtr** value_ptrs, - const K* indices, - int64 num_of_keys, - IntraThreadCopyIdAllocator* thread_copy_id_alloc) { - int num_worker_threads = ctx.worker_threads->num_threads; - std::vector> init_cursor_list( - num_worker_threads + 1); - uint64 main_thread_id = Env::Default()->GetCurrentThreadId(); - - auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list, - &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) { - int copy_id = - thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id); - for (int i = start; i < limit; i++) { - bool is_need_set_default_value = false; - var_ptr[i] = LookupOrCreateEmb( - value_ptrs[i], is_need_set_default_value); - if (is_need_set_default_value) { - init_cursor_list[copy_id].emplace_back(i); - } - } - }; - const int64 unit_cost = 1000; - auto worker_threads = ctx.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, - num_of_keys, unit_cost, do_work_get_ptrs); - - // Merge copies of init_cursor_list - for (int i = 1; i < (worker_threads->num_threads + 1); i++) { - if (init_cursor_list[i].size() > 0) { - init_cursor_list[0].splice(init_cursor_list[0].end(), - init_cursor_list[i]); - } - } - - auto stream = ctx.compute_stream; - auto event_mgr = ctx.event_mgr; - - SetDefaultValueOfNewFeatures( - indices, num_of_keys, - init_cursor_list[0], - var_ptr, stream, event_mgr, - ctx.gpu_device); - } #endif - void LookupOrCreate(K key, V* val, V* default_v, int count = 1) { - const V* default_value_ptr = - (default_v == nullptr) ? default_value_ : default_v; - ValuePtr* value_ptr = nullptr; - filter_->LookupOrCreate(key, val, default_value_ptr, &value_ptr, count, - default_value_no_permission_); - add_freq_fn_(value_ptr, count, emb_config_.filter_freq); - } - - void BatchInitEmb(int64 size, V** memcpy_address, V* default_value, - bool* init_flags, int64 value_len) { - filter_->BatchInitEmb(size, memcpy_address, default_value, - init_flags, value_len); - } - #if GOOGLE_CUDA void CopyEmbeddingsToBuffer( V* val_base, int64 size, @@ -467,73 +411,18 @@ class EmbeddingVar : public ResourceBase { se::Stream* compute_stream, EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device); - - void SetDefaultValueOfNewFeatures( - const K* keys, int64 size, - const std::list& init_cursor, - V** memcpy_address, - se::Stream* compute_stream, - EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device); - - void CopyEmbeddingsFromCPUToGPU( - const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, - se::Stream* compute_stream, - EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device, - const DeviceBase::CpuWorkerThreads* worker_threads, - int64* output_value_ptrs = nullptr); - - void AllocateMemoryForNewFeatures( - V** memcpy_address, - const std::list& init_cursor) { - std::vector*> value_ptr_list; - for (auto it = init_cursor.cbegin(); - it != init_cursor.cend(); ++it) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_ptr_list.emplace_back(value_ptr); - } - storage_->AllocateMemoryForNewFeatures(value_ptr_list); - } #endif // GOOGLE_CUDA - V* LookupOrCreateEmb(ValuePtr* value_ptr, const V* default_v) { - return value_ptr->GetOrAllocate(alloc_, value_len_, default_v, - emb_config_.emb_index, storage_->GetOffset( - emb_config_.emb_index)); - } - - V* LookupOrCreateEmb(ValuePtr* value_ptr, const V* default_v, - Allocator* alloc) { - return value_ptr->GetOrAllocate(alloc, value_len_, default_v, - emb_config_.emb_index, storage_->GetOffset( - emb_config_.emb_index)); - } - - V* LookupOrCreateEmb(ValuePtr* value_ptr, bool &need_initialize) { - return value_ptr->GetOrAllocate(alloc_, value_len_, nullptr, - emb_config_.emb_index, - storage_->GetOffset(emb_config_.emb_index), - need_initialize); - } - - V* LookupPrimaryEmb(ValuePtr* value_ptr) { - V* primary_val = value_ptr->GetValue(emb_config_.primary_emb_index, - storage_->GetOffset(emb_config_.primary_emb_index)); - return primary_val; - } - - typename TTypes::Flat flat(ValuePtr* value_ptr, int64 index) { - V* default_v = - default_value_ + (index % emb_config_.default_value_dim) * value_len_; - V* val = LookupOrCreateEmb(value_ptr, default_v); + typename TTypes::Flat flat(void* value_ptr) { + V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index); Eigen::array dims({value_len_}); return typename TTypes::Flat(val, dims); } + V* GetValuePtr(void* ptr) { + return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index); + } + int64 ValueLen() const { return value_len_; } @@ -602,25 +491,26 @@ class EmbeddingVar : public ResourceBase { std::vector* value_list, std::vector* version_list, std::vector* freq_list) { - std::vector*> value_ptr_list; + std::vector value_ptr_list; storage_->GetSnapshot(key_list, &value_ptr_list); bool is_save_freq = emb_config_.is_save_freq(); bool is_save_version = emb_config_.is_save_version(); for (int64 i = 0; i < key_list->size(); i++) { - V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0); - if (val != nullptr) { + if (feat_desc_->IsAdmit(value_ptr_list[i])) { + V* val = feat_desc_->GetEmbedding( + value_ptr_list[i], emb_config_.emb_index); value_list->emplace_back(val); } else { value_list->emplace_back(default_value_); } if(is_save_version) { - int64 dump_version = value_ptr_list[i]->GetStep(); + int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]); version_list->emplace_back(dump_version); } if(is_save_freq) { - int64 dump_freq = value_ptr_list[i]->GetFreq(); + int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]); freq_list->emplace_back(dump_freq); } } @@ -634,6 +524,10 @@ class EmbeddingVar : public ResourceBase { return storage_; } + embedding::FeatureDescriptor* feature_descriptor() { + return feat_desc_; + } + Status Shrink(embedding::ShrinkArgs& shrink_args) { if (emb_config_.is_primary()) { shrink_args.value_len = value_len_; @@ -671,10 +565,6 @@ class EmbeddingVar : public ResourceBase { return alloc_; } - int64 GetAllocLen() { - return emb_config_.total_num(storage_->GetAllocLen()); - } - V** GetBuffer(int64 size) { if (dev_addr_buffer_size_ >= size) { return dev_addr_buffer_; @@ -756,16 +646,17 @@ class EmbeddingVar : public ResourceBase { return storage_->HashTable(); } - protected: FilterPolicy>* GetFilter() const { return filter_; } + protected: ~EmbeddingVar() override { // When dynamic dimension embedding is used, // there will be more than one primary slot if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) { delete storage_; + delete feat_desc_; } if (embedding::StorageType::HBM_DRAM == storage_type_) { alloc_->DeallocateRaw(dev_addr_buffer_); @@ -804,35 +695,6 @@ class EmbeddingVar : public ResourceBase { value_len_ * sizeof(V), do_work); } - V* GetAddressOfGpuValuePtr(ValuePtr* value_ptr, - int64 index, - bool copyback_flag, - std::list& init_cursor, - std::list& copyback_cursor) { - V* mem_addr = nullptr; - bool init_flag = false; - if (!copyback_flag) { - mem_addr = LookupOrCreateEmb(value_ptr, init_flag); - } else { - mem_addr = value_ptr->GetValue(0,0); - if (copyback_flag == - embedding::CopyBackFlag::COPYBACK_AND_DESTROY) { - delete value_ptr; - // If the 64th bit of cursor is set to 1, - // the corresponding valueptr need to be deleted later. - int64 tmp = 1; - tmp = tmp << 63; - copyback_cursor.emplace_back(index | tmp); - } else { - copyback_cursor.emplace_back(index); - } - } - if (init_flag) { - init_cursor.emplace_back(index); - } - return mem_addr; - } - std::string name_; bool is_initialized_ = false; @@ -849,8 +711,7 @@ class EmbeddingVar : public ResourceBase { embedding::StorageType storage_type_; EmbeddingConfig emb_config_; FilterPolicy>* filter_; - std::function*, int64, int64)> add_freq_fn_; - std::function*, int64)> update_version_fn_; + embedding::FeatureDescriptor* feat_desc_; TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar); }; diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc index c1b43a608b5..7dddf714b6b 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc @@ -21,42 +21,38 @@ namespace tensorflow { namespace embedding { template void EmbeddingVarCkptData::Emplace( - K key, ValuePtr* value_ptr, + K key, void* value_ptr, const EmbeddingConfig& emb_config, - V* default_value, int64 value_offset, + V* default_value, + FeatureDescriptor* feat_desc, bool is_save_freq, bool is_save_version, bool save_unfiltered_features) { if((int64)value_ptr == ValuePtrStatus::IS_DELETED) return; - V* primary_val = value_ptr->GetValue(0, 0); - bool is_not_admit = - primary_val == nullptr - && emb_config.filter_freq != 0; + bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); + bool is_admit = feat_desc->IsAdmit(value_ptr); - if (!is_not_admit) { + if (is_admit) { key_vec_.emplace_back(key); - if (primary_val == nullptr) { + if (!is_in_dram) { + value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM); + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) { value_ptr_vec_.emplace_back(default_value); - } else if ( - (int64)primary_val == ValuePosition::NOT_IN_DRAM) { - value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM); } else { - V* val = value_ptr->GetValue(emb_config.emb_index, - value_offset); + V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index); value_ptr_vec_.emplace_back(val); } - - if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); + int64 dump_version = feat_desc->GetVersion(value_ptr); version_vec_.emplace_back(dump_version); } if(is_save_freq) { - int64 dump_freq = value_ptr->GetFreq(); + int64 dump_freq = feat_desc->GetFreq(value_ptr); freq_vec_.emplace_back(dump_freq); } } else { @@ -66,18 +62,18 @@ void EmbeddingVarCkptData::Emplace( key_filter_vec_.emplace_back(key); if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); + int64 dump_version = feat_desc->GetVersion(value_ptr); version_filter_vec_.emplace_back(dump_version); } - int64 dump_freq = value_ptr->GetFreq(); + int64 dump_freq = feat_desc->GetFreq(value_ptr); freq_filter_vec_.emplace_back(dump_freq); } } #define REGISTER_KERNELS(ktype, vtype) \ template void EmbeddingVarCkptData::Emplace( \ - ktype, ValuePtr*, const EmbeddingConfig&, \ - vtype*, int64, bool, bool, bool); + ktype, void*, const EmbeddingConfig&, \ + vtype*, FeatureDescriptor*, bool, bool, bool); #define REGISTER_KERNELS_ALL_INDEX(type) \ REGISTER_KERNELS(int32, type) \ REGISTER_KERNELS(int64, type) diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h index 6d7b09e70b0..10bf0d0e43b 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h @@ -19,15 +19,19 @@ limitations under the License. #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h" namespace tensorflow { class BundleWriter; +namespace { + const int kSavedPartitionNum = 1000; + const int kDramFlagOffset = 49; +} namespace embedding { - template class EmbeddingVarCkptData { public: - void Emplace(K key, ValuePtr* value_ptr, + void Emplace(K key, void* value_ptr, const EmbeddingConfig& emb_config, - V* default_value, int64 value_offset, + V* default_value, + FeatureDescriptor* feat_desc, bool is_save_freq, bool is_save_version, bool save_unfiltered_features); diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h index 84c823a90dc..4c052b43c7e 100644 --- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h +++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h @@ -57,7 +57,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator { value_len_(value_len), col_idx_(0) { if (!valueptr_list.empty()) { - if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { curr_ptr_ = val_iter_->Next(); } else { curr_ptr_ = *curr_iter_; @@ -75,7 +75,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator { curr_iter_++; col_idx_ = 0; if (curr_iter_ != end_iter_) { - if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { curr_ptr_ = val_iter_->Next(); } else { curr_ptr_ = *curr_iter_; diff --git a/tensorflow/core/framework/embedding/feature_descriptor.h b/tensorflow/core/framework/embedding/feature_descriptor.h new file mode 100644 index 00000000000..8808da353f4 --- /dev/null +++ b/tensorflow/core/framework/embedding/feature_descriptor.h @@ -0,0 +1,200 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/framework/embedding/config.pb.h" +#include "tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h" +#include "tensorflow/core/framework/embedding/normal_feature_descriptor.h" +#include + +namespace tensorflow { +namespace embedding { + +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl; + +template +class FeatureDescriptor { + public: + FeatureDescriptor( + int64 block_num, + int64 slot_num, + Allocator* alloc, + StorageType storage_type, + bool need_record_freq, + bool need_record_version, + const std::pair& filter_info) { + if (block_num > 1) { + feat_desc_impl_.reset( + new DynmaicDimDescriptorImpl( + alloc, block_num * slot_num)); + } else if (filter_info.first) { + feat_desc_impl_.reset( + new CounterFilterDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version, + filter_info.second, + storage_type)); + } else if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { + feat_desc_impl_.reset( + new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } + } + + FeatureDescriptor(FeatureDescriptor* feat_desc) { + if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(CounterFilterDescriptorImpl*)) { + feat_desc_impl_.reset( + new CounterFilterDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + else if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl)) { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + } + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + return feat_desc_impl_->InitSlotInfo( + emb_index, embedding_dim, default_value); + } + + bool InitSlotInfo(FeatureDescriptor* feat_desc) { + return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get()); + } + + V* GetEmbedding(void *val, int emb_index) { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + void* Allocate() { + return feat_desc_impl_->Allocate(); + } + + void* Allocate(int64 freq) { + return feat_desc_impl_->Allocate(freq); + } + + void Deallocate(void* val) { + feat_desc_impl_->Deallocate(val); + } + + void Deallocate(const std::vector& value_ptrs) { + feat_desc_impl_->Deallocate(value_ptrs); + } + + void SetDefaultValue(void* val, int64 index) { + feat_desc_impl_->SetDefaultValue(val, index); + } + + void SetValue(void* val, int64 emb_index, V* value) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + reinterpret_cast*>(feat_desc_impl_.get())->SetDefaultValues( + keys, init_cursor, value_ptrs, + compute_stream, event_mgr, gpu_device); + } +#endif + + void SetAllocator(Allocator* alloc) { + feat_desc_impl_->SetAllocator(alloc); + } + + int data_bytes() { + return feat_desc_impl_->data_bytes(); + } + + int64 GetFreq(void* val) { + return feat_desc_impl_->GetFreq(val); + } + + int64 GetVersion(void* val) { + return feat_desc_impl_->GetVersion(val); + } + + void SetFreq(void* val, int64 freq) { + feat_desc_impl_->SetFreq(val, freq); + } + + void UpdateVersion(void* val, int64 version) { + feat_desc_impl_->UpdateVersion(val, version); + } + + void AddFreq(void* val, int64 freq) { + feat_desc_impl_->AddFreq(val, freq); + } + + int total_dim() { + return feat_desc_impl_->total_dim(); + } + + bool IsAdmit(void* val) { + return feat_desc_impl_->IsAdmit(val); + } + + void* Admit(void* val) { + return feat_desc_impl_->Admit(val); + } + + + protected: + std::unique_ptr> feat_desc_impl_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/feature_descriptor_impl.h b/tensorflow/core/framework/embedding/feature_descriptor_impl.h new file mode 100644 index 00000000000..6996d22f447 --- /dev/null +++ b/tensorflow/core/framework/embedding/feature_descriptor_impl.h @@ -0,0 +1,317 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#include "tensorflow/core/util/env_var.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { +namespace embedding { +struct SlotInfo { + int embedding_dim; + int embedding_offset; + void* default_value; + int64 default_value_dim; + int default_value_len; +}; + +class BaseFreqDescriptor { + public: + virtual int64 GetFreq(void* value_ptr) = 0; + virtual void AddFreq(void* value_ptr, int64 freq) {} + virtual void SetFreq(void* value_ptr, int64 freq) {} + virtual BaseFreqDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class FreqDescriptor: public BaseFreqDescriptor { + public: + explicit FreqDescriptor(int offset_byte) + : offset_byte_(offset_byte) {} + + int64 GetFreq(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void AddFreq(void* value_ptr, int64 freq) override { + __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq); + } + + void SetFreq(void* value_ptr, int64 freq) override { + *(int64*)(value_ptr + offset_byte_) = freq; + } + + BaseFreqDescriptor* Clone() override { + return new FreqDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonFreqDescriptor: public BaseFreqDescriptor { + public: + int64 GetFreq(void* value_ptr) override { + LOG(FATAL)<<"Can not get freq from NonFreqCounter."; + } + + BaseFreqDescriptor* Clone() override { + return new NonFreqDescriptor(); + } +}; + +class BaseVersionDescriptor { + public: + virtual int64 GetVersion(void* value_ptr) = 0; + virtual void UpdateVersion(void* value_ptr, int64 version) {} + virtual BaseVersionDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class VersionDescriptor: public BaseVersionDescriptor { + public: + explicit VersionDescriptor(int offset_byte) + : offset_byte_(offset_byte) {} + + int64 GetVersion(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void UpdateVersion(void* value_ptr, int64 version) override { + *(int64*)(value_ptr + offset_byte_) = version; + } + + BaseVersionDescriptor* Clone() override { + return new VersionDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonVersionDescriptor: public BaseVersionDescriptor { + public: + int64 GetVersion(void* value_ptr) override { + LOG(FATAL)<<"Can not get version from NonFreqCounter."; + } + + BaseVersionDescriptor* Clone() override { + return new NonVersionDescriptor(); + } +}; + +template +class FeatureDescriptorImpl { + public: + FeatureDescriptorImpl(int64 slot_num, + bool need_record_freq, + bool need_record_version) { + slot_infos_.resize(slot_num); + for (int i = 0; i < slot_infos_.size(); i++) { + slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE; + } + + if (!need_record_freq) { + freq_desc_.reset(new NonFreqDescriptor()); + } + if (!need_record_version) { + version_desc_.reset(new NonVersionDescriptor()); + } + } + + FeatureDescriptorImpl(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + freq_desc_.reset( + feat_desc_impl->freq_desc_->Clone()); + version_desc_.reset( + feat_desc_impl->version_desc_->Clone()); + } + + virtual ~FeatureDescriptorImpl() {} + + virtual bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) = 0; + virtual bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + LOG(FATAL)<<"InitSlotInfo(feat_desc_impl) is not implemented."; + } + virtual V* GetEmbedding(void* val, int emb_index) = 0; + virtual void* Allocate() = 0; + virtual void* Allocate(int64 freq) {return Allocate();} + virtual void Deallocate(void* val) = 0; + virtual void Deallocate(const std::vector& val) = 0; + virtual void SetAllocator(Allocator* alloc) = 0; + virtual void SetDefaultValue(void* val, int64 key) = 0; + virtual void SetValue(void* val, int64 emb_index, V* value) {} + virtual bool IsAdmit(void* val) {return true;} + virtual void* Admit(void* val) {} +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + virtual int data_bytes() = 0; + + virtual int64 GetFreq(void* val) { + return freq_desc_->GetFreq(val); + } + + virtual int64 GetVersion(void* val) { + return version_desc_->GetVersion(val); + } + + virtual void SetFreq(void* val, int64 freq) { + freq_desc_->SetFreq(val, freq); + } + + virtual void UpdateVersion(void* val, int64 version) { + version_desc_->UpdateVersion(val, version); + } + + virtual void AddFreq(void* val, int64 freq) { + freq_desc_->AddFreq(val, freq); + } + + inline int total_dim() { + int64 slot_num = slot_infos_.size(); + return slot_infos_[slot_num - 1].embedding_offset + + slot_infos_[slot_num - 1].embedding_dim; + } + + protected: + bool SetEmbeddingInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + slot_infos_[emb_index].default_value = default_value.first; + slot_infos_[emb_index].default_value_dim = default_value.second; + slot_infos_[emb_index].default_value_len = embedding_dim; + + bool is_aligned = true; + TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true, + &is_aligned)); + if (is_aligned) { + embedding_dim = ComputeAlignedDim(embedding_dim); + } + + //Avoid parallel consitency issue + __sync_bool_compare_and_swap( + &slot_infos_[emb_index].embedding_offset, + EMPTY_OFFSET_VALUE, embedding_dim); + slot_infos_[emb_index].embedding_dim = embedding_dim; + //Check whether all offsets are set + for (int i = 0; i < slot_infos_.size(); i++) { + if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) { + return false; + } + } + + ComputeEmbeddingOffsets(); + return true; + } + + void SetSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + } + + void ComputeAllocBytes(int* alloc_bytes) { + for(auto slot_info: slot_infos_) { + *alloc_bytes += slot_info.embedding_dim * sizeof(V); + } + } + + void CreateFreqAndVersionDescriptor(int* alloc_bytes) { + if (!freq_desc_) { + freq_desc_.reset(new FreqDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + if (!version_desc_) { + version_desc_.reset(new VersionDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + } + + void InitFreqAndVersion(void* val) { + freq_desc_->SetFreq(val, 0); + version_desc_->UpdateVersion(val, -1); + } + + void SetFreqAndVersionOffset(int* alloc_bytes) { + freq_desc_->SetOffset(alloc_bytes); + version_desc_->SetOffset(alloc_bytes); + } + + V* GetDefaultValuePtr(int64 emb_index, int64 key) { + V* default_value_base = (V*)slot_infos_[emb_index].default_value; + int64 default_value_offset = + (key % slot_infos_[emb_index].default_value_dim) * + slot_infos_[emb_index].default_value_len; + return default_value_base + default_value_offset; + } + + void SetDefaultValue(void* val, int64 emb_index, int64 key) { + memcpy(val, + GetDefaultValuePtr(emb_index, key), + slot_infos_[emb_index].default_value_len * sizeof(V)); + } + + private: + int64 ComputeAlignedDim(int64 embedding_dim) { + int padding_bytes = + ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES; + if (padding_bytes == ALIGN_BYTES) { + return embedding_dim; + } else { + return embedding_dim + padding_bytes / sizeof(V); + } + } + + void ComputeEmbeddingOffsets() { + for (int i = slot_infos_.size() - 1 ; i >= 0; i--) { + slot_infos_[i].embedding_offset = 0; + for (int j = 0; j < i; j++) { + slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset; + } + } + } + + protected: + const int EMPTY_OFFSET_VALUE= -1; + const int ALIGN_BYTES = 16; + std::vector slot_infos_; + std::unique_ptr freq_desc_; + std::unique_ptr version_desc_; +}; + +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ diff --git a/tensorflow/core/framework/embedding/filter_factory.h b/tensorflow/core/framework/embedding/filter_factory.h index 5bb92467a51..0127e2c882a 100644 --- a/tensorflow/core/framework/embedding/filter_factory.h +++ b/tensorflow/core/framework/embedding/filter_factory.h @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/filter_policy.h" #include "tensorflow/core/framework/embedding/nullable_filter_policy.h" - namespace tensorflow { namespace embedding{ template @@ -34,22 +33,23 @@ class FilterFactory { template static FilterPolicy* CreateFilter( const EmbeddingConfig& config, EV* ev, - embedding::Storage* storage) { + embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) { if (config.filter_freq > 0) { if (config.kHashFunc != 0) { return new BloomFilterPolicy( - config, ev); + config, ev, feat_desc); } else { return new CounterFilterPolicy( - config, ev); + config, ev, feat_desc); } } else { return new NullableFilterPolicy( - config, ev, storage); + config, ev, storage, feat_desc); } } }; -} // tensorflow +} //namespace tensorflow #endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_ diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h index 559a6796246..256d3b044d4 100644 --- a/tensorflow/core/framework/embedding/filter_policy.h +++ b/tensorflow/core/framework/embedding/filter_policy.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/embedding/embedding_config.h" #include "tensorflow/core/framework/embedding/emb_file.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" namespace tensorflow { @@ -45,9 +46,6 @@ struct RestoreBuffer { template class RestoreSSDBuffer; -template -class ValuePtr; - template class FilterPolicy { public: @@ -55,7 +53,7 @@ class FilterPolicy { config_(config), ev_(ev) {} virtual void LookupOrCreate(K key, V* val, - const V* default_value_ptr, ValuePtr** value_ptr, + const V* default_value_ptr, void** value_ptr, int count, const V* default_value_no_permission) = 0; virtual Status Lookup(K key, V* val, const V* default_value_ptr, @@ -70,53 +68,25 @@ class FilterPolicy { virtual void BatchLookupOrCreateKey( const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) = 0; #endif //GOOGLE_CUDA - virtual Status LookupOrCreateKey(K key, ValuePtr** val, + virtual Status LookupOrCreateKey(K key, void** val, bool* is_filter, int64 count) = 0; + + virtual Status LookupKey(K key, void** val, + bool* is_filter, int64 count) {} - virtual int64 GetFreq(K key, ValuePtr* value_ptr) = 0; - + virtual int64 GetFreq(K key, void* value_ptr) = 0; virtual int64 GetFreq(K key) = 0; - virtual bool is_admit(K key, ValuePtr* value_ptr) = 0; + virtual bool is_admit(K key, void* value_ptr) = 0; virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0; - protected: - void LookupOrCreateEmbInternal(bool is_filter, bool to_dram, - int i, int value_len, - ValuePtr* value_ptr, - V* value_src, K* key_src) { - - if (!is_filter) { - ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen()); - return; - } else { - if (to_dram) { -#if GOOGLE_CUDA - std::vector default_value_host; - default_value_host.resize(config_.default_value_dim * value_len); - cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(), - sizeof(V) * config_.default_value_dim * value_len, - cudaMemcpyDeviceToHost); - ev_->LookupOrCreateEmb(value_ptr, - default_value_host.data() + - (key_src[i] % config_.default_value_dim) - * ev_->ValueLen()); -#endif - return; - } else { - ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i])); - return; - } - } - } - protected: EmbeddingConfig config_; EV* ev_; diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h index a2af6a2430a..b0950eff22d 100644 --- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h +++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h @@ -18,25 +18,21 @@ limitations under the License. #include "tensorflow/core/framework/embedding/shrink_policy.h" namespace tensorflow { - -template -class ValuePtr; - namespace embedding { template class GlobalStepShrinkPolicy : public ShrinkPolicy { public: GlobalStepShrinkPolicy(int64 steps_to_live, - Allocator* alloc, + FeatureDescriptor* feat_desc, KVInterface* kv) : steps_to_live_(steps_to_live), kv_(kv), - ShrinkPolicy(alloc) {} + ShrinkPolicy(feat_desc) {} TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override { ShrinkPolicy::ReleaseValuePtrs(); FilterToDelete(shrink_args.global_step, @@ -46,16 +42,16 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy { private: void FilterToDelete(int64 global_step, std::vector& key_list, - std::vector*>& value_list) { + std::vector& value_list) { for (int64 i = 0; i < key_list.size(); ++i) { - int64 version = value_list[i]->GetStep(); + int64 version = ShrinkPolicy::feat_desc_->GetVersion(value_list[i]); if (version == -1) { - value_list[i]->SetStep(global_step); + ShrinkPolicy::feat_desc_->UpdateVersion(value_list[i], global_step); } else { if (global_step - version > steps_to_live_) { kv_->Remove(key_list[i]); ShrinkPolicy::EmplacePointer(value_list[i]); - value_list[i] = (ValuePtr*)ValuePtrStatus::IS_DELETED; + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; } } } diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h index 1dd90d63a6e..fc4a2506313 100644 --- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h @@ -204,29 +204,29 @@ class GPUHashMapKV : public KVInterface { } Status BatchLookupOrCreate(const K* keys, size_t n, - ValuePtr** value_ptrs) override { + void** value_ptrs) override { return Status::OK(); } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { return Status::OK(); } Status Contains(K key) override { return Status::OK(); } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { return Status::OK(); } Status Remove(K key) override { return Status::OK(); } Status BatchLookup(const K* keys, size_t size, - ValuePtr** value_ptrs) override { + void** value_ptrs) override { return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return Status::OK(); } @@ -235,22 +235,20 @@ class GPUHashMapKV : public KVInterface { } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return Status::OK(); } int64 Size() const override { return 0; } - void SetTotalDims(int total_dims) override {} + void FreeValuePtr(void* value_ptr) override {} - void FreeValuePtr(ValuePtr* value_ptr) override {} - - Status Commit(K key, const ValuePtr* value_ptr) override { + Status Commit(K key, const void* value_ptr) override { return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { return Status::OK(); } diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h index 581f1f1cfaf..1056f4bbd78 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h @@ -3,7 +3,6 @@ #if GOOGLE_CUDA #define EIGEN_USE_GPU -#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" @@ -14,9 +13,6 @@ namespace tensorflow { using se::DeviceMemoryBase; using se::Stream; -template -class ValuePtr; - template class CheckpointLoader; @@ -26,15 +22,17 @@ namespace embedding { template class HbmDramSsdStorage : public MultiTierStorage { public: - HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc, - Allocator* cpu_alloc, LayoutCreator* lc, const std::string& name) - : cpu_alloc_(cpu_alloc), gpu_alloc_(gpu_alloc), + HbmDramSsdStorage(const StorageConfig& sc, + Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), MultiTierStorage(sc, name), dram_capacity_(-1) { - hbm_ = new HbmStorageWithCpuKv(sc, gpu_alloc_, lc); - dram_ = new DramStorage(sc, cpu_alloc_, lc, - new LocklessHashMapCPU(gpu_alloc_)); - ssd_ = new SsdHashStorage(sc, cpu_alloc_, lc); + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); + ssd_ = new SsdHashStorage(sc, dram_feat_desc_); } ~HbmDramSsdStorage() override { @@ -46,29 +44,20 @@ class HbmDramSsdStorage : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage); - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + ssd_->Init(); - MultiTierStorage::cache_capacity_ = - Storage::storage_config_.size[0] - / (Storage::total_dims_ * sizeof(V)); + MultiTierStorage::cache_capacity_ = + Storage::storage_config_.size[0] + / (total_dim() * sizeof(V)); - dram_capacity_ = Storage::storage_config_.size[1] - / (Storage::total_dims_ * sizeof(V)); - MultiTierStorage::ready_eviction_ = true; - } - Storage::flag_.clear(std::memory_order_release); + dram_capacity_ = Storage::storage_config_.size[1] + / (total_dim() * sizeof(V)); + MultiTierStorage::ready_eviction_ = true; } - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = hbm_->Get(key, value_ptr); if (s.ok()) { return s; @@ -88,13 +77,12 @@ class HbmDramSsdStorage : public MultiTierStorage { void BatchGet(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) override { + void** value_ptr_list, + int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); - std::vector*>> + std::vector> ssd_value_ptr_list(num_worker_threads + 1); BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, @@ -102,20 +90,20 @@ class HbmDramSsdStorage : public MultiTierStorage { CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursor_list[0], - ssd_value_ptr_list[0], value_len); + ssd_value_ptr_list[0]); } void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_fountd_cursor_list) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); - std::vector*>> + std::vector> ssd_value_ptr_list(num_worker_threads + 1); BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, @@ -124,70 +112,27 @@ class HbmDramSsdStorage : public MultiTierStorage { CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursor_list[0], - ssd_value_ptr_list[0], value_len); + ssd_value_ptr_list[0]); CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], value_len); } - void Insert(K key, ValuePtr* value_ptr) override { + void Insert(K key, void** value_ptr) override { hbm_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { if (to_dram) { - dram_->Insert(key, value_ptr, alloc_len); + dram_->Insert(key, value_ptr); } else { - hbm_->Insert(key, value_ptr, alloc_len); + hbm_->Insert(key, value_ptr); } } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(size); - { - mutex_lock l(memory_pool_mu_); - gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate()); - *value_ptr = gpu_value_ptr; - } - s = hbm_->TryInsert(key, *value_ptr); - // Insert Failed - if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0)); - } - delete *value_ptr; - return hbm_->Get(key, value_ptr); - } else { - return s; - } - } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - s = dram_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK; - return s; - } - s = ssd_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK_AND_DESTROY; - return s; - } - hbm_->Insert(key, value_ptr, size); - return Status::OK(); + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs."; } void InitCache(embedding::CacheStrategy cache_strategy) override { @@ -195,66 +140,6 @@ class HbmDramSsdStorage : public MultiTierStorage { dram_cache_ = new LRUCache(); } - void CopyEmbeddingsFromCPUToGPU( - int total, const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, V* memcpy_buffer_gpu, - se::Stream* compute_stream, - EventMgr* event_mgr, - const DeviceBase::CpuWorkerThreads* worker_threads) override { - auto memcpy_buffer_cpu = TypedAllocator::Allocate(cpu_allocator(), - total * value_len, AllocationAttributes()); - int64* memory_index = new int64[total]; - int64 i = 0; - auto it = copyback_cursor.cbegin(); - { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursor.cend(); ++it, ++i) { - int64 j = *it & 0x0fffffffffffffff; - memory_index[i] = *it; - ValuePtr* gpu_value_ptr = - hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)memcpy_address[j] - sizeof(FixedLengthHeader), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - } - } - - auto do_work = [memory_index, memcpy_address, - memcpy_buffer_cpu, gpu_value_ptrs, - value_len, this] (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - int64 j = memory_index[i] & 0x0fffffffffffffff; - bool destroy_flag = (memory_index[i] >> 63) & 0x1; - memcpy(memcpy_buffer_cpu + i * value_len, - memcpy_address[j], value_len * sizeof(V)); - if (destroy_flag) { - ssd_->DestroyValuePtr(reinterpret_cast*>( - (char *)memcpy_address[j] - sizeof(FixedLengthHeader))); - } - } - }; - Shard(worker_threads->num_threads, worker_threads->workers, total, - 1000, do_work); - - DeviceMemoryBase gpu_dst_ptr( - memcpy_buffer_gpu, total * value_len * sizeof(V)); - compute_stream->ThenMemcpy( - &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V)); - SyncWithEventMgr(compute_stream, event_mgr); - TypedAllocator::Deallocate( - cpu_allocator(), memcpy_buffer_cpu, total * value_len); - delete[] memory_index; - } - Status Remove(K key) override { hbm_->Remove(key); dram_->Remove(key); @@ -311,25 +196,23 @@ class HbmDramSsdStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_dram_key_list; - std::vector*> value_ptr_list, tmp_dram_value_list; + std::vector value_ptr_list, tmp_dram_value_list; TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); HbmValueIterator hbm_value_iter( key_list, value_ptr_list, - emb_config.emb_index, Storage::alloc_len_, - gpu_alloc_); + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); - std::vector*> tmp_hbm_value_ptrs(value_ptr_list.size()); for (int64 i = 0; i < value_ptr_list.size(); i++) { - ValuePtr* value_ptr = hbm_->CreateValuePtr(value_len); - memcpy((char *)value_ptr->GetPtr(), - (char *)value_ptr_list[i]->GetPtr(), - sizeof(FixedLengthHeader)); - value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - value_ptr->SetInitialized(emb_config.primary_emb_index); - tmp_hbm_value_ptrs[i] = value_ptr; - value_ptr_list[i] = value_ptr; + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq( + value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); } TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, @@ -347,17 +230,24 @@ class HbmDramSsdStorage : public MultiTierStorage { { mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, &hbm_value_iter))); } - for (auto it: tmp_hbm_value_ptrs) { - delete it; + for (auto value_ptr: value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); + } } ssd_->Save(tensor_name, prefix, writer, emb_config, @@ -368,7 +258,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status DramToSsdBatchCommit(std::shared_ptr> keys) { MultiTierStorage::ReleaseValuePtrs(dram_value_ptr_out_of_date_, - dram_->alloc_); + dram_feat_desc_); mutex_lock l(*(ssd_->get_mutex())); mutex_lock l1(*(dram_->get_mutex())); @@ -380,7 +270,7 @@ class HbmDramSsdStorage : public MultiTierStorage { k_size = std::min(k_size, DramEvictionSize); K dram_evic_ids[DramEvictionSize]; size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < true_size; ++i) { if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr)); @@ -408,22 +298,31 @@ class HbmDramSsdStorage : public MultiTierStorage { k_size = std::min(k_size, EvictionSize); size_t true_size = MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; std::shared_ptr> keys(new std::vector()); - std::vector*> value_ptrs; + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; for (int64 i = 0; i < true_size; ++i) { if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { keys->emplace_back(evic_ids[i]); - value_ptrs.emplace_back(value_ptr); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); } } - dram_->BatchCommit(*keys, value_ptrs); - { - //Mutex with main thread - mutex_lock l_mem(memory_pool_mu_); - embedding_mem_pool_->Deallocate(value_ptrs); - } + + CopyEmbeddingFromHbmToDram( + hbm_value_ptrs, + dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(*keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); for (auto it : *keys) { TF_CHECK_OK(hbm_->Remove(it)); } @@ -435,58 +334,14 @@ class HbmDramSsdStorage : public MultiTierStorage { } } - void CreateEmbeddingMemoryPool( - Allocator* alloc, - int64 value_len, - int64 block_size) override { - embedding_mem_pool_ = - new EmbeddingMemoryPool(alloc, value_len, block_size); - } - - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for (auto it : value_ptr_list) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = it->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - //Mutex with other ImportOps - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < num_of_value_ptrs; i++) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = value_ptr_list[i]->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); } protected: - void SetTotalDims(int64 total_dims) override { - dram_->SetTotalDims(total_dims); - ssd_->SetTotalDims(total_dims); - } - - void CopyToGpuValuePtr( - ValuePtr* gpu_ptr, - ValuePtr* cpu_ptr, - int64 size) { - V* cpu_data_address = cpu_ptr->GetValue(0, 0); - V* gpu_data_address = gpu_ptr->GetValue(0, 0); - cudaMemcpy(gpu_data_address, cpu_data_address, - size * sizeof(V), cudaMemcpyHostToDevice); - memcpy(gpu_ptr->GetPtr(), - cpu_ptr->GetPtr(), - sizeof(FixedLengthHeader)); + int total_dim() override { + return hbm_feat_desc_->total_dim(); } void Restore(const std::string& name_string, @@ -539,6 +394,10 @@ class HbmDramSsdStorage : public MultiTierStorage { (int64*)restore_buff.freq_buffer); return s; } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override {} private: void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { V* memcpy_buffer_cpu = new V[size * value_len]; @@ -551,46 +410,30 @@ class HbmDramSsdStorage : public MultiTierStorage { (V*)gpu_alloc_->AllocateRaw( Allocator::kAllocatorAlignment, size * sizeof(V*)); - ValuePtr** gpu_value_ptrs = new ValuePtr*[size]; - ValuePtr** cpu_value_ptrs = new ValuePtr*[size]; - { - //Mutex with other Import Ops - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < size; i++) { - dram_->Get(ids[i], &cpu_value_ptrs[i]); - gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - gpu_value_ptrs[i]->SetPtr(val_ptr); - memcpy((char *)gpu_value_ptrs[i]->GetPtr(), - (char *)cpu_value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); } } //Split from above for loop for minize the cost of mutex lock //TODO: Speed up with intra parallelism - std::vector*> invalid_value_ptrs; + for (int64 i = 0; i < size; i++) { memcpy(memcpy_buffer_cpu + i * value_len, - cpu_value_ptrs[i]->GetValue(emb_index, - Storage::GetOffset(emb_index)), - value_len * sizeof(V)); - Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); - if (!s.ok()) { - invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]); - hbm_->Get(ids[i], &gpu_value_ptrs[i]); - } - gpu_value_ptrs[i]->SetInitialized(emb_index); - value_address[i] = gpu_value_ptrs[i]->GetValue( - emb_index, Storage::GetOffset(emb_index)); + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); } cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, size * value_len * sizeof(V), cudaMemcpyHostToDevice); cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), cudaMemcpyHostToDevice); - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate(invalid_value_ptrs); - } int block_dim = 128; void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, (void*)&value_len, (void*)&size}; @@ -611,10 +454,10 @@ class HbmDramSsdStorage : public MultiTierStorage { void BatchGetValuePtrs( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, std::vector>& copyback_cursor_list, - std::vector*>>& ssd_value_ptr_list, + std::vector>& ssd_value_ptr_list, std::vector>* not_found_cursor_list = nullptr) { int num_worker_threads = ctx.worker_threads->num_threads; IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); @@ -688,39 +531,32 @@ class HbmDramSsdStorage : public MultiTierStorage { void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursors, - std::list*>& ssd_value_ptrs, - int64 value_len) { + std::list& ssd_value_ptrs) { int64 total = copyback_cursors.size(); - std::vector*> gpu_value_ptrs(total); + std::vector gpu_value_ptrs(total); std::vector copyback_keys(total); std::vector memory_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = copyback_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursors.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)value_ptr_list[j]->GetPtr(), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - copyback_keys[i] = keys[*it]; - } + int64 i = 0; + auto it = copyback_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion(gpu_value_ptr, + dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; } MultiTierStorage::CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursors, - memory_index, gpu_value_ptrs, value_len); + memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(), + hbm_feat_desc_, dram_feat_desc_); //Insert copyback ids to hbm hash table. auto do_insert = [this, copyback_keys, gpu_value_ptrs, @@ -730,12 +566,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status s = hbm_->TryInsert( copyback_keys[i], gpu_value_ptrs[i]); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - gpu_value_ptrs[i]->GetValue(0, 0)); - } - delete gpu_value_ptrs[i]; + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); } } @@ -752,34 +583,31 @@ class HbmDramSsdStorage : public MultiTierStorage { void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& not_found_cursors, int64 value_len) { int64 total = not_found_cursors.size(); if (total > 0) { - std::vector*>> insert_pairs(total); + std::vector> insert_pairs(total); std::vector cursor_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = not_found_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != not_found_cursors.cend(); ++it, ++i) { - int64 j = *it; - cursor_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - value_ptr_list[j] = gpu_value_ptr; - insert_pairs[i].first = keys[j]; - insert_pairs[i].second = value_ptr_list[j]; - } + + int64 i = 0; + auto it = not_found_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; } + hbm_feat_desc_->SetDefaultValues( + keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, ctx.gpu_device); + //Insert copyback ids to hbm hash table. auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index] (int64 start, int64 limit) { @@ -787,12 +615,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status s = hbm_->TryInsert( insert_pairs[i].first, insert_pairs[i].second); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - insert_pairs[i].second->GetValue(0, 0)); - } - delete insert_pairs[i].second; + hbm_->DestroyValuePtr(insert_pairs[i].second); hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); } } @@ -804,29 +627,28 @@ class HbmDramSsdStorage : public MultiTierStorage { } void AddCopyBackFlagToValuePtr( - ValuePtr** value_ptr, CopyBackFlag copyback_flag) { + void** value_ptr, CopyBackFlag copyback_flag) { int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; tmp = ((int64)*value_ptr) | tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } - void RemoveCopyBackFlagInValuePtr(ValuePtr** value_ptr) { + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; tmp = ((int64)*value_ptr) & tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } private: HbmStorageWithCpuKv* hbm_ = nullptr; DramStorage* dram_ = nullptr; SsdHashStorage* ssd_ = nullptr; - EmbeddingMemoryPool* embedding_mem_pool_; Allocator* gpu_alloc_; - Allocator* cpu_alloc_; BatchCache* dram_cache_; int64 dram_capacity_; - std::deque*> dram_value_ptr_out_of_date_; - mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ + std::deque dram_value_ptr_out_of_date_; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; const int copyback_flag_offset_bits_ = 60; }; } // embedding diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h index 518c39287e0..d058d95f05b 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h @@ -17,7 +17,6 @@ limitations under the License. #if GOOGLE_CUDA #define EIGEN_USE_GPU -#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h" @@ -29,9 +28,6 @@ namespace tensorflow { using se::DeviceMemoryBase; using se::Stream; -template -class ValuePtr; - template class CheckpointLoader; @@ -41,27 +37,27 @@ namespace embedding { template class HbmDramStorage : public MultiTierStorage { public: - HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc, - Allocator* cpu_alloc, LayoutCreator* lc, - const std::string& name) - : gpu_alloc_(gpu_alloc), MultiTierStorage(sc, name) { - hbm_ = new HbmStorageWithCpuKv(sc, gpu_alloc, lc); - StorageConfig storage_config = StorageConfig(); - storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS; - dram_ = new DramStorage(sc, cpu_alloc, - LayoutCreatorFactory::Create(storage_config), - new LocklessHashMapCPU(gpu_alloc)); + HbmDramStorage(const StorageConfig& sc, + Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), + MultiTierStorage(sc, name) { + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); } ~HbmDramStorage() override { MultiTierStorage::DeleteFromEvictionManager(); delete hbm_; delete dram_; + delete dram_feat_desc_; } TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = hbm_->Get(key, value_ptr); if (s.ok()) { return s; @@ -76,9 +72,8 @@ class HbmDramStorage : public MultiTierStorage { void BatchGet(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) override { + void** value_ptr_list, + int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); @@ -87,18 +82,17 @@ class HbmDramStorage : public MultiTierStorage { copyback_cursor_list); CopyEmbeddingsFromDramToHbm( - ctx, keys, value_ptr_list, copyback_cursor_list[0], - value_len); + ctx, keys, value_ptr_list, copyback_cursor_list[0]); } - void Insert(K key, ValuePtr* value_ptr) override { + void Insert(K key, void** value_ptr) override { hbm_->Insert(key, value_ptr); } void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_fountd_cursor_list) override { @@ -110,115 +104,22 @@ class HbmDramStorage : public MultiTierStorage { copyback_cursor_list, ¬_fountd_cursor_list); CopyEmbeddingsFromDramToHbm( - ctx, keys, value_ptr_list, copyback_cursor_list[0], - value_len); - + ctx, keys, value_ptr_list, copyback_cursor_list[0]); CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], value_len); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) override { if (to_dram) { - dram_->Insert(key, value_ptr, alloc_len); + dram_->CreateAndInsert(key, value_ptr); } else { - hbm_->Insert(key, value_ptr, alloc_len); + hbm_->CreateAndInsert(key, value_ptr); } } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(size); - { - mutex_lock l(memory_pool_mu_); - gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate()); - *value_ptr = gpu_value_ptr; - } - s = hbm_->TryInsert(key, *value_ptr); - if (s.ok()) { - return s; - } - // Insert Failed, key already exist - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0)); - } - delete *value_ptr; - return hbm_->Get(key, value_ptr); - } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - s = dram_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK; - return s; - } - - hbm_->Insert(key, value_ptr, size); - return Status::OK(); - } - - void CopyEmbeddingsFromCPUToGPU( - int total, const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, V* memcpy_buffer_gpu, - se::Stream* compute_stream, - EventMgr* event_mgr, - const DeviceBase::CpuWorkerThreads* worker_threads) override { - auto memcpy_buffer_cpu = TypedAllocator::Allocate(cpu_allocator(), - total * value_len, AllocationAttributes()); - int64* memory_index = new int64[total]; - int64 i = 0; - auto it = copyback_cursor.cbegin(); - { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursor.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)memcpy_address[j] - sizeof(FixedLengthHeader), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - } - } - //Split from above for loop for minize the cost of mutex lock - auto do_work = [memory_index, memcpy_address, - memcpy_buffer_cpu, gpu_value_ptrs, - value_len, this] (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - int j = memory_index[i]; - memcpy(memcpy_buffer_cpu + i * value_len, - memcpy_address[j], value_len * sizeof(V)); - } - }; - Shard(worker_threads->num_threads, worker_threads->workers, total, - 1000, do_work); - DeviceMemoryBase gpu_dst_ptr( - memcpy_buffer_gpu, total * value_len * sizeof(V)); - compute_stream->ThenMemcpy( - &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V)); - SyncWithEventMgr(compute_stream, event_mgr); - TypedAllocator::Deallocate( - cpu_allocator(), memcpy_buffer_cpu, total * value_len); - delete[] memory_index; + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs."; } Status Remove(K key) override { @@ -270,25 +171,23 @@ class HbmDramStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_dram_key_list; - std::vector*> value_ptr_list, tmp_dram_value_list; + std::vector value_ptr_list, tmp_dram_value_list; TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); HbmValueIterator hbm_value_iter( key_list, value_ptr_list, - emb_config.emb_index, Storage::alloc_len_, - gpu_alloc_); - - std::vector*> tmp_hbm_value_ptrs(value_ptr_list.size()); + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); + for (int64 i = 0; i < value_ptr_list.size(); i++) { - ValuePtr* value_ptr = hbm_->CreateValuePtr(value_len); - memcpy((char *)value_ptr->GetPtr(), - (char *)value_ptr_list[i]->GetPtr(), - sizeof(FixedLengthHeader)); - value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - value_ptr->SetInitialized(emb_config.primary_emb_index); - tmp_hbm_value_ptrs[i] = value_ptr; - value_ptr_list[i] = value_ptr; + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq( + value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); } TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, @@ -306,54 +205,26 @@ class HbmDramStorage : public MultiTierStorage { { mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, &hbm_value_iter))); } - for (auto it: tmp_hbm_value_ptrs) { - delete it; - } - return Status::OK(); - } - - void CreateEmbeddingMemoryPool( - Allocator* alloc, - int64 value_len, - int64 block_size) override { - embedding_mem_pool_ = - new EmbeddingMemoryPool(alloc, value_len, block_size); - } - - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for (auto it : value_ptr_list) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = it->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - //Mutex with other ImportOps - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < num_of_value_ptrs; i++) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = value_ptr_list[i]->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); + for (auto value_ptr: value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); } } + return Status::OK(); } void BatchEviction() override { @@ -372,22 +243,31 @@ class HbmDramStorage : public MultiTierStorage { k_size = std::min(k_size, EvictionSize); size_t true_size = MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; std::vector keys; - std::vector*> value_ptrs; + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; for (int64 i = 0; i < true_size; ++i) { if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { keys.emplace_back(evic_ids[i]); - value_ptrs.emplace_back(value_ptr); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); } } - dram_->BatchCommit(keys, value_ptrs); - { - //Mutex with main thread - mutex_lock l_mem(memory_pool_mu_); - embedding_mem_pool_->Deallocate(value_ptrs); - } + + CopyEmbeddingFromHbmToDram( + hbm_value_ptrs, + dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); for (auto it : keys) { TF_CHECK_OK(hbm_->Remove(it)); } @@ -430,6 +310,16 @@ class HbmDramStorage : public MultiTierStorage { } } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + MultiTierStorage::Init(); + } + protected: Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, @@ -447,14 +337,14 @@ class HbmDramStorage : public MultiTierStorage { return s; } - void SetTotalDims(int64 total_dims) override { - dram_->SetTotalDims(total_dims); + int total_dim() override { + return hbm_feat_desc_->total_dim(); } private: void BatchGetValuePtrs( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, std::vector>& copyback_cursor_list, std::vector>* not_found_cursor_list = nullptr) { @@ -522,38 +412,31 @@ class HbmDramStorage : public MultiTierStorage { void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - std::list& copyback_cursors, - int64 value_len) { + void** value_ptr_list, + std::list& copyback_cursors) { int64 total = copyback_cursors.size(); - std::vector*> gpu_value_ptrs(total); + std::vector gpu_value_ptrs(total); std::vector copyback_keys(total); std::vector memory_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = copyback_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursors.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)value_ptr_list[j]->GetPtr(), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - copyback_keys[i] = keys[*it]; - } + int64 i = 0; + auto it = copyback_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion(gpu_value_ptr, + dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; } MultiTierStorage::CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursors, - memory_index, gpu_value_ptrs, value_len); + memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(), + hbm_feat_desc_, dram_feat_desc_); //Insert copyback ids to hbm hash table. auto do_insert = [this, copyback_keys, gpu_value_ptrs, @@ -563,12 +446,7 @@ class HbmDramStorage : public MultiTierStorage { Status s = hbm_->TryInsert( copyback_keys[i], gpu_value_ptrs[i]); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - gpu_value_ptrs[i]->GetValue(0, 0)); - } - delete gpu_value_ptrs[i]; + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); } } @@ -580,34 +458,29 @@ class HbmDramStorage : public MultiTierStorage { void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& not_found_cursors, int64 value_len) { int64 total = not_found_cursors.size(); if (total > 0) { - std::vector*>> insert_pairs(total); + std::vector> insert_pairs(total); std::vector cursor_index(total); - //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = not_found_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != not_found_cursors.cend(); ++it, ++i) { - int64 j = *it; - cursor_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - value_ptr_list[j] = gpu_value_ptr; - insert_pairs[i].first = keys[j]; - insert_pairs[i].second = value_ptr_list[j]; - } + //Create Hbm ValuePtrs. + int64 i = 0; + auto it = not_found_cursors.cbegin(); + for ( ; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; } + hbm_feat_desc_->SetDefaultValues( + keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, ctx.gpu_device); + //Insert copyback ids to hbm hash table. auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index] (int64 start, int64 limit) { @@ -615,12 +488,7 @@ class HbmDramStorage : public MultiTierStorage { Status s = hbm_->TryInsert( insert_pairs[i].first, insert_pairs[i].second); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - insert_pairs[i].second->GetValue(0, 0)); - } - delete insert_pairs[i].second; + hbm_->DestroyValuePtr(insert_pairs[i].second); hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); } } @@ -632,16 +500,22 @@ class HbmDramStorage : public MultiTierStorage { } void AddCopyBackFlagToValuePtr( - ValuePtr** value_ptr, CopyBackFlag copyback_flag) { + void** value_ptr, CopyBackFlag copyback_flag) { int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; tmp = ((int64)*value_ptr) | tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } - void RemoveCopyBackFlagInValuePtr(ValuePtr** value_ptr) { + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; tmp = ((int64)*value_ptr) & tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); + } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { @@ -655,45 +529,30 @@ class HbmDramStorage : public MultiTierStorage { (V*)gpu_alloc_->AllocateRaw( Allocator::kAllocatorAlignment, size * sizeof(V*)); - ValuePtr** gpu_value_ptrs = new ValuePtr*[size]; - ValuePtr** cpu_value_ptrs = new ValuePtr*[size]; - { - //Mutex with other Import Ops - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < size; i++) { - dram_->Get(ids[i], &cpu_value_ptrs[i]); - gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - gpu_value_ptrs[i]->SetPtr(val_ptr); - memcpy((char *)gpu_value_ptrs[i]->GetPtr(), - (char *)cpu_value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); } } //Split from above for loop for minize the cost of mutex lock //TODO: Speed up with intra parallelism - std::vector*> invalid_value_ptrs; + for (int64 i = 0; i < size; i++) { memcpy(memcpy_buffer_cpu + i * value_len, - cpu_value_ptrs[i]->GetValue(emb_index, - Storage::GetOffset(emb_index)), value_len * sizeof(V)); - Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); - if (!s.ok()) { - invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]); - hbm_->Get(ids[i], &gpu_value_ptrs[i]); - } - gpu_value_ptrs[i]->SetInitialized(emb_index); - value_address[i] = gpu_value_ptrs[i]->GetValue( - emb_index, Storage::GetOffset(emb_index)); + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); } cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, size * value_len * sizeof(V), cudaMemcpyHostToDevice); cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), cudaMemcpyHostToDevice); - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate(invalid_value_ptrs); - } int block_dim = 128; void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, (void*)&value_len, (void*)&size}; @@ -714,9 +573,9 @@ class HbmDramStorage : public MultiTierStorage { private: HbmStorageWithCpuKv* hbm_ = nullptr; DramStorage* dram_ = nullptr; - EmbeddingMemoryPool* embedding_mem_pool_ = nullptr; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; Allocator* gpu_alloc_; - mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ const int copyback_flag_offset_bits_ = 60; }; } // embedding diff --git a/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h new file mode 100644 index 00000000000..a3603a61550 --- /dev/null +++ b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h @@ -0,0 +1,122 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/embedding_memory_pool.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { +namespace embedding { +template +class NormalFeatureDescriptorImpl; + +template +class HbmMultiTierFeatureDescriptorImpl + : public FeatureDescriptorImpl { + public: + HbmMultiTierFeatureDescriptorImpl( + Allocator* alloc, int64 slot_num, + bool need_record_freq, + bool need_record_version) + : dram_alloc_bytes_(sizeof(V*)), + hbm_alloc_(alloc), + dram_alloc_(ev_allocator()), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&dram_alloc_bytes_); + } + + ~HbmMultiTierFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = + FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&hbm_alloc_bytes_); + embedding_mem_pool_.reset( + new EmbeddingMemoryPool(hbm_alloc_, + hbm_alloc_bytes_ / sizeof(V), + 1024 * 1024 * 64)); + } + return is_compute_alloc_bytes; + } + + V* GetEmbedding(void *val, int emb_index) override { + return *((V**)val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = dram_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, dram_alloc_bytes_); + mutex_lock l(memory_pool_mu_); + *((V**)val) = embedding_mem_pool_->Allocate(); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { + mutex_lock l(memory_pool_mu_); + embedding_mem_pool_->Deallocate(*((V**)val)); + dram_alloc_->DeallocateRaw(val); + } + + void Deallocate(const std::vector& value_ptrs) override { + mutex_lock l(memory_pool_mu_); + for (auto ptr: value_ptrs) { + embedding_mem_pool_->Deallocate(*((V**)ptr)); + dram_alloc_->DeallocateRaw(ptr); + } + } + void SetDefaultValue(void* val, int64 key) override { + LOG(FATAL)<<"Can't call SetDefaultValue(void* val, int64 key," + <<"int default_value_len) in HbmMultiTierFeatureDescriptor."; + } + + void SetAllocator(Allocator* alloc) override { + hbm_alloc_ = alloc; + } + + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device); + + int data_bytes() override { + return dram_alloc_bytes_; + } + public: + friend class NormalFeatureDescriptorImpl; + protected: + int dram_alloc_bytes_; + int hbm_alloc_bytes_ = 0; + mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ + Allocator* hbm_alloc_; + Allocator* dram_alloc_; + std::unique_ptr> embedding_mem_pool_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h index 36d331e74aa..31dc4459a13 100644 --- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h +++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h @@ -28,10 +28,11 @@ class HbmValueIterator: public ValueIterator { public: HbmValueIterator( const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, int64 emb_index, int64 value_len, - Allocator* alloc) + Allocator* alloc, + FeatureDescriptor* feat_desc) : value_len_(value_len), alloc_(alloc) { int64 emb_offset = value_len_ * emb_index; @@ -40,7 +41,7 @@ class HbmValueIterator: public ValueIterator { for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { if (key_list[i] % kSavedPartitionNum == part_id) { value_parts_vec[part_id].emplace_back( - value_ptr_list[i]->GetValue(emb_index, emb_offset)); + feat_desc->GetEmbedding(value_ptr_list[i], emb_index)); break; } } diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h index 5d1f20b581a..3659187c825 100644 --- a/tensorflow/core/framework/embedding/kv_interface.h +++ b/tensorflow/core/framework/embedding/kv_interface.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ #include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { @@ -24,9 +25,6 @@ namespace { const char* kInferenceMode = "INFERENCE_MODE"; } -template -class ValuePtr; - template class GPUHashTable; @@ -43,19 +41,19 @@ template class KVInterface { public: virtual ~KVInterface() {} - virtual Status Lookup(K key, ValuePtr** value_ptr) = 0; + virtual Status Lookup(K key, void** value_ptr) = 0; virtual Status Contains(K key) = 0; - virtual Status Insert(K key, const ValuePtr* value_ptr) = 0; + virtual Status Insert(K key, const void* value_ptr) = 0; virtual Status Remove(K key) = 0; virtual Status BatchLookup(const K* keys, size_t size, - ValuePtr** value_ptrs) { + void** value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchLookup in KVInterface."); } // KV Batch Insert virtual Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchInsert in KVInterface."); } @@ -66,27 +64,30 @@ class KVInterface { } virtual Status BatchLookupOrCreate(const K* keys, size_t size, - ValuePtr** value_ptrs) { + void** value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchLookupOrInsert in KVInterface."); } + virtual void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) { + LOG(FATAL)<<"Unimplemented for UpdateValuePtr in KVInterface."; + } + virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) = 0; + const std::vector& value_ptrs) = 0; // KV Size virtual int64 Size() const = 0; - virtual void SetTotalDims(int total_dims) {} - - virtual void FreeValuePtr(ValuePtr* value_ptr) {} + virtual void FreeValuePtr(void* value_ptr) {} - virtual Status Commit(K key, const ValuePtr* value_ptr) { + virtual Status Commit(K key, const void* value_ptr) { return Status::OK(); } virtual Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) = 0; + std::vector* value_ptr_list) = 0; virtual std::string DebugString() const = 0; diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h index 2af6b58f94b..9b0ea8aba3f 100644 --- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h +++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h @@ -19,28 +19,23 @@ limitations under the License. namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class L2WeightShrinkPolicy : public ShrinkPolicy { public: L2WeightShrinkPolicy(float l2_weight_threshold, int64 index, - int64 offset, - Allocator* alloc, + FeatureDescriptor* feat_desc, KVInterface* kv) : index_(index), - offset_(offset), kv_(kv), l2_weight_threshold_(l2_weight_threshold), - ShrinkPolicy(alloc) {} + ShrinkPolicy(feat_desc) {} TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override { ShrinkPolicy::ReleaseValuePtrs(); FilterToDelete(shrink_args.value_len, @@ -50,9 +45,9 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { private: void FilterToDelete(int64 value_len, std::vector& key_list, - std::vector*>& value_list) { + std::vector& value_list) { for (int64 i = 0; i < key_list.size(); ++i) { - V* val = value_list[i]->GetValue(index_, offset_); + V* val = ShrinkPolicy::feat_desc_->GetEmbedding(value_list[i], index_); if (val != nullptr) { V l2_weight = (V)0.0; for (int64 j = 0; j < value_len; j++) { @@ -61,7 +56,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { l2_weight *= (V)0.5; if (l2_weight < (V)l2_weight_threshold_) { kv_->Remove(key_list[i]); - value_list[i] = (ValuePtr*)ValuePtrStatus::IS_DELETED; + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; ShrinkPolicy::EmplacePointer(value_list[i]); } } @@ -70,7 +65,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { private: int64 index_; - int64 offset_; + //int64 offset_; KVInterface* kv_; float l2_weight_threshold_; }; diff --git a/tensorflow/core/framework/embedding/layout_creator.h b/tensorflow/core/framework/embedding/layout_creator.h deleted file mode 100644 index 07d50451bf0..00000000000 --- a/tensorflow/core/framework/embedding/layout_creator.h +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2022 The DeepRec Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -======================================================================*/ -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ - -#include "tensorflow/core/framework/embedding/cache.h" -#include "tensorflow/core/framework/embedding/config.pb.h" -#include "tensorflow/core/framework/embedding/storage_config.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -template -class ValuePtr; - -namespace embedding { -template -class LayoutCreator { - public: - virtual ValuePtr* Create(Allocator* alloc, size_t size) = 0; -}; - -template -class NormalLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalValuePtr(alloc, size); - } -}; - -template -class LightLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new LightValuePtr(alloc, size); - } -}; - -template -class NormalContiguousLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalContiguousValuePtr(alloc, size); - } -}; - -template -class NormalContiguousGPULayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalGPUValuePtr(alloc, size); - } -}; - -template -class CompactLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new CompactValuePtr(alloc, size); - } -}; - -class LayoutCreatorFactory { - public: - template - static LayoutCreator* Create(const StorageConfig& sc) { - switch (sc.layout_type) { - case LayoutType::NORMAL: - static NormalLayoutCreator normal_creator; - return &normal_creator; - case LayoutType::LIGHT: - static LightLayoutCreator light_creator; - return &light_creator; - case LayoutType::NORMAL_CONTIGUOUS: - static NormalContiguousLayoutCreator normal_contiguous_creator; - return &normal_contiguous_creator; - case LayoutType::NORMAL_CONTIGUOUS_GPU: - static NormalContiguousGPULayoutCreator - normal_contiguous_gpu_creator; - return &normal_contiguous_gpu_creator; - case LayoutType::COMPACT: - static CompactLayoutCreator compact_creator; - return &compact_creator; - default: - static NormalLayoutCreator default_creator; - return &default_creator; - } - } -}; -} // embedding -} // tensorflow - -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h index 8ea1fa63fc2..e488ab3776d 100644 --- a/tensorflow/core/framework/embedding/leveldb_kv.h +++ b/tensorflow/core/framework/embedding/leveldb_kv.h @@ -17,9 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_ #include "tensorflow/core/lib/io/path.h" - #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/lib/core/status.h" #include "leveldb/db.h" @@ -35,9 +33,6 @@ using leveldb::WriteBatch; using leveldb::WriteOptions; namespace tensorflow { -template -class ValuePtr; - namespace embedding { template @@ -76,28 +71,21 @@ class SizeCounter { template class LevelDBKV : public KVInterface { public: - LevelDBKV(std::string path) { + LevelDBKV(std::string path, FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { path_ = io::JoinPath(path, "level_db_" + std::to_string(Env::Default()->NowMicros()));; options_.create_if_missing = true; leveldb::Status s = leveldb::DB::Open(options_, path_, &db_); CHECK(s.ok()); counter_ = new SizeCounter(8); - new_value_ptr_fn_ = [] (size_t size) { - return new NormalContiguousValuePtr(ev_allocator(), size); - }; - total_dims_ = 0; - } - - void SetTotalDims(int total_dims) { - total_dims_ = total_dims; } ~LevelDBKV() override { delete db_; } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { std::string val_str; leveldb::Slice db_key((char*)(&key), sizeof(void*)); leveldb::ReadOptions options; @@ -106,8 +94,8 @@ class LevelDBKV : public KVInterface { return errors::NotFound( "Unable to find Key: ", key, " in LevelDB."); } else { - ValuePtr* val = new_value_ptr_fn_(total_dims_); - memcpy((int64 *)(val->GetPtr()), &val_str[0], val_str.length()); + void* val = feat_desc_->Allocate(); + memcpy((int64 *)val, &val_str[0], val_str.length()); *value_ptr = val; return Status::OK(); } @@ -126,22 +114,22 @@ class LevelDBKV : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { counter_->add(key, 1); return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return BatchCommit(keys, value_ptrs); } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { WriteBatch batch; for (int i = 0; i < keys.size(); i++) { - std::string value_res((char*)value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader) + total_dims_ * sizeof(V)); + std::string value_res((char*)value_ptrs[i], + feat_desc_->data_bytes()); leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*)); batch.Put(db_key, value_res); delete value_ptrs[i]; @@ -150,9 +138,9 @@ class LevelDBKV : public KVInterface { return Status::OK(); } - Status Commit(K key, const ValuePtr* value_ptr) override { - std::string value_res((char*)value_ptr->GetPtr(), - sizeof(FixedLengthHeader) + total_dims_ * sizeof(V)); + Status Commit(K key, const void* value_ptr) override { + std::string value_res((char*)value_ptr, + feat_desc_->data_bytes()); leveldb::Slice db_key((char*)(&key), sizeof(void*)); leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res); if (!s.ok()){ @@ -176,22 +164,32 @@ class LevelDBKV : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { ReadOptions options; options.snapshot = db_->GetSnapshot(); leveldb::Iterator* it = db_->NewIterator(options); + void* dram_value_ptr = feat_desc_->Allocate(); for (it->SeekToFirst(); it->Valid(); it->Next()) { K key; memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); key_list->emplace_back(key); - ValuePtr* value_ptr = - new NormalGPUValuePtr(ev_allocator(), 1); - memcpy((char *)value_ptr->GetPtr(), + FeatureDescriptor hbm_feat_desc( + 1, 1, ev_allocator()/*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes()); + memcpy(dram_value_ptr, it->value().ToString().data(), - sizeof(FixedLengthHeader)); + feat_desc_->data_bytes()); + hbm_feat_desc.SetFreq( + value_ptr, feat_desc_->GetFreq(dram_value_ptr)); + hbm_feat_desc.UpdateVersion( + value_ptr, feat_desc_->GetVersion(dram_value_ptr)); value_ptr_list->emplace_back(value_ptr); } delete it; + feat_desc_->Deallocate(dram_value_ptr); return Status::OK(); } @@ -199,8 +197,8 @@ class LevelDBKV : public KVInterface { return counter_->size(); } - void FreeValuePtr(ValuePtr* value_ptr) override { - delete value_ptr; + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); } std::string DebugString() const override{ @@ -212,8 +210,7 @@ class LevelDBKV : public KVInterface { SizeCounter* counter_; Options options_; std::string path_; - std::function*(size_t)> new_value_ptr_fn_; - int total_dims_; + FeatureDescriptor* feat_desc_; }; template @@ -223,10 +220,12 @@ class DBValueIterator: public ValueIterator { const std::vector& key_list, int64 emb_index, int64 value_len, - LevelDBKV* leveldb_kv) + LevelDBKV* leveldb_kv, + FeatureDescriptor* feat_desc) : value_len_(value_len), emb_index_(emb_index), - leveldb_kv_(leveldb_kv) { + leveldb_kv_(leveldb_kv), + feat_desc_(feat_desc) { int64 emb_offset = value_len_ * emb_index; std::vector> keys_parts_vec(kSavedPartitionNum); for (int64 i = 0; i < key_list.size(); i++) { @@ -251,8 +250,7 @@ class DBValueIterator: public ValueIterator { V* Next() { if (value_ptr_ != nullptr) { - value_ptr_->Destroy(ev_allocator()); - delete value_ptr_; + feat_desc_->Deallocate(value_ptr_); } K key = *(keys_iter_++); @@ -260,16 +258,17 @@ class DBValueIterator: public ValueIterator { if (!s.ok()) { LOG(FATAL)<<"Not found value in LevelDB when Save."; } - return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_); + return feat_desc_->GetEmbedding(value_ptr_, emb_index_); } private: int64 value_len_; int64 emb_index_; LevelDBKV* leveldb_kv_; + FeatureDescriptor* feat_desc_; std::list keys_; typename std::list::const_iterator keys_iter_; - ValuePtr* value_ptr_ = nullptr; + void* value_ptr_ = nullptr; int64 key_cursor_ = 0; }; diff --git a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h b/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h deleted file mode 100644 index 8dcea81d4a1..00000000000 --- a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright 2022 The DeepRec Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -=======================================================================*/ - -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ -#if GOOGLE_CUDA -#define EIGEN_USE_GPU - -#include "sparsehash/dense_hash_map_lockless" -#include "tensorflow/core/framework/embedding/batch.h" -#include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/platform/stream_executor.h" - -namespace tensorflow { -using se::DeviceMemoryBase; -using se::Stream; - -namespace embedding { - -template -class LocklessHashMapCPU : public KVInterface { - public: - LocklessHashMapCPU(Allocator* gpu_alloc): gpu_alloc_(gpu_alloc) { - hash_map_.max_load_factor(0.8); - hash_map_.set_empty_key_and_value(EMPTY_KEY_, nullptr); - hash_map_.set_counternum(16); - hash_map_.set_deleted_key(DELETED_KEY_); - cudaEventCreate(&is_finish_); - } - - ~LocklessHashMapCPU() override { - cudaEventDestroy(is_finish_); - } - - Status Lookup(K key, ValuePtr** value_ptr) override { - auto iter = hash_map_.find_wait_free(key); - if (iter.first == EMPTY_KEY_) { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } else { - *value_ptr = iter.second; - return Status::OK(); - } - } - - Status Contains(K key) override { - auto iter = hash_map_.find_wait_free(key); - if (iter.first == EMPTY_KEY_) { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } else { - return Status::OK(); - } - } - - Status Insert(K key, const ValuePtr* value_ptr) override { - auto iter = hash_map_.insert_lockless( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); - // insert fail, exist key - if ((*(iter.first)).second != value_ptr){ - return errors::AlreadyExists( - "already exists Key: ", key, " in LocklessHashMap."); - } else { - return Status::OK(); - } - } - - // Other Method - int64 Size() const override { - return hash_map_.size_lockless(); - } - - // Remove KV - Status Remove(K key) override { - if (hash_map_.erase_lockless(key)) { - return Status::OK(); - } else { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } - } - - void SetTotalDims(int total_dims) override { - total_dims_ = total_dims; - } - - void AppendToValuePtrQueue(ValuePtr* old_value_ptr) { - //A parameter that can be adjusted in the future - if (value_ptr_out_of_date_.size() > CAP_INVALID_VALUEPTR) { - ValuePtr* value_ptr = value_ptr_out_of_date_.front(); - delete value_ptr; - value_ptr_out_of_date_.pop_front(); - } - value_ptr_out_of_date_.emplace_back(old_value_ptr); - } - - Status Commit(K key, const ValuePtr* value_ptr) override { - ValuePtr* cpu_value_ptr = - new NormalContiguousValuePtr(ev_allocator(), total_dims_); - cudaMemcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader), - *(char **)((char*)value_ptr->GetPtr() + sizeof(FixedLengthHeader)), - total_dims_ * sizeof(V), - cudaMemcpyDeviceToHost); - memcpy((char *)cpu_value_ptr->GetPtr(), - (char*)value_ptr->GetPtr(), sizeof(FixedLengthHeader)); - auto iter = hash_map_.insert_lockless(std::move( - std::pair*>(key, - const_cast*>(cpu_value_ptr)))); - if ((*(iter.first)).second != cpu_value_ptr) { - AppendToValuePtrQueue((*(iter.first)).second); - (*(iter.first)).second = cpu_value_ptr; - } - return Status::OK(); - } - - Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { - int batch_size = keys.size(); - Allocator* cpu_alloc = cpu_allocator(); - V** value_address = (V **)cpu_alloc->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V*) * batch_size); - V** dev_value_address; - V* batch_data_place; - V* dev_batch_data_place; - dev_value_address = (V**)gpu_alloc_->AllocateRaw( - Allocator::kAllocatorAlignment, batch_size * sizeof(V*)); - dev_batch_data_place = (V*)gpu_alloc_->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_); - batch_data_place = (V *)cpu_alloc->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_); - - // Copy GPU addresses V* - for(int i = 0;i < batch_size;++i) { - value_address[i] = - *(V **)((char*)value_ptrs[i]->GetPtr() + sizeof(FixedLengthHeader)); - } - - cudaMemcpyAsync(dev_value_address, value_address, - sizeof(V*) * batch_size, - cudaMemcpyHostToDevice); - - // Launch Kernel,Copy data to continuous place - int block_dim = 128; - void* args[] = { (void*)&dev_value_address, - (void*)&dev_batch_data_place, (void*)&total_dims_, - (void*)&batch_size}; - - cudaLaunchKernel((void *)BatchCopy, - (batch_size * total_dims_ + block_dim - 1) / block_dim, - block_dim, args, 0, NULL); - - cudaMemcpyAsync(batch_data_place, dev_batch_data_place, - sizeof(V) * batch_size * total_dims_, - cudaMemcpyDeviceToHost); - - cudaEventRecord(is_finish_); - cudaEventSynchronize(is_finish_); - - // Copy data to ValuePtrs in memory;Insert it into hashmap - for(int i = 0; i < batch_size; ++i) { - ValuePtr* cpu_value_ptr = - new NormalContiguousValuePtr(ev_allocator(), total_dims_); - memcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader), - &batch_data_place[i * total_dims_], total_dims_ * sizeof(V)); - memcpy((char *)cpu_value_ptr->GetPtr(), - (char *)value_ptrs[i]->GetPtr(), sizeof(FixedLengthHeader)); - auto iter = hash_map_.insert_lockless(std::move( - std::pair*>(keys[i], - const_cast*>(cpu_value_ptr)))); - if ((*(iter.first)).second != cpu_value_ptr) { - AppendToValuePtrQueue((*(iter.first)).second); - (*(iter.first)).second = cpu_value_ptr; - } - } - - gpu_alloc_->DeallocateRaw(dev_value_address); - gpu_alloc_->DeallocateRaw(dev_batch_data_place); - - cpu_alloc->DeallocateRaw(batch_data_place); - cpu_alloc->DeallocateRaw(value_address); - - return Status::OK(); - } - - Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { - std::pair*> *hash_map_dump; - int64 bucket_count; - auto it = hash_map_.GetSnapshot(); - hash_map_dump = it.first; - bucket_count = it.second; - for (int64 j = 0; j < bucket_count; j++) { - if (hash_map_dump[j].first != EMPTY_KEY_ && - hash_map_dump[j].first != DELETED_KEY_) { - key_list->emplace_back(hash_map_dump[j].first); - value_ptr_list->emplace_back(hash_map_dump[j].second); - } - } - free(hash_map_dump); - return Status::OK(); - } - - std::string DebugString() const override { - LOG(INFO) << "map info size:" << Size() - << "map info bucket_count:" << hash_map_.bucket_count() - << "map info load_factor:" << hash_map_.load_factor() - << "map info max_load_factor:" << hash_map_.max_load_factor() - << "map info min_load_factor:" << hash_map_.min_load_factor(); - return ""; - } - - private: - typedef google::dense_hash_map_lockless* > - LockLessHashMap; - static const int EMPTY_KEY_ = -1; - static const int DELETED_KEY_ = -2; - static constexpr int CAP_INVALID_VALUEPTR = 200000; - LockLessHashMap hash_map_; - std::deque*> value_ptr_out_of_date_; - int total_dims_; - Allocator* gpu_alloc_; - cudaEvent_t is_finish_; -}; -} // namespace embedding -} // namespace tensorflow - -#endif //GOOGLE_CUDA -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc index de275183d22..9745ab5fcc3 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc +++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc @@ -15,8 +15,7 @@ limitations under the License. #if GOOGLE_CUDA #define EIGEN_USE_GPU #include "tensorflow/core/framework/embedding/multi_tier_storage.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" -#include "tensorflow/core/framework/embedding/batch.h" +#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/op_kernel.h" @@ -44,11 +43,13 @@ template void MultiTierStorage::CopyEmbeddingsFromDramToHbm( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursor, const std::vector& memory_index, - const std::vector*>& gpu_value_ptrs, - int value_len) { + const std::vector& gpu_value_ptrs, + int value_len, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc) { if (copyback_cursor.size() > 0) { int total = copyback_cursor.size(); //Alocate memcpy buffer on CPU and GPU. @@ -64,11 +65,13 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( auto do_work = [memory_index, memcpy_buffer_cpu, value_ptr_list, gpu_value_ptrs, + dram_feat_desc, value_len, this] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { int j = memory_index[i]; memcpy(memcpy_buffer_cpu + i * value_len, - value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V)); + dram_feat_desc->GetEmbedding(value_ptr_list[j], 0), + value_len * sizeof(V)); value_ptr_list[j] = gpu_value_ptrs[i]; } }; @@ -96,8 +99,7 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( for (; it != copyback_cursor.cend(); ++it, ++i) { // Get the cursor int64 cursor = *it; - gpu_value_ptrs[i]->SetInitialized(0); - value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0); + value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0); } DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*)); compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*)); @@ -119,16 +121,71 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( } #define REGISTER_KERNELS(ktype, vtype) \ template void MultiTierStorage::CopyEmbeddingsFromDramToHbm( \ - const EmbeddingVarContext&, const ktype*, ValuePtr**,\ + const EmbeddingVarContext&, const ktype*, void**,\ std::list&, const std::vector&,\ - const std::vector*>&, int); + const std::vector&, int, FeatureDescriptor*,\ + FeatureDescriptor*); #define REGISTER_KERNELS_ALL(type) \ REGISTER_KERNELS(int32, type); \ REGISTER_KERNELS(int64, type) #define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) #undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +template +void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( + const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + if (init_cursor.size() > 0) { + int64 total = init_cursor.size(); + TValue** value_address = nullptr; + value_address = TypedAllocator::Allocate(cpu_allocator(), total * 2, + AllocationAttributes()); + TValue** default_value_address = value_address + total; + TValue** dev_value_address = nullptr; + dev_value_address = + TypedAllocator::Allocate(hbm_alloc_, total * 2, AllocationAttributes()); + TValue** dev_default_value_address = dev_value_address + total; + for (int emb_index = 0; emb_index < FeatureDescriptorImpl::slot_infos_.size(); emb_index++) { + int64 i = 0; + auto it = init_cursor.cbegin(); + for (; it != init_cursor.cend(); ++it, ++i) { + value_address[i] = GetEmbedding(value_ptrs[*it], emb_index); + default_value_address[i] = + FeatureDescriptorImpl::GetDefaultValuePtr(emb_index, keys[i]); + } + DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(TValue*)); + compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, + total * 2 * sizeof(TValue*)); + int block_dim = 128; + int value_len = FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len; + TF_CHECK_OK(GpuLaunchKernel( + embedding::CopyEmbedding, + (total * value_len + block_dim - 1) / block_dim, + block_dim, 0, gpu_device.stream(), dev_default_value_address, + dev_value_address, value_len, total)); + SyncWithEventMgr(compute_stream, event_mgr); + } + + TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2); + TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( \ + const ktype*, const std::list&, void**,\ + se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU #undef REGISTER_KERNELS_ALL #undef REGISTER_KERNELS } // namespace embedding diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h index 8239d109e64..7955322aca6 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.h +++ b/tensorflow/core/framework/embedding/multi_tier_storage.h @@ -31,10 +31,11 @@ limitations under the License. #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/core/status.h" -namespace tensorflow { -template -class ValuePtr; +#if GOOGLE_CUDA +#include "tensorflow/core/framework/embedding/batch.h" +#endif +namespace tensorflow { template class EmbeddingVar; @@ -54,22 +55,10 @@ class MultiTierStorage : public Storage { TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage); - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); - - cache_capacity_ = Storage::storage_config_.size[0] - / (Storage::total_dims_ * sizeof(V)); - ready_eviction_ = true; - } - Storage::flag_.clear(std::memory_order_release); + virtual void Init() override { + cache_capacity_ = Storage::storage_config_.size[0] + / (total_dim() * sizeof(V)); + ready_eviction_ = true; } int64 CacheSize() const override { @@ -90,13 +79,13 @@ class MultiTierStorage : public Storage { } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { LOG(FATAL)<<"BatchCommit isn't supported by MultiTierStorage."; return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { LOG(FATAL)<<"Can't get snapshot of MultiTierStorage."; } @@ -104,7 +93,7 @@ class MultiTierStorage : public Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -128,17 +117,6 @@ class MultiTierStorage : public Storage { return; } - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - return; - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - return; - } - void Schedule(std::function fn) override { cache_thread_pool_->Schedule(std::move(fn)); } @@ -223,50 +201,50 @@ class MultiTierStorage : public Storage { } return s; } - - virtual void SetTotalDims(int64 total_dims) = 0; + virtual int total_dim() = 0; void DeleteFromEvictionManager() { eviction_manager_->DeleteStorage(this); } - void ReleaseValuePtrs(std::deque*>& value_ptrs, - Allocator* allocator) { + void ReleaseValuePtrs(std::deque& value_ptrs, + FeatureDescriptor* feat_desc) { constexpr int CAP_INVALID_VALUEPTR = 64 * 1024; if (value_ptrs.size() > CAP_INVALID_VALUEPTR) { int64 num_of_deleted_value_ptrs = value_ptrs.size() - CAP_INVALID_VALUEPTR; for (int i = 0; i < num_of_deleted_value_ptrs; i++) { - ValuePtr* value_ptr = value_ptrs.front(); - value_ptr->Destroy(allocator); - delete value_ptr; + void* value_ptr = value_ptrs.front(); + feat_desc->Deallocate(value_ptr); value_ptrs.pop_front(); } } } - void ReleaseInvalidValuePtr(Allocator* allocator) { - ReleaseValuePtrs(value_ptr_out_of_date_, allocator); + void ReleaseInvalidValuePtr(FeatureDescriptor* feat_desc) { + ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc); } - void KeepInvalidValuePtr(ValuePtr* value_ptr) { + void KeepInvalidValuePtr(void* value_ptr) { value_ptr_out_of_date_.emplace_back(value_ptr); } #if GOOGLE_CUDA void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& context, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursors, const std::vector& memory_index, - const std::vector*>& gpu_value_ptrs, - int value_len); + const std::vector& gpu_value_ptrs, + int value_len, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc); #endif //GOOGL_CUDA private: virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {} protected: - std::deque*> value_ptr_out_of_date_; + std::deque value_ptr_out_of_date_; BatchCache* cache_ = nullptr; EvictionManager* eviction_manager_; @@ -281,6 +259,70 @@ class MultiTierStorage : public Storage { std::string name_; std::vector mu_list_; }; + +#if GOOGLE_CUDA +template +void CopyEmbeddingFromHbmToDram( + const std::vector& hbm_value_ptrs, + const std::vector& dram_value_ptrs, + Allocator* gpu_alloc, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc) { + int batch_size = hbm_value_ptrs.size(); + V** dev_value_address; + + dev_value_address = (V**)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, batch_size * sizeof(V*)); + Allocator* cpu_alloc = ev_allocator(); + V** value_address = (V**)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V*) * batch_size); + + V* batch_data_place; + V* dev_batch_data_place; + int total_dim = dram_feat_desc->total_dim(); + dev_batch_data_place = (V*)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + batch_data_place = (V *)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + // Copy GPU addresses V* + for(int i = 0; i < batch_size; ++i) { + value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0); + } + cudaMemcpyAsync(dev_value_address, value_address, + sizeof(V*) * batch_size, + cudaMemcpyHostToDevice); + + // Launch Kernel,Copy data to continuous place + int block_dim = 128; + void* args[] = { (void*)&dev_value_address, + (void*)&dev_batch_data_place, (void*)&total_dim, + (void*)&batch_size}; + + cudaLaunchKernel((void *)BatchCopy, + (batch_size * total_dim + block_dim - 1) / block_dim, + block_dim, args, 0, NULL); + + cudaMemcpyAsync(batch_data_place, dev_batch_data_place, + sizeof(V) * batch_size * total_dim, + cudaMemcpyDeviceToHost); + + cudaEvent_t is_finish_; + cudaEventCreate(&is_finish_); + cudaEventRecord(is_finish_); + cudaEventSynchronize(is_finish_); + cudaEventDestroy(is_finish_); + + for(int i = 0; i < batch_size; ++i) { + memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0), + &batch_data_place[i * total_dim], total_dim * sizeof(V)); + } + + cpu_alloc->DeallocateRaw(value_address); + cpu_alloc->DeallocateRaw(batch_data_place); + gpu_alloc->DeallocateRaw(dev_value_address); + gpu_alloc->DeallocateRaw(dev_batch_data_place); +} +#endif //GOOGL_CUDA } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/normal_feature_descriptor.h b/tensorflow/core/framework/embedding/normal_feature_descriptor.h new file mode 100644 index 00000000000..817b33d058b --- /dev/null +++ b/tensorflow/core/framework/embedding/normal_feature_descriptor.h @@ -0,0 +1,134 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +#if GOOGLE_CUDA +template +class HbmMultiTierFeatureDescriptorImpl; +#endif + +template +class NormalFeatureDescriptorImpl: public FeatureDescriptorImpl { + public: + NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num, + bool need_record_freq, + bool need_record_version) + : alloc_bytes_(0), + alloc_(alloc), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) {} + + NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl* feat_desc_impl) + : alloc_(feat_desc_impl->alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + NormalFeatureDescriptorImpl( + HbmMultiTierFeatureDescriptorImpl* feat_desc_impl) + : alloc_bytes_(0), + alloc_(feat_desc_impl->dram_alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + ~NormalFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + return is_compute_alloc_bytes; + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + FeatureDescriptorImpl::SetSlotInfo(feat_desc_impl); + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::SetFreqAndVersionOffset(&alloc_bytes_); + return true; + } + + V* GetEmbedding(void *val, int emb_index) override { + return reinterpret_cast(val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { + alloc_->DeallocateRaw(val); + } + + void Deallocate(const std::vector& value_ptrs) override { + for (auto val: value_ptrs) { + Deallocate(val); + } + } + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy(val_ptr, value, + sizeof(V) * FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + + void SetDefaultValue(void* val, int64 index) override { + for (int i = 0; i < FeatureDescriptorImpl::slot_infos_.size(); i++) { + V* val_ptr = GetEmbedding(val, i); + FeatureDescriptorImpl::SetDefaultValue((void*)val_ptr, i, index); + } + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + LOG(FATAL)<<"Can't call SetDefaultValue(const K*, const std::list&," + <<"void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)" + <<" in HbmMultiTierFeatureDescriptor."; + } +#endif + + void SetAllocator(Allocator* alloc) override { + alloc_ = alloc; + } + + int data_bytes() override { + return alloc_bytes_; + } + + private: + int alloc_bytes_; + Allocator* alloc_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h index 0c5ce80886a..7e3ace0063d 100644 --- a/tensorflow/core/framework/embedding/nullable_filter_policy.h +++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h @@ -30,19 +30,21 @@ template class NullableFilterPolicy : public FilterPolicy { using FilterPolicy::ev_; using FilterPolicy::config_; - using FilterPolicy::LookupOrCreateEmbInternal; public: NullableFilterPolicy(const EmbeddingConfig& config, - EV* ev, embedding::Storage* storage) : - FilterPolicy(config, ev), storage_(storage) {} + EV* ev, embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) + : storage_(storage), feat_desc_(feat_desc), + FilterPolicy(config, ev) {} Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); if (s.ok()) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + V* mem_val = feat_desc_->GetEmbedding( + value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_ptr, @@ -57,17 +59,17 @@ class NullableFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, keys, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; if (value_ptr != nullptr) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_ptr; } @@ -85,65 +87,55 @@ class NullableFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs, + const K* keys, void** value_ptrs, int64 num_of_keys) { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> not_found_cursor_list(num_worker_threads + 1); ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs, num_of_keys, not_found_cursor_list); - std::vector var_ptrs(num_of_keys); - auto do_work = [this, value_ptrs, &var_ptrs] - (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - bool is_need_set_default_value = false; - var_ptrs[i] = ev_->LookupOrCreateEmb( - value_ptrs[i], is_need_set_default_value); - } - }; - auto worker_threads = ctx.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, num_of_keys, - 1000, do_work); - - ev_->SetDefaultValueOfNewFeatures( - keys, num_of_keys, - not_found_cursor_list[0], - var_ptrs.data(), ctx.compute_stream, - ctx.event_mgr, ctx.gpu_device); } #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { *is_filter = true; - return ev_->LookupOrCreateKey(key, val); + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + storage_->Insert(key, value_ptr); + s = Status::OK(); + } + feat_desc_->AddFreq(*value_ptr, count); + return s; } - int64 GetFreq(K key, ValuePtr* value_ptr) override { - if (storage_->GetLayoutType() != LayoutType::LIGHT) { - return value_ptr->GetFreq(); - }else { - return 0; - } + Status LookupKey(K key, void** val, + bool* is_filter, int64 count) override { + *is_filter = true; + return ev_->LookupKey(key, val); + } + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); } int64 GetFreq(K key) override { - if (storage_->GetLayoutType() != LayoutType::LIGHT) { - ValuePtr* value_ptr = nullptr; - TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetFreq(); - }else { + if (!config_.is_save_freq()) return 0; - } + void* value_ptr = nullptr; + TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); + return feat_desc_->GetFreq(value_ptr); } Status Restore(int64 key_num, int bucket_num, int64 partition_id, @@ -161,27 +153,30 @@ class NullableFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); + int64 import_freq = 0; + int64 import_version = -1; + if (config_.filter_freq !=0 || ev_->IsMultiLevel() || config_.record_freq) { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); + import_version = version_buff[i]; } - LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len, - value_ptr, value_buff, key_buff); + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); } return Status::OK(); } - bool is_admit(K key, ValuePtr* value_ptr) override { + bool is_admit(K key, void* value_ptr) override { return true; } private: embedding::Storage* storage_; + embedding::FeatureDescriptor* feat_desc_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h index ea063a113a3..a8d0d9ada75 100644 --- a/tensorflow/core/framework/embedding/shrink_policy.h +++ b/tensorflow/core/framework/embedding/shrink_policy.h @@ -15,14 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { - -template -class ValuePtr; - class Allocator; namespace embedding { @@ -40,31 +37,29 @@ struct ShrinkArgs { template class ShrinkPolicy { public: - ShrinkPolicy(Allocator* alloc): alloc_(alloc) {} + ShrinkPolicy(FeatureDescriptor* feat_desc): feat_desc_(feat_desc) {} virtual ~ShrinkPolicy() {} TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy); virtual void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) = 0; protected: - void EmplacePointer(ValuePtr* value_ptr) { + void EmplacePointer(void* value_ptr) { to_delete_.emplace_back(value_ptr); } void ReleaseValuePtrs() { for (auto it : to_delete_) { - it->Destroy(alloc_); - delete it; + feat_desc_->Deallocate(it); } to_delete_.clear(); } protected: - std::vector*> to_delete_; - private: - Allocator* alloc_; + std::vector to_delete_; + FeatureDescriptor* feat_desc_; }; template @@ -74,7 +69,7 @@ class NonShrinkPolicy: public ShrinkPolicy { TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override {} }; } // embedding diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h index f9de65df588..be08afd7f50 100644 --- a/tensorflow/core/framework/embedding/single_tier_storage.h +++ b/tensorflow/core/framework/embedding/single_tier_storage.h @@ -24,7 +24,6 @@ limitations under the License. #endif // GOOGLE_CUDA #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/framework/embedding/l2weight_shrink_policy.h" -#include "tensorflow/core/framework/embedding/layout_creator.h" #include "tensorflow/core/framework/embedding/leveldb_kv.h" #include "tensorflow/core/framework/embedding/ssd_hash_kv.h" #include "tensorflow/core/framework/embedding/storage_config.h" @@ -32,9 +31,6 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -62,24 +58,22 @@ class HbmDramSsdStorage; template class SingleTierStorage : public Storage { public: - SingleTierStorage(const StorageConfig& sc, Allocator* alloc, - KVInterface* kv, LayoutCreator* lc) - : kv_(kv), alloc_(alloc), layout_creator_(lc), + SingleTierStorage(const StorageConfig& sc, + KVInterface* kv, FeatureDescriptor* feat_desc) + : kv_(kv), feat_desc_(feat_desc), Storage(sc) { if (sc.embedding_config.steps_to_live != 0) { shrink_policy_ = new GlobalStepShrinkPolicy( sc.embedding_config.steps_to_live, - alloc_, + feat_desc_, kv_); } else if (sc.embedding_config.l2_weight_threshold != -1.0) { shrink_policy_ = new L2WeightShrinkPolicy( sc.embedding_config.l2_weight_threshold, sc.embedding_config.primary_emb_index, - Storage::GetOffset( - sc.embedding_config.primary_emb_index), - alloc_, + feat_desc_, kv_); } else { shrink_policy_ = new NonShrinkPolicy(); @@ -89,11 +83,10 @@ class SingleTierStorage : public Storage { ~SingleTierStorage() override { mutex_lock l(Storage::mu_); std::vector key_list; - std::vector*> value_ptr_list; + std::vector value_ptr_list; kv_->GetSnapshot(&key_list, &value_ptr_list); for (auto value_ptr : value_ptr_list) { - value_ptr->Destroy(alloc_); - delete value_ptr; + feat_desc_->Deallocate(value_ptr); } delete kv_; delete shrink_policy_; @@ -101,7 +94,7 @@ class SingleTierStorage : public Storage { TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { return kv_->Lookup(key, value_ptr); } @@ -109,47 +102,45 @@ class SingleTierStorage : public Storage { return kv_->Contains(key); } - virtual void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) override { do { - *value_ptr = layout_creator_->Create(alloc_, alloc_len); + *value_ptr = feat_desc_->Allocate(); Status s = kv_->Insert(key, *value_ptr); if (s.ok()) { break; } else { - (*value_ptr)->Destroy(alloc_); - delete *value_ptr; + feat_desc_->Deallocate(*value_ptr); } } while (!(kv_->Lookup(key, value_ptr)).ok()); } - virtual void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in SingleTireStorage."; + virtual void Insert(K key, void** value_ptr) override { + do { + Status s = kv_->Insert(key, *value_ptr); + if (s.ok()) { + break; + } else { + feat_desc_->Deallocate(*value_ptr); + } + } while (!(kv_->Lookup(key, value_ptr)).ok()); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = kv_->Lookup(key, value_ptr); if (s.ok()) { return s; } - *value_ptr = layout_creator_->Create(alloc_, size); + *value_ptr = feat_desc_->Allocate(); s = kv_->Insert(key, *value_ptr); if (s.ok()) { return s; } // Insert Failed, key already exist - (*value_ptr)->Destroy(alloc_); - delete *value_ptr; + feat_desc_->Deallocate(*value_ptr); return kv_->Lookup(key, value_ptr); } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - return GetOrCreate(key, value_ptr, size); - } Status Remove(K key) override { return kv_->Remove(key); @@ -180,7 +171,7 @@ class SingleTierStorage : public Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -198,13 +189,13 @@ class SingleTierStorage : public Storage { } virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { LOG(FATAL) << "Unsupport BatchCommit in Storage: " << typeid(this).name(); return Status::OK(); } - virtual Status Commit(K keys, const ValuePtr* value_ptr) { + virtual Status Commit(K keys, const void* value_ptr) { LOG(FATAL) << "Unsupport Commit in Storage: " << typeid(this).name(); return Status::OK(); @@ -222,19 +213,12 @@ class SingleTierStorage : public Storage { return; } - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - return; - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - return; - } + virtual void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override {} Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { mutex_lock l(Storage::mu_); return kv_->GetSnapshot(key_list, value_ptr_list); } @@ -247,7 +231,7 @@ class SingleTierStorage : public Storage { ShrinkArgs& shrink_args, int64 value_len, V* default_value) override { - std::vector*> value_ptr_list; + std::vector value_ptr_list; std::vector key_list_tmp; TF_CHECK_OK(kv_->GetSnapshot( &key_list_tmp, &value_ptr_list)); @@ -255,30 +239,16 @@ class SingleTierStorage : public Storage { if (emb_config.is_primary()) { Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len); } - TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list_tmp, - value_ptr_list))); + value_ptr_list, + SingleTierStorage::feat_desc_))); return Status::OK(); } - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); - } - Storage::flag_.clear(std::memory_order_release); - } - bool IsMultiLevel() override { return false; } @@ -299,16 +269,22 @@ class SingleTierStorage : public Storage { LOG(FATAL) << "Unsupport Schedule in SingleTierStorage."; } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + protected: - virtual void SetTotalDims(int64 total_dims) = 0; + virtual void* CreateValuePtr() { + return feat_desc_->Allocate(); + } - virtual ValuePtr* CreateValuePtr(int64 size) { - return layout_creator_->Create(alloc_, size); + virtual void DestroyValuePtr(void* value_ptr) { + feat_desc_->Deallocate(value_ptr); } - virtual void DestroyValuePtr(ValuePtr* value_ptr) { - value_ptr->Destroy(alloc_); - delete value_ptr; + FeatureDescriptor* feature_descriptor() { + return feat_desc_; } protected: virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, @@ -324,7 +300,7 @@ class SingleTierStorage : public Storage { } virtual void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) { mutex_lock l(Storage::mu_); @@ -339,31 +315,40 @@ class SingleTierStorage : public Storage { KVInterface* kv_; ShrinkPolicy* shrink_policy_; Allocator* alloc_; - LayoutCreator* layout_creator_; + FeatureDescriptor* feat_desc_; }; template class DramStorage : public SingleTierStorage { public: - DramStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, - KVInterface* kv) - : SingleTierStorage(sc, alloc, kv, lc) {} + DramStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), feat_desc) {} ~DramStorage() override {} Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { return SingleTierStorage::kv_->BatchCommit(keys, value_ptrs); } - Status TryInsert(K key, ValuePtr* value_ptr) { + Status TryInsert(K key, void* value_ptr) { return SingleTierStorage::kv_->Insert(key, value_ptr); } - Status Commit(K keys, const ValuePtr* value_ptr) override{ + Status Commit(K keys, const void* value_ptr) override{ return SingleTierStorage::kv_->Commit(keys, value_ptr); } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + void* value_ptr = SingleTierStorage::feat_desc_->Allocate(freq); + SingleTierStorage::Insert(key, &value_ptr); + SingleTierStorage::feat_desc_->SetValue(value_ptr, emb_index, value); + SingleTierStorage::feat_desc_->SetFreq(value_ptr, freq); + SingleTierStorage::feat_desc_->UpdateVersion(value_ptr, version); + } TF_DISALLOW_COPY_AND_ASSIGN(DramStorage); public: @@ -375,12 +360,8 @@ class DramStorage : public SingleTierStorage { friend class HbmDramSsdStorage; #endif protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); - } - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -395,9 +376,10 @@ class DramStorage : public SingleTierStorage { template class HbmStorage : public SingleTierStorage { public: - HbmStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new GPUHashMapKV(sc.embedding_config, alloc), lc) { + HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new GPUHashMapKV( + sc.embedding_config, gpu_allocator), feat_desc) { } ~HbmStorage() override {} @@ -488,48 +470,27 @@ class HbmStorage : public SingleTierStorage { gpu_kv->Import(key_import, value_import, device, emb_config); return Status::OK(); } - - void SetTotalDims(int64 total_dims) override {} }; template class HbmStorageWithCpuKv: public SingleTierStorage { public: - HbmStorageWithCpuKv(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + HbmStorageWithCpuKv(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~HbmStorageWithCpuKv() override {} - void Insert(K key, ValuePtr* value_ptr) override { - do { - Status s = SingleTierStorage::kv_->Insert(key, value_ptr); - if (s.ok()) { - break; - } else { - value_ptr->Destroy(SingleTierStorage::alloc_); - delete value_ptr; - } - } while (!(SingleTierStorage::kv_->Lookup(key, &value_ptr)).ok()); - } - - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - SingleTierStorage::Insert(key, value_ptr, alloc_len, to_dram); - } - - Status TryInsert(K key, ValuePtr* value_ptr) { + Status TryInsert(K key, void* value_ptr) { return SingleTierStorage::kv_->Insert(key, value_ptr); } public: friend class HbmDramStorage; friend class HbmDramSsdStorage; protected: - void SetTotalDims(int64 total_dims) override {} - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -544,28 +505,25 @@ class HbmStorageWithCpuKv: public SingleTierStorage { template class PmemMemkindStorage : public SingleTierStorage { public: - PmemMemkindStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + PmemMemkindStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~PmemMemkindStorage() override {} TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage); - - protected: - void SetTotalDims(int64 total_dims) override {} }; template class PmemLibpmemStorage : public SingleTierStorage { public: - PmemLibpmemStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + PmemLibpmemStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~PmemLibpmemStorage() override {} - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -573,10 +531,8 @@ class PmemLibpmemStorage : public SingleTierStorage { protected: friend class DramPmemStorage; - void SetTotalDims(int64 total_dims) override {} - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -590,15 +546,15 @@ class PmemLibpmemStorage : public SingleTierStorage { template class LevelDBStore : public SingleTierStorage { public: - LevelDBStore(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LevelDBKV(sc.path), lc) { + LevelDBStore(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LevelDBKV(sc.path, feat_desc), feat_desc) { } ~LevelDBStore() override {} TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore); - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -608,29 +564,25 @@ class LevelDBStore : public SingleTierStorage { LevelDBKV* leveldb_kv = reinterpret_cast*>(SingleTierStorage::kv_); return new DBValueIterator( - key_list, emb_index, value_len, leveldb_kv); + key_list, emb_index, value_len, + leveldb_kv, SingleTierStorage::feat_desc_); } public: friend class DramLevelDBStore; - - protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); - } }; template class SsdHashStorage : public SingleTierStorage { public: - SsdHashStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new SSDHashKV(sc.path, alloc), lc) { + SsdHashStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new SSDHashKV(sc.path, feat_desc), feat_desc) { } ~SsdHashStorage() override {} TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage); - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -691,8 +643,9 @@ class SsdHashStorage : public SingleTierStorage { #endif protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); + void Init() override { + dynamic_cast*>( + SingleTierStorage::kv_)->Init(); } }; } // embedding diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h index 8040421233e..f51c6904a50 100644 --- a/tensorflow/core/framework/embedding/ssd_hash_kv.h +++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h @@ -25,17 +25,12 @@ limitations under the License. #include "tensorflow/core/framework/embedding/ssd_record_descriptor.h" #include "tensorflow/core/framework/embedding/emb_file_creator.h" #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/util/env_var.h" namespace tensorflow { - -template -class ValuePtr; - namespace embedding { class EmbPosition { public: @@ -115,55 +110,6 @@ class SSDIterator { } } - virtual void Key(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - memcpy((char*)val, &((file_map_[f_id])[curr_vec_].first), dim); - } - - virtual void Value(char* val, int64 dim, int64 value_offset) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, dim, - posi->offset_ + value_offset + sizeof(FixedLengthHeader)); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_ + - value_offset + sizeof(FixedLengthHeader), dim); - } - } - - virtual void Freq(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, sizeof(FixedLengthHeader), - posi->offset_); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_, - sizeof(FixedLengthHeader)); - } - *((int64*)val) = - reinterpret_cast(val)->GetFreqCounter(); - } - - virtual void Version(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, sizeof(FixedLengthHeader), - posi->offset_); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_, - sizeof(FixedLengthHeader)); - } - *((int64*)val) = - reinterpret_cast(val)->GetGlobalStep(); - } - virtual K Key() { int64 f_id = file_id_vec_[curr_file_]; return (file_map_[f_id])[curr_vec_].first; @@ -192,8 +138,9 @@ class SSDIterator { template class SSDHashKV : public KVInterface { public: - explicit SSDHashKV(const std::string& path, Allocator* alloc) - : alloc_(alloc) { + explicit SSDHashKV(const std::string& path, + FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { path_ = io::JoinPath( path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_"); hash_map_.max_load_factor(0.8); @@ -205,9 +152,6 @@ class SSDHashKV : public KVInterface { evict_file_set_.set_counternum(16); evict_file_set_.set_deleted_key(DELETED_KEY); - new_value_ptr_fn_ = [this](size_t size) { - return new NormalContiguousValuePtr(alloc_, size); - }; is_async_compaction_ = true; TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true, &is_async_compaction_)); @@ -224,7 +168,7 @@ class SSDHashKV : public KVInterface { "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!"; compaction_fn_ = [this](){Compaction();}; check_buffer_fn_ = [this](){CheckBuffer();}; - save_kv_fn_ = [this](K key, const ValuePtr* value_ptr, + save_kv_fn_ = [this](K key, const void* value_ptr, bool is_compaction=false) { SaveKV(key, value_ptr, is_compaction); }; @@ -233,7 +177,7 @@ class SSDHashKV : public KVInterface { "Use Async Compactor in SSDHashKV of Multi-tier Embedding Storage!"; compaction_fn_ = [](){}; check_buffer_fn_ = [this](){CheckBufferAsync();}; - save_kv_fn_ = [this](K key, const ValuePtr* value_ptr, + save_kv_fn_ = [this](K key, const void* value_ptr, bool is_compaction=false) { SaveKVAsync(key, value_ptr, is_compaction); }; @@ -244,9 +188,8 @@ class SSDHashKV : public KVInterface { } } - void SetTotalDims(int total_dims) override { - total_dims_ = total_dims; - val_len_ = sizeof(FixedLengthHeader) + total_dims_ * sizeof(V); + void Init() { + val_len_ = feat_desc_->data_bytes(); max_app_count_ = BUFFER_SIZE / val_len_; write_buffer_ = new char[BUFFER_SIZE]; unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_); @@ -334,18 +277,18 @@ class SSDHashKV : public KVInterface { return Status::OK(); } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { auto iter = hash_map_.find_wait_free(key); if (iter.first == EMPTY_KEY) { return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV."); } else { - ValuePtr* val = new_value_ptr_fn_(total_dims_); + void* val = feat_desc_->Allocate(); EmbPosition* posi = iter.second; if (posi->flushed_) { - emb_files_[posi->version_]->Read((char*)(val->GetPtr()), + emb_files_[posi->version_]->Read((char*)val, val_len_, posi->offset_); } else { - memcpy((char*)val->GetPtr(), + memcpy((char*)val, write_buffer_ + posi->buffer_offset_, val_len_); } *value_ptr = val; @@ -363,17 +306,17 @@ class SSDHashKV : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return BatchCommit(keys, value_ptrs); } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { compaction_fn_(); __sync_fetch_and_add(&total_app_count_, keys.size()); for (int i = 0; i < keys.size(); i++) { @@ -384,7 +327,7 @@ class SSDHashKV : public KVInterface { return Status::OK(); } - Status Commit(K key, const ValuePtr* value_ptr) override { + Status Commit(K key, const void* value_ptr) override { compaction_fn_(); __sync_fetch_and_add(&total_app_count_, 1); check_buffer_fn_(); @@ -402,7 +345,7 @@ class SSDHashKV : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { return Status::OK(); } @@ -467,8 +410,8 @@ class SSDHashKV : public KVInterface { int64 Size() const override { return hash_map_.size_lockless(); } - void FreeValuePtr(ValuePtr* value_ptr) override { - delete value_ptr; + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); } private: @@ -555,10 +498,10 @@ class SSDHashKV : public KVInterface { } void AppendToWriteBuffer(size_t curr_buffer_offset, K key, - const ValuePtr* value_ptr) { + const void* value_ptr) { current_offset_ += val_len_; memcpy(write_buffer_ + curr_buffer_offset, - (char*)value_ptr->GetPtr(), val_len_); + (char*)value_ptr, val_len_); key_buffer_[buffer_cur_] = key; ++buffer_cur_; } @@ -582,7 +525,7 @@ class SSDHashKV : public KVInterface { return flag; } - void SaveKV(K key, const ValuePtr* value_ptr, + void SaveKV(K key, const void* value_ptr, bool is_compaction = false) { size_t curr_buffer_offset = buffer_cur_ * val_len_; EmbPosition* ep = new EmbPosition(current_offset_, current_version_, @@ -608,7 +551,7 @@ class SSDHashKV : public KVInterface { } } - void SaveKVAsync(K key, const ValuePtr* value_ptr, + void SaveKVAsync(K key, const void* value_ptr, bool is_compaction = false) { size_t curr_buffer_offset = buffer_cur_ * val_len_; EmbPosition* ep = new EmbPosition(current_offset_, evict_version_, @@ -681,21 +624,21 @@ class SSDHashKV : public KVInterface { } void MoveToNewFile() { - ValuePtr* val = new_value_ptr_fn_(total_dims_); + void* val = feat_desc_->Allocate(); for (auto it : evict_file_map_) { EmbFile* file = emb_files_[it.first]; total_app_count_ -= file->InvalidCount(); file->MapForRead(); for (auto it_vec : it.second) { EmbPosition* posi = it_vec.second; - file->ReadWithMemcpy((char*)(val->GetPtr()), val_len_, + file->ReadWithMemcpy((char*)val, val_len_, posi->offset_); CheckBuffer(); SaveKV(it_vec.first, val, true); } file->UnmapForRead(); } - delete val; + feat_desc_->Deallocate(val); } void MoveToNewFileAsync() { @@ -825,11 +768,10 @@ class SSDHashKV : public KVInterface { char* write_buffer_ = nullptr; K* key_buffer_ = nullptr; bool is_async_compaction_; - Allocator* alloc_ = nullptr; + FeatureDescriptor* feat_desc_; int total_dims_; std::string path_; - std::function*(size_t)> new_value_ptr_fn_; typedef google::dense_hash_map_lockless LockLessHashMap; LockLessHashMap hash_map_; @@ -857,7 +799,7 @@ class SSDHashKV : public KVInterface { std::function compaction_fn_; std::function check_buffer_fn_; - std::function*, bool)> save_kv_fn_; + std::function save_kv_fn_; EmbFileCreator* emb_file_creator_ = nullptr; }; template diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h index bb949183492..1ffb435054b 100644 --- a/tensorflow/core/framework/embedding/storage.h +++ b/tensorflow/core/framework/embedding/storage.h @@ -40,9 +40,6 @@ using GPUDevice = Eigen::GpuDevice; template class CheckpointLoader; -template -class ValuePtr; - template class EmbeddingVar; @@ -57,9 +54,6 @@ class BundleReader; template struct EmbeddingVarContext; -namespace { - const int kSavedPartitionNum = 1000; -} namespace embedding { template @@ -67,42 +61,40 @@ class Storage { friend class CheckpointLoader; public: explicit Storage(const StorageConfig& storage_config) - : storage_config_(storage_config) {} + : storage_config_(storage_config) { + initialize_value_.resize(storage_config.embedding_config.slot_num + 1); + } virtual ~Storage() {} TF_DISALLOW_COPY_AND_ASSIGN(Storage); - virtual Status Get(K key, ValuePtr** value_ptr) = 0; + virtual Status Get(K key, void** value_ptr) = 0; #if GOOGLE_CUDA virtual void BatchGet(const EmbeddingVarContext& ctx, const K* key, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) {} + void** value_ptr_list, + int64 num_of_keys) {} virtual void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* key, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_found_cursor_list) {} #endif //GOOGLE_CUDA virtual Status Contains(K key) = 0; - virtual void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) = 0; - virtual void Insert(K key, ValuePtr* value_ptr) = 0; - virtual void SetAllocLen(int64 value_len, int slot_num) = 0; + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) = 0; + virtual void Insert(K key, void** value_ptr) = 0; + virtual void Init() {} virtual void SetValueLen(int64 value_len) {} - virtual Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) = 0; - virtual Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) = 0; + virtual Status GetOrCreate(K key, void** value_ptr) = 0; virtual int LookupTier(K key) const = 0; virtual Status Remove(K key) = 0; virtual int64 Size() const = 0; virtual int64 Size(int level) const = 0; virtual Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) = 0; + std::vector* value_ptr_list) = 0; virtual Status Save( const string& tensor_name, const string& prefix, @@ -113,7 +105,7 @@ class Storage { V* default_value) = 0; virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) = 0; + const std::vector& value_ptrs) = 0; virtual Status Eviction(K* evict_ids, int64 evict_size) = 0; @@ -121,7 +113,7 @@ class Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -149,25 +141,11 @@ class Storage { Allocator* alloc, int64 value_len, int64 block_size) = 0; - virtual void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) = 0; - virtual void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, int64 num_of_value_ptrs) = 0; inline mutex* get_mutex() { return &mu_; } inline int64 GetAllocLen() { return alloc_len_; } inline int64 GetOffset(int64 index) { return alloc_len_ * index; } inline int64 GetTotalDims() { return total_dims_; } - inline int64 ComputeAllocLen(int64 value_len) { - if (LayoutType::COMPACT == storage_config_.layout_type) { - return value_len; - } else { - return (value_len * sizeof(V) % 16 == 0) - ? value_len - : value_len + (16 - (sizeof(V) * value_len) % 16) / sizeof(V); - } - } - inline LayoutType GetLayoutType() { return storage_config_.layout_type; } inline embedding::StorageType GetStorageType() { return storage_config_.type; } inline std::string GetStoragePath() { return storage_config_.path; } inline embedding::CacheStrategy @@ -183,7 +161,7 @@ class Storage { } inline void Insert(const std::vector& keys, - ValuePtr** value_ptrs) { + void** value_ptrs) { for (size_t i = 0; i < keys.size(); i++) { Insert(keys[i], value_ptrs[i]); } @@ -211,6 +189,13 @@ class Storage { reset_version, reader); restorer.RestoreCkpt(emb_config, device); }; + + virtual void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) = 0; + + virtual void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) = 0; protected: virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, @@ -227,12 +212,7 @@ class Storage { const std::string& ssd_emb_file_name, EmbeddingVar* ev, RestoreSSDBuffer& restore_buff) { - int64 alloc_len = Storage::ComputeAllocLen(value_len); - auto* alloc = ev->GetAllocator(); for (int64 i = 0; i < restore_buff.num_of_keys; i++) { - ValuePtr* value_ptr = nullptr; - ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr); - value_ptr->SetInitialized(emb_index); int64 file_id = restore_buff.key_file_id_list_buf[i]; int64 key_offset = restore_buff.key_offset_list_buf[i]; // Read data from embedding files on SSD. Data are stored in @@ -240,32 +220,29 @@ class Storage { std::stringstream ss; ss << ssd_emb_file_name << "/" << file_id << ".emb"; int fd = open(ss.str().data(), O_RDONLY); + EmbeddingConfig& emb_config = storage_config_.embedding_config; + FeatureDescriptor normal_feat_desc( + emb_config.block_num, emb_config.slot_num + 1, + ev_allocator(), StorageType::DRAM, true, + true, {false, 0}); + void* value_ptr = normal_feat_desc.Allocate(); char* file_addr = (char*)mmap(nullptr, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1) + + normal_feat_desc.data_bytes() + key_offset, PROT_READ, MAP_PRIVATE, fd, 0); - - NormalContiguousValuePtr tmp_value_ptr(alloc, - alloc_len * (emb_slot_num + 1)); - void* ptr = tmp_value_ptr.GetPtr(); - memcpy(ptr, file_addr + key_offset, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1)); + memcpy(value_ptr, file_addr + key_offset, + normal_feat_desc.data_bytes()); munmap(file_addr, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1) + + normal_feat_desc.data_bytes() + key_offset); close(fd); // Copy Data to ValuePtr, data of slots are set by primary here. - for (int j = 0; j < emb_slot_num + 1; j++) { - V* value = tmp_value_ptr.GetValue(j, alloc_len * j); - if (value != nullptr) { - value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j); - } - } - value_ptr->SetFreq(tmp_value_ptr.GetFreq()); - value_ptr->SetStep(tmp_value_ptr.GetStep()); + int64 import_freq = normal_feat_desc.GetFreq(value_ptr); + int64 import_version = normal_feat_desc.GetVersion(value_ptr); + V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index); + Import(restore_buff.key_list_buf[i], value, + import_freq, import_version, emb_index); + normal_feat_desc.Deallocate(value_ptr); } return Status::OK(); } @@ -273,10 +250,11 @@ class Storage { private: void GeneratePartitionedCkptData( const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, EmbeddingVarCkptData* partitioned_ckpt_data, const EmbeddingConfig& emb_config, - V* default_value) { + V* default_value, + FeatureDescriptor* feat_desc) { std::vector> ev_ckpt_data_parts(kSavedPartitionNum); @@ -293,7 +271,43 @@ class Storage { ev_ckpt_data_parts[part_id].Emplace( key_list[i], value_ptr_list[i], emb_config, default_value, - GetOffset(emb_config.emb_index), + feat_desc, + is_save_freq, + is_save_version, + save_unfiltered_features); + break; + } + } + } + + partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts); + } + + void GeneratePartitionedCkptData( + const std::vector& key_list, + const std::vector& value_ptr_list, + EmbeddingVarCkptData* partitioned_ckpt_data, + const EmbeddingConfig& emb_config, + V* default_value, + const std::vector*>& feat_desc) { + std::vector> + ev_ckpt_data_parts(kSavedPartitionNum); + + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar( + "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features)); + + bool is_save_freq = emb_config.is_save_freq(); + bool is_save_version = emb_config.is_save_version(); + + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset; + ev_ckpt_data_parts[part_id].Emplace( + key_list[i], value_ptr_list[i], + emb_config, default_value, + feat_desc[feat_desc_type], is_save_freq, is_save_version, save_unfiltered_features); @@ -333,12 +347,33 @@ class Storage { int64 value_len, V* default_value, const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, + FeatureDescriptor* feat_desc, + ValueIterator* value_iter = nullptr) { + EmbeddingVarCkptData partitioned_ckpt_data; + GeneratePartitionedCkptData(key_list, value_ptr_list, + &partitioned_ckpt_data, emb_config, + default_value, feat_desc); + Status s = + partitioned_ckpt_data.ExportToCkpt( + tensor_name, writer, value_len, value_iter); + return Status::OK(); + } + + Status SaveToCheckpoint( + const string& tensor_name, + BundleWriter* writer, + const EmbeddingConfig& emb_config, + int64 value_len, + V* default_value, + const std::vector& key_list, + const std::vector& value_ptr_list, + const std::vector*>& feat_desc, ValueIterator* value_iter = nullptr) { EmbeddingVarCkptData partitioned_ckpt_data; GeneratePartitionedCkptData(key_list, value_ptr_list, &partitioned_ckpt_data, emb_config, - default_value); + default_value, feat_desc); Status s = partitioned_ckpt_data.ExportToCkpt( tensor_name, writer, value_len, value_iter); @@ -366,6 +401,7 @@ class Storage { mutex mu_; std::atomic_flag flag_ = ATOMIC_FLAG_INIT; + std::vector initialize_value_; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h index 85e44879dcb..23babc9ef08 100644 --- a/tensorflow/core/framework/embedding/storage_config.h +++ b/tensorflow/core/framework/embedding/storage_config.h @@ -17,13 +17,11 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/embedding_config.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" namespace tensorflow { namespace embedding { struct StorageConfig { StorageConfig() : type(StorageType::DEFAULT), path(""), - layout_type(LayoutType::NORMAL), cache_strategy(CacheStrategy::LFU) { size = {1<<30,1<<30,1<<30,1<<30}; } @@ -31,32 +29,14 @@ struct StorageConfig { StorageConfig(StorageType t, const std::string& p, const std::vector& s, - const std::string& layout, const EmbeddingConfig& ec, const CacheStrategy cache_strategy_ = CacheStrategy::LFU) - : type(t), - path(p), - embedding_config(ec), - cache_strategy(cache_strategy_) { - if ("normal" == layout) { - layout_type = LayoutType::NORMAL; - } else if ("light" == layout) { - layout_type = LayoutType::LIGHT; - } else if ("normal_contiguous" == layout){ - layout_type = LayoutType::NORMAL_CONTIGUOUS; - } else if ("normal_contiguous_gpu" == layout){ - layout_type = LayoutType::NORMAL_CONTIGUOUS_GPU; - } else if ("compact" == layout){ - layout_type = LayoutType::COMPACT; - } else { - LOG(WARNING) << "Unknown layout: " - << layout << ", use LayoutType::NORMAL by default."; - layout_type = LayoutType::NORMAL; - } - size = s; - } + : type(t), + path(p), + size(s), + embedding_config(ec), + cache_strategy(cache_strategy_) {} StorageType type; - LayoutType layout_type; std::string path; std::vector size; CacheStrategy cache_strategy; diff --git a/tensorflow/core/framework/embedding/storage_factory.h b/tensorflow/core/framework/embedding/storage_factory.h index 10d2d52b83f..c585b058470 100644 --- a/tensorflow/core/framework/embedding/storage_factory.h +++ b/tensorflow/core/framework/embedding/storage_factory.h @@ -16,7 +16,6 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ #include "tensorflow/core/framework/embedding/config.pb.h" -#include "tensorflow/core/framework/embedding/layout_creator.h" #include "tensorflow/core/framework/embedding/dram_leveldb_storage.h" #include "tensorflow/core/framework/embedding/dram_pmem_storage.h" #include "tensorflow/core/framework/embedding/dram_ssd_storage.h" @@ -34,50 +33,41 @@ class StorageFactory { public: template static Storage* Create(const StorageConfig& sc, - Allocator* gpu_allocator, const string& name) { - auto layout_creator = LayoutCreatorFactory::Create(sc); - + Allocator* gpu_allocator, FeatureDescriptor* feat_desc, + const string& name) { switch (sc.type) { case StorageType::DRAM: - return new DramStorage(sc, ev_allocator(), - layout_creator, new LocklessHashMap()); + return new DramStorage(sc, feat_desc); case StorageType::PMEM_MEMKIND: - return new PmemMemkindStorage(sc, pmem_allocator(), - layout_creator); + feat_desc->SetAllocator(pmem_allocator()); + return new PmemMemkindStorage(sc, feat_desc); case StorageType::PMEM_LIBPMEM: - return new PmemLibpmemStorage(sc, - experimental_pmem_allocator(sc.path, sc.size[0]), - layout_creator); + feat_desc->SetAllocator( + experimental_pmem_allocator(sc.path, sc.size[0])); + return new PmemLibpmemStorage(sc, feat_desc); case StorageType::DRAM_PMEM: - return new DramPmemStorage(sc, ev_allocator(), - experimental_pmem_allocator(sc.path, sc.size[0]), - layout_creator, name); + return new DramPmemStorage(sc, + feat_desc, name); case StorageType::LEVELDB: case StorageType::DRAM_LEVELDB: - return new DramLevelDBStore(sc, ev_allocator(), - layout_creator, name); + return new DramLevelDBStore(sc, feat_desc, name); case StorageType::SSDHASH: case StorageType::DRAM_SSDHASH: - return new DramSsdHashStorage(sc, ev_allocator(), - layout_creator, name); + return new DramSsdHashStorage(sc, feat_desc, name); case StorageType::HBM: #if GOOGLE_CUDA - return new HbmStorage(sc, gpu_allocator, - layout_creator); + return new HbmStorage(sc, gpu_allocator, feat_desc); #endif // GOOGLE_CUDA case StorageType::HBM_DRAM: #if GOOGLE_CUDA - return new HbmDramStorage(sc, gpu_allocator, - ev_allocator(), layout_creator, name); + return new HbmDramStorage(sc, gpu_allocator, feat_desc, name); #endif // GOOGLE_CUDA case StorageType::HBM_DRAM_SSDHASH: #if GOOGLE_CUDA - return new HbmDramSsdStorage(sc, gpu_allocator, - ev_allocator(), layout_creator, name); + return new HbmDramSsdStorage(sc, gpu_allocator, feat_desc, name); #endif // GOOGLE_CUDA default: - return new DramStorage(sc, ev_allocator(), - layout_creator, new LocklessHashMap()); + return new DramStorage(sc, feat_desc); } } }; diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h deleted file mode 100644 index ca7d234ed61..00000000000 --- a/tensorflow/core/framework/embedding/value_ptr.h +++ /dev/null @@ -1,647 +0,0 @@ -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ - -#include -#include -#include -#include - -#include "tensorflow/core/framework/typed_allocator.h" -#if GOOGLE_CUDA -#include -#endif // GOOGLE_CUDA - -namespace tensorflow { - -enum class LayoutType { - LIGHT, - NORMAL, - LEVELDB, - NORMAL_CONTIGUOUS, - NORMAL_CONTIGUOUS_GPU, - COMPACT, -}; - -namespace { -constexpr int COLUMN_BITSET_BYTES = 5; -constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8; - -struct MetaHeader { - unsigned char embed_num; - unsigned char value_type; - unsigned char header_size; - unsigned char column_bitset[COLUMN_BITSET_BYTES]; - - static const int kEmbeddingNumStartIndex = 0; - static const int kValueTypeStartIndex = - kEmbeddingNumStartIndex + sizeof(char); - static const int kHeaderSizeStartIndex = - kValueTypeStartIndex + sizeof(char); - static const int kColumnBitsetIndex = - kHeaderSizeStartIndex + sizeof(char); - - inline unsigned int GetEmbeddingNum() { - return (unsigned int) embed_num; - } - - inline void SetEmbeddingNum(size_t s) { - embed_num = (unsigned char)s; - } - - inline std::bitset GetColumnBitset() { - unsigned long meta = ((unsigned long*)this)[0]; - std::bitset bs(meta >> (8 * kColumnBitsetIndex)); - return bs; - } - - inline void SetColumnBitset(const std::bitset& bs, - unsigned int embnum) { - ((unsigned long*)(this))[0] = - (bs.to_ulong() << (8 * kColumnBitsetIndex)) | - (header_size << (8 * kHeaderSizeStartIndex)) | - (value_type << (8 * kValueTypeStartIndex)) | - (embnum << (8 * kEmbeddingNumStartIndex)); - } - - inline unsigned int GetHeaderSize() { - return (unsigned int) header_size; - } - - inline void SetHeaderSize(size_t size) { - header_size = (unsigned char)size; - } - - inline void SetLayoutType(LayoutType vt) { - value_type = (unsigned char)vt; - } - - inline LayoutType GetLayoutType() { - return (LayoutType)value_type; - } -}; - -struct LightHeader { -/*__________________________________________________________________________________________ - | | | | | embedding | slot | - | number of | valueptr | header | each bit a V* | V* | V* | - | embedding | type | size | 1 valid | actually pointer | actually pointer |... - | columns | | | 0 no-valid | by alloctor | by alloctor | - | (8 bits) | (8 bits) | (8 bits) | (40 bits) | (8 bytes) | (8 bytes) | - -------------------------------------------------------------------------------------------- -*/ - MetaHeader meta; - LightHeader() { - memset(this, 0, sizeof(LightHeader)); - meta.SetLayoutType(LayoutType::LIGHT); - meta.SetHeaderSize(sizeof(LightHeader) / sizeof(int64)); - } -}; - -struct NormalHeader { -/*_________________________________________________________________________________________________________________________ - | | | | | | | embedding | slot | - | number of | valueptr | header | each bit a V* | global step | freq counter | V* | V* | - | embedding | type | size | 1 valid | | | actually pointer | actually pointer |... - | columns | | | 0 no-valid | int64 | int64 | by alloctor | by alloctor | - | (8 bits) | (8 bits) | (8 bits) | (40 bits) | (8 bytes) | (8 bytes) | (8 bytes) | (8 bytes) | - -------------------------------------------------------------------------------------------------------------------------- - */ - MetaHeader meta; - int64 global_step; - int64 freq_counter; - - NormalHeader() { - memset(this, 0, sizeof(NormalHeader)); - meta.SetLayoutType(LayoutType::NORMAL); - meta.SetHeaderSize(sizeof(NormalHeader) / sizeof(int64)); - SetGlobalStep(-1); - } - - inline int64 GetGlobalStep() { - return global_step; - } - - inline void SetGlobalStep(int64 gs) { - global_step = gs; - } - - inline int64 GetFreqCounter() { - return freq_counter; - } - - inline void SetFreqCounter(int64 fc) { - freq_counter = fc; - } - - inline void AddFreq() { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + 1); - } - - inline void AddFreq(int64 count) { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + count); - } -}; - -struct FixedLengthHeader { -/*_________________________________________________________________________________ - | | | embeddings | - | slotflag + global step | freq counter | V | - | | | actually value | - | int64 | int64 | by alloctor | - | (8 bytes) | (8 bytes) | (4 * slot_num * emb_dim bytes) | - --------------------------------------------------------------------------------- -*/ - int64 global_step; - int64 freq_counter; - - FixedLengthHeader() { - memset(this, 0, sizeof(FixedLengthHeader)); - SetGlobalStep(-1); - } - - inline int64 GetGlobalStep() { - return global_step & 0x0000ffffffffffff; - } - - inline void SetGlobalStep(int64 gs) { - int64 temp = global_step; - temp &= 0xffff000000000000; - gs &= 0x0000ffffffffffff; - temp |= gs; - global_step = temp; - } - - inline void SetInitialized(int64 emb_index) { - int64 temp = 1; - temp = temp << (48 + emb_index); - global_step |= temp; - } - - inline int64 GetFreqCounter() { - return freq_counter; - } - - inline void SetFreqCounter(int64 fc) { - freq_counter = fc; - } - - inline void AddFreq() { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + 1); - } - - inline void AddFreq(int64 count) { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + count); - } -}; -} // namespace - -template -class ValuePtr { - public: - virtual ~ValuePtr() {} - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) = 0; - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) = 0; - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) = 0; - - virtual void Destroy(Allocator* allocator) = 0; - - virtual void* GetPtr() const = 0; - - // Global Step - virtual int64 GetStep() { - LOG(FATAL) << "Unsupport GlobalStep in subclass of ValuePtrBase"; - return 0; - } - - virtual void SetStep(int64 gs) {} - - // Frequency Counter - virtual int64 GetFreq() { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - return 0; - } - - virtual void SetFreq(int64 freq) {} - - virtual void AddFreq() { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - } - - virtual void AddFreq(int64 count) { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - } - - virtual void SetValue(V val, size_t size) { - LOG(FATAL) << "Unsupport SetValue in subclass of ValuePtrBase"; - } - - virtual void SetInitialized(int64 emb_index) { - LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase"; - } - - virtual bool SetPtr(V* ptr) { - LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase"; - return false; - } - -}; - -template -class LooseValuePtr : public ValuePtr { - public: - virtual ~LooseValuePtr() {} - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) { - MetaHeader* meta = (MetaHeader*)ptr_; - unsigned int embnum = (unsigned int)meta->embed_num; - auto metadata = meta->GetColumnBitset(); - - if (!metadata.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - metadata = meta->GetColumnBitset(); - if (metadata.test(emb_index)) { - this->flag_.clear(std::memory_order_release); - return ((V**)((int64*)ptr_ + - (unsigned int)meta->header_size))[emb_index]; - } - embnum++ ; - int64 alloc_value_len = value_len; - V* tensor_val = (V*)allocator->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index] = tensor_val; - - metadata.set(emb_index); - // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong(); - // the ptr_ will be occaionally modified from 0x7f18700912a0 to 0x700912a0 - // must use ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val; to avoid - meta->SetColumnBitset(metadata, embnum); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]; - } - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) { - return nullptr; - } - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) { - MetaHeader* meta = (MetaHeader*)ptr_; - auto metadata = meta->GetColumnBitset(); - if (metadata.test(emb_index)) { - return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]; - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - MetaHeader* meta = (MetaHeader*)ptr_; - unsigned int embnum = (unsigned int)meta->embed_num; - auto metadata = meta->GetColumnBitset(); - for (int i = 0; i< embnum; i++) { - if (metadata.test(i)) { - V* val = ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[i]; - if (val != nullptr) { - allocator->DeallocateRaw(val); - } - } - } - } - - virtual void* GetPtr() const { - return ptr_; - } - - protected: - void* ptr_; - std::atomic_flag flag_ = ATOMIC_FLAG_INIT; -}; - -template -class LightValuePtr : public LooseValuePtr { - public: - LightValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*)malloc( - sizeof(LightHeader) + sizeof(int64) * size); - memset(static_cast(this->ptr_) + sizeof(LightHeader), 0, sizeof(int64) * size); - new ((char*)this->ptr_) LightHeader(); - } - - ~LightValuePtr() { - free(this->ptr_); - } -}; - -template -class NormalValuePtr : public LooseValuePtr { - public: - NormalValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*) malloc(sizeof(NormalHeader) + sizeof(int64) * size); - memset(static_cast(this->ptr_) + sizeof(NormalHeader), 0, sizeof(int64) * size); - new ((char*)this->ptr_) NormalHeader(); - } - - ~NormalValuePtr() { - free(this->ptr_); - } - - int64 GetStep() { - return ((NormalHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((NormalHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((NormalHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((NormalHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - return ((NormalHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - return ((NormalHeader*)this->ptr_)->AddFreq(count); - } -}; - -template -class NormalContiguousValuePtr : public LooseValuePtr { - public: - NormalContiguousValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = allocator->AllocateRaw(Allocator::kAllocatorAlignment, - sizeof(FixedLengthHeader) + sizeof(V) * size); - memset(static_cast(this->ptr_) + sizeof(FixedLengthHeader), 0, sizeof(V) * size); - new ((char*)this->ptr_) FixedLengthHeader(); - } - - ~NormalContiguousValuePtr() { - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset); - } - V* tensor_val = - ((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + offset); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return (V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset; - } - } - - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset); - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - allocator->DeallocateRaw(this->ptr_); - } - - int64 GetStep() { - return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - ((FixedLengthHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - ((FixedLengthHeader*)this->ptr_)->AddFreq(count); - } - - void SetValue(V val, size_t size) { - for (int i = 0; i < size; ++i) { - *((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + i) = val; - } - } -}; - -template -class NormalGPUValuePtr : public LooseValuePtr { - public: - NormalGPUValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*) malloc(sizeof(FixedLengthHeader) + sizeof(V *)); - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = nullptr; - new ((char*)this->ptr_) FixedLengthHeader(); - } - - ~NormalGPUValuePtr() { - free(this->ptr_); - } - -#if GOOGLE_CUDA - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - V* tensor_val = - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - cudaMemcpy(tensor_val, default_v, value_len * sizeof(V), - cudaMemcpyDeviceToDevice); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - } - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } -#endif // GOOGLE_CUDA - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, - bool &need_initialize) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - need_initialize = 1; - this->flag_.clear(std::memory_order_release); - return reinterpret_cast(this); - } - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - return; - } - - int64 GetStep() { - return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - ((FixedLengthHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - ((FixedLengthHeader*)this->ptr_)->AddFreq(count); - } - - bool SetPtr(V* ptr) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - V* value_ptr = *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)); - if (value_ptr == nullptr) { - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = ptr; - this->flag_.clear(std::memory_order_release); - return true; - } else { - this->flag_.clear(std::memory_order_release); - return false; - } - } - - void SetInitialized(int64 emb_index) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - ((FixedLengthHeader*)this->ptr_)->SetInitialized(emb_index); - this->flag_.clear(std::memory_order_release); - } - -}; - -template -class CompactValuePtr : public ValuePtr { - public: - CompactValuePtr(Allocator* allocator, size_t size) { - memset(static_cast(this->ptr_), 0, sizeof(V) * size + sizeof(int64)); - } - - ~CompactValuePtr() { - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset); - } - V* tensor_val = - ((V*)this->ptr_ + sizeof(int64) / sizeof(V) + offset); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return (V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset; - } - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) { - return nullptr; - } - - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset); - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - allocator->DeallocateRaw(this->ptr_); - } - - virtual void* GetPtr() const { - return (void*)ptr_; - } - - private: - char ptr_[23]; - std::atomic_flag flag_ = ATOMIC_FLAG_INIT; -}; - -} // namespace tensorflow - -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 115e3c4bae6..0c08c30c30a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -439,7 +439,8 @@ tf_cc_test( tf_cuda_cc_test( name = "embedding_variable_ops_test", - srcs = ["embedding_variable_ops_test.cc"], + srcs = ["embedding_variable_ops_test.cc", + "embedding_variable_test.h"], extra_copts = ["-fexceptions", "-g"], deps = [ ":io", @@ -6497,7 +6498,7 @@ tf_kernel_library( "training_ali_ops_gpu.h", "training_ali_ops.h" ], - copts = tf_copts(), + copts = tf_copts() + ["-g"], deps = [ ":bounds_check", ":training_op_helpers", diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc index 7ec6b1cf109..393e9a9754b 100644 --- a/tensorflow/core/kernels/embedding_variable_memory_test.cc +++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc @@ -19,17 +19,22 @@ namespace embedding { float PerfMemory(Tensor& default_value, const std::vector& id_list, int value_size, int64 default_value_dim, - int64 filter_freq = 0) { + int64 filter_freq = 0, int64 steps_to_live = 0, + int64 record_freq = false) { auto ev = CreateEmbeddingVar(value_size, default_value, - default_value_dim, filter_freq); - ValuePtr* value_ptr = nullptr; + default_value_dim, filter_freq, + steps_to_live, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + record_freq); + void* value_ptr = nullptr; bool is_filter = false; double start_mem, end_mem; start_mem = getResident() * getpagesize(); for (int i = 0; i < id_list.size(); i++) { ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); if (is_filter) - ev->flat(value_ptr, id_list[i]); + ev->flat(value_ptr); } end_mem = getResident() * getpagesize(); double used_mb = (end_mem - start_mem)/1000000; @@ -58,7 +63,7 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) { float used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim); float theoritical_mb = - 50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000; + 50 + num_of_ids * (value_size * sizeof(float)) / 1000000; EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && (used_mb < theoritical_mb * 1.01)); @@ -68,9 +73,10 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) { used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim, filter_freq); theoritical_mb = - 50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000; + 50 + num_of_ids * (8 + value_size * sizeof(float) / 2 + + 4/*memory for ids_list*/) / 1000000; EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && - (used_mb < theoritical_mb * 1.01)); + (used_mb < theoritical_mb * 1.02)); } } //namespace embedding } //namespace tensorflow diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc index 4839c171708..e30381fef07 100644 --- a/tensorflow/core/kernels/embedding_variable_ops_test.cc +++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc @@ -21,6 +21,7 @@ #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/embedding_variable_test.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/io/path.h" @@ -48,18 +49,6 @@ namespace { const int THREADNUM = 16; const int64 max = 2147483647; -template -class TestableEmbeddingVar : public EmbeddingVar { - public: - TestableEmbeddingVar(const string& name, - embedding::Storage* storage, - EmbeddingConfig emb_cfg = EmbeddingConfig(), - Allocator* alloc = nullptr) : EmbeddingVar( - name, storage, emb_cfg, alloc) {} - - using EmbeddingVar::GetFilter; -}; - struct ProcMemory { long size; // total program size long resident; // resident set size @@ -123,11 +112,7 @@ TEST(EmbeddingVariableTest, TestEmptyEV) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); { - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); LOG(INFO) << "size:" << variable->Size(); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); @@ -191,19 +176,14 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) { int64 value_size = 8; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddigVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(0, 0, 1, 1, "", 5), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); for (int64 i = 0; i < 5; i++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); vflat(i) = 5.0; } @@ -269,20 +249,15 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(0, 0, 1, 1, "", 5), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); int64 ev_size = 10048576; for (int64 i = 0; i < ev_size; i++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); } LOG(INFO) << "size:" << variable->Size(); @@ -344,9 +319,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) { void multi_insertion(EmbeddingVar* variable, int64 value_size){ for (long j = 0; j < 5; j++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(j, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, j); + typename TTypes::Flat vflat = variable->flat(value_ptr); } } @@ -355,12 +330,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); std::vector insert_threads(THREADNUM); for (size_t i = 0 ; i < THREADNUM; i++) { @@ -375,54 +345,45 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) { void InsertAndLookup(EmbeddingVar* variable, int64 *keys, long ReadLoops, int value_size){ - float *default_value_fake = (float *)malloc((value_size)*sizeof(float)); - for (int j = 0; j < value_size; j++) { - default_value_fake[j] = -1.0; - } for (long j = 0; j < ReadLoops; j++) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - float *default_value = (float *)malloc((value_size)*sizeof(float)); - for (int k = 0; k < value_size; k++) { - default_value[k] = (float)keys[j]; - } - variable->LookupOrCreate(keys[j], val, default_value); - variable->LookupOrCreate(keys[j], val, default_value_fake); - ASSERT_EQ(default_value[0] , val[0]); - free(val); - free(default_value); + void* val = nullptr; + void* val_1 = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(keys[j], &val, &is_filter, false); + variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false); + ASSERT_EQ(val, val_1); } - free(default_value_fake); } void MultiBloomFilter(EmbeddingVar* var, int value_size, int64 i) { for (long j = 0; j < 1; j++) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - var->LookupOrCreate(i+1, val, nullptr); + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(i+1, &val, &is_filter, false); } } TEST(EmbeddingVariableTest, TestBloomFilter) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); - test::FillValues(&value, std::vector(value_size, 10.0)); - float* fill_v = (float*)malloc(value_size * sizeof(float)); - - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, "normal", 10, 0.01), - cpu_allocator()); - - var->Init(value, 1); - - float *val = (float *)malloc((value_size+1)*sizeof(float)); - float *default_value = (float *)malloc((value_size+1)*sizeof(float)); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(2, val, default_value); + std::vector default_value = + {0.0 ,1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; + test::FillValues(&value, default_value); + + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01); + + //float *val = (float *)malloc((value_size+1)*sizeof(float)); + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(2, &val, &is_filter, false); std::vector keylist; std::vector valuelist; @@ -437,14 +398,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt64) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal", 10, 0.01, DT_UINT64), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT64); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -509,14 +467,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt32) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal", 10, 0.01, DT_UINT32), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT32); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -581,14 +536,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt16) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal_contiguous", 10, 0.01, DT_UINT16), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT16); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -654,14 +606,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal_contiguous", 10, 0.01, DT_UINT8), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT8); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -725,12 +674,7 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) { int64 value_size = 128; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); int64 InsertLoops = 1000; bool* flag = (bool *)malloc(sizeof(bool)*max); @@ -765,8 +709,9 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) { } void MultiFilter(EmbeddingVar* variable, int value_size) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - variable->LookupOrCreate(20, val, nullptr); + bool is_filter = true; + void* val; + variable->LookupOrCreateKey(20, &val, &is_filter, false); } TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { @@ -774,14 +719,8 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 7), - cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5); + float *val = (float *)malloc((value_size+1)*sizeof(float)); int thread_num = 5; std::vector insert_threads(thread_num); @@ -792,20 +731,16 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { t.join(); } - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; var->LookupOrCreateKey(20, &value_ptr); - ASSERT_EQ(value_ptr->GetFreq(), thread_num); + ASSERT_EQ(var->GetFreq(20), thread_num); } EmbeddingVar* InitEV_Lockless(int64 value_size) { Tensor value(DT_INT64, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); + auto variable = CreateEmbeddingVar(value_size, value, 1); - variable->Init(value, 1); return variable; } @@ -813,7 +748,7 @@ void MultiLookup(EmbeddingVar* variable, int64 InsertLoop, int thread_num, int i) { for (int64 j = i * InsertLoop/thread_num; j < (i+1)*InsertLoop/thread_num; j++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(j, &value_ptr); } } @@ -829,9 +764,9 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) { float* fill_v = (float*)malloc(value_size * sizeof(float)); for (int64 i = 0; i < InsertLoop; i++){ - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); } testing::StartTiming(); @@ -848,58 +783,6 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) { } -void hybrid_process(EmbeddingVar* variable, - int64* keys, int64 InsertLoop, int thread_num, - int64 i, int64 value_size) { - float *val = (float *)malloc(sizeof(float)*(value_size + 1)); - for (int64 j = i * InsertLoop/thread_num; - j < (i+1) * InsertLoop/thread_num; j++) { - variable->LookupOrCreate(keys[j], val, nullptr); - } -} - -void BM_HYBRID_LOCKLESS(int iters, int thread_num) { - testing::StopTiming(); - testing::UseRealTime(); - - int64 value_size = 128; - auto variable = InitEV_Lockless(value_size); - int64 InsertLoop = 1000000; - - srand((unsigned)time(NULL)); - int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoop); - - for (int64 i = 0; i < InsertLoop; i++) { - keys[i] = rand() % 1000; - } - - testing::StartTiming(); - while (iters--) { - std::vector insert_threads(thread_num); - for (size_t i = 0 ; i < thread_num; i++) { - insert_threads[i] = std::thread(hybrid_process, - variable, keys, InsertLoop, thread_num, i, value_size); - } - for (auto &t : insert_threads) { - t.join(); - } - } -} - -BENCHMARK(BM_MULTIREAD_LOCKLESS) - ->Arg(1) - ->Arg(2) - ->Arg(4) - ->Arg(8) - ->Arg(16); - -BENCHMARK(BM_HYBRID_LOCKLESS) - ->Arg(1) - ->Arg(2) - ->Arg(4) - ->Arg(8) - ->Arg(16); - TEST(EmbeddingVariableTest, TestAllocate) { int value_len = 8; @@ -923,23 +806,13 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(/*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */1, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); int64 ev_size = 100; for (int64 i = 0; i < ev_size; i++) { - variable->LookupOrCreate(i, fill_v, nullptr); + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(i, &val, &is_filter, false); } LOG(INFO) << "size:" << variable->Size(); @@ -947,59 +820,20 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) { void t1(KVInterface* hashmap) { for (int i = 0; i< 100; ++i) { - hashmap->Insert(i, new NormalValuePtr(ev_allocator(), 100)); + hashmap->Insert(i, nullptr); } } TEST(EmbeddingVariableTest, TestRemoveLockless) { - KVInterface* hashmap = new LocklessHashMap(); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(t1, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} - -TEST(EmbeddingVariableTest, TestBatchCommitofDBKV) { - int64 value_size = 4; + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM, + false, false, {false, 0}); KVInterface* hashmap = - new LevelDBKV(testing::TmpDir()); - hashmap->SetTotalDims(value_size); - - for (int64 i = 0; i < 6; ++i) { - const ValuePtr* tmp = - new NormalContiguousValuePtr(ev_allocator(), value_size); - hashmap->Commit(i, tmp); - } - - for(int64 i = 0; i < 6; i++) { - ValuePtr* tmp = nullptr; - Status s = hashmap->Lookup(i, &tmp); - ASSERT_EQ(s.ok(), true); - } -} - -void InsertAndCommit(KVInterface* hashmap) { - for (int64 i = 0; i< 100; ++i) { - const ValuePtr* tmp = - new NormalContiguousValuePtr(ev_allocator(), 100); - hashmap->Insert(i, tmp); - hashmap->Commit(i, tmp); - } -} - -TEST(EmbeddingVariableTest, TestSizeDBKV) { - KVInterface* hashmap = - new LevelDBKV(testing::TmpDir()); - hashmap->SetTotalDims(100); + new LocklessHashMap(feat_desc); + feat_desc->InitSlotInfo(0, 100, {nullptr, 1}); ASSERT_EQ(hashmap->Size(), 0); LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(InsertAndCommit, hashmap); + auto t = std::thread(t1, hashmap); t.join(); LOG(INFO) << "hashmap size: " << hashmap->Size(); ASSERT_EQ(hashmap->Size(), 100); @@ -1190,213 +1024,6 @@ TEST(EmbeddingVariableTest, TestLFUCache) { } } -TEST(EmbeddingVariableTest, TestCacheRestore) { - setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1); - int64 value_size = 4; - Tensor value(DT_FLOAT, TensorShape({value_size})); - test::FillValues(&value, std::vector(value_size, 9.0)); - float* fill_v = (float*)malloc(value_size * sizeof(float)); - std::vector size; - size.emplace_back(64); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal_contiguous", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage= embedding::StorageFactory::Create( - embedding::StorageConfig(embedding::DRAM_SSDHASH, - testing::TmpDir(), - size, "normal_contiguous", - emb_config), - cpu_allocator(), - "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, emb_config, cpu_allocator()); - variable->Init(value, 1); - variable->InitCache(CacheStrategy::LFU); - - Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); - - int64 ev_size = 7; - int64 cache_size = 3; - for (int64 i = 1; i < cache_size; i++) { - ValuePtr* value_ptr = nullptr; - variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); - value_ptr->AddFreq(2); - } - for (int64 i = cache_size; i < ev_size; i++) { - ValuePtr* value_ptr = nullptr; - variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); - value_ptr->AddFreq(1); - } - - LOG(INFO) << "size:" << variable->Size(); - - BundleWriter writer(Env::Default(), Prefix("foo")); - embedding::ShrinkArgs shrink_args; - shrink_args.global_step = 1; - variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); - TF_ASSERT_OK(writer.Finish()); - variable->Unref(); - - auto imported_storage= embedding::StorageFactory::Create( - embedding::StorageConfig(embedding::DRAM_SSDHASH, - testing::TmpDir(), - size, "normal_contiguous", - emb_config), - cpu_allocator(), - "EmbeddingVar1"); - auto imported_variable = new EmbeddingVar("EmbeddingVar1", - imported_storage, emb_config, cpu_allocator()); - imported_variable->Init(value, 1); - imported_variable->InitCache(CacheStrategy::LFU); - - BundleReader reader(Env::Default(), Prefix("foo")); - std::string name_string("var"); - imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false); - - ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size); - ASSERT_EQ(imported_storage->Size(1), 2); - delete imported_storage; -} - -void t1_gpu(KVInterface* hashmap) { - for (int i = 0; i< 100; ++i) { - hashmap->Insert(i, new NormalGPUValuePtr(ev_allocator(), 100)); - } -} - -#if GOOGLE_CUDA -TEST(EmbeddingVariableTest,TestRemoveLocklessCPU) { - SessionOptions sops; - std::unique_ptr device = - DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0"); - Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator( - GPUOptions(), TfGpuId(0), 1 << 26); - KVInterface* hashmap = - new LocklessHashMapCPU(gpu_allocator); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(t1, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} -#endif // GOOGLE_CUDA - -/*void CommitGPU(KVInterface* hashmap) { - for (int64 i = 0; i< 100; ++i) { - ValuePtr* tmp= new NormalGPUValuePtr(ev_allocator(), 100); - hashmap->Commit(i, tmp); - } -} - -TEST(EmbeddingVariableTest, TestCommitHashMapCPU) { - KVInterface* hashmap = new LocklessHashMapCPU(); - hashmap->SetTotalDims(100); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(CommitGPU, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} - -TEST(EmbeddingVariableTest, TestGPUValuePtr) { - int ev_list_size = 32; - ValuePtr* ptr_ = new NormalGPUValuePtr(ev_allocator(), ev_list_size); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float host_data[ev_list_size]; - float initial_data[ev_list_size]; - for(int i = 0;i < ev_list_size;++i){ - initial_data[i] = 10; - } - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << initial_data[i]; - } - cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(host_data, address, ev_list_size * sizeof(float), cudaMemcpyDeviceToHost); - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << host_data[i]; - } -}//Forbidden, due to no gpu allocator at that time - -TEST(EmbeddingVariableTest, TestCommitValue) { - int ev_list_size = 32; - ValuePtr* ptr_ = new NormalGPUValuePtr(ev_allocator(),ev_list_size); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float initial_data[ev_list_size]; - for(int i = 0;i < ev_list_size;++i){ - initial_data[i] = 10; - } - cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice); - KVInterface* hashmap = new LocklessHashMapCPU(); - hashmap->SetTotalDims(ev_list_size); - hashmap->Commit(1, ptr_); - ValuePtr* check; - hashmap->Lookup(1,&check); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader)); - - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << tmp[i]; - //ASSERT_EQ(tmp[i], 10); - }// -} - -TEST(EmbeddingVariableTest, TestBatchCommitofLocklessHashMapCPU) { - KVInterface* hashmap = new LocklessHashMapCPU(); - const int EmbeddingSize = 16; - const int BatchSize = 16; - - hashmap->SetTotalDims(EmbeddingSize); - std::vector*> value_ptr_list; - std::vector key_list; - - for(int64 i = 0; i < BatchSize; i++) { - key_list.emplace_back(i); - ValuePtr* ptr_ = new NormalGPUValuePtr(EmbeddingSize); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float initial_data[EmbeddingSize]; - for(int j = 0;j < EmbeddingSize;++j){ - initial_data[j] = i; - //LOG(INFO) << "initial[" << i << "][" << j << "]=" << initial_data[j]; - } - cudaMemcpy(address, initial_data, EmbeddingSize * sizeof(float), cudaMemcpyHostToDevice); - value_ptr_list.emplace_back(ptr_); - }//initialize V on GPU - - timespec start,end; - clock_gettime(CLOCK_MONOTONIC, &start); - hashmap->BatchCommit(key_list, value_ptr_list); - clock_gettime(CLOCK_MONOTONIC, &end); - std::cout << "time: " << ((double)(end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec)/1000000 << "ms" << std::endl; - - for(int64 i = 0; i < BatchSize; i++) { - ValuePtr* check; - hashmap->Lookup(i,&check); - float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader)); - for(int j = 0;j < EmbeddingSize;++j){ - LOG(INFO) << "batch[" << i << "][" << j << "]=" << tmp[j]; - //ASSERT_EQ(tmp[j], i); - } - }//compare value after BatchCommit -} -*/ - const int total_size = 1024 * 8; const int th_num = 1; const int malloc_size = total_size / th_num; @@ -1466,17 +1093,11 @@ TEST(EmbeddingVariableTest, TestCPUGPUMalloc) { auto mem_pool = new EmbeddingMemoryPool(gpu_allocator, 256, 1024); float* ptr_1 = mem_pool->Allocate(); float* ptr_2 = mem_pool->Allocate(); - ValuePtr* value_ptr1 = new NormalGPUValuePtr(gpu_allocator, 256); - ValuePtr* value_ptr2 = new NormalGPUValuePtr(gpu_allocator, 256); - value_ptr1->SetPtr(ptr_1); - value_ptr2->SetPtr(ptr_2); - value_ptr1->SetInitialized(0); - value_ptr2->SetInitialized(0); - std::vector*> value_ptrs; - value_ptrs.emplace_back(value_ptr1); + std::vector value_ptrs; + value_ptrs.emplace_back(ptr_1); mem_pool->Deallocate(value_ptrs); value_ptrs.clear(); - value_ptrs.emplace_back(value_ptr2); + value_ptrs.emplace_back(ptr_2); mem_pool->Deallocate(value_ptrs); float* ptr_3 = mem_pool->Allocate(); ASSERT_EQ(ptr_1, ptr_3); @@ -1539,16 +1160,16 @@ TEST(EmbeddingVariableTest, TestEVMallocFree) { void SingleCommit(KVInterface* hashmap, std::vector keys, int bias) { - std::vector*> value_ptrs; + std::vector value_ptrs; for (int64 i = 0; i < keys.size(); ++i) { - ValuePtr* tmp = - new NormalContiguousValuePtr(cpu_allocator(), 124); - tmp->SetValue(float(keys[i] + bias), 124); + void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16); + for (int j = 0; j < 124; j++) { + ((float*)tmp)[j] = keys[i] + bias; + } value_ptrs.push_back(tmp); } ASSERT_EQ(keys.size(), value_ptrs.size()); uint64 start = Env::Default()->NowNanos(); - for (int64 i = 0; i < keys.size(); i++) { hashmap->Commit(keys[i], value_ptrs[i]); } @@ -1558,9 +1179,13 @@ void SingleCommit(KVInterface* hashmap, void TestCompaction() { std::string temp_dir = testing::TmpDir(); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, + true, true, {false, 0}); auto hashmap = new SSDHashKV( - temp_dir, cpu_allocator()); - hashmap->SetTotalDims(124); + temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); ASSERT_EQ(hashmap->Size(), 0); std::vector ids; for (int i = 0; i < 262144; i++) { @@ -1576,12 +1201,12 @@ void TestCompaction() { t1.join(); ids.clear(); sleep(1); - ValuePtr* val = nullptr; + void* val = nullptr; for (int i = 131073; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i+3); + ASSERT_EQ(v[j], i+3); } } for (int i = 131073; i < 262144; i++) { @@ -1596,16 +1221,16 @@ void TestCompaction() { sleep(1); for (int i = 0; i < 131073; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i + 1); + ASSERT_EQ(v[j], i + 1); } } for (int i = 131073; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i + 2); + ASSERT_EQ(v[j], i + 2); } } delete hashmap; @@ -1622,10 +1247,14 @@ TEST(KVInterfaceTest, TestSSDKVSyncCompaction) { } void TestReadEmbFile() { + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, + true, true, {false, 0}); std::string temp_dir = testing::TmpDir(); auto hashmap = new SSDHashKV( - temp_dir, cpu_allocator()); - hashmap->SetTotalDims(124); + temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); ASSERT_EQ(hashmap->Size(), 0); std::vector ids; for (int i = 0; i < 262145; i++) { @@ -1634,12 +1263,12 @@ void TestReadEmbFile() { SingleCommit(hashmap, ids, 3); sleep(1); ids.clear(); - ValuePtr* val = nullptr; + void* val = nullptr; for (int i = 0; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i+3); + ASSERT_EQ(v[j], i+3); } } delete hashmap; @@ -1666,9 +1295,10 @@ TEST(KVInterfaceTest, TestDirectIoFile) { void InsertKey(EmbeddingVar* variable, int value_size) { float *val = (float *)malloc((value_size+1)*sizeof(float)); for (int64 i = 0; i < 100000000; i++) { - variable->LookupOrCreate(20, val, nullptr); + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(20, &val, &is_filter, false); } - LOG(INFO)<<"Finish Insert"; } void RemoveKey(EmbeddingVar* variable) { @@ -1676,29 +1306,13 @@ void RemoveKey(EmbeddingVar* variable) { sleep(1); variable->storage()->Remove(20); } - LOG(INFO)<<"Remove thread finish"; } TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */2, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - emb_config, - cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1); int thread_num = 5; std::vector insert_threads(thread_num); for (size_t i = 0 ; i < thread_num - 1; i++) { @@ -1714,21 +1328,7 @@ TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - emb_config, - cpu_allocator()); - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1); float* set_value = (float*)malloc(value_size * sizeof(float)); //Insertion for (int i = 0; i < 100; i++) { diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc index 9b01e35840b..16f4a894858 100644 --- a/tensorflow/core/kernels/embedding_variable_performance_test.cc +++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc @@ -90,14 +90,21 @@ void GenerateSkewInput(int num_of_ids, float skew_factor, void thread_lookup_or_create( EmbeddingVar* ev, const int64* input_batch, + float* default_value, + int default_value_dim, float** outputs, int value_size, int start, int end) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = start; i < end; i++) { ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false); - auto val = ev->flat(value_ptr, input_batch[i]); - memcpy(outputs[i], &val(0), sizeof(float) * value_size); + if (is_filter) { + auto val = ev->flat(value_ptr); + memcpy(outputs[i], &val(0), sizeof(float) * value_size); + } else { + int default_value_index = input_batch[i] % default_value_dim; + memcpy(outputs[i], default_value + default_value_index * value_size, sizeof(float) * value_size); + } } } @@ -138,6 +145,8 @@ double PerfLookupOrCreate( for (int i = 0; i < num_thread; i++) { worker_threads[i] = std::thread(thread_lookup_or_create, ev, input_batches[k].data(), + default_value_matrix.data(), + default_value_dim, outputs.data(), value_size, thread_task_range[i].first, thread_task_range[i].second); @@ -201,11 +210,11 @@ void thread_lookup( const int64* input_batch, float** outputs, int value_size, int start, int end) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = start; i < end; i++) { ev->LookupKey(input_batch[i], &value_ptr); - auto val = ev->flat(value_ptr, input_batch[i]); + auto val = ev->flat(value_ptr); memcpy(outputs[i], &val(0), sizeof(float) * value_size); } } @@ -293,7 +302,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) { } } auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = 0; i < hot_ids_list.size(); i++) { ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false); @@ -339,13 +348,13 @@ void PerfSave(Tensor& default_value, value_size, default_value, default_value_dim, 0, steps_to_live, l2_weight_threshold); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; srand((unsigned)time(NULL)); for (int i = 0; i < id_list.size(); i++) { ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); - ev->flat(value_ptr, id_list[i]); + ev->flat(value_ptr); int64 global_step = rand() % 100; ev->UpdateVersion(value_ptr, global_step); } diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h index d06304fb78a..07c34764fb0 100644 --- a/tensorflow/core/kernels/embedding_variable_test.h +++ b/tensorflow/core/kernels/embedding_variable_test.h @@ -107,35 +107,42 @@ EmbeddingVar* CreateEmbeddingVar( int value_size, Tensor& default_value, int64 default_value_dim, int64 filter_freq = 0, int64 steps_to_live = 0, - float l2_weight_threshold=-1.0) { - std::string layout_type = "light"; - if (filter_freq != 0) { - layout_type = "normal"; - } - - if (steps_to_live != 0) { - if (layout_type == "light") { - layout_type = "normal_contiguous"; - } - } + float l2_weight_threshold=-1.0, + embedding::StorageType storage_type = embedding::StorageType::DRAM, + std::vector storage_size = {1024*1024*1024, + 1024*1024*1024, + 1024*1024*1024, + 1024*1024*1024}, + bool record_freq = false, + int64 max_element_size = 0, + float false_positive_probability = -1.0, + DataType counter_type = DT_UINT64) { auto embedding_config = EmbeddingConfig( - 0, 0, 1, 0, "emb_var", steps_to_live, - filter_freq, 999999, l2_weight_threshold, layout_type, - 0, -1.0, DT_UINT64, default_value_dim, - 0.0, false, false, false); + 0, 0, 1, 0, "emb_var", steps_to_live, + filter_freq, 999999, l2_weight_threshold, + max_element_size, false_positive_probability, + counter_type, default_value_dim, + 0.0, record_freq, false, false); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), storage_type, + record_freq, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( - embedding::StorageType::DRAM, "", - {1024, 1024, 1024, 1024}, layout_type, + storage_type, "", + storage_size, embedding_config), cpu_allocator(), + feat_desc, "emb_var"); auto ev = new EmbeddingVar( "emb_var", storage, embedding_config, - cpu_allocator()); + cpu_allocator(), + feat_desc); ev->Init(default_value, default_value_dim); return ev; } diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc index 55dd40176a8..2f07e2ef537 100644 --- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc +++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc @@ -774,7 +774,7 @@ class GroupEmbeddingVariableForWardOpTest : public OpsTestBase { embedding_var->Init(value, 1); for (int64 j = 0; j < nnz; ++j) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); typename TTypes::Flat vflat = embedding_var->flat(value_ptr); @@ -958,7 +958,7 @@ class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase { embedding_var->Init(value, 1); for (int64 j = 0; j < nnz; ++j) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); typename TTypes::Flat vflat = embedding_var->flat(value_ptr); diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h index 0582697ad16..d84838ae413 100644 --- a/tensorflow/core/kernels/incr_save_restore_ops.h +++ b/tensorflow/core/kernels/incr_save_restore_ops.h @@ -225,9 +225,9 @@ class IncrEVValueDumpIterator : public DumpIterator { keys_idx_++; col_idx_ = 0; } - ValuePtr* value_ptr = NULL; + void* value_ptr = NULL; TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr)); - return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++); + return emb_var_->flat(value_ptr)(col_idx_++); } private: diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc index c69aec8ebb9..7e40dfff7ac 100644 --- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc +++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc @@ -121,7 +121,7 @@ class KvResourceLookupIDOp : public OpKernel { const int64 indices_size = static_cast(indices_flat.dimension(0)); EmbeddingVarContext ev_ctx(c); ev->GetOrCreateKey(ev_ctx, indices, - reinterpret_cast**>(out_base), + reinterpret_cast(out_base), indices_size); } } @@ -203,7 +203,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel { const size_t slice_bytes = slice_elems * sizeof(TValue); EmbeddingVarContext ev_ctx(c); ev->GatherEmbeddings(ev_ctx, indices, - (ValuePtr**)pointer.data(), + (void**)pointer.data(), out_base, N); } } diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc index 8a01a7bf2cd..5cd0ef140bd 100644 --- a/tensorflow/core/kernels/kv_variable_ops.cc +++ b/tensorflow/core/kernels/kv_variable_ops.cc @@ -214,16 +214,16 @@ class InitializeKvVariableOp : public OpKernel { int64 storage_type = 0; OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type)); storage_type_ = static_cast(storage_type); - auto device_type_str = c->device_type().type_string(); + device_type_str_ = c->device_type().type_string(); if (storage_type_ == embedding::DEFAULT) { - if (device_type_str == "CPU") { + if (device_type_str_ == "CPU") { storage_type_ = embedding::DRAM; } else { storage_type_ = embedding::HBM; } } - bool if_op_on_gpu = (device_type_str == "GPU"); + bool if_op_on_gpu = (device_type_str_ == "GPU"); bool if_embedding_on_hbm = (storage_type_ == embedding::HBM || storage_type_ == embedding::HBM_DRAM || storage_type_ == embedding::HBM_DRAM_SSDHASH); @@ -238,57 +238,14 @@ class InitializeKvVariableOp : public OpKernel { filter_freq_ = 0; } - OP_REQUIRES_OK(c, c->GetAttr("layout", &layout_)); - if (!layout_.empty()) { - // use layout by user configuration - } else if ((filter_freq_ != 0 && max_element_size_ == 0) - || steps_to_live_ != 0 || record_freq_ - || record_version_ || storage_type > 5) { - if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) { - layout_ = "normal"; - } else { - if (storage_type == embedding::HBM_DRAM || - storage_type == embedding::HBM_DRAM_SSDHASH) { - layout_ = "normal_contiguous_gpu"; - } else { - layout_ = "normal_contiguous"; - } - } - } else { - layout_ = "light"; - } - - CHECK(block_num_ == 1 || layout_ != "normal_contiguous"); - - if ("compact" == layout_) { - OP_REQUIRES(c, shape_.dim_size(0) == 1 && - storage_type_ == embedding::StorageType::DRAM, - errors::InvalidArgument("embedding_dim must be 1 and storage type" - " should be DRAM when layout is 'compact'.")); - } + record_freq_ |= (storage_type > 5); + record_version_ |= (storage_type > 5); OP_REQUIRES(c, steps_to_live_ >= 0, errors::InvalidArgument( "steps_to_live must >= 0, ", std::to_string(steps_to_live_))); OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_)); - if (embedding::StorageType::LEVELDB == storage_type_) { - ht_type_ = "leveldb_kv"; - if (layout_ != "normal_contiguous") - LOG(WARNING) - << "layout must be NORAML_CONTIGUOUS when storage type is LEVELDB"; - layout_ = "normal_contiguous"; - } - - if (embedding::StorageType::PMEM_LIBPMEM == storage_type_ || - embedding::StorageType::PMEM_MEMKIND == storage_type_){ - if (layout_ != "normal_contiguous"){ - LOG(WARNING) - << "layout must be NORAML_CONTIGUOUS" - << " when storage type is PMEM_LIBPMEM or PMEM_MEMKIND"; - } - layout_ = "normal_contiguous"; - } OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_)); } @@ -314,35 +271,43 @@ class InitializeKvVariableOp : public OpKernel { context, handle_self, &ev, [this, default_values, opname, context, handle_self](EmbeddingVar** ptr) { - Allocator* gpu_allocator = + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); auto embedding_config = EmbeddingConfig( emb_index_ + block_num_ * slot_index_, emb_index_, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, + l2_weight_threshold_, max_element_size_, false_positive_probability_, counter_type_, default_value_dim_, default_value_no_permission_, record_freq_, record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - gpu_allocator, + alloc_for_ev, + feat_desc, handle_self.name()); *ptr = new EmbeddingVar( handle_self.name(), storage, embedding_config, - gpu_allocator); - return Status::OK(); - })); - ev->Init(default_values, default_value_dim_); + alloc_for_ev, + feat_desc); + return (*ptr)->Init(default_values, default_value_dim_); + })); } else { EmbeddingVar* primary_variable = nullptr; OP_REQUIRES_OK( @@ -352,30 +317,38 @@ class InitializeKvVariableOp : public OpKernel { [this, default_values, opname, handle_primary, context](EmbeddingVar** ptr) { int64 primary_slot_index(0), primary_emb_index(0); - Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes()); - //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes()); + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); auto embedding_config = EmbeddingConfig( primary_emb_index + block_num_ * primary_slot_index, primary_emb_index, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, + l2_weight_threshold_, max_element_size_, false_positive_probability_, counter_type_, 0, record_freq_, record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - gpu_allocator, + alloc_for_ev, + feat_desc, handle_primary.name()); *ptr = new EmbeddingVar( handle_primary.name(), storage, embedding_config, - gpu_allocator); + alloc_for_ev, + feat_desc); // default_values is slot value, should not to initialize primary value return Status::OK(); })); @@ -386,20 +359,26 @@ class InitializeKvVariableOp : public OpKernel { context, handle_self, &ev, [this, default_values, opname, primary_variable, handle_self, context](EmbeddingVar** ptr) { + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, + emb_index_, + block_num_, slot_num_, opname, + steps_to_live_, filter_freq_, + max_freq_, l2_weight_threshold_, + max_element_size_, + false_positive_probability_, + counter_type_, default_value_dim_, + default_value_no_permission_, + record_freq_, record_version_, + is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; *ptr = new EmbeddingVar(handle_self.name(), primary_variable->storage(), - EmbeddingConfig(emb_index_ + block_num_ * slot_index_, - emb_index_, - block_num_, slot_num_, opname, - steps_to_live_, filter_freq_, - max_freq_, l2_weight_threshold_, - layout_, max_element_size_, - false_positive_probability_, - counter_type_, default_value_dim_, - default_value_no_permission_, - record_freq_, record_version_, - is_inference_), - primary_variable->GetAllocator()); + embedding_config, + alloc_for_ev, + primary_variable->feature_descriptor()); return (*ptr)->Init(default_values, default_value_dim_); })); core::ScopedUnref unref_me(primary_variable); @@ -424,7 +403,6 @@ class InitializeKvVariableOp : public OpKernel { int64 filter_freq_; int64 max_freq_; float l2_weight_threshold_; - std::string layout_; int64 max_element_size_; float false_positive_probability_; embedding::StorageType storage_type_; @@ -436,6 +414,7 @@ class InitializeKvVariableOp : public OpKernel { bool record_version_; bool is_inference_; bool is_set_initialized_; + std::string device_type_str_; }; #define REGISTER_KERNELS(ktype, vtype) \ diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h index 8e3572443ba..3202e6d12bf 100644 --- a/tensorflow/core/kernels/kv_variable_ops.h +++ b/tensorflow/core/kernels/kv_variable_ops.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/embedding/cache_factory.h" #include "tensorflow/core/framework/embedding/embedding_var.h" #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/framework/op_kernel.h" diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc index 23a504eea5d..3b10c2521b9 100644 --- a/tensorflow/core/kernels/kv_variable_restore_ops.cc +++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc @@ -120,20 +120,6 @@ class KvResourceImportV2Op: public AsyncOpKernel { OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_)); OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_)); - if ((filter_freq_ != 0 && max_element_size_ == 0) - || steps_to_live_ != -1 || record_freq_ - || record_version_ || storage_type > 5) { - if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) { - layout_ = "normal"; - } else { - layout_ = "normal_contiguous"; - } - } else { - layout_ = "light"; - } - - CHECK(block_num_ == 1 || layout_ != "normal_contiguous"); - TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true, &ev_async_restore_)); } @@ -170,24 +156,33 @@ class KvResourceImportV2Op: public AsyncOpKernel { block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, l2_weight_threshold_, - layout_, max_element_size_, + max_element_size_, false_positive_probability_, counter_type_, default_value_dim_, default_value_no_permission_, record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - allocator, + alloc_for_ev, + feat_desc, handle_self.name()); *ptr = new EmbeddingVar( handle_self.name(), storage, embedding_config, - allocator); + alloc_for_ev, + feat_desc); return Status::OK(); })); ev->Init(default_values, default_value_dim_); @@ -207,19 +202,27 @@ class KvResourceImportV2Op: public AsyncOpKernel { primary_emb_index, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, l2_weight_threshold_, - layout_, max_element_size_, + max_element_size_, false_positive_probability_, counter_type_, 0, record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - allocator, + alloc_for_ev, + feat_desc, handle_primary.name()); *ptr = new EmbeddingVar(handle_primary.name(), - storage, embedding_config, allocator); + storage, embedding_config, alloc_for_ev, feat_desc); // default_values is slot value, should not to initialize primary value return Status::OK(); })); @@ -232,17 +235,22 @@ class KvResourceImportV2Op: public AsyncOpKernel { handle_self, context](EmbeddingVar** ptr) { Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, + emb_index_, block_num_, slot_num_, opname, + steps_to_live_, filter_freq_, max_freq_, + l2_weight_threshold_, max_element_size_, + false_positive_probability_, + counter_type_, default_value_dim_, + default_value_no_permission_, + record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; *ptr = new EmbeddingVar(handle_self.name(), primary_variable->storage(), - EmbeddingConfig(emb_index_ + block_num_ * slot_index_, - emb_index_, block_num_, slot_num_, opname, - steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, max_element_size_, - false_positive_probability_, - counter_type_, default_value_dim_, - default_value_no_permission_, - record_freq_, record_version_), - allocator); + embedding_config, + alloc_for_ev, + primary_variable->feature_descriptor()); return (*ptr)->Init(default_values, default_value_dim_); })); core::ScopedUnref unref_me(primary_variable); @@ -290,7 +298,6 @@ class KvResourceImportV2Op: public AsyncOpKernel { int64 slot_num_; int64 filter_freq_; float l2_weight_threshold_; - std::string layout_; int64 max_freq_; embedding::StorageType storage_type_; std::string storage_path_; @@ -301,6 +308,7 @@ class KvResourceImportV2Op: public AsyncOpKernel { bool record_version_; bool reset_version_; bool ev_async_restore_; + std::string device_type_str_; }; #define REGISTER_KERNELS(dev, ktype, vtype) \ diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h index 4f69ebe3fb5..da58e17e1bb 100644 --- a/tensorflow/core/kernels/save_restore_tensor.h +++ b/tensorflow/core/kernels/save_restore_tensor.h @@ -23,7 +23,6 @@ limitations under the License. #include "tensorflow/core/framework/hash_table/hash_table.h" #include "tensorflow/core/framework/hash_table/bloom_filter_strategy.h" #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h index e013a6a2bae..12948de24a4 100644 --- a/tensorflow/core/kernels/training_ali_op_helpers.h +++ b/tensorflow/core/kernels/training_ali_op_helpers.h @@ -121,55 +121,54 @@ EmbeddingVariableInputLockHolder MaybeLockEmbeddingVariableInputMutexesInO template void LookupKeyAndSetVersion( OpKernelContext* ctx, EmbeddingVar* var, - ValuePtr** value_ptrs, Tstep gs, const K* indices, + void** value_ptrs, Tstep gs, const K* indices, int64 task_size, bool indices_as_pointer, int counts_index) { + EmbeddingVarContext ev_ctx(ctx); int64* indices_counts = nullptr; std::function get_count_fn = 0; if (counts_index != -1) { const Tensor& counts_tensor = ctx->input(counts_index); indices_counts = (int64*)counts_tensor.data(); - get_count_fn = [](int64* counts, int64 index) { - return counts[index];}; - } else { - get_count_fn = [](int64* counts, int64 index) {return 1;}; } + var->LookupOrCreateKey(ev_ctx, indices, value_ptrs, + task_size, indices_counts, + indices_as_pointer); - auto lookup_key_and_set_version_fn = [var, value_ptrs, gs, - indices, indices_as_pointer, - indices_counts, get_count_fn] (int64 start, int64 limit) { - ValuePtr* value_ptr = nullptr; + auto update_version_fn = [var, value_ptrs, gs] + (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - bool is_filter = false; - int64 count = get_count_fn(indices_counts, i); - var->LookupOrCreateKey(indices[i], &value_ptr, - &is_filter, indices_as_pointer, count); - value_ptrs[i] = value_ptr; - var->UpdateVersion(value_ptr, gs); + var->UpdateVersion(value_ptrs[i], gs); } }; const int64 unit_cost = 1000; //very unreliable estimate for cost per step. auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); Shard(worker_threads->num_threads, worker_threads->workers, task_size, unit_cost, - lookup_key_and_set_version_fn); + update_version_fn); } template -void LookupOrCreateEmbedding( +void LookupEmbedding( OpKernelContext* ctx, std::vector*, V**>>& vars, - ValuePtr** value_ptrs, + void** value_ptrs, const K* indices, - int64 num_of_keys, - IntraThreadCopyIdAllocator* thread_copy_id_alloc) { + int64 num_of_keys) { for (auto it: vars) { EmbeddingVar* var = it.first; V** var_ptr = it.second; - EmbeddingVarContext ev_ctx(ctx); - var->BatchLookupOrCreateEmb( - ev_ctx, var_ptr, value_ptrs, - indices, num_of_keys, thread_copy_id_alloc); + auto lookup_emb_fn = [var, var_ptr, value_ptrs] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + var_ptr[i] = var->GetValuePtr(value_ptrs[i]); + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + lookup_emb_fn); } } @@ -180,12 +179,12 @@ void GetEmbeddingPointers( const K* indices, Tstep gs, bool indices_as_pointer, int counts_index, int64 num_of_keys, IntraThreadCopyIdAllocator* thread_copy_id_alloc) { - std::vector*> value_ptrs(num_of_keys); + std::vector value_ptrs(num_of_keys); LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(), gs, indices, num_of_keys, indices_as_pointer, counts_index); - LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(), - indices, num_of_keys, thread_copy_id_alloc); + LookupEmbedding(ctx, vars, value_ptrs.data(), + indices, num_of_keys); } } // end namespace tensorflow diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc index 839ce82feef..546b30e29dd 100644 --- a/tensorflow/core/kernels/training_ali_ops.cc +++ b/tensorflow/core/kernels/training_ali_ops.cc @@ -141,16 +141,16 @@ class KvSparseApplyAdagradOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const TKey index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto a = accum->flat(value_ptr, index); + auto a = accum->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); a += g.square(); v -= g.constant(lr_scalar) * g * a.rsqrt(); } @@ -542,15 +542,15 @@ class KvSparseApplyFtrlOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const TKey index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); if (is_filter) { - auto var = var_->flat(value_ptr, index); - auto accum = accum_->flat(value_ptr, index); - auto linear = linear_->flat(value_ptr, index); + auto var = var_->flat(value_ptr); + auto accum = accum_->flat(value_ptr); + auto linear = linear_->flat(value_ptr); auto grad = grad_flat.template chip<0>(i); // Use a macro to implement the computation here due to the templating of the @@ -1301,19 +1301,19 @@ class KvSparseApplyAdagradDecayOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto a = accum->flat(value_ptr, index); + auto a = accum->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); - auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index); + auto v = var->flat(value_ptr); + auto accum_decay_power = accum_decay_power_var->flat(value_ptr); if (gs / decay_step_scalar > accum_decay_power(0)) { a *= a.constant(decay_rate_scalar); @@ -1505,19 +1505,18 @@ class KvSparseApplyAdamOp : public OpKernel { auto indices_vec = indices.vec(); int64 gs = global_step.scalar()(); - for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter =false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto var_i = var->flat(value_ptr, index); - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); m_a += (g - m_a) * (static_cast(1) - beta1_scalar); @@ -2412,15 +2411,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { Tstep gs = global_step.scalar()(); for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto v_ = v->flat(value_ptr, index); - auto m_ = m->flat(value_ptr, index); + auto v_ = v->flat(value_ptr); + auto m_ = m->flat(value_ptr); auto grad_ = grad_flat.template chip<0>(i); v_ = v_ * v_.constant(beta2_scalar) + @@ -2429,7 +2428,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { (v_ + v_.constant(epsilon_scalar)).rsqrt() * v_.constant(lr_scalar) * grad_; - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); v -= m_; } } @@ -2461,17 +2460,17 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto var_i = var->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); m_a = m_a * beta1_scalar + g * (static_cast(1) - beta1_scalar); v_a = v_a * beta2_scalar + g.square() * (static_cast(1) - beta2_scalar); @@ -2939,7 +2938,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, @@ -2947,7 +2946,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel { var->UpdateVersion(value_ptr, gs); if (is_filter) { auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); v -= g.constant(lr_scalar) * g; } } @@ -3136,16 +3135,16 @@ class KvSparseApplyAdamWOp : public OpKernel { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter =false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto var_i = var->flat(value_ptr, index); - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); // m_a = beta1 * m + (1 - beta1) * g m_a += (g - m_a) * (static_cast(1) - beta1_scalar); diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 2a56634206c..e89b095aff1 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -6132,6 +6132,8 @@ class GraphKeys(object): TRAINABLE_VARIABLES = "trainable_variables" # Indicate EmbeddingVariable in CollectionDef EMBEDDING_VARIABLES = "embedding_variables" + # Collection for dependencies of EmbeddingVariable's restore op + EMBEDDING_VARIABLE_RESTORE_DEPENDENCY = "embedding_variable_restore_dependency" # Key to collect summaries. SUMMARIES = "summaries" # Key to collect QueueRunners. diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py index 240938e8675..d47d94d0d99 100644 --- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py @@ -47,69 +47,6 @@ class EmbeddingVariableGpuTest(test_util.TensorFlowTestCase): - def testDynamicDimensionEmbeddingVariable(self): - print("testDynamicDimensionEmbeddingVariable") - with ops.device('/gpu:0'): - def runTestAdagrad(self, var, g): - if isinstance(var, kv_variable_ops.EmbeddingVariable): - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - else: - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2]) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - gs = training_util.get_or_create_global_step() - opt = adagrad.AdagradOptimizer(0.1) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - with self.test_session(graph=g) as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - return r - with ops.device('/gpu:0'), ops.Graph().as_default() as g: - emb_var = variable_scope.get_embedding_variable("var_1", - initializer=init_ops.ones_initializer(dtypes.float32), - embedding_dim = 8, - ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4)) - emb1 = runTestAdagrad(self, emb_var, g) - with ops.device('/gpu:0'), ops.Graph().as_default() as g: - var = variable_scope.get_dynamic_dimension_embedding_variable("var_dist", - embedding_block_dimension=4, - embedding_block_num=2, - storage_type=config_pb2.StorageType.HBM, - initializer=init_ops.ones_initializer(dtypes.float32)) - emb2 = runTestAdagrad(self, var, g) - for i in range(0, 6): - for j in range(0, 8): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - def testDynamicEmbeddingVariableForInitFromProto(self): - print("testDynamicEmbeddingVariableForInitFromProto") - with ops.device('/gpu:0'): - embedding = variable_scope.get_dynamic_dimension_embedding_variable("var_dist", - embedding_block_dimension=4, - embedding_block_num=2, - storage_type=config_pb2.StorageType.HBM, - initializer=init_ops.ones_initializer(dtypes.float32)) - emb = embedding_ops.embedding_lookup(embedding, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2]) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - graph = ops.get_default_graph() - meta_graph_def = saver_module.export_meta_graph() - ops.reset_default_graph() - with self.test_session() as sess: - res = saver_module.import_meta_graph(meta_graph_def) - def testEmbeddingVariableForInitFromProto(self): print("testEmbeddingVariableForInitFromProto") with ops.device('/gpu:0'): @@ -235,43 +172,6 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): print(sess.run([emb, train_op,loss])) print(sess.run([emb, train_op,loss])) - def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): - print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn") - columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) - with ops.device("/gpu:0"): - W = feature_column.embedding_column(sparse_id_column=columns, - dimension=3, - initializer=init_ops.ones_initializer(dtypes.float32)) - ids={} - ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) - emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) - - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - - opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - - with self.test_session() as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - emb1, top, l = sess.run([emb, train_op, loss]) - for val1 in emb1.tolist(): - for val in val1: - self.assertEqual(val, .0) - emb1, top, l = sess.run([emb, train_op, loss]) - for index, val1 in enumerate(emb1.tolist()): - if index < 7: - for val in val1: - self.assertNotEqual(val, 1.0) - else: - for val in val1: - self.assertEqual(val, .0) - def testEmbeddingVariableForSparseColumnEmbeddingCol(self): columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM))) @@ -870,6 +770,66 @@ def testSaveV3(self): result = sess.run([emb1]) print(result) + def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self): + print("testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm") + checkpoint_directory = self.get_temp_dir() + with ops.Graph().as_default() as g, ops.device('/gpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + ev_option = variables.EmbeddingVariableOption( + storage_option=variables.StorageOption( + storage_type=config_pb2.StorageType.HBM_DRAM))) + + emb = embedding_ops.embedding_lookup(var, + math_ops.cast([0,1,2,5,6,7], + dtypes.int64)) + fun = math_ops.multiply(emb, 1.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver(sharded=True) + init = variables.global_variables_initializer() + graph = ops.get_default_graph() + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + sess.run(train_op) + emb_ori = sess.run(emb) + save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345) + + with ops.Graph().as_default() as g, ops.device('/gpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + ev_option = variables.EmbeddingVariableOption( + storage_option=variables.StorageOption( + storage_type=config_pb2.StorageType.HBM_DRAM))) + + emb = embedding_ops.embedding_lookup(var, + math_ops.cast([0,1,2,5,6,7], + dtypes.int64)) + fun = math_ops.multiply(emb, 1.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver() + graph = ops.get_default_graph() + with self.test_session(graph = graph) as sess: + saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345")) + emb_val = sess.run(emb) + self.assertAllEqual(emb_ori, emb_val) + save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345) + for name, shape in checkpoint_utils.list_variables(checkpoint_directory): + if "Adagrad-values" in name: + value = checkpoint_utils.load_variable(checkpoint_directory, name) + for i in range(0, shape[0]): + for j in range(0, shape[1]): + self.assertAlmostEqual(1.1, value[i][j]) + def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self): print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm") checkpoint_directory = self.get_temp_dir() @@ -894,8 +854,8 @@ def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self): emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - fun = math_ops.multiply(emb, 0.0, name='multiply') - fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1') + fun = math_ops.multiply(emb, 1.0, name='multiply') + fun1 = math_ops.multiply(emb2, 1.0, name='multiply_1') loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum') gs = training_util.get_or_create_global_step() opt = adagrad.AdagradOptimizer(0.1) diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index c6cdf951a1e..81b315e2e43 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -120,7 +120,7 @@ def _CounterFilterTestTemplate(self, optimizer): initializer=init_ops.ones_initializer(dtypes.float32), ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)), partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) - emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64)) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64)) fun = math_ops.multiply(emb, 2.0, name='multiply') loss = math_ops.reduce_sum(fun, name='reduce_sum') gs = training_util.get_or_create_global_step() @@ -133,11 +133,18 @@ def _CounterFilterTestTemplate(self, optimizer): sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) sess.run([init]) emb1, top, l = sess.run([emb, train_op, loss]) - for val in emb1.tolist()[0]: - self.assertEqual(val, .0) + + for val1 in emb1.tolist(): + for val in val1: + self.assertEqual(val, .0) emb1, top, l = sess.run([emb, train_op, loss]) - for val in emb1.tolist()[0]: - self.assertNotEqual(val, 1.0) + for index, val1 in enumerate(emb1.tolist()): + if index < 7: + for val in val1: + self.assertNotEqual(val, 1.0) + else: + for val in val1: + self.assertEqual(val, .0) def _RecordFreqTestTemplate(self, optimizer): checkpoint_directory = self.get_temp_dir() @@ -720,20 +727,11 @@ def testEmbeddingVariableForL2FeatureEviction(self): sess.run([init]) emb_ori = sess.run([emb, train_op]) save_path = saver.save(sess, os.path.join(checkpoint_directory, "model1.ckpt"), global_step=12345) - #for name, shape in checkpoint_utils.list_variables(checkpoint_directory): - # print('loading... ', name, shape) - with self.test_session() as sess: - saver.restore(sess, os.path.join(checkpoint_directory, "model1.ckpt-12345")) - emb_right = [[0.8282884, 0.8282884, 0.8282884], - [0.8282884, 0.8282884, 0.8282884], - [0.8282884, 0.8282884, 0.8282884], - [0.7927219, 0.7927219, 0.7927219], - [0.7927219, 0.7927219, 0.7927219], - [1.0, 1.0, 1.0]] - emb_ori = sess.run(emb) - for i in range(6): - for j in range(3): - self.assertAlmostEqual(emb_ori[i][j], emb_right[i][j]) + for name, shape in checkpoint_utils.list_variables(checkpoint_directory): + if name == "var_1-keys": + self.assertEqual(shape[0], 2) + keys = checkpoint_utils.load_variable(checkpoint_directory, name) + self.assertAllEqual(keys, [0, 1]) def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): columns_list=[] @@ -764,14 +762,15 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn") - columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) - W = feature_column.embedding_column(sparse_id_column=columns, - dimension=3, - initializer=init_ops.ones_initializer(dtypes.float32)) - ids={} - ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) - emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) + with ops.device("/cpu:0"): + columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) + W = feature_column.embedding_column(sparse_id_column=columns, + dimension=3, + initializer=init_ops.ones_initializer(dtypes.float32)) + ids={} + ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) + emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) fun = math_ops.multiply(emb, 2.0, name='multiply') loss = math_ops.reduce_sum(fun, name='reduce_sum') @@ -786,6 +785,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) sess.run([init]) emb1, top, l = sess.run([emb, train_op, loss]) + for val1 in emb1.tolist(): for val in val1: self.assertEqual(val, .0) @@ -1328,66 +1328,6 @@ def testEmbeddingVariableForHTPartitionNum(self): print(sess.run([emb, train_op,loss])) print(sess.run([emb, train_op,loss])) - def testEmbeddingVariableForLayout(self): - print("testEmbeddingVariableForLayout") - def runTestAdagrad(self, var, g): - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - gs = training_util.get_or_create_global_step() - opt = adagrad.AdagradOptimizer(0.1) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - with self.test_session(graph=g) as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - return r - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) - var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32)) - emb1 = runTestAdagrad(self, emb_var, g) - emb2 = runTestAdagrad(self, var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1), - steps_to_live=5) - var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32)) - emb1 = runTestAdagrad(self, emb_var, g) - emb2 = runTestAdagrad(self, var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1), - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=5))) - emb1 = runTestAdagrad(self, emb_var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], .0) - def testEVInitializerWithKeyFetch(self): print("testEVInitializerWithKeyFetch") with ops.Graph().as_default() as g, ops.device('/cpu:0'): @@ -2391,7 +2331,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self): "model1.ckpt") with self.test_session() as sess: sess.run([init]) - sess.run([emb, train_op]) + sess.run([train_op]) save_path = saver.save(sess, model_path) for name, shape in checkpoint_utils.list_variables(model_path): if name == "var_1-keys": @@ -2403,6 +2343,37 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self): name == "var_1-freqs_filtered": self.assertEqual(0, shape[0]) del os.environ["TF_EV_SAVE_FILTERED_FEATURES"] + + def testEmbeddingVariableForSaveUnfilterFeature(self): + checkpoint_directory = self.get_temp_dir() + with ops.device("/cpu:0"): + emb_var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + initializer=init_ops.ones_initializer(dtypes.float32), + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) + emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + model_path = os.path.join(checkpoint_directory, + "model1.ckpt") + with self.test_session() as sess: + sess.run([init]) + sess.run([train_op]) + save_path = saver.save(sess, model_path) + for name, shape in checkpoint_utils.list_variables(model_path): + if name == "var_1-keys": + keys = checkpoint_utils.load_variable(model_path, name) + self.assertEqual(1, len(keys)) + self.assertEqual(1, keys[0]) + if name == "var_1-keys_filtered" or \ + name == "var_1-freqs_filtered": + self.assertEqual(2, shape[0]) def testEmbeddingVariableForMultiTierInference(self): print("testEmbeddingVariableForMultiTierInference") @@ -2716,7 +2687,55 @@ def testCPUFbjOpt(self): def testCPUFbjOptWithCounterFilter(self): print("testCPUFbjOpt") os.environ["TF_EMBEDDING_FBJ_OPT"] = "True" - self._CounterFilterTestTemplate("Adagrad") + with ops.device("/cpu:0"): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + initializer=init_ops.ones_initializer(dtypes.float32), + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)), + partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = self._CreateOptimizer("Adagrad") + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + emb1, top, l = sess.run([emb, train_op, loss]) + emb_list = emb1.tolist() + emb_right = [[.0, .0, .0], + [.0, .0, .0], + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + [.0, .0, .0], + [.0, .0, .0], + [1.0, 1.0, 1.0], + [.0, .0, .0], + [.0, .0, .0], + [.0, .0, .0]] + + for i in range(6): + for j in range(3): + self.assertAlmostEqual(emb_list[i][j], emb_right[i][j]) + + emb1= sess.run(emb) + emb_right = [[0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90122706, 0.90122706, 0.90122706], + [0.90122706, 0.90122706, 0.90122706], + [0.90122706, 0.90122706, 0.90122706], + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + [.0, .0, .0]] + for i in range(6): + for j in range(3): + self.assertAlmostEqual(emb1[i][j], emb_right[i][j]) del os.environ["TF_EMBEDDING_FBJ_OPT"] def testCPUFbjOptWithBloomFilter(self): diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index 96329ca345b..1ef9550ef6d 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -373,6 +373,8 @@ def _init_from_args(self, self._slot_num = 0 else: self._slot_num = evconfig.slot_num + if self._is_primary: + self._import_dependency_ops = [] with ops.name_scope("IsInitialized"): self._is_initialized_op = ( gen_kv_variable_ops.kv_var_is_initialized_op(self._handle, @@ -488,6 +490,7 @@ def create_init_op_for_restore(self, name, initial_value, invalid_key, rank): set_attr_ops.append(set_cache_op) with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]): self._init_op_for_restore = control_flow_ops.no_op() + self.collect_restore_denpendencies() def need_counts(self): return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier) @@ -612,8 +615,19 @@ def _init_from_proto(self, variable_def, import_scope=None): else: self._is_primary = False + self.collect_restore_denpendencies() # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py) + def collect_restore_denpendencies(self): + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY) + if len(restore_dependency) == 0: + ops.add_to_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY, {}) + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY) + dependency_dict = restore_dependency[0] + if not dependency_dict.__contains__(self._primary_handle): + dependency_dict[self._primary_handle] = [] + dependency_dict[self._primary_handle].append(self._init_op_for_restore) + def set_init_data_source_initializer(self, init_data_source): import pkgutil try: diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py index 0d8bfe87022..650b1a5e272 100644 --- a/tensorflow/python/training/saving/saveable_object_util.py +++ b/tensorflow/python/training/saving/saveable_object_util.py @@ -195,7 +195,8 @@ def restore(self, restored_tensors, unused_restored_shapes): if self.var._init_data_source is not None: return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num) else: - with ops.control_dependencies([self.var._init_op_for_restore]): + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0] + with ops.control_dependencies(restore_dependency[self.var._primary_handle]): rank = self.op.initial_value.get_shape().rank - 1 restore_op = gen_kv_variable_ops.kv_resource_import_v3( restored_tensors[0],