Skip to content

Commit

Permalink
cpu: x64: improve tbb bnorm realtime inference performance
Browse files Browse the repository at this point in the history
Try to generate at leasts nthr tasks in tbb bnorm for realtime inference
scenarios.
  • Loading branch information
kwiersch authored and tprimak committed Oct 21, 2022
1 parent d7a781e commit cd953e4
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2168,7 +2168,11 @@ struct driver_t : public c_compatible {

dim_t total_size = size_src_dst + size_stats_ss_tensors;

dim_t n_chunks = total_size / platform::get_per_core_cache_size(2);
// Try to create at least nthr_ chunks for realtime inference
const int n_chunks_min = nthr_ <= 4 ? nstl::min(4, nthr_) : 1;
const size_t l2_per_core = platform::get_per_core_cache_size(2);
dim_t n_chunks
= nstl::max<dim_t>(n_chunks_min, total_size / l2_per_core);

// we prioritize parallelization on N, then S, and finally C
nthr.N = utils::saturate<dim_t>(1, N_, n_chunks);
Expand Down

0 comments on commit cd953e4

Please sign in to comment.